diff options
Diffstat (limited to 'drivers/xen')
50 files changed, 4397 insertions, 1802 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index d4dffcd5287..38fb36e1c59 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -18,11 +18,10 @@ config XEN_SELFBALLOONING by the current usage of anonymous memory ("committed AS") and controlled by various sysfs-settable parameters. Configuring FRONTSWAP is highly recommended; if it is not configured, self- - ballooning is disabled by default but can be enabled with the - 'selfballooning' kernel boot parameter. If FRONTSWAP is configured, + ballooning is disabled by default. If FRONTSWAP is configured, frontswap-selfshrinking is enabled by default but can be disabled - with the 'noselfshrink' kernel boot parameter; and self-ballooning - is enabled by default but can be disabled with the 'noselfballooning' + with the 'tmem.selfshrink=0' kernel boot parameter; and self-ballooning + is enabled by default but can be disabled with the 'tmem.selfballooning=0' kernel boot parameter. Note that systems without a sufficiently large swap device should not enable self-ballooning. @@ -140,12 +139,12 @@ config XEN_GRANT_DEV_ALLOC config SWIOTLB_XEN def_bool y - depends on PCI select SWIOTLB config XEN_TMEM - bool - default y if (CLEANCACHE || FRONTSWAP) + tristate + depends on !ARM && !ARM64 + default m if (CLEANCACHE || FRONTSWAP) help Shim to interface in-kernel Transcendent Memory hooks (e.g. cleancache and frontswap) to Xen tmem hypercalls. @@ -178,6 +177,40 @@ config XEN_PRIVCMD depends on XEN default m +config XEN_STUB + bool "Xen stub drivers" + depends on XEN && X86_64 && BROKEN + default n + help + Allow kernel to install stub drivers, to reserve space for Xen drivers, + i.e. memory hotplug and cpu hotplug, and to block native drivers loaded, + so that real Xen drivers can be modular. + + To enable Xen features like cpu and memory hotplug, select Y here. + +config XEN_ACPI_HOTPLUG_MEMORY + tristate "Xen ACPI memory hotplug" + depends on XEN_DOM0 && XEN_STUB && ACPI + default n + help + This is Xen ACPI memory hotplug. + + Currently Xen only support ACPI memory hot-add. If you want + to hot-add memory at runtime (the hot-added memory cannot be + removed until machine stop), select Y/M here, otherwise select N. + +config XEN_ACPI_HOTPLUG_CPU + tristate "Xen ACPI cpu hotplug" + depends on XEN_DOM0 && XEN_STUB && ACPI + select ACPI_CONTAINER + default n + help + Xen ACPI cpu enumerating and hotplugging + + For hotplugging, currently Xen only support ACPI cpu hotadd. + If you want to hotadd cpu at runtime (the hotadded cpu cannot + be removed until machine stop), select Y/M here. + config XEN_ACPI_PROCESSOR tristate "Xen ACPI processor" depends on XEN && X86 && ACPI_PROCESSOR && CPU_FREQ @@ -188,7 +221,7 @@ config XEN_ACPI_PROCESSOR To do that the driver parses the Power Management data and uploads said information to the Xen hypervisor. Then the Xen hypervisor can - select the proper Cx and Pxx states. It also registers itslef as the + select the proper Cx and Pxx states. It also registers itself as the SMM so that other drivers (such as ACPI cpufreq scaling driver) will not load. @@ -204,4 +237,7 @@ config XEN_MCE_LOG Allow kernel fetching MCE error from Xen platform and converting it into Linux mcelog format for mcelog tools +config XEN_HAVE_PVMMU + bool + endmenu diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 0e863703545..45e00afa7f2 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -1,8 +1,9 @@ -ifneq ($(CONFIG_ARM),y) -obj-y += manage.o balloon.o +ifeq ($(filter y, $(CONFIG_ARM) $(CONFIG_ARM64)),) obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o endif -obj-y += grant-table.o features.o events.o +obj-$(CONFIG_X86) += fallback.o +obj-y += grant-table.o features.o balloon.o manage.o +obj-y += events/ obj-y += xenbus/ nostackp := $(call cc-option, -fno-stack-protector) @@ -10,11 +11,11 @@ CFLAGS_features.o := $(nostackp) dom0-$(CONFIG_PCI) += pci.o dom0-$(CONFIG_USB_SUPPORT) += dbgp.o -dom0-$(CONFIG_ACPI) += acpi.o +dom0-$(CONFIG_ACPI) += acpi.o $(xen-pad-y) +xen-pad-$(CONFIG_X86) += xen-acpi-pad.o dom0-$(CONFIG_X86) += pcpu.o obj-$(CONFIG_XEN_DOM0) += $(dom0-y) obj-$(CONFIG_BLOCK) += biomerge.o -obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += xen-balloon.o obj-$(CONFIG_XEN_SELFBALLOONING) += xen-selfballoon.o obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o @@ -28,6 +29,9 @@ obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o obj-$(CONFIG_XEN_MCE_LOG) += mcelog.o obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback/ obj-$(CONFIG_XEN_PRIVCMD) += xen-privcmd.o +obj-$(CONFIG_XEN_STUB) += xen-stub.o +obj-$(CONFIG_XEN_ACPI_HOTPLUG_MEMORY) += xen-acpi-memhotplug.o +obj-$(CONFIG_XEN_ACPI_HOTPLUG_CPU) += xen-acpi-cpuhotplug.o obj-$(CONFIG_XEN_ACPI_PROCESSOR) += xen-acpi-processor.o xen-evtchn-y := evtchn.o xen-gntdev-y := gntdev.o diff --git a/drivers/xen/acpi.c b/drivers/xen/acpi.c index 119d42a2bf5..90307c0b630 100644 --- a/drivers/xen/acpi.c +++ b/drivers/xen/acpi.c @@ -35,28 +35,43 @@ #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> -int xen_acpi_notify_hypervisor_state(u8 sleep_state, - u32 pm1a_cnt, u32 pm1b_cnt) +static int xen_acpi_notify_hypervisor_state(u8 sleep_state, + u32 val_a, u32 val_b, + bool extended) { + unsigned int bits = extended ? 8 : 16; + struct xen_platform_op op = { .cmd = XENPF_enter_acpi_sleep, .interface_version = XENPF_INTERFACE_VERSION, - .u = { - .enter_acpi_sleep = { - .pm1a_cnt_val = (u16)pm1a_cnt, - .pm1b_cnt_val = (u16)pm1b_cnt, - .sleep_state = sleep_state, - }, + .u.enter_acpi_sleep = { + .val_a = (u16)val_a, + .val_b = (u16)val_b, + .sleep_state = sleep_state, + .flags = extended ? XENPF_ACPI_SLEEP_EXTENDED : 0, }, }; - if ((pm1a_cnt & 0xffff0000) || (pm1b_cnt & 0xffff0000)) { - WARN(1, "Using more than 16bits of PM1A/B 0x%x/0x%x!" - "Email xen-devel@lists.xensource.com Thank you.\n", \ - pm1a_cnt, pm1b_cnt); + if (WARN((val_a & (~0 << bits)) || (val_b & (~0 << bits)), + "Using more than %u bits of sleep control values %#x/%#x!" + "Email xen-devel@lists.xen.org - Thank you.\n", \ + bits, val_a, val_b)) return -1; - } HYPERVISOR_dom0_op(&op); return 1; } + +int xen_acpi_notify_hypervisor_sleep(u8 sleep_state, + u32 pm1a_cnt, u32 pm1b_cnt) +{ + return xen_acpi_notify_hypervisor_state(sleep_state, pm1a_cnt, + pm1b_cnt, false); +} + +int xen_acpi_notify_hypervisor_extended_sleep(u8 sleep_state, + u32 val_a, u32 val_b) +{ + return xen_acpi_notify_hypervisor_state(sleep_state, val_a, + val_b, true); +} diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 31ab82fda38..5c660c77f03 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -36,6 +36,9 @@ * IN THE SOFTWARE. */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/cpu.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/errno.h> @@ -50,12 +53,12 @@ #include <linux/notifier.h> #include <linux/memory.h> #include <linux/memory_hotplug.h> +#include <linux/percpu-defs.h> #include <asm/page.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> #include <asm/tlb.h> -#include <asm/e820.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> @@ -88,15 +91,9 @@ struct balloon_stats balloon_stats; EXPORT_SYMBOL_GPL(balloon_stats); /* We increase/decrease in batches which fit in a page */ -static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; +static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)]; +static DEFINE_PER_CPU(struct page *, balloon_scratch_page); -#ifdef CONFIG_HIGHMEM -#define inc_totalhigh_pages() (totalhigh_pages++) -#define dec_totalhigh_pages() (totalhigh_pages--) -#else -#define inc_totalhigh_pages() do {} while (0) -#define dec_totalhigh_pages() do {} while (0) -#endif /* List of ballooned pages, threaded through the mem_map array. */ static LIST_HEAD(ballooned_pages); @@ -133,9 +130,7 @@ static void __balloon_append(struct page *page) static void balloon_append(struct page *page) { __balloon_append(page); - if (PageHighMem(page)) - dec_totalhigh_pages(); - totalram_pages--; + adjust_managed_page_count(page, -1); } /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ @@ -152,24 +147,16 @@ static struct page *balloon_retrieve(bool prefer_highmem) page = list_entry(ballooned_pages.next, struct page, lru); list_del(&page->lru); - if (PageHighMem(page)) { + if (PageHighMem(page)) balloon_stats.balloon_high--; - inc_totalhigh_pages(); - } else + else balloon_stats.balloon_low--; - totalram_pages++; + adjust_managed_page_count(page, 1); return page; } -static struct page *balloon_first_page(void) -{ - if (list_empty(&ballooned_pages)) - return NULL; - return list_entry(ballooned_pages.next, struct page, lru); -} - static struct page *balloon_next_page(struct page *page) { struct list_head *next = page->lru.next; @@ -243,7 +230,7 @@ static enum bp_state reserve_additional_memory(long credit) rc = add_memory(nid, hotplug_start_paddr, balloon_hotplug << PAGE_SHIFT); if (rc) { - pr_info("xen_balloon: %s: add_memory() failed: %i\n", __func__, rc); + pr_info("%s: add_memory() failed: %i\n", __func__, rc); return BP_EAGAIN; } @@ -334,7 +321,7 @@ static enum bp_state increase_reservation(unsigned long nr_pages) if (nr_pages > ARRAY_SIZE(frame_list)) nr_pages = ARRAY_SIZE(frame_list); - page = balloon_first_page(); + page = list_first_entry_or_null(&ballooned_pages, struct page, lru); for (i = 0; i < nr_pages; i++) { if (!page) { nr_pages = i; @@ -355,25 +342,25 @@ static enum bp_state increase_reservation(unsigned long nr_pages) BUG_ON(page == NULL); pfn = page_to_pfn(page); - BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) && - phys_to_machine_mapping_valid(pfn)); - - set_phys_to_machine(pfn, frame_list[i]); - - /* Link back into the page tables if not highmem. */ - if (xen_pv_domain() && !PageHighMem(page)) { - int ret; - ret = HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), - mfn_pte(frame_list[i], PAGE_KERNEL), - 0); - BUG_ON(ret); + +#ifdef CONFIG_XEN_HAVE_PVMMU + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + set_phys_to_machine(pfn, frame_list[i]); + + /* Link back into the page tables if not highmem. */ + if (!PageHighMem(page)) { + int ret; + ret = HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + mfn_pte(frame_list[i], PAGE_KERNEL), + 0); + BUG_ON(ret); + } } +#endif /* Relinquish the page back to the allocator. */ - ClearPageReserved(page); - init_page_count(page); - __free_page(page); + __free_reserved_page(page); } balloon_stats.current_pages += rc; @@ -406,37 +393,59 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) nr_pages = ARRAY_SIZE(frame_list); for (i = 0; i < nr_pages; i++) { - if ((page = alloc_page(gfp)) == NULL) { + page = alloc_page(gfp); + if (page == NULL) { nr_pages = i; state = BP_EAGAIN; break; } - - pfn = page_to_pfn(page); - frame_list[i] = pfn_to_mfn(pfn); - scrub_page(page); - if (xen_pv_domain() && !PageHighMem(page)) { - ret = HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), - __pte_ma(0), 0); - BUG_ON(ret); - } - + frame_list[i] = page_to_pfn(page); } - /* Ensure that ballooned highmem pages don't have kmaps. */ + /* + * Ensure that ballooned highmem pages don't have kmaps. + * + * Do this before changing the p2m as kmap_flush_unused() + * reads PTEs to obtain pages (and hence needs the original + * p2m entry). + */ kmap_flush_unused(); - flush_tlb_all(); - /* No more mappings: invalidate P2M and add to balloon. */ + /* Update direct mapping, invalidate P2M, and add to balloon. */ for (i = 0; i < nr_pages; i++) { - pfn = mfn_to_pfn(frame_list[i]); - __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); - balloon_append(pfn_to_page(pfn)); + pfn = frame_list[i]; + frame_list[i] = pfn_to_mfn(pfn); + page = pfn_to_page(pfn); + +#ifdef CONFIG_XEN_HAVE_PVMMU + /* + * Ballooned out frames are effectively replaced with + * a scratch frame. Ensure direct mappings and the + * p2m are consistent. + */ + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (!PageHighMem(page)) { + struct page *scratch_page = get_balloon_scratch_page(); + + ret = HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte(page_to_pfn(scratch_page), + PAGE_KERNEL_RO), 0); + BUG_ON(ret); + + put_balloon_scratch_page(); + } + __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + } +#endif + + balloon_append(page); } + flush_tlb_all(); + set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = nr_pages; ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); @@ -488,6 +497,18 @@ static void balloon_process(struct work_struct *work) mutex_unlock(&balloon_mutex); } +struct page *get_balloon_scratch_page(void) +{ + struct page *ret = get_cpu_var(balloon_scratch_page); + BUG_ON(ret == NULL); + return ret; +} + +void put_balloon_scratch_page(void) +{ + put_cpu_var(balloon_scratch_page); +} + /* Resets the Xen limit, sets new target, and kicks off processing. */ void balloon_set_new_target(unsigned long target) { @@ -581,18 +602,66 @@ static void __init balloon_add_region(unsigned long start_pfn, } } +static int alloc_balloon_scratch_page(int cpu) +{ + if (per_cpu(balloon_scratch_page, cpu) != NULL) + return 0; + + per_cpu(balloon_scratch_page, cpu) = alloc_page(GFP_KERNEL); + if (per_cpu(balloon_scratch_page, cpu) == NULL) { + pr_warn("Failed to allocate balloon_scratch_page for cpu %d\n", cpu); + return -ENOMEM; + } + + return 0; +} + + +static int balloon_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (long)hcpu; + switch (action) { + case CPU_UP_PREPARE: + if (alloc_balloon_scratch_page(cpu)) + return NOTIFY_BAD; + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block balloon_cpu_notifier = { + .notifier_call = balloon_cpu_notify, +}; + static int __init balloon_init(void) { - int i; + int i, cpu; if (!xen_domain()) return -ENODEV; - pr_info("xen/balloon: Initialising balloon driver.\n"); + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + register_cpu_notifier(&balloon_cpu_notifier); + + get_online_cpus(); + for_each_online_cpu(cpu) { + if (alloc_balloon_scratch_page(cpu)) { + put_online_cpus(); + unregister_cpu_notifier(&balloon_cpu_notifier); + return -ENOMEM; + } + } + put_online_cpus(); + } + + pr_info("Initialising balloon driver\n"); balloon_stats.current_pages = xen_pv_domain() ? min(xen_start_info->nr_pages - xen_released_pages, max_pfn) - : max_pfn; + : get_num_physpages(); balloon_stats.target_pages = balloon_stats.current_pages; balloon_stats.balloon_low = 0; balloon_stats.balloon_high = 0; @@ -624,4 +693,15 @@ static int __init balloon_init(void) subsys_initcall(balloon_init); +static int __init balloon_clear(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(balloon_scratch_page, cpu) = NULL; + + return 0; +} +early_initcall(balloon_clear); + MODULE_LICENSE("GPL"); diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index 4dcfced107f..cc6513a176b 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/notifier.h> #include <xen/xen.h> @@ -25,13 +27,13 @@ static void disable_hotplug_cpu(int cpu) static int vcpu_online(unsigned int cpu) { int err; - char dir[32], state[32]; + char dir[16], state[16]; sprintf(dir, "cpu/%u", cpu); - err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state); + err = xenbus_scanf(XBT_NIL, dir, "availability", "%15s", state); if (err != 1) { if (!xen_initial_domain()) - printk(KERN_ERR "XENBUS: Unable to read cpu state\n"); + pr_err("Unable to read cpu state\n"); return err; } @@ -40,7 +42,7 @@ static int vcpu_online(unsigned int cpu) else if (strcmp(state, "offline") == 0) return 0; - printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n", state, cpu); + pr_err("unknown state(%s) on CPU%d\n", state, cpu); return -EINVAL; } static void vcpu_hotplug(unsigned int cpu) diff --git a/drivers/xen/dbgp.c b/drivers/xen/dbgp.c index 42569c77ccc..8145a59fd9f 100644 --- a/drivers/xen/dbgp.c +++ b/drivers/xen/dbgp.c @@ -8,7 +8,9 @@ static int xen_dbgp_op(struct usb_hcd *hcd, int op) { +#ifdef CONFIG_PCI const struct device *ctrlr = hcd_to_bus(hcd)->controller; +#endif struct physdev_dbgp_op dbgp; if (!xen_initial_domain()) @@ -17,7 +19,7 @@ static int xen_dbgp_op(struct usb_hcd *hcd, int op) dbgp.op = op; #ifdef CONFIG_PCI - if (ctrlr->bus == &pci_bus_type) { + if (dev_is_pci(ctrlr)) { const struct pci_dev *pdev = to_pci_dev(ctrlr); dbgp.u.pci.seg = pci_domain_nr(pdev->bus); diff --git a/drivers/xen/events/Makefile b/drivers/xen/events/Makefile new file mode 100644 index 00000000000..62be55cd981 --- /dev/null +++ b/drivers/xen/events/Makefile @@ -0,0 +1,5 @@ +obj-y += events.o + +events-y += events_base.o +events-y += events_2l.o +events-y += events_fifo.o diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c new file mode 100644 index 00000000000..5db43fc100a --- /dev/null +++ b/drivers/xen/events/events_2l.c @@ -0,0 +1,365 @@ +/* + * Xen event channels (2-level ABI) + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/linkage.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/module.h> + +#include <asm/sync_bitops.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> + +#include "events_internal.h" + +/* + * Note sizeof(xen_ulong_t) can be more than sizeof(unsigned long). Be + * careful to only use bitops which allow for this (e.g + * test_bit/find_first_bit and friends but not __ffs) and to pass + * BITS_PER_EVTCHN_WORD as the bitmask length. + */ +#define BITS_PER_EVTCHN_WORD (sizeof(xen_ulong_t)*8) +/* + * Make a bitmask (i.e. unsigned long *) of a xen_ulong_t + * array. Primarily to avoid long lines (hence the terse name). + */ +#define BM(x) (unsigned long *)(x) +/* Find the first set bit in a evtchn mask */ +#define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD) + +static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD], + cpu_evtchn_mask); + +static unsigned evtchn_2l_max_channels(void) +{ + return EVTCHN_2L_NR_CHANNELS; +} + +static void evtchn_2l_bind_to_cpu(struct irq_info *info, unsigned cpu) +{ + clear_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, info->cpu))); + set_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, cpu))); +} + +static void evtchn_2l_clear_pending(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_clear_bit(port, BM(&s->evtchn_pending[0])); +} + +static void evtchn_2l_set_pending(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, BM(&s->evtchn_pending[0])); +} + +static bool evtchn_2l_is_pending(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + return sync_test_bit(port, BM(&s->evtchn_pending[0])); +} + +static bool evtchn_2l_test_and_set_mask(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + return sync_test_and_set_bit(port, BM(&s->evtchn_mask[0])); +} + +static void evtchn_2l_mask(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, BM(&s->evtchn_mask[0])); +} + +static void evtchn_2l_unmask(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + unsigned int cpu = get_cpu(); + int do_hypercall = 0, evtchn_pending = 0; + + BUG_ON(!irqs_disabled()); + + if (unlikely((cpu != cpu_from_evtchn(port)))) + do_hypercall = 1; + else { + /* + * Need to clear the mask before checking pending to + * avoid a race with an event becoming pending. + * + * EVTCHNOP_unmask will only trigger an upcall if the + * mask bit was set, so if a hypercall is needed + * remask the event. + */ + sync_clear_bit(port, BM(&s->evtchn_mask[0])); + evtchn_pending = sync_test_bit(port, BM(&s->evtchn_pending[0])); + + if (unlikely(evtchn_pending && xen_hvm_domain())) { + sync_set_bit(port, BM(&s->evtchn_mask[0])); + do_hypercall = 1; + } + } + + /* Slow path (hypercall) if this is a non-local port or if this is + * an hvm domain and an event is pending (hvm domains don't have + * their own implementation of irq_enable). */ + if (do_hypercall) { + struct evtchn_unmask unmask = { .port = port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); + } else { + struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + + /* + * The following is basically the equivalent of + * 'hw_resend_irq'. Just like a real IO-APIC we 'lose + * the interrupt edge' if the channel is masked. + */ + if (evtchn_pending && + !sync_test_and_set_bit(port / BITS_PER_EVTCHN_WORD, + BM(&vcpu_info->evtchn_pending_sel))) + vcpu_info->evtchn_upcall_pending = 1; + } + + put_cpu(); +} + +static DEFINE_PER_CPU(unsigned int, current_word_idx); +static DEFINE_PER_CPU(unsigned int, current_bit_idx); + +/* + * Mask out the i least significant bits of w + */ +#define MASK_LSBS(w, i) (w & ((~((xen_ulong_t)0UL)) << i)) + +static inline xen_ulong_t active_evtchns(unsigned int cpu, + struct shared_info *sh, + unsigned int idx) +{ + return sh->evtchn_pending[idx] & + per_cpu(cpu_evtchn_mask, cpu)[idx] & + ~sh->evtchn_mask[idx]; +} + +/* + * Search the CPU's pending events bitmasks. For each one found, map + * the event number to an irq, and feed it into do_IRQ() for handling. + * + * Xen uses a two-level bitmap to speed searching. The first level is + * a bitset of words which contain pending event bits. The second + * level is a bitset of pending events themselves. + */ +static void evtchn_2l_handle_events(unsigned cpu) +{ + int irq; + xen_ulong_t pending_words; + xen_ulong_t pending_bits; + int start_word_idx, start_bit_idx; + int word_idx, bit_idx; + int i; + struct shared_info *s = HYPERVISOR_shared_info; + struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + + /* Timer interrupt has highest priority. */ + irq = irq_from_virq(cpu, VIRQ_TIMER); + if (irq != -1) { + unsigned int evtchn = evtchn_from_irq(irq); + word_idx = evtchn / BITS_PER_LONG; + bit_idx = evtchn % BITS_PER_LONG; + if (active_evtchns(cpu, s, word_idx) & (1ULL << bit_idx)) + generic_handle_irq(irq); + } + + /* + * Master flag must be cleared /before/ clearing + * selector flag. xchg_xen_ulong must contain an + * appropriate barrier. + */ + pending_words = xchg_xen_ulong(&vcpu_info->evtchn_pending_sel, 0); + + start_word_idx = __this_cpu_read(current_word_idx); + start_bit_idx = __this_cpu_read(current_bit_idx); + + word_idx = start_word_idx; + + for (i = 0; pending_words != 0; i++) { + xen_ulong_t words; + + words = MASK_LSBS(pending_words, word_idx); + + /* + * If we masked out all events, wrap to beginning. + */ + if (words == 0) { + word_idx = 0; + bit_idx = 0; + continue; + } + word_idx = EVTCHN_FIRST_BIT(words); + + pending_bits = active_evtchns(cpu, s, word_idx); + bit_idx = 0; /* usually scan entire word from start */ + /* + * We scan the starting word in two parts. + * + * 1st time: start in the middle, scanning the + * upper bits. + * + * 2nd time: scan the whole word (not just the + * parts skipped in the first pass) -- if an + * event in the previously scanned bits is + * pending again it would just be scanned on + * the next loop anyway. + */ + if (word_idx == start_word_idx) { + if (i == 0) + bit_idx = start_bit_idx; + } + + do { + xen_ulong_t bits; + int port; + + bits = MASK_LSBS(pending_bits, bit_idx); + + /* If we masked out all events, move on. */ + if (bits == 0) + break; + + bit_idx = EVTCHN_FIRST_BIT(bits); + + /* Process port. */ + port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx; + irq = get_evtchn_to_irq(port); + + if (irq != -1) + generic_handle_irq(irq); + + bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD; + + /* Next caller starts at last processed + 1 */ + __this_cpu_write(current_word_idx, + bit_idx ? word_idx : + (word_idx+1) % BITS_PER_EVTCHN_WORD); + __this_cpu_write(current_bit_idx, bit_idx); + } while (bit_idx != 0); + + /* Scan start_l1i twice; all others once. */ + if ((word_idx != start_word_idx) || (i != 0)) + pending_words &= ~(1UL << word_idx); + + word_idx = (word_idx + 1) % BITS_PER_EVTCHN_WORD; + } +} + +irqreturn_t xen_debug_interrupt(int irq, void *dev_id) +{ + struct shared_info *sh = HYPERVISOR_shared_info; + int cpu = smp_processor_id(); + xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu); + int i; + unsigned long flags; + static DEFINE_SPINLOCK(debug_lock); + struct vcpu_info *v; + + spin_lock_irqsave(&debug_lock, flags); + + printk("\nvcpu %d\n ", cpu); + + for_each_online_cpu(i) { + int pending; + v = per_cpu(xen_vcpu, i); + pending = (get_irq_regs() && i == cpu) + ? xen_irqs_disabled(get_irq_regs()) + : v->evtchn_upcall_mask; + printk("%d: masked=%d pending=%d event_sel %0*"PRI_xen_ulong"\n ", i, + pending, v->evtchn_upcall_pending, + (int)(sizeof(v->evtchn_pending_sel)*2), + v->evtchn_pending_sel); + } + v = per_cpu(xen_vcpu, cpu); + + printk("\npending:\n "); + for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", + (int)sizeof(sh->evtchn_pending[0])*2, + sh->evtchn_pending[i], + i % 8 == 0 ? "\n " : " "); + printk("\nglobal mask:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", + (int)(sizeof(sh->evtchn_mask[0])*2), + sh->evtchn_mask[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nglobally unmasked:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", + (int)(sizeof(sh->evtchn_mask[0])*2), + sh->evtchn_pending[i] & ~sh->evtchn_mask[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nlocal cpu%d mask:\n ", cpu); + for (i = (EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", (int)(sizeof(cpu_evtchn[0])*2), + cpu_evtchn[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nlocally unmasked:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) { + xen_ulong_t pending = sh->evtchn_pending[i] + & ~sh->evtchn_mask[i] + & cpu_evtchn[i]; + printk("%0*"PRI_xen_ulong"%s", + (int)(sizeof(sh->evtchn_mask[0])*2), + pending, i % 8 == 0 ? "\n " : " "); + } + + printk("\npending list:\n"); + for (i = 0; i < EVTCHN_2L_NR_CHANNELS; i++) { + if (sync_test_bit(i, BM(sh->evtchn_pending))) { + int word_idx = i / BITS_PER_EVTCHN_WORD; + printk(" %d: event %d -> irq %d%s%s%s\n", + cpu_from_evtchn(i), i, + get_evtchn_to_irq(i), + sync_test_bit(word_idx, BM(&v->evtchn_pending_sel)) + ? "" : " l2-clear", + !sync_test_bit(i, BM(sh->evtchn_mask)) + ? "" : " globally-masked", + sync_test_bit(i, BM(cpu_evtchn)) + ? "" : " locally-masked"); + } + } + + spin_unlock_irqrestore(&debug_lock, flags); + + return IRQ_HANDLED; +} + +static const struct evtchn_ops evtchn_ops_2l = { + .max_channels = evtchn_2l_max_channels, + .nr_channels = evtchn_2l_max_channels, + .bind_to_cpu = evtchn_2l_bind_to_cpu, + .clear_pending = evtchn_2l_clear_pending, + .set_pending = evtchn_2l_set_pending, + .is_pending = evtchn_2l_is_pending, + .test_and_set_mask = evtchn_2l_test_and_set_mask, + .mask = evtchn_2l_mask, + .unmask = evtchn_2l_unmask, + .handle_events = evtchn_2l_handle_events, +}; + +void __init xen_evtchn_2l_init(void) +{ + pr_info("Using 2-level ABI\n"); + evtchn_ops = &evtchn_ops_2l; +} diff --git a/drivers/xen/events.c b/drivers/xen/events/events_base.c index 59e10a1286d..c919d3d5c84 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events/events_base.c @@ -21,6 +21,8 @@ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/linkage.h> #include <linux/interrupt.h> #include <linux/irq.h> @@ -54,8 +56,13 @@ #include <xen/interface/hvm/params.h> #include <xen/interface/physdev.h> #include <xen/interface/sched.h> +#include <xen/interface/vcpu.h> #include <asm/hw_irq.h> +#include "events_internal.h" + +const struct evtchn_ops *evtchn_ops; + /* * This lock protects updates to the following mapping and reference-count * arrays. The lock does not need to be acquired to read the mapping tables. @@ -70,56 +77,15 @@ static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; /* IRQ <-> IPI mapping */ static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1}; -/* Interrupt types. */ -enum xen_irq_type { - IRQT_UNBOUND = 0, - IRQT_PIRQ, - IRQT_VIRQ, - IRQT_IPI, - IRQT_EVTCHN -}; - -/* - * Packed IRQ information: - * type - enum xen_irq_type - * event channel - irq->event channel mapping - * cpu - cpu this event channel is bound to - * index - type-specific information: - * PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM - * guest, or GSI (real passthrough IRQ) of the device. - * VIRQ - virq number - * IPI - IPI vector - * EVTCHN - - */ -struct irq_info { - struct list_head list; - int refcnt; - enum xen_irq_type type; /* type */ - unsigned irq; - unsigned short evtchn; /* event channel */ - unsigned short cpu; /* cpu bound */ - - union { - unsigned short virq; - enum ipi_vector ipi; - struct { - unsigned short pirq; - unsigned short gsi; - unsigned char vector; - unsigned char flags; - uint16_t domid; - } pirq; - } u; -}; -#define PIRQ_NEEDS_EOI (1 << 0) -#define PIRQ_SHAREABLE (1 << 1) - -static int *evtchn_to_irq; +int **evtchn_to_irq; +#ifdef CONFIG_X86 static unsigned long *pirq_eoi_map; +#endif static bool (*pirq_needs_eoi)(unsigned irq); -static DEFINE_PER_CPU(unsigned long [NR_EVENT_CHANNELS/BITS_PER_LONG], - cpu_evtchn_mask); +#define EVTCHN_ROW(e) (e / (PAGE_SIZE/sizeof(**evtchn_to_irq))) +#define EVTCHN_COL(e) (e % (PAGE_SIZE/sizeof(**evtchn_to_irq))) +#define EVTCHN_PER_ROW (PAGE_SIZE / sizeof(**evtchn_to_irq)) /* Xen will never allocate port zero for any purpose. */ #define VALID_EVTCHN(chn) ((chn) != 0) @@ -130,19 +96,75 @@ static struct irq_chip xen_pirq_chip; static void enable_dynirq(struct irq_data *data); static void disable_dynirq(struct irq_data *data); +static void clear_evtchn_to_irq_row(unsigned row) +{ + unsigned col; + + for (col = 0; col < EVTCHN_PER_ROW; col++) + evtchn_to_irq[row][col] = -1; +} + +static void clear_evtchn_to_irq_all(void) +{ + unsigned row; + + for (row = 0; row < EVTCHN_ROW(xen_evtchn_max_channels()); row++) { + if (evtchn_to_irq[row] == NULL) + continue; + clear_evtchn_to_irq_row(row); + } +} + +static int set_evtchn_to_irq(unsigned evtchn, unsigned irq) +{ + unsigned row; + unsigned col; + + if (evtchn >= xen_evtchn_max_channels()) + return -EINVAL; + + row = EVTCHN_ROW(evtchn); + col = EVTCHN_COL(evtchn); + + if (evtchn_to_irq[row] == NULL) { + /* Unallocated irq entries return -1 anyway */ + if (irq == -1) + return 0; + + evtchn_to_irq[row] = (int *)get_zeroed_page(GFP_KERNEL); + if (evtchn_to_irq[row] == NULL) + return -ENOMEM; + + clear_evtchn_to_irq_row(row); + } + + evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)] = irq; + return 0; +} + +int get_evtchn_to_irq(unsigned evtchn) +{ + if (evtchn >= xen_evtchn_max_channels()) + return -1; + if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL) + return -1; + return evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]; +} + /* Get info for IRQ */ -static struct irq_info *info_for_irq(unsigned irq) +struct irq_info *info_for_irq(unsigned irq) { return irq_get_handler_data(irq); } /* Constructors for packed IRQ information. */ -static void xen_irq_info_common_init(struct irq_info *info, +static int xen_irq_info_common_setup(struct irq_info *info, unsigned irq, enum xen_irq_type type, - unsigned short evtchn, + unsigned evtchn, unsigned short cpu) { + int ret; BUG_ON(info->type != IRQT_UNBOUND && info->type != type); @@ -151,68 +173,78 @@ static void xen_irq_info_common_init(struct irq_info *info, info->evtchn = evtchn; info->cpu = cpu; - evtchn_to_irq[evtchn] = irq; + ret = set_evtchn_to_irq(evtchn, irq); + if (ret < 0) + return ret; + + irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN); + + return xen_evtchn_port_setup(info); } -static void xen_irq_info_evtchn_init(unsigned irq, - unsigned short evtchn) +static int xen_irq_info_evtchn_setup(unsigned irq, + unsigned evtchn) { struct irq_info *info = info_for_irq(irq); - xen_irq_info_common_init(info, irq, IRQT_EVTCHN, evtchn, 0); + return xen_irq_info_common_setup(info, irq, IRQT_EVTCHN, evtchn, 0); } -static void xen_irq_info_ipi_init(unsigned cpu, +static int xen_irq_info_ipi_setup(unsigned cpu, unsigned irq, - unsigned short evtchn, + unsigned evtchn, enum ipi_vector ipi) { struct irq_info *info = info_for_irq(irq); - xen_irq_info_common_init(info, irq, IRQT_IPI, evtchn, 0); - info->u.ipi = ipi; per_cpu(ipi_to_irq, cpu)[ipi] = irq; + + return xen_irq_info_common_setup(info, irq, IRQT_IPI, evtchn, 0); } -static void xen_irq_info_virq_init(unsigned cpu, +static int xen_irq_info_virq_setup(unsigned cpu, unsigned irq, - unsigned short evtchn, - unsigned short virq) + unsigned evtchn, + unsigned virq) { struct irq_info *info = info_for_irq(irq); - xen_irq_info_common_init(info, irq, IRQT_VIRQ, evtchn, 0); - info->u.virq = virq; per_cpu(virq_to_irq, cpu)[virq] = irq; + + return xen_irq_info_common_setup(info, irq, IRQT_VIRQ, evtchn, 0); } -static void xen_irq_info_pirq_init(unsigned irq, - unsigned short evtchn, - unsigned short pirq, - unsigned short gsi, - unsigned short vector, +static int xen_irq_info_pirq_setup(unsigned irq, + unsigned evtchn, + unsigned pirq, + unsigned gsi, uint16_t domid, unsigned char flags) { struct irq_info *info = info_for_irq(irq); - xen_irq_info_common_init(info, irq, IRQT_PIRQ, evtchn, 0); - info->u.pirq.pirq = pirq; info->u.pirq.gsi = gsi; - info->u.pirq.vector = vector; info->u.pirq.domid = domid; info->u.pirq.flags = flags; + + return xen_irq_info_common_setup(info, irq, IRQT_PIRQ, evtchn, 0); +} + +static void xen_irq_info_cleanup(struct irq_info *info) +{ + set_evtchn_to_irq(info->evtchn, -1); + info->evtchn = 0; } /* * Accessors for packed IRQ information. */ -static unsigned int evtchn_from_irq(unsigned irq) +unsigned int evtchn_from_irq(unsigned irq) { if (unlikely(WARN(irq < 0 || irq >= nr_irqs, "Invalid irq %d!\n", irq))) return 0; @@ -222,10 +254,15 @@ static unsigned int evtchn_from_irq(unsigned irq) unsigned irq_from_evtchn(unsigned int evtchn) { - return evtchn_to_irq[evtchn]; + return get_evtchn_to_irq(evtchn); } EXPORT_SYMBOL_GPL(irq_from_evtchn); +int irq_from_virq(unsigned int cpu, unsigned int virq) +{ + return per_cpu(virq_to_irq, cpu)[virq]; +} + static enum ipi_vector ipi_from_irq(unsigned irq) { struct irq_info *info = info_for_irq(irq); @@ -261,14 +298,14 @@ static enum xen_irq_type type_from_irq(unsigned irq) return info_for_irq(irq)->type; } -static unsigned cpu_from_irq(unsigned irq) +unsigned cpu_from_irq(unsigned irq) { return info_for_irq(irq)->cpu; } -static unsigned int cpu_from_evtchn(unsigned int evtchn) +unsigned int cpu_from_evtchn(unsigned int evtchn) { - int irq = evtchn_to_irq[evtchn]; + int irq = get_evtchn_to_irq(evtchn); unsigned ret = 0; if (irq != -1) @@ -277,10 +314,12 @@ static unsigned int cpu_from_evtchn(unsigned int evtchn) return ret; } +#ifdef CONFIG_X86 static bool pirq_check_eoi_map(unsigned irq) { return test_bit(pirq_from_irq(irq), pirq_eoi_map); } +#endif static bool pirq_needs_eoi_flag(unsigned irq) { @@ -290,67 +329,28 @@ static bool pirq_needs_eoi_flag(unsigned irq) return info->u.pirq.flags & PIRQ_NEEDS_EOI; } -static inline unsigned long active_evtchns(unsigned int cpu, - struct shared_info *sh, - unsigned int idx) -{ - return sh->evtchn_pending[idx] & - per_cpu(cpu_evtchn_mask, cpu)[idx] & - ~sh->evtchn_mask[idx]; -} - static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) { - int irq = evtchn_to_irq[chn]; + int irq = get_evtchn_to_irq(chn); + struct irq_info *info = info_for_irq(irq); BUG_ON(irq == -1); #ifdef CONFIG_SMP - cpumask_copy(irq_to_desc(irq)->irq_data.affinity, cpumask_of(cpu)); + cpumask_copy(irq_get_irq_data(irq)->affinity, cpumask_of(cpu)); #endif + xen_evtchn_port_bind_to_cpu(info, cpu); - clear_bit(chn, per_cpu(cpu_evtchn_mask, cpu_from_irq(irq))); - set_bit(chn, per_cpu(cpu_evtchn_mask, cpu)); - - info_for_irq(irq)->cpu = cpu; -} - -static void init_evtchn_cpu_bindings(void) -{ - int i; -#ifdef CONFIG_SMP - struct irq_info *info; - - /* By default all event channels notify CPU#0. */ - list_for_each_entry(info, &xen_irq_list_head, list) { - struct irq_desc *desc = irq_to_desc(info->irq); - cpumask_copy(desc->irq_data.affinity, cpumask_of(0)); - } -#endif - - for_each_possible_cpu(i) - memset(per_cpu(cpu_evtchn_mask, i), - (i == 0) ? ~0 : 0, sizeof(*per_cpu(cpu_evtchn_mask, i))); -} - -static inline void clear_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_clear_bit(port, &s->evtchn_pending[0]); + info->cpu = cpu; } -static inline void set_evtchn(int port) +static void xen_evtchn_mask_all(void) { - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, &s->evtchn_pending[0]); -} + unsigned int evtchn; -static inline int test_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - return sync_test_bit(port, &s->evtchn_pending[0]); + for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++) + mask_evtchn(evtchn); } - /** * notify_remote_via_irq - send event to remote end of event channel via irq * @irq: irq of event channel to send event to @@ -368,61 +368,12 @@ void notify_remote_via_irq(int irq) } EXPORT_SYMBOL_GPL(notify_remote_via_irq); -static void mask_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, &s->evtchn_mask[0]); -} - -static void unmask_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - unsigned int cpu = get_cpu(); - int do_hypercall = 0, evtchn_pending = 0; - - BUG_ON(!irqs_disabled()); - - if (unlikely((cpu != cpu_from_evtchn(port)))) - do_hypercall = 1; - else - evtchn_pending = sync_test_bit(port, &s->evtchn_pending[0]); - - if (unlikely(evtchn_pending && xen_hvm_domain())) - do_hypercall = 1; - - /* Slow path (hypercall) if this is a non-local port or if this is - * an hvm domain and an event is pending (hvm domains don't have - * their own implementation of irq_enable). */ - if (do_hypercall) { - struct evtchn_unmask unmask = { .port = port }; - (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); - } else { - struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); - - sync_clear_bit(port, &s->evtchn_mask[0]); - - /* - * The following is basically the equivalent of - * 'hw_resend_irq'. Just like a real IO-APIC we 'lose - * the interrupt edge' if the channel is masked. - */ - if (evtchn_pending && - !sync_test_and_set_bit(port / BITS_PER_LONG, - &vcpu_info->evtchn_pending_sel)) - vcpu_info->evtchn_upcall_pending = 1; - } - - put_cpu(); -} - static void xen_irq_init(unsigned irq) { struct irq_info *info; #ifdef CONFIG_SMP - struct irq_desc *desc = irq_to_desc(irq); - /* By default all event channels notify CPU#0. */ - cpumask_copy(desc->irq_data.affinity, cpumask_of(0)); + cpumask_copy(irq_get_irq_data(irq)->affinity, cpumask_of(0)); #endif info = kzalloc(sizeof(*info), GFP_KERNEL); @@ -437,29 +388,22 @@ static void xen_irq_init(unsigned irq) list_add_tail(&info->list, &xen_irq_list_head); } -static int __must_check xen_allocate_irq_dynamic(void) +static int __must_check xen_allocate_irqs_dynamic(int nvec) { - int first = 0; - int irq; + int i, irq = irq_alloc_descs(-1, 0, nvec, -1); -#ifdef CONFIG_X86_IO_APIC - /* - * For an HVM guest or domain 0 which see "real" (emulated or - * actual respectively) GSIs we allocate dynamic IRQs - * e.g. those corresponding to event channels or MSIs - * etc. from the range above those "real" GSIs to avoid - * collisions. - */ - if (xen_initial_domain() || xen_hvm_domain()) - first = get_nr_irqs_gsi(); -#endif + if (irq >= 0) { + for (i = 0; i < nvec; i++) + xen_irq_init(irq + i); + } - irq = irq_alloc_desc_from(first, -1); + return irq; +} - if (irq >= 0) - xen_irq_init(irq); +static inline int __must_check xen_allocate_irq_dynamic(void) +{ - return irq; + return xen_allocate_irqs_dynamic(1); } static int __must_check xen_allocate_irq_gsi(unsigned gsi) @@ -490,6 +434,9 @@ static void xen_free_irq(unsigned irq) { struct irq_info *info = irq_get_handler_data(irq); + if (WARN_ON(!info)) + return; + list_del(&info->list); irq_set_handler_data(irq, NULL); @@ -505,6 +452,15 @@ static void xen_free_irq(unsigned irq) irq_free_desc(irq); } +static void xen_evtchn_close(unsigned int port) +{ + struct evtchn_close close; + + close.port = port; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) + BUG(); +} + static void pirq_query_unmask(int irq) { struct physdev_irq_status_query irq_status; @@ -521,13 +477,6 @@ static void pirq_query_unmask(int irq) info->u.pirq.flags |= PIRQ_NEEDS_EOI; } -static bool probing_irq(int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - - return desc && desc->action == NULL; -} - static void eoi_pirq(struct irq_data *data) { int evtchn = evtchn_from_irq(data->irq); @@ -569,16 +518,20 @@ static unsigned int __startup_pirq(unsigned int irq) BIND_PIRQ__WILL_SHARE : 0; rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq); if (rc != 0) { - if (!probing_irq(irq)) - printk(KERN_INFO "Failed to obtain physical IRQ %d\n", - irq); + pr_warn("Failed to obtain physical IRQ %d\n", irq); return 0; } evtchn = bind_pirq.port; pirq_query_unmask(irq); - evtchn_to_irq[evtchn] = irq; + rc = set_evtchn_to_irq(evtchn, irq); + if (rc != 0) { + pr_err("irq%d: Failed to set port to irq mapping (%d)\n", + irq, rc); + xen_evtchn_close(evtchn); + return 0; + } bind_evtchn_to_cpu(evtchn, 0); info->evtchn = evtchn; @@ -596,10 +549,9 @@ static unsigned int startup_pirq(struct irq_data *data) static void shutdown_pirq(struct irq_data *data) { - struct evtchn_close close; unsigned int irq = data->irq; struct irq_info *info = info_for_irq(irq); - int evtchn = evtchn_from_irq(irq); + unsigned evtchn = evtchn_from_irq(irq); BUG_ON(info->type != IRQT_PIRQ); @@ -607,14 +559,8 @@ static void shutdown_pirq(struct irq_data *data) return; mask_evtchn(evtchn); - - close.port = evtchn; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) - BUG(); - - bind_evtchn_to_cpu(evtchn, 0); - evtchn_to_irq[evtchn] = -1; - info->evtchn = 0; + xen_evtchn_close(evtchn); + xen_irq_info_cleanup(info); } static void enable_pirq(struct irq_data *data) @@ -643,6 +589,41 @@ int xen_irq_from_gsi(unsigned gsi) } EXPORT_SYMBOL_GPL(xen_irq_from_gsi); +static void __unbind_from_irq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + struct irq_info *info = irq_get_handler_data(irq); + + if (info->refcnt > 0) { + info->refcnt--; + if (info->refcnt != 0) + return; + } + + if (VALID_EVTCHN(evtchn)) { + unsigned int cpu = cpu_from_irq(irq); + + xen_evtchn_close(evtchn); + + switch (type_from_irq(irq)) { + case IRQT_VIRQ: + per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1; + break; + case IRQT_IPI: + per_cpu(ipi_to_irq, cpu)[ipi_from_irq(irq)] = -1; + break; + default: + break; + } + + xen_irq_info_cleanup(info); + } + + BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND); + + xen_free_irq(irq); +} + /* * Do not make any assumptions regarding the relationship between the * IRQ number returned here and the Xen pirq argument. @@ -658,13 +639,14 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, { int irq = -1; struct physdev_irq irq_op; + int ret; mutex_lock(&irq_mapping_update_lock); irq = xen_irq_from_gsi(gsi); if (irq != -1) { - printk(KERN_INFO "xen_map_pirq_gsi: returning irq %d for gsi %u\n", - irq, gsi); + pr_info("%s: returning irq %d for gsi %u\n", + __func__, irq, gsi); goto out; } @@ -685,8 +667,13 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, goto out; } - xen_irq_info_pirq_init(irq, 0, pirq, gsi, irq_op.vector, DOMID_SELF, + ret = xen_irq_info_pirq_setup(irq, 0, pirq, gsi, DOMID_SELF, shareable ? PIRQ_SHAREABLE : 0); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } pirq_query_unmask(irq); /* We try to use the handler with the appropriate semantic for the @@ -733,21 +720,25 @@ int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc) } int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, - int pirq, int vector, const char *name, - domid_t domid) + int pirq, int nvec, const char *name, domid_t domid) { - int irq, ret; + int i, irq, ret; mutex_lock(&irq_mapping_update_lock); - irq = xen_allocate_irq_dynamic(); + irq = xen_allocate_irqs_dynamic(nvec); if (irq < 0) goto out; - irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_edge_irq, - name); + for (i = 0; i < nvec; i++) { + irq_set_chip_and_handler_name(irq + i, &xen_pirq_chip, handle_edge_irq, name); + + ret = xen_irq_info_pirq_setup(irq + i, 0, pirq + i, 0, domid, + i == 0 ? 0 : PIRQ_MSI_GROUP); + if (ret < 0) + goto error_irq; + } - xen_irq_info_pirq_init(irq, 0, pirq, 0, vector, domid, 0); ret = irq_set_msi_desc(irq, msidesc); if (ret < 0) goto error_irq; @@ -755,26 +746,27 @@ out: mutex_unlock(&irq_mapping_update_lock); return irq; error_irq: + for (; i >= 0; i--) + __unbind_from_irq(irq + i); mutex_unlock(&irq_mapping_update_lock); - xen_free_irq(irq); return ret; } #endif int xen_destroy_irq(int irq) { - struct irq_desc *desc; struct physdev_unmap_pirq unmap_irq; struct irq_info *info = info_for_irq(irq); int rc = -ENOENT; mutex_lock(&irq_mapping_update_lock); - desc = irq_to_desc(irq); - if (!desc) - goto out; - - if (xen_initial_domain()) { + /* + * If trying to remove a vector in a MSI group different + * than the first one skip the PIRQ unmap unless this vector + * is the first one in the group. + */ + if (xen_initial_domain() && !(info->u.pirq.flags & PIRQ_MSI_GROUP)) { unmap_irq.pirq = info->u.pirq.pirq; unmap_irq.domid = info->u.pirq.domid; rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq); @@ -783,10 +775,10 @@ int xen_destroy_irq(int irq) * (free_domain_pirqs). */ if ((rc == -ESRCH && info->u.pirq.domid != DOMID_SELF)) - printk(KERN_INFO "domain %d does not have %d anymore\n", + pr_info("domain %d does not have %d anymore\n", info->u.pirq.domid, info->u.pirq.pirq); else if (rc) { - printk(KERN_WARNING "unmap irq failed %d\n", rc); + pr_warn("unmap irq failed %d\n", rc); goto out; } } @@ -826,28 +818,39 @@ int xen_pirq_from_irq(unsigned irq) return pirq_from_irq(irq); } EXPORT_SYMBOL_GPL(xen_pirq_from_irq); + int bind_evtchn_to_irq(unsigned int evtchn) { int irq; + int ret; + + if (evtchn >= xen_evtchn_max_channels()) + return -ENOMEM; mutex_lock(&irq_mapping_update_lock); - irq = evtchn_to_irq[evtchn]; + irq = get_evtchn_to_irq(evtchn); if (irq == -1) { irq = xen_allocate_irq_dynamic(); - if (irq == -1) + if (irq < 0) goto out; irq_set_chip_and_handler_name(irq, &xen_dynamic_chip, handle_edge_irq, "event"); - xen_irq_info_evtchn_init(irq, evtchn); + ret = xen_irq_info_evtchn_setup(irq, evtchn); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } + /* New interdomain events are bound to VCPU 0. */ + bind_evtchn_to_cpu(evtchn, 0); } else { struct irq_info *info = info_for_irq(irq); WARN_ON(info == NULL || info->type != IRQT_EVTCHN); } - irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN); out: mutex_unlock(&irq_mapping_update_lock); @@ -860,6 +863,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) { struct evtchn_bind_ipi bind_ipi; int evtchn, irq; + int ret; mutex_lock(&irq_mapping_update_lock); @@ -879,8 +883,12 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) BUG(); evtchn = bind_ipi.port; - xen_irq_info_ipi_init(cpu, irq, evtchn, ipi); - + ret = xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } bind_evtchn_to_cpu(evtchn, cpu); } else { struct irq_info *info = info_for_irq(irq); @@ -913,7 +921,7 @@ static int find_virq(unsigned int virq, unsigned int cpu) int port, rc = -ENOENT; memset(&status, 0, sizeof(status)); - for (port = 0; port <= NR_EVENT_CHANNELS; port++) { + for (port = 0; port < xen_evtchn_max_channels(); port++) { status.dom = DOMID_SELF; status.port = port; rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status); @@ -929,6 +937,19 @@ static int find_virq(unsigned int virq, unsigned int cpu) return rc; } +/** + * xen_evtchn_nr_channels - number of usable event channel ports + * + * This may be less than the maximum supported by the current + * hypervisor ABI. Use xen_evtchn_max_channels() for the maximum + * supported. + */ +unsigned xen_evtchn_nr_channels(void) +{ + return evtchn_ops->nr_channels(); +} +EXPORT_SYMBOL_GPL(xen_evtchn_nr_channels); + int bind_virq_to_irq(unsigned int virq, unsigned int cpu) { struct evtchn_bind_virq bind_virq; @@ -940,7 +961,7 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu) if (irq == -1) { irq = xen_allocate_irq_dynamic(); - if (irq == -1) + if (irq < 0) goto out; irq_set_chip_and_handler_name(irq, &xen_percpu_chip, @@ -959,7 +980,12 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu) evtchn = ret; } - xen_irq_info_virq_init(cpu, irq, evtchn, virq); + ret = xen_irq_info_virq_setup(cpu, irq, evtchn, virq); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } bind_evtchn_to_cpu(evtchn, cpu); } else { @@ -975,47 +1001,8 @@ out: static void unbind_from_irq(unsigned int irq) { - struct evtchn_close close; - int evtchn = evtchn_from_irq(irq); - struct irq_info *info = irq_get_handler_data(irq); - mutex_lock(&irq_mapping_update_lock); - - if (info->refcnt > 0) { - info->refcnt--; - if (info->refcnt != 0) - goto done; - } - - if (VALID_EVTCHN(evtchn)) { - close.port = evtchn; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) - BUG(); - - switch (type_from_irq(irq)) { - case IRQT_VIRQ: - per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) - [virq_from_irq(irq)] = -1; - break; - case IRQT_IPI: - per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn)) - [ipi_from_irq(irq)] = -1; - break; - default: - break; - } - - /* Closed ports are implicitly re-bound to VCPU0. */ - bind_evtchn_to_cpu(evtchn, 0); - - evtchn_to_irq[evtchn] = -1; - } - - BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND); - - xen_free_irq(irq); - - done: + __unbind_from_irq(irq); mutex_unlock(&irq_mapping_update_lock); } @@ -1106,14 +1093,35 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi, void unbind_from_irqhandler(unsigned int irq, void *dev_id) { + struct irq_info *info = irq_get_handler_data(irq); + + if (WARN_ON(!info)) + return; free_irq(irq, dev_id); unbind_from_irq(irq); } EXPORT_SYMBOL_GPL(unbind_from_irqhandler); +/** + * xen_set_irq_priority() - set an event channel priority. + * @irq:irq bound to an event channel. + * @priority: priority between XEN_IRQ_PRIORITY_MAX and XEN_IRQ_PRIORITY_MIN. + */ +int xen_set_irq_priority(unsigned irq, unsigned priority) +{ + struct evtchn_set_priority set_priority; + + set_priority.port = evtchn_from_irq(irq); + set_priority.priority = priority; + + return HYPERVISOR_event_channel_op(EVTCHNOP_set_priority, + &set_priority); +} +EXPORT_SYMBOL_GPL(xen_set_irq_priority); + int evtchn_make_refcounted(unsigned int evtchn) { - int irq = evtchn_to_irq[evtchn]; + int irq = get_evtchn_to_irq(evtchn); struct irq_info *info; if (irq == -1) @@ -1138,12 +1146,12 @@ int evtchn_get(unsigned int evtchn) struct irq_info *info; int err = -ENOENT; - if (evtchn >= NR_EVENT_CHANNELS) + if (evtchn >= xen_evtchn_max_channels()) return -EINVAL; mutex_lock(&irq_mapping_update_lock); - irq = evtchn_to_irq[evtchn]; + irq = get_evtchn_to_irq(evtchn); if (irq == -1) goto done; @@ -1167,7 +1175,7 @@ EXPORT_SYMBOL_GPL(evtchn_get); void evtchn_put(unsigned int evtchn) { - int irq = evtchn_to_irq[evtchn]; + int irq = get_evtchn_to_irq(evtchn); if (WARN_ON(irq == -1)) return; unbind_from_irq(irq); @@ -1176,205 +1184,36 @@ EXPORT_SYMBOL_GPL(evtchn_put); void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) { - int irq = per_cpu(ipi_to_irq, cpu)[vector]; - BUG_ON(irq < 0); - notify_remote_via_irq(irq); -} - -irqreturn_t xen_debug_interrupt(int irq, void *dev_id) -{ - struct shared_info *sh = HYPERVISOR_shared_info; - int cpu = smp_processor_id(); - unsigned long *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu); - int i; - unsigned long flags; - static DEFINE_SPINLOCK(debug_lock); - struct vcpu_info *v; - - spin_lock_irqsave(&debug_lock, flags); - - printk("\nvcpu %d\n ", cpu); - - for_each_online_cpu(i) { - int pending; - v = per_cpu(xen_vcpu, i); - pending = (get_irq_regs() && i == cpu) - ? xen_irqs_disabled(get_irq_regs()) - : v->evtchn_upcall_mask; - printk("%d: masked=%d pending=%d event_sel %0*lx\n ", i, - pending, v->evtchn_upcall_pending, - (int)(sizeof(v->evtchn_pending_sel)*2), - v->evtchn_pending_sel); - } - v = per_cpu(xen_vcpu, cpu); - - printk("\npending:\n "); - for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) - printk("%0*lx%s", (int)sizeof(sh->evtchn_pending[0])*2, - sh->evtchn_pending[i], - i % 8 == 0 ? "\n " : " "); - printk("\nglobal mask:\n "); - for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) - printk("%0*lx%s", - (int)(sizeof(sh->evtchn_mask[0])*2), - sh->evtchn_mask[i], - i % 8 == 0 ? "\n " : " "); - - printk("\nglobally unmasked:\n "); - for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) - printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2), - sh->evtchn_pending[i] & ~sh->evtchn_mask[i], - i % 8 == 0 ? "\n " : " "); - - printk("\nlocal cpu%d mask:\n ", cpu); - for (i = (NR_EVENT_CHANNELS/BITS_PER_LONG)-1; i >= 0; i--) - printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2), - cpu_evtchn[i], - i % 8 == 0 ? "\n " : " "); - - printk("\nlocally unmasked:\n "); - for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) { - unsigned long pending = sh->evtchn_pending[i] - & ~sh->evtchn_mask[i] - & cpu_evtchn[i]; - printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2), - pending, i % 8 == 0 ? "\n " : " "); - } + int irq; - printk("\npending list:\n"); - for (i = 0; i < NR_EVENT_CHANNELS; i++) { - if (sync_test_bit(i, sh->evtchn_pending)) { - int word_idx = i / BITS_PER_LONG; - printk(" %d: event %d -> irq %d%s%s%s\n", - cpu_from_evtchn(i), i, - evtchn_to_irq[i], - sync_test_bit(word_idx, &v->evtchn_pending_sel) - ? "" : " l2-clear", - !sync_test_bit(i, sh->evtchn_mask) - ? "" : " globally-masked", - sync_test_bit(i, cpu_evtchn) - ? "" : " locally-masked"); - } +#ifdef CONFIG_X86 + if (unlikely(vector == XEN_NMI_VECTOR)) { + int rc = HYPERVISOR_vcpu_op(VCPUOP_send_nmi, cpu, NULL); + if (rc < 0) + printk(KERN_WARNING "Sending nmi to CPU%d failed (rc:%d)\n", cpu, rc); + return; } - - spin_unlock_irqrestore(&debug_lock, flags); - - return IRQ_HANDLED; +#endif + irq = per_cpu(ipi_to_irq, cpu)[vector]; + BUG_ON(irq < 0); + notify_remote_via_irq(irq); } static DEFINE_PER_CPU(unsigned, xed_nesting_count); -static DEFINE_PER_CPU(unsigned int, current_word_idx); -static DEFINE_PER_CPU(unsigned int, current_bit_idx); - -/* - * Mask out the i least significant bits of w - */ -#define MASK_LSBS(w, i) (w & ((~0UL) << i)) -/* - * Search the CPUs pending events bitmasks. For each one found, map - * the event number to an irq, and feed it into do_IRQ() for - * handling. - * - * Xen uses a two-level bitmap to speed searching. The first level is - * a bitset of words which contain pending event bits. The second - * level is a bitset of pending events themselves. - */ static void __xen_evtchn_do_upcall(void) { - int start_word_idx, start_bit_idx; - int word_idx, bit_idx; - int i; - int cpu = get_cpu(); - struct shared_info *s = HYPERVISOR_shared_info; struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + int cpu = get_cpu(); unsigned count; do { - unsigned long pending_words; - vcpu_info->evtchn_upcall_pending = 0; if (__this_cpu_inc_return(xed_nesting_count) - 1) goto out; -#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ - /* Clear master flag /before/ clearing selector flag. */ - wmb(); -#endif - pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); - - start_word_idx = __this_cpu_read(current_word_idx); - start_bit_idx = __this_cpu_read(current_bit_idx); - - word_idx = start_word_idx; - - for (i = 0; pending_words != 0; i++) { - unsigned long pending_bits; - unsigned long words; - - words = MASK_LSBS(pending_words, word_idx); - - /* - * If we masked out all events, wrap to beginning. - */ - if (words == 0) { - word_idx = 0; - bit_idx = 0; - continue; - } - word_idx = __ffs(words); - - pending_bits = active_evtchns(cpu, s, word_idx); - bit_idx = 0; /* usually scan entire word from start */ - if (word_idx == start_word_idx) { - /* We scan the starting word in two parts */ - if (i == 0) - /* 1st time: start in the middle */ - bit_idx = start_bit_idx; - else - /* 2nd time: mask bits done already */ - bit_idx &= (1UL << start_bit_idx) - 1; - } - - do { - unsigned long bits; - int port, irq; - struct irq_desc *desc; - - bits = MASK_LSBS(pending_bits, bit_idx); - - /* If we masked out all events, move on. */ - if (bits == 0) - break; - - bit_idx = __ffs(bits); - - /* Process port. */ - port = (word_idx * BITS_PER_LONG) + bit_idx; - irq = evtchn_to_irq[port]; - - if (irq != -1) { - desc = irq_to_desc(irq); - if (desc) - generic_handle_irq_desc(irq, desc); - } - - bit_idx = (bit_idx + 1) % BITS_PER_LONG; - - /* Next caller starts at last processed + 1 */ - __this_cpu_write(current_word_idx, - bit_idx ? word_idx : - (word_idx+1) % BITS_PER_LONG); - __this_cpu_write(current_bit_idx, bit_idx); - } while (bit_idx != 0); - - /* Scan start_l1i twice; all others once. */ - if ((word_idx != start_word_idx) || (i != 0)) - pending_words &= ~(1UL << word_idx); - - word_idx = (word_idx + 1) % BITS_PER_LONG; - } + xen_evtchn_handle_events(cpu); BUG_ON(!irqs_disabled()); @@ -1391,10 +1230,11 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); + irq_enter(); #ifdef CONFIG_X86 exit_idle(); + inc_irq_stat(irq_hv_callback_count); #endif - irq_enter(); __xen_evtchn_do_upcall(); @@ -1413,6 +1253,9 @@ void rebind_evtchn_irq(int evtchn, int irq) { struct irq_info *info = info_for_irq(irq); + if (WARN_ON(!info)) + return; + /* Make sure the irq is masked, since the new event channel will also be masked. */ disable_irq(irq); @@ -1420,12 +1263,12 @@ void rebind_evtchn_irq(int evtchn, int irq) mutex_lock(&irq_mapping_update_lock); /* After resume the irq<->evtchn mappings are all cleared out */ - BUG_ON(evtchn_to_irq[evtchn] != -1); + BUG_ON(get_evtchn_to_irq(evtchn) != -1); /* Expect irq to have been bound before, so there should be a proper type */ BUG_ON(info->type == IRQT_UNBOUND); - xen_irq_info_evtchn_init(irq, evtchn); + (void)xen_irq_info_evtchn_setup(irq, evtchn); mutex_unlock(&irq_mapping_update_lock); @@ -1441,6 +1284,7 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) { struct evtchn_bind_vcpu bind_vcpu; int evtchn = evtchn_from_irq(irq); + int masked; if (!VALID_EVTCHN(evtchn)) return -1; @@ -1457,6 +1301,12 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) bind_vcpu.vcpu = tcpu; /* + * Mask the event while changing the VCPU binding to prevent + * it being delivered on an unexpected VCPU. + */ + masked = test_and_set_mask(evtchn); + + /* * If this fails, it usually just indicates that we're dealing with a * virq or IPI channel, which don't actually need to be rebound. Ignore * it, but don't do the xenlinux-level rebind in that case. @@ -1464,33 +1314,20 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) bind_evtchn_to_cpu(evtchn, tcpu); + if (!masked) + unmask_evtchn(evtchn); + return 0; } static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest, bool force) { - unsigned tcpu = cpumask_first(dest); + unsigned tcpu = cpumask_first_and(dest, cpu_online_mask); return rebind_irq_to_cpu(data->irq, tcpu); } -int resend_irq_on_evtchn(unsigned int irq) -{ - int masked, evtchn = evtchn_from_irq(irq); - struct shared_info *s = HYPERVISOR_shared_info; - - if (!VALID_EVTCHN(evtchn)) - return 1; - - masked = sync_test_and_set_bit(evtchn, s->evtchn_mask); - sync_set_bit(evtchn, s->evtchn_pending); - if (!masked) - unmask_evtchn(evtchn); - - return 1; -} - static void enable_dynirq(struct irq_data *data) { int evtchn = evtchn_from_irq(data->irq); @@ -1525,21 +1362,18 @@ static void mask_ack_dynirq(struct irq_data *data) static int retrigger_dynirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(data->irq); - struct shared_info *sh = HYPERVISOR_shared_info; - int ret = 0; + unsigned int evtchn = evtchn_from_irq(data->irq); + int masked; - if (VALID_EVTCHN(evtchn)) { - int masked; + if (!VALID_EVTCHN(evtchn)) + return 0; - masked = sync_test_and_set_bit(evtchn, sh->evtchn_mask); - sync_set_bit(evtchn, sh->evtchn_pending); - if (!masked) - unmask_evtchn(evtchn); - ret = 1; - } + masked = test_and_set_mask(evtchn); + set_evtchn(evtchn); + if (!masked) + unmask_evtchn(evtchn); - return ret; + return 1; } static void restore_pirqs(void) @@ -1568,8 +1402,8 @@ static void restore_pirqs(void) rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); if (rc) { - printk(KERN_WARNING "xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n", - gsi, irq, pirq, rc); + pr_warn("xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n", + gsi, irq, pirq, rc); xen_free_irq(irq); continue; } @@ -1600,7 +1434,7 @@ static void restore_cpu_virqs(unsigned int cpu) evtchn = bind_virq.port; /* Record the new mapping. */ - xen_irq_info_virq_init(cpu, irq, evtchn, virq); + (void)xen_irq_info_virq_setup(cpu, irq, evtchn, virq); bind_evtchn_to_cpu(evtchn, cpu); } } @@ -1624,7 +1458,7 @@ static void restore_cpu_ipis(unsigned int cpu) evtchn = bind_ipi.port; /* Record the new mapping. */ - xen_irq_info_ipi_init(cpu, irq, evtchn, ipi); + (void)xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi); bind_evtchn_to_cpu(evtchn, cpu); } } @@ -1686,7 +1520,12 @@ void xen_poll_irq(int irq) int xen_test_irq_shared(int irq) { struct irq_info *info = info_for_irq(irq); - struct physdev_irq_status_query irq_status = { .irq = info->u.pirq.pirq }; + struct physdev_irq_status_query irq_status; + + if (WARN_ON(!info)) + return -ENOENT; + + irq_status.irq = info->u.pirq.pirq; if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) return 0; @@ -1696,21 +1535,18 @@ EXPORT_SYMBOL_GPL(xen_test_irq_shared); void xen_irq_resume(void) { - unsigned int cpu, evtchn; + unsigned int cpu; struct irq_info *info; - init_evtchn_cpu_bindings(); - /* New event-channel space is not 'live' yet. */ - for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) - mask_evtchn(evtchn); + xen_evtchn_mask_all(); + xen_evtchn_resume(); /* No IRQ <-> event-channel mappings. */ list_for_each_entry(info, &xen_irq_list_head, list) info->evtchn = 0; /* zap event-channel binding */ - for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) - evtchn_to_irq[evtchn] = -1; + clear_evtchn_to_irq_all(); for_each_possible_cpu(cpu) { restore_cpu_virqs(cpu); @@ -1783,46 +1619,58 @@ void xen_callback_vector(void) int rc; uint64_t callback_via; if (xen_have_vector_callback) { - callback_via = HVM_CALLBACK_VECTOR(XEN_HVM_EVTCHN_CALLBACK); + callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR); rc = xen_set_callback_via(callback_via); if (rc) { - printk(KERN_ERR "Request for Xen HVM callback vector" - " failed.\n"); + pr_err("Request for Xen HVM callback vector failed\n"); xen_have_vector_callback = 0; return; } - printk(KERN_INFO "Xen HVM callback vector for event delivery is " - "enabled\n"); + pr_info("Xen HVM callback vector for event delivery is enabled\n"); /* in the restore case the vector has already been allocated */ - if (!test_bit(XEN_HVM_EVTCHN_CALLBACK, used_vectors)) - alloc_intr_gate(XEN_HVM_EVTCHN_CALLBACK, xen_hvm_callback_vector); + if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, + xen_hvm_callback_vector); } } #else void xen_callback_vector(void) {} #endif +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "xen." + +static bool fifo_events = true; +module_param(fifo_events, bool, 0); + void __init xen_init_IRQ(void) { - int i; + int ret = -EINVAL; - evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq), - GFP_KERNEL); - BUG_ON(!evtchn_to_irq); - for (i = 0; i < NR_EVENT_CHANNELS; i++) - evtchn_to_irq[i] = -1; + if (fifo_events) + ret = xen_evtchn_fifo_init(); + if (ret < 0) + xen_evtchn_2l_init(); - init_evtchn_cpu_bindings(); + evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()), + sizeof(*evtchn_to_irq), GFP_KERNEL); + BUG_ON(!evtchn_to_irq); /* No event channels are 'live' right now. */ - for (i = 0; i < NR_EVENT_CHANNELS; i++) - mask_evtchn(i); + xen_evtchn_mask_all(); pirq_needs_eoi = pirq_needs_eoi_flag; #ifdef CONFIG_X86 - if (xen_hvm_domain()) { + if (xen_pv_domain()) { + irq_ctx_init(smp_processor_id()); + if (xen_initial_domain()) + pci_xen_initial_domain(); + } + if (xen_feature(XENFEAT_hvm_callback_vector)) xen_callback_vector(); + + if (xen_hvm_domain()) { native_init_IRQ(); /* pci_xen_hvm_init must be called after native_init_IRQ so that * __acpi_register_gsi can point at the right function */ @@ -1831,13 +1679,10 @@ void __init xen_init_IRQ(void) int rc; struct physdev_pirq_eoi_gmfn eoi_gmfn; - irq_ctx_init(smp_processor_id()); - if (xen_initial_domain()) - pci_xen_initial_domain(); - pirq_eoi_map = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO); eoi_gmfn.gmfn = virt_to_mfn(pirq_eoi_map); rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn); + /* TODO: No PVH support for PIRQ EOI */ if (rc != 0) { free_page((unsigned long) pirq_eoi_map); pirq_eoi_map = NULL; diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c new file mode 100644 index 00000000000..84b4bfb8434 --- /dev/null +++ b/drivers/xen/events/events_fifo.c @@ -0,0 +1,443 @@ +/* + * Xen event channels (FIFO-based ABI) + * + * Copyright (C) 2013 Citrix Systems R&D ltd. + * + * This source code is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * Or, when distributed separately from the Linux kernel or + * incorporated into other software packages, subject to the following + * license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/linkage.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/smp.h> +#include <linux/percpu.h> +#include <linux/cpu.h> + +#include <asm/sync_bitops.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/page.h> + +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> + +#include "events_internal.h" + +#define EVENT_WORDS_PER_PAGE (PAGE_SIZE / sizeof(event_word_t)) +#define MAX_EVENT_ARRAY_PAGES (EVTCHN_FIFO_NR_CHANNELS / EVENT_WORDS_PER_PAGE) + +struct evtchn_fifo_queue { + uint32_t head[EVTCHN_FIFO_MAX_QUEUES]; +}; + +static DEFINE_PER_CPU(struct evtchn_fifo_control_block *, cpu_control_block); +static DEFINE_PER_CPU(struct evtchn_fifo_queue, cpu_queue); +static event_word_t *event_array[MAX_EVENT_ARRAY_PAGES] __read_mostly; +static unsigned event_array_pages __read_mostly; + +/* + * sync_set_bit() and friends must be unsigned long aligned on non-x86 + * platforms. + */ +#if !defined(CONFIG_X86) && BITS_PER_LONG > 32 + +#define BM(w) (unsigned long *)((unsigned long)w & ~0x7UL) +#define EVTCHN_FIFO_BIT(b, w) \ + (((unsigned long)w & 0x4UL) ? (EVTCHN_FIFO_ ##b + 32) : EVTCHN_FIFO_ ##b) + +#else + +#define BM(w) ((unsigned long *)(w)) +#define EVTCHN_FIFO_BIT(b, w) EVTCHN_FIFO_ ##b + +#endif + +static inline event_word_t *event_word_from_port(unsigned port) +{ + unsigned i = port / EVENT_WORDS_PER_PAGE; + + return event_array[i] + port % EVENT_WORDS_PER_PAGE; +} + +static unsigned evtchn_fifo_max_channels(void) +{ + return EVTCHN_FIFO_NR_CHANNELS; +} + +static unsigned evtchn_fifo_nr_channels(void) +{ + return event_array_pages * EVENT_WORDS_PER_PAGE; +} + +static void free_unused_array_pages(void) +{ + unsigned i; + + for (i = event_array_pages; i < MAX_EVENT_ARRAY_PAGES; i++) { + if (!event_array[i]) + break; + free_page((unsigned long)event_array[i]); + event_array[i] = NULL; + } +} + +static void init_array_page(event_word_t *array_page) +{ + unsigned i; + + for (i = 0; i < EVENT_WORDS_PER_PAGE; i++) + array_page[i] = 1 << EVTCHN_FIFO_MASKED; +} + +static int evtchn_fifo_setup(struct irq_info *info) +{ + unsigned port = info->evtchn; + unsigned new_array_pages; + int ret; + + new_array_pages = port / EVENT_WORDS_PER_PAGE + 1; + + if (new_array_pages > MAX_EVENT_ARRAY_PAGES) + return -EINVAL; + + while (event_array_pages < new_array_pages) { + void *array_page; + struct evtchn_expand_array expand_array; + + /* Might already have a page if we've resumed. */ + array_page = event_array[event_array_pages]; + if (!array_page) { + array_page = (void *)__get_free_page(GFP_KERNEL); + if (array_page == NULL) { + ret = -ENOMEM; + goto error; + } + event_array[event_array_pages] = array_page; + } + + /* Mask all events in this page before adding it. */ + init_array_page(array_page); + + expand_array.array_gfn = virt_to_mfn(array_page); + + ret = HYPERVISOR_event_channel_op(EVTCHNOP_expand_array, &expand_array); + if (ret < 0) + goto error; + + event_array_pages++; + } + return 0; + + error: + if (event_array_pages == 0) + panic("xen: unable to expand event array with initial page (%d)\n", ret); + else + pr_err("unable to expand event array (%d)\n", ret); + free_unused_array_pages(); + return ret; +} + +static void evtchn_fifo_bind_to_cpu(struct irq_info *info, unsigned cpu) +{ + /* no-op */ +} + +static void evtchn_fifo_clear_pending(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + sync_clear_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static void evtchn_fifo_set_pending(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + sync_set_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static bool evtchn_fifo_is_pending(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + return sync_test_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static bool evtchn_fifo_test_and_set_mask(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + return sync_test_and_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); +} + +static void evtchn_fifo_mask(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + sync_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); +} + +static bool evtchn_fifo_is_masked(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + return sync_test_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); +} +/* + * Clear MASKED, spinning if BUSY is set. + */ +static void clear_masked(volatile event_word_t *word) +{ + event_word_t new, old, w; + + w = *word; + + do { + old = w & ~(1 << EVTCHN_FIFO_BUSY); + new = old & ~(1 << EVTCHN_FIFO_MASKED); + w = sync_cmpxchg(word, old, new); + } while (w != old); +} + +static void evtchn_fifo_unmask(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + + BUG_ON(!irqs_disabled()); + + clear_masked(word); + if (evtchn_fifo_is_pending(port)) { + struct evtchn_unmask unmask = { .port = port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); + } +} + +static uint32_t clear_linked(volatile event_word_t *word) +{ + event_word_t new, old, w; + + w = *word; + + do { + old = w; + new = (w & ~((1 << EVTCHN_FIFO_LINKED) + | EVTCHN_FIFO_LINK_MASK)); + } while ((w = sync_cmpxchg(word, old, new)) != old); + + return w & EVTCHN_FIFO_LINK_MASK; +} + +static void handle_irq_for_port(unsigned port) +{ + int irq; + + irq = get_evtchn_to_irq(port); + if (irq != -1) + generic_handle_irq(irq); +} + +static void consume_one_event(unsigned cpu, + struct evtchn_fifo_control_block *control_block, + unsigned priority, unsigned long *ready) +{ + struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu); + uint32_t head; + unsigned port; + event_word_t *word; + + head = q->head[priority]; + + /* + * Reached the tail last time? Read the new HEAD from the + * control block. + */ + if (head == 0) { + rmb(); /* Ensure word is up-to-date before reading head. */ + head = control_block->head[priority]; + } + + port = head; + word = event_word_from_port(port); + head = clear_linked(word); + + /* + * If the link is non-zero, there are more events in the + * queue, otherwise the queue is empty. + * + * If the queue is empty, clear this priority from our local + * copy of the ready word. + */ + if (head == 0) + clear_bit(priority, ready); + + if (evtchn_fifo_is_pending(port) && !evtchn_fifo_is_masked(port)) + handle_irq_for_port(port); + + q->head[priority] = head; +} + +static void evtchn_fifo_handle_events(unsigned cpu) +{ + struct evtchn_fifo_control_block *control_block; + unsigned long ready; + unsigned q; + + control_block = per_cpu(cpu_control_block, cpu); + + ready = xchg(&control_block->ready, 0); + + while (ready) { + q = find_first_bit(BM(&ready), EVTCHN_FIFO_MAX_QUEUES); + consume_one_event(cpu, control_block, q, &ready); + ready |= xchg(&control_block->ready, 0); + } +} + +static void evtchn_fifo_resume(void) +{ + unsigned cpu; + + for_each_possible_cpu(cpu) { + void *control_block = per_cpu(cpu_control_block, cpu); + struct evtchn_init_control init_control; + int ret; + + if (!control_block) + continue; + + /* + * If this CPU is offline, take the opportunity to + * free the control block while it is not being + * used. + */ + if (!cpu_online(cpu)) { + free_page((unsigned long)control_block); + per_cpu(cpu_control_block, cpu) = NULL; + continue; + } + + init_control.control_gfn = virt_to_mfn(control_block); + init_control.offset = 0; + init_control.vcpu = cpu; + + ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control, + &init_control); + if (ret < 0) + BUG(); + } + + /* + * The event array starts out as empty again and is extended + * as normal when events are bound. The existing pages will + * be reused. + */ + event_array_pages = 0; +} + +static const struct evtchn_ops evtchn_ops_fifo = { + .max_channels = evtchn_fifo_max_channels, + .nr_channels = evtchn_fifo_nr_channels, + .setup = evtchn_fifo_setup, + .bind_to_cpu = evtchn_fifo_bind_to_cpu, + .clear_pending = evtchn_fifo_clear_pending, + .set_pending = evtchn_fifo_set_pending, + .is_pending = evtchn_fifo_is_pending, + .test_and_set_mask = evtchn_fifo_test_and_set_mask, + .mask = evtchn_fifo_mask, + .unmask = evtchn_fifo_unmask, + .handle_events = evtchn_fifo_handle_events, + .resume = evtchn_fifo_resume, +}; + +static int evtchn_fifo_init_control_block(unsigned cpu) +{ + struct page *control_block = NULL; + struct evtchn_init_control init_control; + int ret = -ENOMEM; + + control_block = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (control_block == NULL) + goto error; + + init_control.control_gfn = virt_to_mfn(page_address(control_block)); + init_control.offset = 0; + init_control.vcpu = cpu; + + ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control, &init_control); + if (ret < 0) + goto error; + + per_cpu(cpu_control_block, cpu) = page_address(control_block); + + return 0; + + error: + __free_page(control_block); + return ret; +} + +static int evtchn_fifo_cpu_notification(struct notifier_block *self, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + int ret = 0; + + switch (action) { + case CPU_UP_PREPARE: + if (!per_cpu(cpu_control_block, cpu)) + ret = evtchn_fifo_init_control_block(cpu); + break; + default: + break; + } + return ret < 0 ? NOTIFY_BAD : NOTIFY_OK; +} + +static struct notifier_block evtchn_fifo_cpu_notifier = { + .notifier_call = evtchn_fifo_cpu_notification, +}; + +int __init xen_evtchn_fifo_init(void) +{ + int cpu = get_cpu(); + int ret; + + ret = evtchn_fifo_init_control_block(cpu); + if (ret < 0) + goto out; + + pr_info("Using FIFO-based ABI\n"); + + evtchn_ops = &evtchn_ops_fifo; + + register_cpu_notifier(&evtchn_fifo_cpu_notifier); +out: + put_cpu(); + return ret; +} diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h new file mode 100644 index 00000000000..50c2050a1e3 --- /dev/null +++ b/drivers/xen/events/events_internal.h @@ -0,0 +1,151 @@ +/* + * Xen Event Channels (internal header) + * + * Copyright (C) 2013 Citrix Systems R&D Ltd. + * + * This source code is licensed under the GNU General Public License, + * Version 2 or later. See the file COPYING for more details. + */ +#ifndef __EVENTS_INTERNAL_H__ +#define __EVENTS_INTERNAL_H__ + +/* Interrupt types. */ +enum xen_irq_type { + IRQT_UNBOUND = 0, + IRQT_PIRQ, + IRQT_VIRQ, + IRQT_IPI, + IRQT_EVTCHN +}; + +/* + * Packed IRQ information: + * type - enum xen_irq_type + * event channel - irq->event channel mapping + * cpu - cpu this event channel is bound to + * index - type-specific information: + * PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM + * guest, or GSI (real passthrough IRQ) of the device. + * VIRQ - virq number + * IPI - IPI vector + * EVTCHN - + */ +struct irq_info { + struct list_head list; + int refcnt; + enum xen_irq_type type; /* type */ + unsigned irq; + unsigned int evtchn; /* event channel */ + unsigned short cpu; /* cpu bound */ + + union { + unsigned short virq; + enum ipi_vector ipi; + struct { + unsigned short pirq; + unsigned short gsi; + unsigned char vector; + unsigned char flags; + uint16_t domid; + } pirq; + } u; +}; + +#define PIRQ_NEEDS_EOI (1 << 0) +#define PIRQ_SHAREABLE (1 << 1) +#define PIRQ_MSI_GROUP (1 << 2) + +struct evtchn_ops { + unsigned (*max_channels)(void); + unsigned (*nr_channels)(void); + + int (*setup)(struct irq_info *info); + void (*bind_to_cpu)(struct irq_info *info, unsigned cpu); + + void (*clear_pending)(unsigned port); + void (*set_pending)(unsigned port); + bool (*is_pending)(unsigned port); + bool (*test_and_set_mask)(unsigned port); + void (*mask)(unsigned port); + void (*unmask)(unsigned port); + + void (*handle_events)(unsigned cpu); + void (*resume)(void); +}; + +extern const struct evtchn_ops *evtchn_ops; + +extern int **evtchn_to_irq; +int get_evtchn_to_irq(unsigned int evtchn); + +struct irq_info *info_for_irq(unsigned irq); +unsigned cpu_from_irq(unsigned irq); +unsigned cpu_from_evtchn(unsigned int evtchn); + +static inline unsigned xen_evtchn_max_channels(void) +{ + return evtchn_ops->max_channels(); +} + +/* + * Do any ABI specific setup for a bound event channel before it can + * be unmasked and used. + */ +static inline int xen_evtchn_port_setup(struct irq_info *info) +{ + if (evtchn_ops->setup) + return evtchn_ops->setup(info); + return 0; +} + +static inline void xen_evtchn_port_bind_to_cpu(struct irq_info *info, + unsigned cpu) +{ + evtchn_ops->bind_to_cpu(info, cpu); +} + +static inline void clear_evtchn(unsigned port) +{ + evtchn_ops->clear_pending(port); +} + +static inline void set_evtchn(unsigned port) +{ + evtchn_ops->set_pending(port); +} + +static inline bool test_evtchn(unsigned port) +{ + return evtchn_ops->is_pending(port); +} + +static inline bool test_and_set_mask(unsigned port) +{ + return evtchn_ops->test_and_set_mask(port); +} + +static inline void mask_evtchn(unsigned port) +{ + return evtchn_ops->mask(port); +} + +static inline void unmask_evtchn(unsigned port) +{ + return evtchn_ops->unmask(port); +} + +static inline void xen_evtchn_handle_events(unsigned cpu) +{ + return evtchn_ops->handle_events(cpu); +} + +static inline void xen_evtchn_resume(void) +{ + if (evtchn_ops->resume) + evtchn_ops->resume(); +} + +void xen_evtchn_2l_init(void); +int xen_evtchn_fifo_init(void); + +#endif /* #ifndef __EVENTS_INTERNAL_H__ */ diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index b1f60a0c0be..00f40f051d9 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -31,6 +31,8 @@ * IN THE SOFTWARE. */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -55,6 +57,7 @@ struct per_user_data { struct mutex bind_mutex; /* serialize bind/unbind operations */ + struct rb_root evtchns; /* Notification ring, accessed via /dev/xen/evtchn. */ #define EVTCHN_RING_SIZE (PAGE_SIZE / sizeof(evtchn_port_t)) @@ -62,6 +65,7 @@ struct per_user_data { evtchn_port_t *ring; unsigned int ring_cons, ring_prod, ring_overflow; struct mutex ring_cons_mutex; /* protect against concurrent readers */ + spinlock_t ring_prod_lock; /* product against concurrent interrupts */ /* Processes wait on this queue when ring is empty. */ wait_queue_head_t evtchn_wait; @@ -69,54 +73,79 @@ struct per_user_data { const char *name; }; -/* - * Who's bound to each port? This is logically an array of struct - * per_user_data *, but we encode the current enabled-state in bit 0. - */ -static unsigned long *port_user; -static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */ +struct user_evtchn { + struct rb_node node; + struct per_user_data *user; + unsigned port; + bool enabled; +}; -static inline struct per_user_data *get_port_user(unsigned port) +static int add_evtchn(struct per_user_data *u, struct user_evtchn *evtchn) { - return (struct per_user_data *)(port_user[port] & ~1); -} + struct rb_node **new = &(u->evtchns.rb_node), *parent = NULL; -static inline void set_port_user(unsigned port, struct per_user_data *u) -{ - port_user[port] = (unsigned long)u; + while (*new) { + struct user_evtchn *this; + + this = container_of(*new, struct user_evtchn, node); + + parent = *new; + if (this->port < evtchn->port) + new = &((*new)->rb_left); + else if (this->port > evtchn->port) + new = &((*new)->rb_right); + else + return -EEXIST; + } + + /* Add new node and rebalance tree. */ + rb_link_node(&evtchn->node, parent, new); + rb_insert_color(&evtchn->node, &u->evtchns); + + return 0; } -static inline bool get_port_enabled(unsigned port) +static void del_evtchn(struct per_user_data *u, struct user_evtchn *evtchn) { - return port_user[port] & 1; + rb_erase(&evtchn->node, &u->evtchns); + kfree(evtchn); } -static inline void set_port_enabled(unsigned port, bool enabled) +static struct user_evtchn *find_evtchn(struct per_user_data *u, unsigned port) { - if (enabled) - port_user[port] |= 1; - else - port_user[port] &= ~1; + struct rb_node *node = u->evtchns.rb_node; + + while (node) { + struct user_evtchn *evtchn; + + evtchn = container_of(node, struct user_evtchn, node); + + if (evtchn->port < port) + node = node->rb_left; + else if (evtchn->port > port) + node = node->rb_right; + else + return evtchn; + } + return NULL; } static irqreturn_t evtchn_interrupt(int irq, void *data) { - unsigned int port = (unsigned long)data; - struct per_user_data *u; - - spin_lock(&port_user_lock); - - u = get_port_user(port); + struct user_evtchn *evtchn = data; + struct per_user_data *u = evtchn->user; - WARN(!get_port_enabled(port), + WARN(!evtchn->enabled, "Interrupt for port %d, but apparently not enabled; per-user %p\n", - port, u); + evtchn->port, u); disable_irq_nosync(irq); - set_port_enabled(port, false); + evtchn->enabled = false; + + spin_lock(&u->ring_prod_lock); if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) { - u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port; + u->ring[EVTCHN_RING_MASK(u->ring_prod)] = evtchn->port; wmb(); /* Ensure ring contents visible */ if (u->ring_cons == u->ring_prod++) { wake_up_interruptible(&u->evtchn_wait); @@ -126,7 +155,7 @@ static irqreturn_t evtchn_interrupt(int irq, void *data) } else u->ring_overflow = 1; - spin_unlock(&port_user_lock); + spin_unlock(&u->ring_prod_lock); return IRQ_HANDLED; } @@ -227,20 +256,20 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf, if (copy_from_user(kbuf, buf, count) != 0) goto out; - spin_lock_irq(&port_user_lock); + mutex_lock(&u->bind_mutex); for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) { unsigned port = kbuf[i]; + struct user_evtchn *evtchn; - if (port < NR_EVENT_CHANNELS && - get_port_user(port) == u && - !get_port_enabled(port)) { - set_port_enabled(port, true); + evtchn = find_evtchn(u, port); + if (evtchn && !evtchn->enabled) { + evtchn->enabled = true; enable_irq(irq_from_evtchn(port)); } } - spin_unlock_irq(&port_user_lock); + mutex_unlock(&u->bind_mutex); rc = count; @@ -251,6 +280,8 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf, static int evtchn_bind_to_user(struct per_user_data *u, int port) { + struct user_evtchn *evtchn; + struct evtchn_close close; int rc = 0; /* @@ -261,25 +292,46 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port) * interrupt handler yet, and our caller has already * serialized bind operations.) */ - BUG_ON(get_port_user(port) != NULL); - set_port_user(port, u); - set_port_enabled(port, true); /* start enabled */ - rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED, - u->name, (void *)(unsigned long)port); - if (rc >= 0) - rc = evtchn_make_refcounted(port); + evtchn = kzalloc(sizeof(*evtchn), GFP_KERNEL); + if (!evtchn) + return -ENOMEM; + + evtchn->user = u; + evtchn->port = port; + evtchn->enabled = true; /* start enabled */ + rc = add_evtchn(u, evtchn); + if (rc < 0) + goto err; + + rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, 0, + u->name, evtchn); + if (rc < 0) + goto err; + + rc = evtchn_make_refcounted(port); + return rc; + +err: + /* bind failed, should close the port now */ + close.port = port; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) + BUG(); + del_evtchn(u, evtchn); return rc; } -static void evtchn_unbind_from_user(struct per_user_data *u, int port) +static void evtchn_unbind_from_user(struct per_user_data *u, + struct user_evtchn *evtchn) { - int irq = irq_from_evtchn(port); + int irq = irq_from_evtchn(evtchn->port); + + BUG_ON(irq < 0); - unbind_from_irqhandler(irq, (void *)(unsigned long)port); + unbind_from_irqhandler(irq, evtchn); - set_port_user(port, NULL); + del_evtchn(u, evtchn); } static long evtchn_ioctl(struct file *file, @@ -358,45 +410,38 @@ static long evtchn_ioctl(struct file *file, case IOCTL_EVTCHN_UNBIND: { struct ioctl_evtchn_unbind unbind; + struct user_evtchn *evtchn; rc = -EFAULT; if (copy_from_user(&unbind, uarg, sizeof(unbind))) break; rc = -EINVAL; - if (unbind.port >= NR_EVENT_CHANNELS) + if (unbind.port >= xen_evtchn_nr_channels()) break; - spin_lock_irq(&port_user_lock); - rc = -ENOTCONN; - if (get_port_user(unbind.port) != u) { - spin_unlock_irq(&port_user_lock); + evtchn = find_evtchn(u, unbind.port); + if (!evtchn) break; - } disable_irq(irq_from_evtchn(unbind.port)); - - spin_unlock_irq(&port_user_lock); - - evtchn_unbind_from_user(u, unbind.port); - + evtchn_unbind_from_user(u, evtchn); rc = 0; break; } case IOCTL_EVTCHN_NOTIFY: { struct ioctl_evtchn_notify notify; + struct user_evtchn *evtchn; rc = -EFAULT; if (copy_from_user(¬ify, uarg, sizeof(notify))) break; - if (notify.port >= NR_EVENT_CHANNELS) { - rc = -EINVAL; - } else if (get_port_user(notify.port) != u) { - rc = -ENOTCONN; - } else { + rc = -ENOTCONN; + evtchn = find_evtchn(u, notify.port); + if (evtchn) { notify_remote_via_evtchn(notify.port); rc = 0; } @@ -406,9 +451,9 @@ static long evtchn_ioctl(struct file *file, case IOCTL_EVTCHN_RESET: { /* Initialise the ring to empty. Clear errors. */ mutex_lock(&u->ring_cons_mutex); - spin_lock_irq(&port_user_lock); + spin_lock_irq(&u->ring_prod_lock); u->ring_cons = u->ring_prod = u->ring_overflow = 0; - spin_unlock_irq(&port_user_lock); + spin_unlock_irq(&u->ring_prod_lock); mutex_unlock(&u->ring_cons_mutex); rc = 0; break; @@ -467,6 +512,7 @@ static int evtchn_open(struct inode *inode, struct file *filp) mutex_init(&u->bind_mutex); mutex_init(&u->ring_cons_mutex); + spin_lock_init(&u->ring_prod_lock); filp->private_data = u; @@ -475,29 +521,18 @@ static int evtchn_open(struct inode *inode, struct file *filp) static int evtchn_release(struct inode *inode, struct file *filp) { - int i; struct per_user_data *u = filp->private_data; + struct rb_node *node; - spin_lock_irq(&port_user_lock); + while ((node = u->evtchns.rb_node)) { + struct user_evtchn *evtchn; - free_page((unsigned long)u->ring); - - for (i = 0; i < NR_EVENT_CHANNELS; i++) { - if (get_port_user(i) != u) - continue; - - disable_irq(irq_from_evtchn(i)); - } - - spin_unlock_irq(&port_user_lock); - - for (i = 0; i < NR_EVENT_CHANNELS; i++) { - if (get_port_user(i) != u) - continue; - - evtchn_unbind_from_user(get_port_user(i), i); + evtchn = rb_entry(node, struct user_evtchn, node); + disable_irq(irq_from_evtchn(evtchn->port)); + evtchn_unbind_from_user(u, evtchn); } + free_page((unsigned long)u->ring); kfree(u->name); kfree(u); @@ -528,29 +563,20 @@ static int __init evtchn_init(void) if (!xen_domain()) return -ENODEV; - port_user = kcalloc(NR_EVENT_CHANNELS, sizeof(*port_user), GFP_KERNEL); - if (port_user == NULL) - return -ENOMEM; - - spin_lock_init(&port_user_lock); - - /* Create '/dev/misc/evtchn'. */ + /* Create '/dev/xen/evtchn'. */ err = misc_register(&evtchn_miscdev); if (err != 0) { - printk(KERN_ALERT "Could not register /dev/misc/evtchn\n"); + pr_err("Could not register /dev/xen/evtchn\n"); return err; } - printk(KERN_INFO "Event-channel device installed.\n"); + pr_info("Event-channel device installed\n"); return 0; } static void __exit evtchn_cleanup(void) { - kfree(port_user); - port_user = NULL; - misc_deregister(&evtchn_miscdev); } diff --git a/drivers/xen/fallback.c b/drivers/xen/fallback.c new file mode 100644 index 00000000000..b04fb64c5a9 --- /dev/null +++ b/drivers/xen/fallback.c @@ -0,0 +1,81 @@ +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/bug.h> +#include <linux/export.h> +#include <asm/hypervisor.h> +#include <asm/xen/hypercall.h> + +int xen_event_channel_op_compat(int cmd, void *arg) +{ + struct evtchn_op op; + int rc; + + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, event_channel_op_compat, &op); + + switch (cmd) { + case EVTCHNOP_close: + case EVTCHNOP_send: + case EVTCHNOP_bind_vcpu: + case EVTCHNOP_unmask: + /* no output */ + break; + +#define COPY_BACK(eop) \ + case EVTCHNOP_##eop: \ + memcpy(arg, &op.u.eop, sizeof(op.u.eop)); \ + break + + COPY_BACK(bind_interdomain); + COPY_BACK(bind_virq); + COPY_BACK(bind_pirq); + COPY_BACK(status); + COPY_BACK(alloc_unbound); + COPY_BACK(bind_ipi); +#undef COPY_BACK + + default: + WARN_ON(rc != -ENOSYS); + break; + } + + return rc; +} +EXPORT_SYMBOL_GPL(xen_event_channel_op_compat); + +int xen_physdev_op_compat(int cmd, void *arg) +{ + struct physdev_op op; + int rc; + + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, physdev_op_compat, &op); + + switch (cmd) { + case PHYSDEVOP_IRQ_UNMASK_NOTIFY: + case PHYSDEVOP_set_iopl: + case PHYSDEVOP_set_iobitmap: + case PHYSDEVOP_apic_write: + /* no output */ + break; + +#define COPY_BACK(pop, fld) \ + case PHYSDEVOP_##pop: \ + memcpy(arg, &op.u.fld, sizeof(op.u.fld)); \ + break + + COPY_BACK(irq_status_query, irq_status_query); + COPY_BACK(apic_read, apic_op); + COPY_BACK(ASSIGN_VECTOR, irq_op); +#undef COPY_BACK + + default: + WARN_ON(rc != -ENOSYS); + break; + } + + return rc; +} +EXPORT_SYMBOL_GPL(xen_physdev_op_compat); diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index 4097987b330..787d1794541 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -48,6 +48,8 @@ * grant operation. */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/atomic.h> #include <linux/module.h> #include <linux/miscdevice.h> @@ -507,7 +509,7 @@ static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma) int rv, i; if (!(vma->vm_flags & VM_SHARED)) { - printk(KERN_ERR "%s: Mapping must be shared.\n", __func__); + pr_err("%s: Mapping must be shared\n", __func__); return -EINVAL; } @@ -584,7 +586,7 @@ static int __init gntalloc_init(void) err = misc_register(&gntalloc_miscdev); if (err != 0) { - printk(KERN_ERR "Could not register misc gntalloc device\n"); + pr_err("Could not register misc gntalloc device\n"); return err; } diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 610bfc6be17..073b4a19a8b 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -19,6 +19,8 @@ #undef DEBUG +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/kernel.h> #include <linux/init.h> @@ -56,10 +58,15 @@ MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by " static atomic_t pages_mapped = ATOMIC_INIT(0); static int use_ptemod; +#define populate_freeable_maps use_ptemod struct gntdev_priv { + /* maps with visible offsets in the file descriptor */ struct list_head maps; - /* lock protects maps from concurrent changes */ + /* maps that are not visible; will be freed on munmap. + * Only populated if populate_freeable_maps == 1 */ + struct list_head freeable_maps; + /* lock protects maps and freeable_maps */ spinlock_t lock; struct mm_struct *mm; struct mmu_notifier mn; @@ -105,6 +112,21 @@ static void gntdev_print_maps(struct gntdev_priv *priv, #endif } +static void gntdev_free_map(struct grant_map *map) +{ + if (map == NULL) + return; + + if (map->pages) + free_xenballooned_pages(map->count, map->pages); + kfree(map->pages); + kfree(map->grants); + kfree(map->map_ops); + kfree(map->unmap_ops); + kfree(map->kmap_ops); + kfree(map); +} + static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) { struct grant_map *add; @@ -142,12 +164,7 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) return add; err: - kfree(add->pages); - kfree(add->grants); - kfree(add->map_ops); - kfree(add->unmap_ops); - kfree(add->kmap_ops); - kfree(add); + gntdev_free_map(add); return NULL; } @@ -183,7 +200,7 @@ static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, return NULL; } -static void gntdev_put_map(struct grant_map *map) +static void gntdev_put_map(struct gntdev_priv *priv, struct grant_map *map) { if (!map) return; @@ -198,17 +215,15 @@ static void gntdev_put_map(struct grant_map *map) evtchn_put(map->notify.event); } - if (map->pages) { - if (!use_ptemod) - unmap_grant_pages(map, 0, map->count); - - free_xenballooned_pages(map->count, map->pages); + if (populate_freeable_maps && priv) { + spin_lock(&priv->lock); + list_del(&map->next); + spin_unlock(&priv->lock); } - kfree(map->pages); - kfree(map->grants); - kfree(map->map_ops); - kfree(map->unmap_ops); - kfree(map); + + if (map->pages && !use_ptemod) + unmap_grant_pages(map, 0, map->count); + gntdev_free_map(map); } /* ------------------------------------------------------------------ */ @@ -257,19 +272,12 @@ static int map_grant_pages(struct grant_map *map) * with find_grant_ptes. */ for (i = 0; i < map->count; i++) { - unsigned level; unsigned long address = (unsigned long) pfn_to_kaddr(page_to_pfn(map->pages[i])); - pte_t *ptep; - u64 pte_maddr = 0; BUG_ON(PageHighMem(map->pages[i])); - ptep = lookup_address(address, &level); - pte_maddr = arbitrary_virt_to_machine(ptep).maddr; - gnttab_set_map_op(&map->kmap_ops[i], pte_maddr, - map->flags | - GNTMAP_host_map | - GNTMAP_contains_pte, + gnttab_set_map_op(&map->kmap_ops[i], address, + map->flags | GNTMAP_host_map, map->grants[i].ref, map->grants[i].domid); } @@ -299,17 +307,10 @@ static int __unmap_grant_pages(struct grant_map *map, int offset, int pages) if (map->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { int pgno = (map->notify.addr >> PAGE_SHIFT); - if (pgno >= offset && pgno < offset + pages && use_ptemod) { - void __user *tmp = (void __user *) - map->vma->vm_start + map->notify.addr; - err = copy_to_user(tmp, &err, 1); - if (err) - return -EFAULT; - map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE; - } else if (pgno >= offset && pgno < offset + pages) { - uint8_t *tmp = kmap(map->pages[pgno]); + if (pgno >= offset && pgno < offset + pages) { + /* No need for kmap, pages are in lowmem */ + uint8_t *tmp = pfn_to_kaddr(page_to_pfn(map->pages[pgno])); tmp[map->notify.addr & (PAGE_SIZE-1)] = 0; - kunmap(map->pages[pgno]); map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE; } } @@ -374,11 +375,24 @@ static void gntdev_vma_open(struct vm_area_struct *vma) static void gntdev_vma_close(struct vm_area_struct *vma) { struct grant_map *map = vma->vm_private_data; + struct file *file = vma->vm_file; + struct gntdev_priv *priv = file->private_data; pr_debug("gntdev_vma_close %p\n", vma); - map->vma = NULL; + if (use_ptemod) { + /* It is possible that an mmu notifier could be running + * concurrently, so take priv->lock to ensure that the vma won't + * vanishing during the unmap_grant_pages call, since we will + * spin here until that completes. Such a concurrent call will + * not do any unmapping, since that has been done prior to + * closing the vma, but it may still iterate the unmap_ops list. + */ + spin_lock(&priv->lock); + map->vma = NULL; + spin_unlock(&priv->lock); + } vma->vm_private_data = NULL; - gntdev_put_map(map); + gntdev_put_map(priv, map); } static struct vm_operations_struct gntdev_vmops = { @@ -388,33 +402,43 @@ static struct vm_operations_struct gntdev_vmops = { /* ------------------------------------------------------------------ */ +static void unmap_if_in_range(struct grant_map *map, + unsigned long start, unsigned long end) +{ + unsigned long mstart, mend; + int err; + + if (!map->vma) + return; + if (map->vma->vm_start >= end) + return; + if (map->vma->vm_end <= start) + return; + mstart = max(start, map->vma->vm_start); + mend = min(end, map->vma->vm_end); + pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", + map->index, map->count, + map->vma->vm_start, map->vma->vm_end, + start, end, mstart, mend); + err = unmap_grant_pages(map, + (mstart - map->vma->vm_start) >> PAGE_SHIFT, + (mend - mstart) >> PAGE_SHIFT); + WARN_ON(err); +} + static void mn_invl_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end) { struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); struct grant_map *map; - unsigned long mstart, mend; - int err; spin_lock(&priv->lock); list_for_each_entry(map, &priv->maps, next) { - if (!map->vma) - continue; - if (map->vma->vm_start >= end) - continue; - if (map->vma->vm_end <= start) - continue; - mstart = max(start, map->vma->vm_start); - mend = min(end, map->vma->vm_end); - pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", - map->index, map->count, - map->vma->vm_start, map->vma->vm_end, - start, end, mstart, mend); - err = unmap_grant_pages(map, - (mstart - map->vma->vm_start) >> PAGE_SHIFT, - (mend - mstart) >> PAGE_SHIFT); - WARN_ON(err); + unmap_if_in_range(map, start, end); + } + list_for_each_entry(map, &priv->freeable_maps, next) { + unmap_if_in_range(map, start, end); } spin_unlock(&priv->lock); } @@ -443,6 +467,15 @@ static void mn_release(struct mmu_notifier *mn, err = unmap_grant_pages(map, /* offset */ 0, map->count); WARN_ON(err); } + list_for_each_entry(map, &priv->freeable_maps, next) { + if (!map->vma) + continue; + pr_debug("map %d+%d (%lx %lx)\n", + map->index, map->count, + map->vma->vm_start, map->vma->vm_end); + err = unmap_grant_pages(map, /* offset */ 0, map->count); + WARN_ON(err); + } spin_unlock(&priv->lock); } @@ -464,6 +497,7 @@ static int gntdev_open(struct inode *inode, struct file *flip) return -ENOMEM; INIT_LIST_HEAD(&priv->maps); + INIT_LIST_HEAD(&priv->freeable_maps); spin_lock_init(&priv->lock); if (use_ptemod) { @@ -498,8 +532,9 @@ static int gntdev_release(struct inode *inode, struct file *flip) while (!list_empty(&priv->maps)) { map = list_entry(priv->maps.next, struct grant_map, next); list_del(&map->next); - gntdev_put_map(map); + gntdev_put_map(NULL /* already removed */, map); } + WARN_ON(!list_empty(&priv->freeable_maps)); if (use_ptemod) mmu_notifier_unregister(&priv->mn, priv->mm); @@ -527,14 +562,14 @@ static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv, if (unlikely(atomic_add_return(op.count, &pages_mapped) > limit)) { pr_debug("can't map: over limit\n"); - gntdev_put_map(map); + gntdev_put_map(NULL, map); return err; } if (copy_from_user(map->grants, &u->refs, sizeof(map->grants[0]) * op.count) != 0) { - gntdev_put_map(map); - return err; + gntdev_put_map(NULL, map); + return -EFAULT; } spin_lock(&priv->lock); @@ -563,11 +598,13 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv, map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count); if (map) { list_del(&map->next); + if (populate_freeable_maps) + list_add_tail(&map->next, &priv->freeable_maps); err = 0; } spin_unlock(&priv->lock); if (map) - gntdev_put_map(map); + gntdev_put_map(priv, map); return err; } @@ -577,25 +614,31 @@ static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv, struct ioctl_gntdev_get_offset_for_vaddr op; struct vm_area_struct *vma; struct grant_map *map; + int rv = -EINVAL; if (copy_from_user(&op, u, sizeof(op)) != 0) return -EFAULT; pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr); + down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, op.vaddr); if (!vma || vma->vm_ops != &gntdev_vmops) - return -EINVAL; + goto out_unlock; map = vma->vm_private_data; if (!map) - return -EINVAL; + goto out_unlock; op.offset = map->index << PAGE_SHIFT; op.count = map->count; + rv = 0; - if (copy_to_user(u, &op, sizeof(op)) != 0) + out_unlock: + up_read(¤t->mm->mmap_sem); + + if (rv == 0 && copy_to_user(u, &op, sizeof(op)) != 0) return -EFAULT; - return 0; + return rv; } static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u) @@ -712,7 +755,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) if (use_ptemod && map->vma) goto unlock_out; if (use_ptemod && priv->mm != vma->vm_mm) { - printk(KERN_WARNING "Huh? Other mm?\n"); + pr_warn("Huh? Other mm?\n"); goto unlock_out; } @@ -747,7 +790,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) vma->vm_end - vma->vm_start, find_grant_ptes, map); if (err) { - printk(KERN_WARNING "find_grant_ptes() failure.\n"); + pr_warn("find_grant_ptes() failure.\n"); goto out_put_map; } } @@ -776,7 +819,7 @@ out_unlock_put: out_put_map: if (use_ptemod) map->vma = NULL; - gntdev_put_map(map); + gntdev_put_map(priv, map); return err; } @@ -803,11 +846,11 @@ static int __init gntdev_init(void) if (!xen_domain()) return -ENODEV; - use_ptemod = xen_pv_domain(); + use_ptemod = !xen_feature(XENFEAT_auto_translated_physmap); err = misc_register(&gntdev_miscdev); if (err != 0) { - printk(KERN_ERR "Could not register gntdev device\n"); + pr_err("Could not register gntdev device\n"); return err; } return 0; diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index b2b0a375b34..eeba7544f0c 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -31,6 +31,8 @@ * IN THE SOFTWARE. */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/sched.h> #include <linux/mm.h> @@ -47,6 +49,7 @@ #include <xen/grant_table.h> #include <xen/interface/memory.h> #include <xen/hvc-console.h> +#include <xen/swiotlb-xen.h> #include <asm/xen/hypercall.h> #include <asm/xen/interface.h> @@ -56,19 +59,13 @@ /* External tools reserve first few grant table entries. */ #define NR_RESERVED_ENTRIES 8 #define GNTTAB_LIST_END 0xffffffff -#define GREFS_PER_GRANT_FRAME \ -(grant_table_version == 1 ? \ -(PAGE_SIZE / sizeof(struct grant_entry_v1)) : \ -(PAGE_SIZE / sizeof(union grant_entry_v2))) static grant_ref_t **gnttab_list; static unsigned int nr_grant_frames; -static unsigned int boot_max_nr_grant_frames; static int gnttab_free_count; static grant_ref_t gnttab_free_head; static DEFINE_SPINLOCK(gnttab_list_lock); -unsigned long xen_hvm_resume_frames; -EXPORT_SYMBOL_GPL(xen_hvm_resume_frames); +struct grant_frames xen_auto_xlat_grant_frames; static union { struct grant_entry_v1 *v1; @@ -84,7 +81,7 @@ struct gnttab_ops { * nr_gframes is the number of frames to map grant table. Returning * GNTST_okay means success and negative value means failure. */ - int (*map_frames)(unsigned long *frames, unsigned int nr_gframes); + int (*map_frames)(xen_pfn_t *frames, unsigned int nr_gframes); /* * Release a list of frames which are mapped in map_frames for grant * entry status. @@ -154,6 +151,7 @@ static struct gnttab_ops *gnttab_interface; static grant_status_t *grstatus; static int grant_table_version; +static int grefs_per_grant_frame; static struct gnttab_free_callback *gnttab_free_callback_list; @@ -511,8 +509,7 @@ static void gnttab_handle_deferred(unsigned long unused) entry = NULL; } else { if (!--entry->warn_delay) - pr_info("g.e. %#x still pending\n", - entry->ref); + pr_info("g.e. %#x still pending\n", entry->ref); if (!first) first = entry; } @@ -732,9 +729,18 @@ void gnttab_request_free_callback(struct gnttab_free_callback *callback, void (*fn)(void *), void *arg, u16 count) { unsigned long flags; + struct gnttab_free_callback *cb; + spin_lock_irqsave(&gnttab_list_lock, flags); - if (callback->next) - goto out; + + /* Check if the callback is already on the list */ + cb = gnttab_free_callback_list; + while (cb) { + if (cb == callback) + goto out; + cb = cb->next; + } + callback->fn = fn; callback->arg = arg; callback->count = count; @@ -767,12 +773,14 @@ static int grow_gnttab_list(unsigned int more_frames) unsigned int new_nr_grant_frames, extra_entries, i; unsigned int nr_glist_frames, new_nr_glist_frames; + BUG_ON(grefs_per_grant_frame == 0); + new_nr_grant_frames = nr_grant_frames + more_frames; - extra_entries = more_frames * GREFS_PER_GRANT_FRAME; + extra_entries = more_frames * grefs_per_grant_frame; - nr_glist_frames = (nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP; + nr_glist_frames = (nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; new_nr_glist_frames = - (new_nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP; + (new_nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; for (i = nr_glist_frames; i < new_nr_glist_frames; i++) { gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC); if (!gnttab_list[i]) @@ -780,12 +788,12 @@ static int grow_gnttab_list(unsigned int more_frames) } - for (i = GREFS_PER_GRANT_FRAME * nr_grant_frames; - i < GREFS_PER_GRANT_FRAME * new_nr_grant_frames - 1; i++) + for (i = grefs_per_grant_frame * nr_grant_frames; + i < grefs_per_grant_frame * new_nr_grant_frames - 1; i++) gnttab_entry(i) = i + 1; gnttab_entry(i) = gnttab_free_head; - gnttab_free_head = GREFS_PER_GRANT_FRAME * nr_grant_frames; + gnttab_free_head = grefs_per_grant_frame * nr_grant_frames; gnttab_free_count += extra_entries; nr_grant_frames = new_nr_grant_frames; @@ -817,6 +825,11 @@ static unsigned int __max_nr_grant_frames(void) unsigned int gnttab_max_grant_frames(void) { unsigned int xen_max = __max_nr_grant_frames(); + static unsigned int boot_max_nr_grant_frames; + + /* First time, initialize it properly. */ + if (!boot_max_nr_grant_frames) + boot_max_nr_grant_frames = __max_nr_grant_frames(); if (xen_max > boot_max_nr_grant_frames) return boot_max_nr_grant_frames; @@ -824,6 +837,51 @@ unsigned int gnttab_max_grant_frames(void) } EXPORT_SYMBOL_GPL(gnttab_max_grant_frames); +int gnttab_setup_auto_xlat_frames(phys_addr_t addr) +{ + xen_pfn_t *pfn; + unsigned int max_nr_gframes = __max_nr_grant_frames(); + unsigned int i; + void *vaddr; + + if (xen_auto_xlat_grant_frames.count) + return -EINVAL; + + vaddr = xen_remap(addr, PAGE_SIZE * max_nr_gframes); + if (vaddr == NULL) { + pr_warn("Failed to ioremap gnttab share frames (addr=%pa)!\n", + &addr); + return -ENOMEM; + } + pfn = kcalloc(max_nr_gframes, sizeof(pfn[0]), GFP_KERNEL); + if (!pfn) { + xen_unmap(vaddr); + return -ENOMEM; + } + for (i = 0; i < max_nr_gframes; i++) + pfn[i] = PFN_DOWN(addr) + i; + + xen_auto_xlat_grant_frames.vaddr = vaddr; + xen_auto_xlat_grant_frames.pfn = pfn; + xen_auto_xlat_grant_frames.count = max_nr_gframes; + + return 0; +} +EXPORT_SYMBOL_GPL(gnttab_setup_auto_xlat_frames); + +void gnttab_free_auto_xlat_frames(void) +{ + if (!xen_auto_xlat_grant_frames.count) + return; + kfree(xen_auto_xlat_grant_frames.pfn); + xen_unmap(xen_auto_xlat_grant_frames.vaddr); + + xen_auto_xlat_grant_frames.pfn = NULL; + xen_auto_xlat_grant_frames.count = 0; + xen_auto_xlat_grant_frames.vaddr = NULL; +} +EXPORT_SYMBOL_GPL(gnttab_free_auto_xlat_frames); + /* Handling of paged out grant targets (GNTST_eagain) */ #define MAX_DELAY 256 static inline void @@ -839,7 +897,7 @@ gnttab_retry_eagain_gop(unsigned int cmd, void *gop, int16_t *status, } while ((*status == GNTST_eagain) && (delay < MAX_DELAY)); if (delay >= MAX_DELAY) { - printk(KERN_ERR "%s: %s eagain grant\n", func, current->comm); + pr_err("%s: %s eagain grant\n", func, current->comm); *status = GNTST_bad_page; } } @@ -875,9 +933,6 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, struct page **pages, unsigned int count) { int i, ret; - bool lazy = false; - pte_t *pte; - unsigned long mfn; ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count); if (ret) @@ -889,36 +944,7 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, map_ops + i, &map_ops[i].status, __func__); - if (xen_feature(XENFEAT_auto_translated_physmap)) - return ret; - - if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { - arch_enter_lazy_mmu_mode(); - lazy = true; - } - - for (i = 0; i < count; i++) { - /* Do not add to override if the map failed. */ - if (map_ops[i].status) - continue; - - if (map_ops[i].flags & GNTMAP_contains_pte) { - pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + - (map_ops[i].host_addr & ~PAGE_MASK)); - mfn = pte_mfn(*pte); - } else { - mfn = PFN_DOWN(map_ops[i].dev_bus_addr); - } - ret = m2p_add_override(mfn, pages[i], kmap_ops ? - &kmap_ops[i] : NULL); - if (ret) - return ret; - } - - if (lazy) - arch_leave_lazy_mmu_mode(); - - return ret; + return set_foreign_p2m_mapping(map_ops, kmap_ops, pages, count); } EXPORT_SYMBOL_GPL(gnttab_map_refs); @@ -926,41 +952,23 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count) { - int i, ret; - bool lazy = false; + int ret; ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count); if (ret) return ret; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return ret; - - if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { - arch_enter_lazy_mmu_mode(); - lazy = true; - } - - for (i = 0; i < count; i++) { - ret = m2p_remove_override(pages[i], kmap_ops ? - &kmap_ops[i] : NULL); - if (ret) - return ret; - } - - if (lazy) - arch_leave_lazy_mmu_mode(); - - return ret; + return clear_foreign_p2m_mapping(unmap_ops, kmap_ops, pages, count); } EXPORT_SYMBOL_GPL(gnttab_unmap_refs); static unsigned nr_status_frames(unsigned nr_grant_frames) { - return (nr_grant_frames * GREFS_PER_GRANT_FRAME + SPP - 1) / SPP; + BUG_ON(grefs_per_grant_frame == 0); + return (nr_grant_frames * grefs_per_grant_frame + SPP - 1) / SPP; } -static int gnttab_map_frames_v1(unsigned long *frames, unsigned int nr_gframes) +static int gnttab_map_frames_v1(xen_pfn_t *frames, unsigned int nr_gframes) { int rc; @@ -977,7 +985,7 @@ static void gnttab_unmap_frames_v1(void) arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames); } -static int gnttab_map_frames_v2(unsigned long *frames, unsigned int nr_gframes) +static int gnttab_map_frames_v2(xen_pfn_t *frames, unsigned int nr_gframes) { uint64_t *sframes; unsigned int nr_sframes; @@ -1029,14 +1037,15 @@ static void gnttab_unmap_frames_v2(void) static int gnttab_map(unsigned int start_idx, unsigned int end_idx) { struct gnttab_setup_table setup; - unsigned long *frames; + xen_pfn_t *frames; unsigned int nr_gframes = end_idx + 1; int rc; - if (xen_hvm_domain()) { + if (xen_feature(XENFEAT_auto_translated_physmap)) { struct xen_add_to_physmap xatp; unsigned int i = end_idx; rc = 0; + BUG_ON(xen_auto_xlat_grant_frames.count < nr_gframes); /* * Loop backwards, so that the first hypercall has the largest * index, ensuring that the table will grow only once. @@ -1045,11 +1054,11 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) xatp.domid = DOMID_SELF; xatp.idx = i; xatp.space = XENMAPSPACE_grant_table; - xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i; + xatp.gpfn = xen_auto_xlat_grant_frames.pfn[i]; rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp); if (rc != 0) { - printk(KERN_WARNING - "grant table add_to_physmap failed, err=%d\n", rc); + pr_warn("grant table add_to_physmap failed, err=%d\n", + rc); break; } } while (i-- > start_idx); @@ -1108,13 +1117,12 @@ static void gnttab_request_version(void) int rc; struct gnttab_set_version gsv; - if (xen_hvm_domain()) - gsv.version = 1; - else - gsv.version = 2; + gsv.version = 1; + rc = HYPERVISOR_grant_table_op(GNTTABOP_set_version, &gsv, 1); if (rc == 0 && gsv.version == 2) { grant_table_version = 2; + grefs_per_grant_frame = PAGE_SIZE / sizeof(union grant_entry_v2); gnttab_interface = &gnttab_v2_ops; } else if (grant_table_version == 2) { /* @@ -1127,42 +1135,41 @@ static void gnttab_request_version(void) panic("we need grant tables version 2, but only version 1 is available"); } else { grant_table_version = 1; + grefs_per_grant_frame = PAGE_SIZE / sizeof(struct grant_entry_v1); gnttab_interface = &gnttab_v1_ops; } - printk(KERN_INFO "Grant tables using version %d layout.\n", - grant_table_version); + pr_info("Grant tables using version %d layout\n", grant_table_version); } -int gnttab_resume(void) +static int gnttab_setup(void) { unsigned int max_nr_gframes; - gnttab_request_version(); max_nr_gframes = gnttab_max_grant_frames(); if (max_nr_gframes < nr_grant_frames) return -ENOSYS; - if (xen_pv_domain()) - return gnttab_map(0, nr_grant_frames - 1); - - if (gnttab_shared.addr == NULL) { - gnttab_shared.addr = ioremap(xen_hvm_resume_frames, - PAGE_SIZE * max_nr_gframes); + if (xen_feature(XENFEAT_auto_translated_physmap) && gnttab_shared.addr == NULL) { + gnttab_shared.addr = xen_auto_xlat_grant_frames.vaddr; if (gnttab_shared.addr == NULL) { - printk(KERN_WARNING - "Failed to ioremap gnttab share frames!"); + pr_warn("gnttab share frames (addr=0x%08lx) is not mapped!\n", + (unsigned long)xen_auto_xlat_grant_frames.vaddr); return -ENOMEM; } } + return gnttab_map(0, nr_grant_frames - 1); +} - gnttab_map(0, nr_grant_frames - 1); - - return 0; +int gnttab_resume(void) +{ + gnttab_request_version(); + return gnttab_setup(); } int gnttab_suspend(void) { - gnttab_interface->unmap_frames(); + if (!xen_feature(XENFEAT_auto_translated_physmap)) + gnttab_interface->unmap_frames(); return 0; } @@ -1171,9 +1178,10 @@ static int gnttab_expand(unsigned int req_entries) int rc; unsigned int cur, extra; + BUG_ON(grefs_per_grant_frame == 0); cur = nr_grant_frames; - extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) / - GREFS_PER_GRANT_FRAME); + extra = ((req_entries + (grefs_per_grant_frame-1)) / + grefs_per_grant_frame); if (cur + extra > gnttab_max_grant_frames()) return -ENOSPC; @@ -1187,25 +1195,28 @@ static int gnttab_expand(unsigned int req_entries) int gnttab_init(void) { int i; + unsigned long max_nr_grant_frames; unsigned int max_nr_glist_frames, nr_glist_frames; unsigned int nr_init_grefs; int ret; + gnttab_request_version(); + max_nr_grant_frames = gnttab_max_grant_frames(); nr_grant_frames = 1; - boot_max_nr_grant_frames = __max_nr_grant_frames(); /* Determine the maximum number of frames required for the * grant reference free list on the current hypervisor. */ - max_nr_glist_frames = (boot_max_nr_grant_frames * - GREFS_PER_GRANT_FRAME / RPP); + BUG_ON(grefs_per_grant_frame == 0); + max_nr_glist_frames = (max_nr_grant_frames * + grefs_per_grant_frame / RPP); gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *), GFP_KERNEL); if (gnttab_list == NULL) return -ENOMEM; - nr_glist_frames = (nr_grant_frames * GREFS_PER_GRANT_FRAME + RPP - 1) / RPP; + nr_glist_frames = (nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; for (i = 0; i < nr_glist_frames; i++) { gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL); if (gnttab_list[i] == NULL) { @@ -1214,12 +1225,17 @@ int gnttab_init(void) } } - if (gnttab_resume() < 0) { + ret = arch_gnttab_init(max_nr_grant_frames, + nr_status_frames(max_nr_grant_frames)); + if (ret < 0) + goto ini_nomem; + + if (gnttab_setup() < 0) { ret = -ENODEV; goto ini_nomem; } - nr_init_grefs = nr_grant_frames * GREFS_PER_GRANT_FRAME; + nr_init_grefs = nr_grant_frames * grefs_per_grant_frame; for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++) gnttab_entry(i) = i + 1; @@ -1239,7 +1255,7 @@ int gnttab_init(void) } EXPORT_SYMBOL_GPL(gnttab_init); -static int __devinit __gnttab_init(void) +static int __gnttab_init(void) { /* Delay grant-table initialization in the PV on HVM case */ if (xen_hvm_domain()) @@ -1250,5 +1266,6 @@ static int __devinit __gnttab_init(void) return gnttab_init(); } - -core_initcall(__gnttab_init); +/* Starts after core_initcall so that xen_pvh_gnttab_setup can be called + * beforehand to initialize xen_auto_xlat_grant_frames. */ +core_initcall_sync(__gnttab_init); diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 412b96cc530..5f1e1f3cd18 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -1,6 +1,9 @@ /* * Handle extern requests for shutdown, reboot and sysrq */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/err.h> #include <linux/slab.h> @@ -38,30 +41,21 @@ static enum shutdown_state shutting_down = SHUTDOWN_INVALID; struct suspend_info { int cancelled; - unsigned long arg; /* extra hypercall argument */ - void (*pre)(void); - void (*post)(int cancelled); }; -static void xen_hvm_post_suspend(int cancelled) -{ - xen_arch_hvm_post_suspend(cancelled); - gnttab_resume(); -} +static RAW_NOTIFIER_HEAD(xen_resume_notifier); -static void xen_pre_suspend(void) +void xen_resume_notifier_register(struct notifier_block *nb) { - xen_mm_pin_all(); - gnttab_suspend(); - xen_arch_pre_suspend(); + raw_notifier_chain_register(&xen_resume_notifier, nb); } +EXPORT_SYMBOL_GPL(xen_resume_notifier_register); -static void xen_post_suspend(int cancelled) +void xen_resume_notifier_unregister(struct notifier_block *nb) { - xen_arch_post_suspend(cancelled); - gnttab_resume(); - xen_mm_unpin_all(); + raw_notifier_chain_unregister(&xen_resume_notifier, nb); } +EXPORT_SYMBOL_GPL(xen_resume_notifier_unregister); #ifdef CONFIG_HIBERNATE_CALLBACKS static int xen_suspend(void *data) @@ -73,27 +67,27 @@ static int xen_suspend(void *data) err = syscore_suspend(); if (err) { - printk(KERN_ERR "xen_suspend: system core suspend failed: %d\n", - err); + pr_err("%s: system core suspend failed: %d\n", __func__, err); return err; } - if (si->pre) - si->pre(); + gnttab_suspend(); + xen_arch_pre_suspend(); /* * This hypercall returns 1 if suspend was cancelled * or the domain was merely checkpointed, and 0 if it * is resuming in a new domain. */ - si->cancelled = HYPERVISOR_suspend(si->arg); + si->cancelled = HYPERVISOR_suspend(xen_pv_domain() + ? virt_to_mfn(xen_start_info) + : 0); - if (si->post) - si->post(si->cancelled); + xen_arch_post_suspend(si->cancelled); + gnttab_resume(); if (!si->cancelled) { xen_irq_resume(); - xen_console_resume(); xen_timer_resume(); } @@ -115,14 +109,14 @@ static void do_suspend(void) during suspend. */ err = freeze_processes(); if (err) { - printk(KERN_ERR "xen suspend: freeze failed %d\n", err); + pr_err("%s: freeze failed %d\n", __func__, err); goto out; } #endif err = dpm_suspend_start(PMSG_FREEZE); if (err) { - printk(KERN_ERR "xen suspend: dpm_suspend_start %d\n", err); + pr_err("%s: dpm_suspend_start %d\n", __func__, err); goto out_thaw; } @@ -131,29 +125,25 @@ static void do_suspend(void) err = dpm_suspend_end(PMSG_FREEZE); if (err) { - printk(KERN_ERR "dpm_suspend_end failed: %d\n", err); + pr_err("dpm_suspend_end failed: %d\n", err); si.cancelled = 0; goto out_resume; } si.cancelled = 1; - if (xen_hvm_domain()) { - si.arg = 0UL; - si.pre = NULL; - si.post = &xen_hvm_post_suspend; - } else { - si.arg = virt_to_mfn(xen_start_info); - si.pre = &xen_pre_suspend; - si.post = &xen_post_suspend; - } - err = stop_machine(xen_suspend, &si, cpumask_of(0)); + /* Resume console as early as possible. */ + if (!si.cancelled) + xen_console_resume(); + + raw_notifier_call_chain(&xen_resume_notifier, 0, NULL); + dpm_resume_start(si.cancelled ? PMSG_THAW : PMSG_RESTORE); if (err) { - printk(KERN_ERR "failed to start xen_suspend: %d\n", err); + pr_err("failed to start xen_suspend: %d\n", err); si.cancelled = 1; } @@ -166,9 +156,6 @@ out_resume: dpm_resume_end(si.cancelled ? PMSG_THAW : PMSG_RESTORE); - /* Make sure timer events get retriggered on all CPUs */ - clock_was_set(); - out_thaw: #ifdef CONFIG_PREEMPT thaw_processes(); @@ -183,10 +170,32 @@ struct shutdown_handler { void (*cb)(void); }; +static int poweroff_nb(struct notifier_block *cb, unsigned long code, void *unused) +{ + switch (code) { + case SYS_DOWN: + case SYS_HALT: + case SYS_POWER_OFF: + shutting_down = SHUTDOWN_POWEROFF; + default: + break; + } + return NOTIFY_DONE; +} static void do_poweroff(void) { - shutting_down = SHUTDOWN_POWEROFF; - orderly_poweroff(false); + switch (system_state) { + case SYSTEM_BOOTING: + orderly_poweroff(true); + break; + case SYSTEM_RUNNING: + orderly_poweroff(false); + break; + default: + /* Don't do it when we are halting/rebooting. */ + pr_info("Ignoring Xen toolstack shutdown.\n"); + break; + } } static void do_reboot(void) @@ -245,7 +254,7 @@ static void shutdown_handler(struct xenbus_watch *watch, if (handler->cb) { handler->cb(); } else { - printk(KERN_INFO "Ignoring shutdown request: %s\n", str); + pr_info("Ignoring shutdown request: %s\n", str); shutting_down = SHUTDOWN_INVALID; } @@ -265,8 +274,7 @@ static void sysrq_handler(struct xenbus_watch *watch, const char **vec, if (err) return; if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) { - printk(KERN_ERR "Unable to read sysrq code in " - "control/sysrq\n"); + pr_err("Unable to read sysrq code in control/sysrq\n"); xenbus_transaction_end(xbt, 1); return; } @@ -293,20 +301,25 @@ static struct xenbus_watch shutdown_watch = { .callback = shutdown_handler }; +static struct notifier_block xen_reboot_nb = { + .notifier_call = poweroff_nb, +}; + static int setup_shutdown_watcher(void) { int err; err = register_xenbus_watch(&shutdown_watch); if (err) { - printk(KERN_ERR "Failed to set shutdown watcher\n"); + pr_err("Failed to set shutdown watcher\n"); return err; } + #ifdef CONFIG_MAGIC_SYSRQ err = register_xenbus_watch(&sysrq_watch); if (err) { - printk(KERN_ERR "Failed to set sysrq watcher\n"); + pr_err("Failed to set sysrq watcher\n"); return err; } #endif @@ -331,6 +344,7 @@ int xen_setup_shutdown_event(void) if (!xen_domain()) return -ENODEV; register_xenstore_notifier(&xenstore_notifier); + register_reboot_notifier(&xen_reboot_nb); return 0; } diff --git a/drivers/xen/mcelog.c b/drivers/xen/mcelog.c index 8feee08bcb4..6ab6a79c38a 100644 --- a/drivers/xen/mcelog.c +++ b/drivers/xen/mcelog.c @@ -32,6 +32,8 @@ * IN THE SOFTWARE. */ +#define pr_fmt(fmt) "xen_mcelog: " fmt + #include <linux/init.h> #include <linux/types.h> #include <linux/kernel.h> @@ -51,8 +53,6 @@ #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> -#define XEN_MCELOG "xen_mcelog: " - static struct mc_info g_mi; static struct mcinfo_logical_cpu *g_physinfo; static uint32_t ncpus; @@ -227,7 +227,7 @@ static int convert_log(struct mc_info *mi) mic = NULL; x86_mcinfo_lookup(&mic, mi, MC_TYPE_GLOBAL); if (unlikely(!mic)) { - pr_warning(XEN_MCELOG "Failed to find global error info\n"); + pr_warn("Failed to find global error info\n"); return -ENODEV; } @@ -241,8 +241,7 @@ static int convert_log(struct mc_info *mi) if (g_physinfo[i].mc_apicid == m.apicid) break; if (unlikely(i == ncpus)) { - pr_warning(XEN_MCELOG "Failed to match cpu with apicid %d\n", - m.apicid); + pr_warn("Failed to match cpu with apicid %d\n", m.apicid); return -ENODEV; } @@ -254,7 +253,7 @@ static int convert_log(struct mc_info *mi) mic = NULL; x86_mcinfo_lookup(&mic, mi, MC_TYPE_BANK); if (unlikely(!mic)) { - pr_warning(XEN_MCELOG "Fail to find bank error info\n"); + pr_warn("Fail to find bank error info\n"); return -ENODEV; } @@ -295,9 +294,8 @@ static int mc_queue_handle(uint32_t flags) mc_op.u.mc_fetch.flags = flags; ret = HYPERVISOR_mca(&mc_op); if (ret) { - pr_err(XEN_MCELOG "Failed to fetch %s error log\n", - (flags == XEN_MC_URGENT) ? - "urgnet" : "nonurgent"); + pr_err("Failed to fetch %surgent error log\n", + flags == XEN_MC_URGENT ? "" : "non"); break; } @@ -307,15 +305,12 @@ static int mc_queue_handle(uint32_t flags) else { ret = convert_log(&g_mi); if (ret) - pr_warning(XEN_MCELOG - "Failed to convert this error log, " - "continue acking it anyway\n"); + pr_warn("Failed to convert this error log, continue acking it anyway\n"); mc_op.u.mc_fetch.flags = flags | XEN_MC_ACK; ret = HYPERVISOR_mca(&mc_op); if (ret) { - pr_err(XEN_MCELOG - "Failed to ack previous error log\n"); + pr_err("Failed to ack previous error log\n"); break; } } @@ -334,15 +329,12 @@ static void xen_mce_work_fn(struct work_struct *work) /* urgent mc_info */ err = mc_queue_handle(XEN_MC_URGENT); if (err) - pr_err(XEN_MCELOG - "Failed to handle urgent mc_info queue, " - "continue handling nonurgent mc_info queue anyway.\n"); + pr_err("Failed to handle urgent mc_info queue, continue handling nonurgent mc_info queue anyway\n"); /* nonurgent mc_info */ err = mc_queue_handle(XEN_MC_NONURGENT); if (err) - pr_err(XEN_MCELOG - "Failed to handle nonurgent mc_info queue.\n"); + pr_err("Failed to handle nonurgent mc_info queue\n"); /* wake processes polling /dev/mcelog */ wake_up_interruptible(&xen_mce_chrdev_wait); @@ -370,7 +362,7 @@ static int bind_virq_for_mce(void) set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo); ret = HYPERVISOR_mca(&mc_op); if (ret) { - pr_err(XEN_MCELOG "Failed to get CPU numbers\n"); + pr_err("Failed to get CPU numbers\n"); return ret; } @@ -383,7 +375,7 @@ static int bind_virq_for_mce(void) set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo); ret = HYPERVISOR_mca(&mc_op); if (ret) { - pr_err(XEN_MCELOG "Failed to get CPU info\n"); + pr_err("Failed to get CPU info\n"); kfree(g_physinfo); return ret; } @@ -391,7 +383,7 @@ static int bind_virq_for_mce(void) ret = bind_virq_to_irqhandler(VIRQ_MCA, 0, xen_mce_interrupt, 0, "mce", NULL); if (ret < 0) { - pr_err(XEN_MCELOG "Failed to bind virq\n"); + pr_err("Failed to bind virq\n"); kfree(g_physinfo); return ret; } diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c index 18fff88254e..dd9c249ea31 100644 --- a/drivers/xen/pci.c +++ b/drivers/xen/pci.c @@ -26,6 +26,9 @@ #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> #include "../pci/pci.h" +#ifdef CONFIG_PCI_MMCONFIG +#include <asm/pci_x86.h> +#endif static bool __read_mostly pci_seg_supported = true; @@ -58,12 +61,12 @@ static int xen_add_device(struct device *dev) add.flags = XEN_PCI_DEV_EXTFN; #ifdef CONFIG_ACPI - handle = DEVICE_ACPI_HANDLE(&pci_dev->dev); + handle = ACPI_HANDLE(&pci_dev->dev); if (!handle && pci_dev->bus->bridge) - handle = DEVICE_ACPI_HANDLE(pci_dev->bus->bridge); + handle = ACPI_HANDLE(pci_dev->bus->bridge); #ifdef CONFIG_PCI_IOV if (!handle && pci_dev->is_virtfn) - handle = DEVICE_ACPI_HANDLE(physfn->bus->bridge); + handle = ACPI_HANDLE(physfn->bus->bridge); #endif if (handle) { acpi_status status; @@ -192,3 +195,49 @@ static int __init register_xen_pci_notifier(void) } arch_initcall(register_xen_pci_notifier); + +#ifdef CONFIG_PCI_MMCONFIG +static int __init xen_mcfg_late(void) +{ + struct pci_mmcfg_region *cfg; + int rc; + + if (!xen_initial_domain()) + return 0; + + if ((pci_probe & PCI_PROBE_MMCONF) == 0) + return 0; + + if (list_empty(&pci_mmcfg_list)) + return 0; + + /* Check whether they are in the right area. */ + list_for_each_entry(cfg, &pci_mmcfg_list, list) { + struct physdev_pci_mmcfg_reserved r; + + r.address = cfg->address; + r.segment = cfg->segment; + r.start_bus = cfg->start_bus; + r.end_bus = cfg->end_bus; + r.flags = XEN_PCI_MMCFG_RESERVED; + + rc = HYPERVISOR_physdev_op(PHYSDEVOP_pci_mmcfg_reserved, &r); + switch (rc) { + case 0: + case -ENOSYS: + continue; + + default: + pr_warn("Failed to report MMCONFIG reservation" + " state for %s to hypervisor" + " (%d)\n", + cfg->name, rc); + } + } + return 0; +} +/* + * Needs to be done after acpi_init which are subsys_initcall. + */ +subsys_initcall_sync(xen_mcfg_late); +#endif diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c index 067fcfa1723..0aac403d53f 100644 --- a/drivers/xen/pcpu.c +++ b/drivers/xen/pcpu.c @@ -31,6 +31,8 @@ * IN THE SOFTWARE. */ +#define pr_fmt(fmt) "xen_cpu: " fmt + #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/cpu.h> @@ -38,13 +40,13 @@ #include <linux/capability.h> #include <xen/xen.h> +#include <xen/acpi.h> #include <xen/xenbus.h> #include <xen/events.h> #include <xen/interface/platform.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> -#define XEN_PCPU "xen_cpu: " /* * @cpu_id: Xen physical cpu logic number @@ -242,8 +244,7 @@ static struct pcpu *create_and_register_pcpu(struct xenpf_pcpuinfo *info) err = register_pcpu(pcpu); if (err) { - pr_warning(XEN_PCPU "Failed to register pcpu%u\n", - info->xen_cpuid); + pr_warn("Failed to register pcpu%u\n", info->xen_cpuid); return ERR_PTR(-ENOENT); } @@ -278,8 +279,7 @@ static int sync_pcpu(uint32_t cpu, uint32_t *max_cpu) * Only those at cpu present map has its sys interface. */ if (info->flags & XEN_PCPU_FLAGS_INVALID) { - if (pcpu) - unregister_and_remove_pcpu(pcpu); + unregister_and_remove_pcpu(pcpu); return 0; } @@ -333,6 +333,41 @@ static irqreturn_t xen_pcpu_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +/* Sync with Xen hypervisor after cpu hotadded */ +void xen_pcpu_hotplug_sync(void) +{ + schedule_work(&xen_pcpu_work); +} +EXPORT_SYMBOL_GPL(xen_pcpu_hotplug_sync); + +/* + * For hypervisor presented cpu, return logic cpu id; + * For hypervisor non-presented cpu, return -ENODEV. + */ +int xen_pcpu_id(uint32_t acpi_id) +{ + int cpu_id = 0, max_id = 0; + struct xen_platform_op op; + + op.cmd = XENPF_get_cpuinfo; + while (cpu_id <= max_id) { + op.u.pcpu_info.xen_cpuid = cpu_id; + if (HYPERVISOR_dom0_op(&op)) { + cpu_id++; + continue; + } + + if (acpi_id == op.u.pcpu_info.acpi_id) + return cpu_id; + if (op.u.pcpu_info.max_present > max_id) + max_id = op.u.pcpu_info.max_present; + cpu_id++; + } + + return -ENODEV; +} +EXPORT_SYMBOL_GPL(xen_pcpu_id); + static int __init xen_pcpu_init(void) { int irq, ret; @@ -344,19 +379,19 @@ static int __init xen_pcpu_init(void) xen_pcpu_interrupt, 0, "xen-pcpu", NULL); if (irq < 0) { - pr_warning(XEN_PCPU "Failed to bind pcpu virq\n"); + pr_warn("Failed to bind pcpu virq\n"); return irq; } ret = subsys_system_register(&xen_pcpu_subsys, NULL); if (ret) { - pr_warning(XEN_PCPU "Failed to register pcpu subsys\n"); + pr_warn("Failed to register pcpu subsys\n"); goto err1; } ret = xen_sync_pcpus(); if (ret) { - pr_warning(XEN_PCPU "Failed to sync pcpu info\n"); + pr_warn("Failed to sync pcpu info\n"); goto err2; } diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index 97ca359ae2b..3454973dc3b 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -45,7 +45,7 @@ static unsigned long platform_mmio_alloc; static unsigned long platform_mmiolen; static uint64_t callback_via; -unsigned long alloc_xen_mmio(unsigned long len) +static unsigned long alloc_xen_mmio(unsigned long len) { unsigned long addr; @@ -84,7 +84,7 @@ static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id) static int xen_allocate_irq(struct pci_dev *pdev) { return request_irq(pdev->irq, do_hvm_evtchn_intr, - IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TRIGGER_RISING, + IRQF_NOBALANCING | IRQF_TRIGGER_RISING, "xen-platform-pci", pdev); } @@ -101,13 +101,14 @@ static int platform_pci_resume(struct pci_dev *pdev) return 0; } -static int __devinit platform_pci_init(struct pci_dev *pdev, - const struct pci_device_id *ent) +static int platform_pci_init(struct pci_dev *pdev, + const struct pci_device_id *ent) { int i, ret; long ioaddr; long mmio_addr, mmio_len; unsigned int max_nr_gframes; + unsigned long grant_frames; if (!xen_domain()) return -ENODEV; @@ -154,13 +155,17 @@ static int __devinit platform_pci_init(struct pci_dev *pdev, } max_nr_gframes = gnttab_max_grant_frames(); - xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); - ret = gnttab_init(); + grant_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); + ret = gnttab_setup_auto_xlat_frames(grant_frames); if (ret) goto out; + ret = gnttab_init(); + if (ret) + goto grant_out; xenbus_probe(NULL); return 0; - +grant_out: + gnttab_free_auto_xlat_frames(); out: pci_release_region(pdev, 0); mem_out: @@ -170,7 +175,7 @@ pci_out: return ret; } -static struct pci_device_id platform_pci_tbl[] __devinitdata = { +static struct pci_device_id platform_pci_tbl[] = { {PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, {0,} diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 8adb9cc267f..569a13b9e85 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -6,6 +6,8 @@ * Copyright (c) 2002-2004, K A Fraser, B Dragovic */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/module.h> #include <linux/sched.h> @@ -33,14 +35,18 @@ #include <xen/features.h> #include <xen/page.h> #include <xen/xen-ops.h> +#include <xen/balloon.h> #include "privcmd.h" MODULE_LICENSE("GPL"); -#ifndef HAVE_ARCH_PRIVCMD_MMAP -static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma); -#endif +#define PRIV_VMA_LOCKED ((void *)1) + +static int privcmd_vma_range_is_mapped( + struct vm_area_struct *vma, + unsigned long addr, + unsigned long nr_pages); static long privcmd_ioctl_hypercall(void __user *udata) { @@ -178,7 +184,7 @@ static int mmap_mfn_range(void *data, void *state) msg->va & PAGE_MASK, msg->mfn, msg->npages, vma->vm_page_prot, - st->domain); + st->domain, NULL); if (rc < 0) return rc; @@ -196,8 +202,9 @@ static long privcmd_ioctl_mmap(void __user *udata) LIST_HEAD(pagelist); struct mmap_mfn_state state; - if (!xen_initial_domain()) - return -EPERM; + /* We only support privcmd_ioctl_mmap_batch for auto translated. */ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return -ENOSYS; if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) return -EFAULT; @@ -219,9 +226,9 @@ static long privcmd_ioctl_mmap(void __user *udata) vma = find_vma(mm, msg->va); rc = -EINVAL; - if (!vma || (msg->va != vma->vm_start) || - !privcmd_enforce_singleshot_mapping(vma)) + if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data) goto out_up; + vma->vm_private_data = PRIV_VMA_LOCKED; } state.va = vma->vm_start; @@ -246,6 +253,7 @@ struct mmap_batch_state { domid_t domain; unsigned long va; struct vm_area_struct *vma; + int index; /* A tristate: * 0 for no errors * 1 if at least one error has happened (and no @@ -253,24 +261,47 @@ struct mmap_batch_state { * -ENOENT if at least 1 -ENOENT has happened. */ int global_error; - /* An array for individual errors */ - int *err; + int version; /* User-space mfn array to store errors in the second pass for V1. */ xen_pfn_t __user *user_mfn; + /* User-space int array to store errors in the second pass for V2. */ + int __user *user_err; }; +/* auto translated dom0 note: if domU being created is PV, then mfn is + * mfn(addr on bus). If it's auto xlated, then mfn is pfn (input to HAP). + */ static int mmap_batch_fn(void *data, void *state) { xen_pfn_t *mfnp = data; struct mmap_batch_state *st = state; + struct vm_area_struct *vma = st->vma; + struct page **pages = vma->vm_private_data; + struct page *cur_page = NULL; int ret; + if (xen_feature(XENFEAT_auto_translated_physmap)) + cur_page = pages[st->index++]; + ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, - st->vma->vm_page_prot, st->domain); + st->vma->vm_page_prot, st->domain, + &cur_page); /* Store error code for second pass. */ - *(st->err++) = ret; + if (st->version == 1) { + if (ret < 0) { + /* + * V1 encodes the error codes in the 32bit top nibble of the + * mfn (with its known limitations vis-a-vis 64 bit callers). + */ + *mfnp |= (ret == -ENOENT) ? + PRIVCMD_MMAPBATCH_PAGED_ERROR : + PRIVCMD_MMAPBATCH_MFN_ERROR; + } + } else { /* st->version == 2 */ + *((int *) mfnp) = ret; + } /* And see if it affects the global_error. */ if (ret < 0) { @@ -287,20 +318,51 @@ static int mmap_batch_fn(void *data, void *state) return 0; } -static int mmap_return_errors_v1(void *data, void *state) +static int mmap_return_errors(void *data, void *state) { - xen_pfn_t *mfnp = data; struct mmap_batch_state *st = state; - int err = *(st->err++); - /* - * V1 encodes the error codes in the 32bit top nibble of the - * mfn (with its known limitations vis-a-vis 64 bit callers). - */ - *mfnp |= (err == -ENOENT) ? - PRIVCMD_MMAPBATCH_PAGED_ERROR : - PRIVCMD_MMAPBATCH_MFN_ERROR; - return __put_user(*mfnp, st->user_mfn++); + if (st->version == 1) { + xen_pfn_t mfnp = *((xen_pfn_t *) data); + if (mfnp & PRIVCMD_MMAPBATCH_MFN_ERROR) + return __put_user(mfnp, st->user_mfn++); + else + st->user_mfn++; + } else { /* st->version == 2 */ + int err = *((int *) data); + if (err) + return __put_user(err, st->user_err++); + else + st->user_err++; + } + + return 0; +} + +/* Allocate pfns that are then mapped with gmfns from foreign domid. Update + * the vma with the page info to use later. + * Returns: 0 if success, otherwise -errno + */ +static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) +{ + int rc; + struct page **pages; + + pages = kcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL); + if (pages == NULL) + return -ENOMEM; + + rc = alloc_xenballooned_pages(numpgs, pages, 0); + if (rc != 0) { + pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__, + numpgs, rc); + kfree(pages); + return -ENOMEM; + } + BUG_ON(vma->vm_private_data != NULL); + vma->vm_private_data = pages; + + return 0; } static struct vm_operations_struct privcmd_vm_ops; @@ -313,12 +375,8 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) struct vm_area_struct *vma; unsigned long nr_pages; LIST_HEAD(pagelist); - int *err_array = NULL; struct mmap_batch_state state; - if (!xen_initial_domain()) - return -EPERM; - switch (version) { case 1: if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) @@ -352,30 +410,64 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) goto out; } - err_array = kcalloc(m.num, sizeof(int), GFP_KERNEL); - if (err_array == NULL) { - ret = -ENOMEM; - goto out; + if (version == 2) { + /* Zero error array now to only copy back actual errors. */ + if (clear_user(m.err, sizeof(int) * m.num)) { + ret = -EFAULT; + goto out; + } } down_write(&mm->mmap_sem); vma = find_vma(mm, m.addr); - ret = -EINVAL; if (!vma || - vma->vm_ops != &privcmd_vm_ops || - (m.addr != vma->vm_start) || - ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) || - !privcmd_enforce_singleshot_mapping(vma)) { - up_write(&mm->mmap_sem); - goto out; + vma->vm_ops != &privcmd_vm_ops) { + ret = -EINVAL; + goto out_unlock; + } + + /* + * Caller must either: + * + * Map the whole VMA range, which will also allocate all the + * pages required for the auto_translated_physmap case. + * + * Or + * + * Map unmapped holes left from a previous map attempt (e.g., + * because those foreign frames were previously paged out). + */ + if (vma->vm_private_data == NULL) { + if (m.addr != vma->vm_start || + m.addr + (nr_pages << PAGE_SHIFT) != vma->vm_end) { + ret = -EINVAL; + goto out_unlock; + } + if (xen_feature(XENFEAT_auto_translated_physmap)) { + ret = alloc_empty_pages(vma, m.num); + if (ret < 0) + goto out_unlock; + } else + vma->vm_private_data = PRIV_VMA_LOCKED; + } else { + if (m.addr < vma->vm_start || + m.addr + (nr_pages << PAGE_SHIFT) > vma->vm_end) { + ret = -EINVAL; + goto out_unlock; + } + if (privcmd_vma_range_is_mapped(vma, m.addr, nr_pages)) { + ret = -EINVAL; + goto out_unlock; + } } state.domain = m.dom; state.vma = vma; state.va = m.addr; + state.index = 0; state.global_error = 0; - state.err = err_array; + state.version = version; /* mmap_batch_fn guarantees ret == 0 */ BUG_ON(traverse_pages(m.num, sizeof(xen_pfn_t), @@ -383,17 +475,14 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) up_write(&mm->mmap_sem); - if (state.global_error && (version == 1)) { + if (state.global_error) { /* Write back errors in second pass. */ state.user_mfn = (xen_pfn_t *)m.arr; - state.err = err_array; + state.user_err = m.err; ret = traverse_pages(m.num, sizeof(xen_pfn_t), - &pagelist, mmap_return_errors_v1, &state); - } else if (version == 2) { - ret = __copy_to_user(m.err, err_array, m.num * sizeof(int)); - if (ret) - ret = -EFAULT; - } + &pagelist, mmap_return_errors, &state); + } else + ret = 0; /* If we have not had any EFAULT-like global errors then set the global * error to -ENOENT if necessary. */ @@ -401,10 +490,12 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) ret = -ENOENT; out: - kfree(err_array); free_page_list(&pagelist); - return ret; + +out_unlock: + up_write(&mm->mmap_sem); + goto out; } static long privcmd_ioctl(struct file *file, @@ -438,6 +529,24 @@ static long privcmd_ioctl(struct file *file, return ret; } +static void privcmd_close(struct vm_area_struct *vma) +{ + struct page **pages = vma->vm_private_data; + int numpgs = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + int rc; + + if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages) + return; + + rc = xen_unmap_domain_mfn_range(vma, numpgs, pages); + if (rc == 0) + free_xenballooned_pages(numpgs, pages); + else + pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n", + numpgs, rc); + kfree(pages); +} + static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", @@ -448,6 +557,7 @@ static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } static struct vm_operations_struct privcmd_vm_ops = { + .close = privcmd_close, .fault = privcmd_fault }; @@ -463,9 +573,24 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma) +/* + * For MMAPBATCH*. This allows asserting the singleshot mapping + * on a per pfn/pte basis. Mapping calls that fail with ENOENT + * can be then retried until success. + */ +static int is_mapped_fn(pte_t *pte, struct page *pmd_page, + unsigned long addr, void *data) +{ + return pte_none(*pte) ? 0 : -EBUSY; +} + +static int privcmd_vma_range_is_mapped( + struct vm_area_struct *vma, + unsigned long addr, + unsigned long nr_pages) { - return (xchg(&vma->vm_private_data, (void *)1) == NULL); + return apply_to_page_range(vma->vm_mm, addr, nr_pages << PAGE_SHIFT, + is_mapped_fn, NULL) != 0; } const struct file_operations xen_privcmd_fops = { @@ -490,7 +615,7 @@ static int __init privcmd_init(void) err = misc_register(&privcmd_dev); if (err != 0) { - printk(KERN_ERR "Could not register Xen privcmd device\n"); + pr_err("Could not register Xen privcmd device\n"); return err; } return 0; diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 58db6df866e..ebd8f218a78 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -33,6 +33,8 @@ * */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + #include <linux/bootmem.h> #include <linux/dma-mapping.h> #include <linux/export.h> @@ -40,12 +42,31 @@ #include <xen/page.h> #include <xen/xen-ops.h> #include <xen/hvc-console.h> + +#include <asm/dma-mapping.h> +#include <asm/xen/page-coherent.h> + +#include <trace/events/swiotlb.h> /* * Used to do a quick range check in swiotlb_tbl_unmap_single and * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this * API. */ +#ifndef CONFIG_X86 +static unsigned long dma_alloc_coherent_mask(struct device *dev, + gfp_t gfp) +{ + unsigned long dma_mask = 0; + + dma_mask = dev->coherent_dma_mask; + if (!dma_mask) + dma_mask = (gfp & GFP_DMA) ? DMA_BIT_MASK(24) : DMA_BIT_MASK(32); + + return dma_mask; +} +#endif + static char *xen_io_tlb_start, *xen_io_tlb_end; static unsigned long xen_io_tlb_nslabs; /* @@ -54,17 +75,35 @@ static unsigned long xen_io_tlb_nslabs; static u64 start_dma_addr; -static dma_addr_t xen_phys_to_bus(phys_addr_t paddr) +/* + * Both of these functions should avoid PFN_PHYS because phys_addr_t + * can be 32bit when dma_addr_t is 64bit leading to a loss in + * information if the shift is done before casting to 64bit. + */ +static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr) { - return phys_to_machine(XPADDR(paddr)).maddr; + unsigned long mfn = pfn_to_mfn(PFN_DOWN(paddr)); + dma_addr_t dma = (dma_addr_t)mfn << PAGE_SHIFT; + + dma |= paddr & ~PAGE_MASK; + + return dma; } -static phys_addr_t xen_bus_to_phys(dma_addr_t baddr) +static inline phys_addr_t xen_bus_to_phys(dma_addr_t baddr) { - return machine_to_phys(XMADDR(baddr)).paddr; + unsigned long pfn = mfn_to_pfn(PFN_DOWN(baddr)); + dma_addr_t dma = (dma_addr_t)pfn << PAGE_SHIFT; + phys_addr_t paddr = dma; + + BUG_ON(paddr != dma); /* truncation has occurred, should never happen */ + + paddr |= baddr & ~PAGE_MASK; + + return paddr; } -static dma_addr_t xen_virt_to_bus(void *address) +static inline dma_addr_t xen_virt_to_bus(void *address) { return xen_phys_to_bus(virt_to_phys(address)); } @@ -87,7 +126,7 @@ static int check_pages_physically_contiguous(unsigned long pfn, return 1; } -static int range_straddles_page_boundary(phys_addr_t p, size_t size) +static inline int range_straddles_page_boundary(phys_addr_t p, size_t size) { unsigned long pfn = PFN_DOWN(p); unsigned int offset = p & ~PAGE_MASK; @@ -124,6 +163,8 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) { int i, rc; int dma_bits; + dma_addr_t dma_handle; + phys_addr_t p = virt_to_phys(buf); dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; @@ -133,9 +174,9 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) do { rc = xen_create_contiguous_region( - (unsigned long)buf + (i << IO_TLB_SHIFT), + p + (i << IO_TLB_SHIFT), get_order(slabs << IO_TLB_SHIFT), - dma_bits); + dma_bits, &dma_handle); } while (rc && dma_bits++ < max_dma_bits); if (rc) return rc; @@ -202,8 +243,8 @@ retry: order--; } if (order != get_order(bytes)) { - pr_warn("Warning: only able to allocate %ld MB " - "for software IO TLB\n", (PAGE_SIZE << order) >> 20); + pr_warn("Warning: only able to allocate %ld MB for software IO TLB\n", + (PAGE_SIZE << order) >> 20); xen_io_tlb_nslabs = SLABS_PER_PAGE << order; bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; } @@ -231,7 +272,9 @@ retry: } start_dma_addr = xen_virt_to_bus(xen_io_tlb_start); if (early) { - swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, verbose); + if (swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, + verbose)) + panic("Cannot allocate SWIOTLB buffer"); rc = 0; } else rc = swiotlb_late_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs); @@ -240,11 +283,11 @@ error: if (repeat--) { xen_io_tlb_nslabs = max(1024UL, /* Min is 2MB */ (xen_io_tlb_nslabs >> 1)); - printk(KERN_INFO "Xen-SWIOTLB: Lowering to %luMB\n", - (xen_io_tlb_nslabs << IO_TLB_SHIFT) >> 20); + pr_info("Lowering to %luMB\n", + (xen_io_tlb_nslabs << IO_TLB_SHIFT) >> 20); goto retry; } - pr_err("%s (rc:%d)", xen_swiotlb_error(m_ret), rc); + pr_err("%s (rc:%d)\n", xen_swiotlb_error(m_ret), rc); if (early) panic("%s (rc:%d)", xen_swiotlb_error(m_ret), rc); else @@ -259,7 +302,6 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, void *ret; int order = get_order(size); u64 dma_mask = DMA_BIT_MASK(32); - unsigned long vstart; phys_addr_t phys; dma_addr_t dev_addr; @@ -274,8 +316,12 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret)) return ret; - vstart = __get_free_pages(flags, order); - ret = (void *)vstart; + /* On ARM this function returns an ioremap'ped virtual address for + * which virt_to_phys doesn't return the corresponding physical + * address. In fact on ARM virt_to_phys only works for kernel direct + * mapped RAM memory. Also see comment below. + */ + ret = xen_alloc_coherent_pages(hwdev, size, dma_handle, flags, attrs); if (!ret) return ret; @@ -283,18 +329,21 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, if (hwdev && hwdev->coherent_dma_mask) dma_mask = dma_alloc_coherent_mask(hwdev, flags); - phys = virt_to_phys(ret); + /* At this point dma_handle is the physical address, next we are + * going to set it to the machine address. + * Do not use virt_to_phys(ret) because on ARM it doesn't correspond + * to *dma_handle. */ + phys = *dma_handle; dev_addr = xen_phys_to_bus(phys); if (((dev_addr + size - 1 <= dma_mask)) && !range_straddles_page_boundary(phys, size)) *dma_handle = dev_addr; else { - if (xen_create_contiguous_region(vstart, order, - fls64(dma_mask)) != 0) { - free_pages(vstart, order); + if (xen_create_contiguous_region(phys, order, + fls64(dma_mask), dma_handle) != 0) { + xen_free_coherent_pages(hwdev, size, ret, (dma_addr_t)phys, attrs); return NULL; } - *dma_handle = virt_to_machine(ret).maddr; } memset(ret, 0, size); return ret; @@ -315,13 +364,15 @@ xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, if (hwdev && hwdev->coherent_dma_mask) dma_mask = hwdev->coherent_dma_mask; - phys = virt_to_phys(vaddr); + /* do not use virt_to_phys because on ARM it doesn't return you the + * physical address */ + phys = xen_bus_to_phys(dev_addr); if (((dev_addr + size - 1 > dma_mask)) || range_straddles_page_boundary(phys, size)) - xen_destroy_contiguous_region((unsigned long)vaddr, order); + xen_destroy_contiguous_region(phys, order); - free_pages((unsigned long)vaddr, order); + xen_free_coherent_pages(hwdev, size, vaddr, (dma_addr_t)phys, attrs); } EXPORT_SYMBOL_GPL(xen_swiotlb_free_coherent); @@ -338,9 +389,8 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, enum dma_data_direction dir, struct dma_attrs *attrs) { - phys_addr_t phys = page_to_phys(page) + offset; + phys_addr_t map, phys = page_to_phys(page) + offset; dma_addr_t dev_addr = xen_phys_to_bus(phys); - void *map; BUG_ON(dir == DMA_NONE); /* @@ -349,17 +399,26 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, * buffering it. */ if (dma_capable(dev, dev_addr, size) && - !range_straddles_page_boundary(phys, size) && !swiotlb_force) + !range_straddles_page_boundary(phys, size) && !swiotlb_force) { + /* we are not interested in the dma_addr returned by + * xen_dma_map_page, only in the potential cache flushes executed + * by the function. */ + xen_dma_map_page(dev, page, offset, size, dir, attrs); return dev_addr; + } /* * Oh well, have to allocate and map a bounce buffer. */ + trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); + map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir); - if (!map) + if (map == SWIOTLB_MAP_ERROR) return DMA_ERROR_CODE; - dev_addr = xen_virt_to_bus(map); + xen_dma_map_page(dev, pfn_to_page(map >> PAGE_SHIFT), + map & ~PAGE_MASK, size, dir, attrs); + dev_addr = xen_phys_to_bus(map); /* * Ensure that the address returned is DMA'ble @@ -381,15 +440,18 @@ EXPORT_SYMBOL_GPL(xen_swiotlb_map_page); * whatever the device wrote there. */ static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir) + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) { phys_addr_t paddr = xen_bus_to_phys(dev_addr); BUG_ON(dir == DMA_NONE); + xen_dma_unmap_page(hwdev, paddr, size, dir, attrs); + /* NOTE: We use dev_addr here, not paddr! */ if (is_xen_swiotlb_buffer(dev_addr)) { - swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir); + swiotlb_tbl_unmap_single(hwdev, paddr, size, dir); return; } @@ -409,7 +471,7 @@ void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, size_t size, enum dma_data_direction dir, struct dma_attrs *attrs) { - xen_unmap_single(hwdev, dev_addr, size, dir); + xen_unmap_single(hwdev, dev_addr, size, dir, attrs); } EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page); @@ -432,12 +494,15 @@ xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, BUG_ON(dir == DMA_NONE); + if (target == SYNC_FOR_CPU) + xen_dma_sync_single_for_cpu(hwdev, paddr, size, dir); + /* NOTE: We use dev_addr here, not paddr! */ - if (is_xen_swiotlb_buffer(dev_addr)) { - swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir, - target); - return; - } + if (is_xen_swiotlb_buffer(dev_addr)) + swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target); + + if (target == SYNC_FOR_DEVICE) + xen_dma_sync_single_for_cpu(hwdev, paddr, size, dir); if (dir != DMA_FROM_DEVICE) return; @@ -494,22 +559,38 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, if (swiotlb_force || !dma_capable(hwdev, dev_addr, sg->length) || range_straddles_page_boundary(paddr, sg->length)) { - void *map = swiotlb_tbl_map_single(hwdev, - start_dma_addr, - sg_phys(sg), - sg->length, dir); - if (!map) { + phys_addr_t map = swiotlb_tbl_map_single(hwdev, + start_dma_addr, + sg_phys(sg), + sg->length, + dir); + if (map == SWIOTLB_MAP_ERROR) { + dev_warn(hwdev, "swiotlb buffer is full\n"); /* Don't panic here, we expect map_sg users to do proper error handling. */ xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, attrs); - sgl[0].dma_length = 0; - return DMA_ERROR_CODE; + sg_dma_len(sgl) = 0; + return 0; } - sg->dma_address = xen_virt_to_bus(map); - } else + xen_dma_map_page(hwdev, pfn_to_page(map >> PAGE_SHIFT), + map & ~PAGE_MASK, + sg->length, + dir, + attrs); + sg->dma_address = xen_phys_to_bus(map); + } else { + /* we are not interested in the dma_addr returned by + * xen_dma_map_page, only in the potential cache flushes executed + * by the function. */ + xen_dma_map_page(hwdev, pfn_to_page(paddr >> PAGE_SHIFT), + paddr & ~PAGE_MASK, + sg->length, + dir, + attrs); sg->dma_address = dev_addr; - sg->dma_length = sg->length; + } + sg_dma_len(sg) = sg->length; } return nelems; } @@ -530,7 +611,7 @@ xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, BUG_ON(dir == DMA_NONE); for_each_sg(sgl, sg, nelems, i) - xen_unmap_single(hwdev, sg->dma_address, sg->dma_length, dir); + xen_unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, attrs); } EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg_attrs); @@ -552,7 +633,7 @@ xen_swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, for_each_sg(sgl, sg, nelems, i) xen_swiotlb_sync_single(hwdev, sg->dma_address, - sg->dma_length, dir, target); + sg_dma_len(sg), dir, target); } void @@ -590,3 +671,15 @@ xen_swiotlb_dma_supported(struct device *hwdev, u64 mask) return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask; } EXPORT_SYMBOL_GPL(xen_swiotlb_dma_supported); + +int +xen_swiotlb_set_dma_mask(struct device *dev, u64 dma_mask) +{ + if (!dev->dma_mask || !xen_swiotlb_dma_supported(dev, dma_mask)) + return -EIO; + + *dev->dma_mask = dma_mask; + + return 0; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_set_dma_mask); diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c index 5e5ad7e2885..96453f8a85c 100644 --- a/drivers/xen/sys-hypervisor.c +++ b/drivers/xen/sys-hypervisor.c @@ -11,6 +11,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/kobject.h> +#include <linux/err.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> @@ -284,7 +285,8 @@ static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer) ret = HYPERVISOR_xen_version(XENVER_platform_parameters, parms); if (!ret) - ret = sprintf(buffer, "%lx\n", parms->virt_start); + ret = sprintf(buffer, "%"PRI_xen_ulong"\n", + parms->virt_start); kfree(parms); } diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c index 144564e5eb2..83b5c53bec6 100644 --- a/drivers/xen/tmem.c +++ b/drivers/xen/tmem.c @@ -5,16 +5,15 @@ * Author: Dan Magenheimer */ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/module.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/init.h> #include <linux/pagemap.h> #include <linux/cleancache.h> - -/* temporary ifdef until include/linux/frontswap.h is upstream */ -#ifdef CONFIG_FRONTSWAP #include <linux/frontswap.h> -#endif #include <xen/xen.h> #include <xen/interface/xen.h> @@ -23,6 +22,36 @@ #include <asm/xen/hypervisor.h> #include <xen/tmem.h> +#ifndef CONFIG_XEN_TMEM_MODULE +bool __read_mostly tmem_enabled = false; + +static int __init enable_tmem(char *s) +{ + tmem_enabled = true; + return 1; +} +__setup("tmem", enable_tmem); +#endif + +#ifdef CONFIG_CLEANCACHE +static bool cleancache __read_mostly = true; +module_param(cleancache, bool, S_IRUGO); +static bool selfballooning __read_mostly = true; +module_param(selfballooning, bool, S_IRUGO); +#endif /* CONFIG_CLEANCACHE */ + +#ifdef CONFIG_FRONTSWAP +static bool frontswap __read_mostly = true; +module_param(frontswap, bool, S_IRUGO); +#else /* CONFIG_FRONTSWAP */ +#define frontswap (0) +#endif /* CONFIG_FRONTSWAP */ + +#ifdef CONFIG_XEN_SELFBALLOONING +static bool selfshrinking __read_mostly = true; +module_param(selfshrinking, bool, S_IRUGO); +#endif /* CONFIG_XEN_SELFBALLOONING */ + #define TMEM_CONTROL 0 #define TMEM_NEW_POOL 1 #define TMEM_DESTROY_POOL 2 @@ -128,14 +157,6 @@ static int xen_tmem_flush_object(u32 pool_id, struct tmem_oid oid) return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, oid, 0, 0, 0, 0, 0); } -bool __read_mostly tmem_enabled = false; - -static int __init enable_tmem(char *s) -{ - tmem_enabled = true; - return 1; -} -__setup("tmem", enable_tmem); #ifdef CONFIG_CLEANCACHE static int xen_tmem_destroy_pool(u32 pool_id) @@ -227,16 +248,7 @@ static int tmem_cleancache_init_shared_fs(char *uuid, size_t pagesize) return xen_tmem_new_pool(shared_uuid, TMEM_POOL_SHARED, pagesize); } -static bool __initdata use_cleancache = true; - -static int __init no_cleancache(char *s) -{ - use_cleancache = false; - return 1; -} -__setup("nocleancache", no_cleancache); - -static struct cleancache_ops __initdata tmem_cleancache_ops = { +static struct cleancache_ops tmem_cleancache_ops = { .put_page = tmem_cleancache_put_page, .get_page = tmem_cleancache_get_page, .invalidate_page = tmem_cleancache_flush_page, @@ -353,16 +365,7 @@ static void tmem_frontswap_init(unsigned ignored) xen_tmem_new_pool(private, TMEM_POOL_PERSIST, PAGE_SIZE); } -static bool __initdata use_frontswap = true; - -static int __init no_frontswap(char *s) -{ - use_frontswap = false; - return 1; -} -__setup("nofrontswap", no_frontswap); - -static struct frontswap_ops __initdata tmem_frontswap_ops = { +static struct frontswap_ops tmem_frontswap_ops = { .store = tmem_frontswap_store, .load = tmem_frontswap_load, .invalidate_page = tmem_frontswap_flush_page, @@ -371,36 +374,53 @@ static struct frontswap_ops __initdata tmem_frontswap_ops = { }; #endif -static int __init xen_tmem_init(void) +static int xen_tmem_init(void) { if (!xen_domain()) return 0; #ifdef CONFIG_FRONTSWAP - if (tmem_enabled && use_frontswap) { + if (tmem_enabled && frontswap) { char *s = ""; - struct frontswap_ops old_ops = - frontswap_register_ops(&tmem_frontswap_ops); + struct frontswap_ops *old_ops; tmem_frontswap_poolid = -1; - if (old_ops.init != NULL) + old_ops = frontswap_register_ops(&tmem_frontswap_ops); + if (IS_ERR(old_ops) || old_ops) { + if (IS_ERR(old_ops)) + return PTR_ERR(old_ops); s = " (WARNING: frontswap_ops overridden)"; - printk(KERN_INFO "frontswap enabled, RAM provided by " - "Xen Transcendent Memory\n"); + } + pr_info("frontswap enabled, RAM provided by Xen Transcendent Memory%s\n", + s); } #endif #ifdef CONFIG_CLEANCACHE BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid)); - if (tmem_enabled && use_cleancache) { + if (tmem_enabled && cleancache) { char *s = ""; - struct cleancache_ops old_ops = + struct cleancache_ops *old_ops = cleancache_register_ops(&tmem_cleancache_ops); - if (old_ops.init_fs != NULL) + if (old_ops) s = " (WARNING: cleancache_ops overridden)"; - printk(KERN_INFO "cleancache enabled, RAM provided by " - "Xen Transcendent Memory%s\n", s); + pr_info("cleancache enabled, RAM provided by Xen Transcendent Memory%s\n", + s); + } +#endif +#ifdef CONFIG_XEN_SELFBALLOONING + /* + * There is no point of driving pages to the swap system if they + * aren't going anywhere in tmem universe. + */ + if (!frontswap) { + selfshrinking = false; + selfballooning = false; } + xen_selfballoon_init(selfballooning, selfshrinking); #endif return 0; } module_init(xen_tmem_init) +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dan Magenheimer <dan.magenheimer@oracle.com>"); +MODULE_DESCRIPTION("Shim to Xen transcendent memory"); diff --git a/drivers/xen/xen-acpi-cpuhotplug.c b/drivers/xen/xen-acpi-cpuhotplug.c new file mode 100644 index 00000000000..3e62ee4b3b6 --- /dev/null +++ b/drivers/xen/xen-acpi-cpuhotplug.c @@ -0,0 +1,462 @@ +/* + * Copyright (C) 2012 Intel Corporation + * Author: Liu Jinsong <jinsong.liu@intel.com> + * Author: Jiang Yunhong <yunhong.jiang@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/cpu.h> +#include <linux/acpi.h> +#include <linux/uaccess.h> +#include <acpi/processor.h> +#include <xen/acpi.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypercall.h> + +#define PREFIX "ACPI:xen_cpu_hotplug:" + +#define INSTALL_NOTIFY_HANDLER 0 +#define UNINSTALL_NOTIFY_HANDLER 1 + +static acpi_status xen_acpi_cpu_hotadd(struct acpi_processor *pr); + +/* -------------------------------------------------------------------------- + Driver Interface +-------------------------------------------------------------------------- */ + +static int xen_acpi_processor_enable(struct acpi_device *device) +{ + acpi_status status = 0; + unsigned long long value; + union acpi_object object = { 0 }; + struct acpi_buffer buffer = { sizeof(union acpi_object), &object }; + struct acpi_processor *pr; + + pr = acpi_driver_data(device); + if (!pr) { + pr_err(PREFIX "Cannot find driver data\n"); + return -EINVAL; + } + + if (!strcmp(acpi_device_hid(device), ACPI_PROCESSOR_OBJECT_HID)) { + /* Declared with "Processor" statement; match ProcessorID */ + status = acpi_evaluate_object(pr->handle, NULL, NULL, &buffer); + if (ACPI_FAILURE(status)) { + pr_err(PREFIX "Evaluating processor object\n"); + return -ENODEV; + } + + pr->acpi_id = object.processor.proc_id; + } else { + /* Declared with "Device" statement; match _UID */ + status = acpi_evaluate_integer(pr->handle, METHOD_NAME__UID, + NULL, &value); + if (ACPI_FAILURE(status)) { + pr_err(PREFIX "Evaluating processor _UID\n"); + return -ENODEV; + } + + pr->acpi_id = value; + } + + pr->id = xen_pcpu_id(pr->acpi_id); + + if ((int)pr->id < 0) + /* This cpu is not presented at hypervisor, try to hotadd it */ + if (ACPI_FAILURE(xen_acpi_cpu_hotadd(pr))) { + pr_err(PREFIX "Hotadd CPU (acpi_id = %d) failed.\n", + pr->acpi_id); + return -ENODEV; + } + + return 0; +} + +static int xen_acpi_processor_add(struct acpi_device *device) +{ + int ret; + struct acpi_processor *pr; + + if (!device) + return -EINVAL; + + pr = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL); + if (!pr) + return -ENOMEM; + + pr->handle = device->handle; + strcpy(acpi_device_name(device), ACPI_PROCESSOR_DEVICE_NAME); + strcpy(acpi_device_class(device), ACPI_PROCESSOR_CLASS); + device->driver_data = pr; + + ret = xen_acpi_processor_enable(device); + if (ret) + pr_err(PREFIX "Error when enabling Xen processor\n"); + + return ret; +} + +static int xen_acpi_processor_remove(struct acpi_device *device) +{ + struct acpi_processor *pr; + + if (!device) + return -EINVAL; + + pr = acpi_driver_data(device); + if (!pr) + return -EINVAL; + + kfree(pr); + return 0; +} + +/*-------------------------------------------------------------- + Acpi processor hotplug support +--------------------------------------------------------------*/ + +static int is_processor_present(acpi_handle handle) +{ + acpi_status status; + unsigned long long sta = 0; + + + status = acpi_evaluate_integer(handle, "_STA", NULL, &sta); + + if (ACPI_SUCCESS(status) && (sta & ACPI_STA_DEVICE_PRESENT)) + return 1; + + /* + * _STA is mandatory for a processor that supports hot plug + */ + if (status == AE_NOT_FOUND) + pr_info(PREFIX "Processor does not support hot plug\n"); + else + pr_info(PREFIX "Processor Device is not present"); + return 0; +} + +static int xen_apic_id(acpi_handle handle) +{ + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + union acpi_object *obj; + struct acpi_madt_local_apic *lapic; + int apic_id; + + if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer))) + return -EINVAL; + + if (!buffer.length || !buffer.pointer) + return -EINVAL; + + obj = buffer.pointer; + if (obj->type != ACPI_TYPE_BUFFER || + obj->buffer.length < sizeof(*lapic)) { + kfree(buffer.pointer); + return -EINVAL; + } + + lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer; + + if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC || + !(lapic->lapic_flags & ACPI_MADT_ENABLED)) { + kfree(buffer.pointer); + return -EINVAL; + } + + apic_id = (uint32_t)lapic->id; + kfree(buffer.pointer); + buffer.length = ACPI_ALLOCATE_BUFFER; + buffer.pointer = NULL; + + return apic_id; +} + +static int xen_hotadd_cpu(struct acpi_processor *pr) +{ + int cpu_id, apic_id, pxm; + struct xen_platform_op op; + + apic_id = xen_apic_id(pr->handle); + if (apic_id < 0) { + pr_err(PREFIX "Failed to get apic_id for acpi_id %d\n", + pr->acpi_id); + return -ENODEV; + } + + pxm = xen_acpi_get_pxm(pr->handle); + if (pxm < 0) { + pr_err(PREFIX "Failed to get _PXM for acpi_id %d\n", + pr->acpi_id); + return pxm; + } + + op.cmd = XENPF_cpu_hotadd; + op.u.cpu_add.apic_id = apic_id; + op.u.cpu_add.acpi_id = pr->acpi_id; + op.u.cpu_add.pxm = pxm; + + cpu_id = HYPERVISOR_dom0_op(&op); + if (cpu_id < 0) + pr_err(PREFIX "Failed to hotadd CPU for acpi_id %d\n", + pr->acpi_id); + + return cpu_id; +} + +static acpi_status xen_acpi_cpu_hotadd(struct acpi_processor *pr) +{ + if (!is_processor_present(pr->handle)) + return AE_ERROR; + + pr->id = xen_hotadd_cpu(pr); + if ((int)pr->id < 0) + return AE_ERROR; + + /* + * Sync with Xen hypervisor, providing new /sys/.../xen_cpuX + * interface after cpu hotadded. + */ + xen_pcpu_hotplug_sync(); + + return AE_OK; +} + +static int acpi_processor_device_remove(struct acpi_device *device) +{ + pr_debug(PREFIX "Xen does not support CPU hotremove\n"); + + return -ENOSYS; +} + +static void acpi_processor_hotplug_notify(acpi_handle handle, + u32 event, void *data) +{ + struct acpi_processor *pr; + struct acpi_device *device = NULL; + u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; /* default */ + int result; + + acpi_scan_lock_acquire(); + + switch (event) { + case ACPI_NOTIFY_BUS_CHECK: + case ACPI_NOTIFY_DEVICE_CHECK: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "Processor driver received %s event\n", + (event == ACPI_NOTIFY_BUS_CHECK) ? + "ACPI_NOTIFY_BUS_CHECK" : "ACPI_NOTIFY_DEVICE_CHECK")); + + if (!is_processor_present(handle)) + break; + + acpi_bus_get_device(handle, &device); + if (acpi_device_enumerated(device)) + break; + + result = acpi_bus_scan(handle); + if (result) { + pr_err(PREFIX "Unable to add the device\n"); + break; + } + device = NULL; + acpi_bus_get_device(handle, &device); + if (!acpi_device_enumerated(device)) { + pr_err(PREFIX "Missing device object\n"); + break; + } + ost_code = ACPI_OST_SC_SUCCESS; + break; + + case ACPI_NOTIFY_EJECT_REQUEST: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "received ACPI_NOTIFY_EJECT_REQUEST\n")); + + if (acpi_bus_get_device(handle, &device)) { + pr_err(PREFIX "Device don't exist, dropping EJECT\n"); + break; + } + pr = acpi_driver_data(device); + if (!pr) { + pr_err(PREFIX "Driver data is NULL, dropping EJECT\n"); + break; + } + + /* + * TBD: implement acpi_processor_device_remove if Xen support + * CPU hotremove in the future. + */ + acpi_processor_device_remove(device); + break; + + default: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "Unsupported event [0x%x]\n", event)); + + /* non-hotplug event; possibly handled by other handler */ + goto out; + } + + (void) acpi_evaluate_ost(handle, event, ost_code, NULL); + +out: + acpi_scan_lock_release(); +} + +static acpi_status is_processor_device(acpi_handle handle) +{ + struct acpi_device_info *info; + char *hid; + acpi_status status; + + status = acpi_get_object_info(handle, &info); + if (ACPI_FAILURE(status)) + return status; + + if (info->type == ACPI_TYPE_PROCESSOR) { + kfree(info); + return AE_OK; /* found a processor object */ + } + + if (!(info->valid & ACPI_VALID_HID)) { + kfree(info); + return AE_ERROR; + } + + hid = info->hardware_id.string; + if ((hid == NULL) || strcmp(hid, ACPI_PROCESSOR_DEVICE_HID)) { + kfree(info); + return AE_ERROR; + } + + kfree(info); + return AE_OK; /* found a processor device object */ +} + +static acpi_status +processor_walk_namespace_cb(acpi_handle handle, + u32 lvl, void *context, void **rv) +{ + acpi_status status; + int *action = context; + + status = is_processor_device(handle); + if (ACPI_FAILURE(status)) + return AE_OK; /* not a processor; continue to walk */ + + switch (*action) { + case INSTALL_NOTIFY_HANDLER: + acpi_install_notify_handler(handle, + ACPI_SYSTEM_NOTIFY, + acpi_processor_hotplug_notify, + NULL); + break; + case UNINSTALL_NOTIFY_HANDLER: + acpi_remove_notify_handler(handle, + ACPI_SYSTEM_NOTIFY, + acpi_processor_hotplug_notify); + break; + default: + break; + } + + /* found a processor; skip walking underneath */ + return AE_CTRL_DEPTH; +} + +static +void acpi_processor_install_hotplug_notify(void) +{ + int action = INSTALL_NOTIFY_HANDLER; + acpi_walk_namespace(ACPI_TYPE_ANY, + ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, + processor_walk_namespace_cb, NULL, &action, NULL); +} + +static +void acpi_processor_uninstall_hotplug_notify(void) +{ + int action = UNINSTALL_NOTIFY_HANDLER; + acpi_walk_namespace(ACPI_TYPE_ANY, + ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, + processor_walk_namespace_cb, NULL, &action, NULL); +} + +static const struct acpi_device_id processor_device_ids[] = { + {ACPI_PROCESSOR_OBJECT_HID, 0}, + {ACPI_PROCESSOR_DEVICE_HID, 0}, + {"", 0}, +}; +MODULE_DEVICE_TABLE(acpi, processor_device_ids); + +static struct acpi_driver xen_acpi_processor_driver = { + .name = "processor", + .class = ACPI_PROCESSOR_CLASS, + .ids = processor_device_ids, + .ops = { + .add = xen_acpi_processor_add, + .remove = xen_acpi_processor_remove, + }, +}; + +static int __init xen_acpi_processor_init(void) +{ + int result = 0; + + if (!xen_initial_domain()) + return -ENODEV; + + /* unregister the stub which only used to reserve driver space */ + xen_stub_processor_exit(); + + result = acpi_bus_register_driver(&xen_acpi_processor_driver); + if (result < 0) { + xen_stub_processor_init(); + return result; + } + + acpi_processor_install_hotplug_notify(); + return 0; +} + +static void __exit xen_acpi_processor_exit(void) +{ + if (!xen_initial_domain()) + return; + + acpi_processor_uninstall_hotplug_notify(); + + acpi_bus_unregister_driver(&xen_acpi_processor_driver); + + /* + * stub reserve space again to prevent any chance of native + * driver loading. + */ + xen_stub_processor_init(); + return; +} + +module_init(xen_acpi_processor_init); +module_exit(xen_acpi_processor_exit); +ACPI_MODULE_NAME("xen-acpi-cpuhotplug"); +MODULE_AUTHOR("Liu Jinsong <jinsong.liu@intel.com>"); +MODULE_DESCRIPTION("Xen Hotplug CPU Driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xen-acpi-memhotplug.c b/drivers/xen/xen-acpi-memhotplug.c new file mode 100644 index 00000000000..34e40b733f9 --- /dev/null +++ b/drivers/xen/xen-acpi-memhotplug.c @@ -0,0 +1,485 @@ +/* + * Copyright (C) 2012 Intel Corporation + * Author: Liu Jinsong <jinsong.liu@intel.com> + * Author: Jiang Yunhong <yunhong.jiang@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/acpi.h> +#include <xen/acpi.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypercall.h> + +#define PREFIX "ACPI:xen_memory_hotplug:" + +struct acpi_memory_info { + struct list_head list; + u64 start_addr; /* Memory Range start physical addr */ + u64 length; /* Memory Range length */ + unsigned short caching; /* memory cache attribute */ + unsigned short write_protect; /* memory read/write attribute */ + /* copied from buffer getting from _CRS */ + unsigned int enabled:1; +}; + +struct acpi_memory_device { + struct acpi_device *device; + struct list_head res_list; +}; + +static bool acpi_hotmem_initialized __read_mostly; + +static int xen_hotadd_memory(int pxm, struct acpi_memory_info *info) +{ + int rc; + struct xen_platform_op op; + + op.cmd = XENPF_mem_hotadd; + op.u.mem_add.spfn = info->start_addr >> PAGE_SHIFT; + op.u.mem_add.epfn = (info->start_addr + info->length) >> PAGE_SHIFT; + op.u.mem_add.pxm = pxm; + + rc = HYPERVISOR_dom0_op(&op); + if (rc) + pr_err(PREFIX "Xen Hotplug Memory Add failed on " + "0x%lx -> 0x%lx, _PXM: %d, error: %d\n", + (unsigned long)info->start_addr, + (unsigned long)(info->start_addr + info->length), + pxm, rc); + + return rc; +} + +static int xen_acpi_memory_enable_device(struct acpi_memory_device *mem_device) +{ + int pxm, result; + int num_enabled = 0; + struct acpi_memory_info *info; + + if (!mem_device) + return -EINVAL; + + pxm = xen_acpi_get_pxm(mem_device->device->handle); + if (pxm < 0) + return pxm; + + list_for_each_entry(info, &mem_device->res_list, list) { + if (info->enabled) { /* just sanity check...*/ + num_enabled++; + continue; + } + + if (!info->length) + continue; + + result = xen_hotadd_memory(pxm, info); + if (result) + continue; + info->enabled = 1; + num_enabled++; + } + + if (!num_enabled) + return -ENODEV; + + return 0; +} + +static acpi_status +acpi_memory_get_resource(struct acpi_resource *resource, void *context) +{ + struct acpi_memory_device *mem_device = context; + struct acpi_resource_address64 address64; + struct acpi_memory_info *info, *new; + acpi_status status; + + status = acpi_resource_to_address64(resource, &address64); + if (ACPI_FAILURE(status) || + (address64.resource_type != ACPI_MEMORY_RANGE)) + return AE_OK; + + list_for_each_entry(info, &mem_device->res_list, list) { + if ((info->caching == address64.info.mem.caching) && + (info->write_protect == address64.info.mem.write_protect) && + (info->start_addr + info->length == address64.minimum)) { + info->length += address64.address_length; + return AE_OK; + } + } + + new = kzalloc(sizeof(struct acpi_memory_info), GFP_KERNEL); + if (!new) + return AE_ERROR; + + INIT_LIST_HEAD(&new->list); + new->caching = address64.info.mem.caching; + new->write_protect = address64.info.mem.write_protect; + new->start_addr = address64.minimum; + new->length = address64.address_length; + list_add_tail(&new->list, &mem_device->res_list); + + return AE_OK; +} + +static int +acpi_memory_get_device_resources(struct acpi_memory_device *mem_device) +{ + acpi_status status; + struct acpi_memory_info *info, *n; + + if (!list_empty(&mem_device->res_list)) + return 0; + + status = acpi_walk_resources(mem_device->device->handle, + METHOD_NAME__CRS, acpi_memory_get_resource, mem_device); + + if (ACPI_FAILURE(status)) { + list_for_each_entry_safe(info, n, &mem_device->res_list, list) + kfree(info); + INIT_LIST_HEAD(&mem_device->res_list); + return -EINVAL; + } + + return 0; +} + +static int acpi_memory_get_device(acpi_handle handle, + struct acpi_memory_device **mem_device) +{ + struct acpi_device *device = NULL; + int result = 0; + + acpi_scan_lock_acquire(); + + acpi_bus_get_device(handle, &device); + if (acpi_device_enumerated(device)) + goto end; + + /* + * Now add the notified device. This creates the acpi_device + * and invokes .add function + */ + result = acpi_bus_scan(handle); + if (result) { + pr_warn(PREFIX "ACPI namespace scan failed\n"); + result = -EINVAL; + goto out; + } + device = NULL; + acpi_bus_get_device(handle, &device); + if (!acpi_device_enumerated(device)) { + pr_warn(PREFIX "Missing device object\n"); + result = -EINVAL; + goto out; + } + +end: + *mem_device = acpi_driver_data(device); + if (!(*mem_device)) { + pr_err(PREFIX "driver data not found\n"); + result = -ENODEV; + goto out; + } + +out: + acpi_scan_lock_release(); + return result; +} + +static int acpi_memory_check_device(struct acpi_memory_device *mem_device) +{ + unsigned long long current_status; + + /* Get device present/absent information from the _STA */ + if (ACPI_FAILURE(acpi_evaluate_integer(mem_device->device->handle, + "_STA", NULL, ¤t_status))) + return -ENODEV; + /* + * Check for device status. Device should be + * present/enabled/functioning. + */ + if (!((current_status & ACPI_STA_DEVICE_PRESENT) + && (current_status & ACPI_STA_DEVICE_ENABLED) + && (current_status & ACPI_STA_DEVICE_FUNCTIONING))) + return -ENODEV; + + return 0; +} + +static int acpi_memory_disable_device(struct acpi_memory_device *mem_device) +{ + pr_debug(PREFIX "Xen does not support memory hotremove\n"); + + return -ENOSYS; +} + +static void acpi_memory_device_notify(acpi_handle handle, u32 event, void *data) +{ + struct acpi_memory_device *mem_device; + struct acpi_device *device; + u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; /* default */ + + switch (event) { + case ACPI_NOTIFY_BUS_CHECK: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "\nReceived BUS CHECK notification for device\n")); + /* Fall Through */ + case ACPI_NOTIFY_DEVICE_CHECK: + if (event == ACPI_NOTIFY_DEVICE_CHECK) + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "\nReceived DEVICE CHECK notification for device\n")); + + if (acpi_memory_get_device(handle, &mem_device)) { + pr_err(PREFIX "Cannot find driver data\n"); + break; + } + + ost_code = ACPI_OST_SC_SUCCESS; + break; + + case ACPI_NOTIFY_EJECT_REQUEST: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "\nReceived EJECT REQUEST notification for device\n")); + + acpi_scan_lock_acquire(); + if (acpi_bus_get_device(handle, &device)) { + acpi_scan_lock_release(); + pr_err(PREFIX "Device doesn't exist\n"); + break; + } + mem_device = acpi_driver_data(device); + if (!mem_device) { + acpi_scan_lock_release(); + pr_err(PREFIX "Driver Data is NULL\n"); + break; + } + + /* + * TBD: implement acpi_memory_disable_device and invoke + * acpi_bus_remove if Xen support hotremove in the future + */ + acpi_memory_disable_device(mem_device); + acpi_scan_lock_release(); + break; + + default: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "Unsupported event [0x%x]\n", event)); + /* non-hotplug event; possibly handled by other handler */ + return; + } + + (void) acpi_evaluate_ost(handle, event, ost_code, NULL); + return; +} + +static int xen_acpi_memory_device_add(struct acpi_device *device) +{ + int result; + struct acpi_memory_device *mem_device = NULL; + + + if (!device) + return -EINVAL; + + mem_device = kzalloc(sizeof(struct acpi_memory_device), GFP_KERNEL); + if (!mem_device) + return -ENOMEM; + + INIT_LIST_HEAD(&mem_device->res_list); + mem_device->device = device; + sprintf(acpi_device_name(device), "%s", ACPI_MEMORY_DEVICE_NAME); + sprintf(acpi_device_class(device), "%s", ACPI_MEMORY_DEVICE_CLASS); + device->driver_data = mem_device; + + /* Get the range from the _CRS */ + result = acpi_memory_get_device_resources(mem_device); + if (result) { + kfree(mem_device); + return result; + } + + /* + * For booting existed memory devices, early boot code has recognized + * memory area by EFI/E820. If DSDT shows these memory devices on boot, + * hotplug is not necessary for them. + * For hot-added memory devices during runtime, it need hypercall to + * Xen hypervisor to add memory. + */ + if (!acpi_hotmem_initialized) + return 0; + + if (!acpi_memory_check_device(mem_device)) + result = xen_acpi_memory_enable_device(mem_device); + + return result; +} + +static int xen_acpi_memory_device_remove(struct acpi_device *device) +{ + struct acpi_memory_device *mem_device = NULL; + + if (!device || !acpi_driver_data(device)) + return -EINVAL; + + mem_device = acpi_driver_data(device); + kfree(mem_device); + + return 0; +} + +/* + * Helper function to check for memory device + */ +static acpi_status is_memory_device(acpi_handle handle) +{ + char *hardware_id; + acpi_status status; + struct acpi_device_info *info; + + status = acpi_get_object_info(handle, &info); + if (ACPI_FAILURE(status)) + return status; + + if (!(info->valid & ACPI_VALID_HID)) { + kfree(info); + return AE_ERROR; + } + + hardware_id = info->hardware_id.string; + if ((hardware_id == NULL) || + (strcmp(hardware_id, ACPI_MEMORY_DEVICE_HID))) + status = AE_ERROR; + + kfree(info); + return status; +} + +static acpi_status +acpi_memory_register_notify_handler(acpi_handle handle, + u32 level, void *ctxt, void **retv) +{ + acpi_status status; + + status = is_memory_device(handle); + if (ACPI_FAILURE(status)) + return AE_OK; /* continue */ + + status = acpi_install_notify_handler(handle, ACPI_SYSTEM_NOTIFY, + acpi_memory_device_notify, NULL); + /* continue */ + return AE_OK; +} + +static acpi_status +acpi_memory_deregister_notify_handler(acpi_handle handle, + u32 level, void *ctxt, void **retv) +{ + acpi_status status; + + status = is_memory_device(handle); + if (ACPI_FAILURE(status)) + return AE_OK; /* continue */ + + status = acpi_remove_notify_handler(handle, + ACPI_SYSTEM_NOTIFY, + acpi_memory_device_notify); + + return AE_OK; /* continue */ +} + +static const struct acpi_device_id memory_device_ids[] = { + {ACPI_MEMORY_DEVICE_HID, 0}, + {"", 0}, +}; +MODULE_DEVICE_TABLE(acpi, memory_device_ids); + +static struct acpi_driver xen_acpi_memory_device_driver = { + .name = "acpi_memhotplug", + .class = ACPI_MEMORY_DEVICE_CLASS, + .ids = memory_device_ids, + .ops = { + .add = xen_acpi_memory_device_add, + .remove = xen_acpi_memory_device_remove, + }, +}; + +static int __init xen_acpi_memory_device_init(void) +{ + int result; + acpi_status status; + + if (!xen_initial_domain()) + return -ENODEV; + + /* unregister the stub which only used to reserve driver space */ + xen_stub_memory_device_exit(); + + result = acpi_bus_register_driver(&xen_acpi_memory_device_driver); + if (result < 0) { + xen_stub_memory_device_init(); + return -ENODEV; + } + + status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, + acpi_memory_register_notify_handler, + NULL, NULL, NULL); + + if (ACPI_FAILURE(status)) { + pr_warn(PREFIX "walk_namespace failed\n"); + acpi_bus_unregister_driver(&xen_acpi_memory_device_driver); + xen_stub_memory_device_init(); + return -ENODEV; + } + + acpi_hotmem_initialized = true; + return 0; +} + +static void __exit xen_acpi_memory_device_exit(void) +{ + acpi_status status; + + if (!xen_initial_domain()) + return; + + status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, + acpi_memory_deregister_notify_handler, + NULL, NULL, NULL); + if (ACPI_FAILURE(status)) + pr_warn(PREFIX "walk_namespace failed\n"); + + acpi_bus_unregister_driver(&xen_acpi_memory_device_driver); + + /* + * stub reserve space again to prevent any chance of native + * driver loading. + */ + xen_stub_memory_device_init(); + return; +} + +module_init(xen_acpi_memory_device_init); +module_exit(xen_acpi_memory_device_exit); +ACPI_MODULE_NAME("xen-acpi-memhotplug"); +MODULE_AUTHOR("Liu Jinsong <jinsong.liu@intel.com>"); +MODULE_DESCRIPTION("Xen Hotplug Mem Driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xen-acpi-pad.c b/drivers/xen/xen-acpi-pad.c new file mode 100644 index 00000000000..f83b754505f --- /dev/null +++ b/drivers/xen/xen-acpi-pad.c @@ -0,0 +1,170 @@ +/* + * xen-acpi-pad.c - Xen pad interface + * + * Copyright (c) 2012, Intel Corporation. + * Author: Liu, Jinsong <jinsong.liu@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/acpi.h> +#include <xen/interface/version.h> +#include <xen/xen-ops.h> +#include <asm/xen/hypercall.h> + +#define ACPI_PROCESSOR_AGGREGATOR_CLASS "acpi_pad" +#define ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME "Processor Aggregator" +#define ACPI_PROCESSOR_AGGREGATOR_NOTIFY 0x80 +static DEFINE_MUTEX(xen_cpu_lock); + +static int xen_acpi_pad_idle_cpus(unsigned int idle_nums) +{ + struct xen_platform_op op; + + op.cmd = XENPF_core_parking; + op.u.core_parking.type = XEN_CORE_PARKING_SET; + op.u.core_parking.idle_nums = idle_nums; + + return HYPERVISOR_dom0_op(&op); +} + +static int xen_acpi_pad_idle_cpus_num(void) +{ + struct xen_platform_op op; + + op.cmd = XENPF_core_parking; + op.u.core_parking.type = XEN_CORE_PARKING_GET; + + return HYPERVISOR_dom0_op(&op) + ?: op.u.core_parking.idle_nums; +} + +/* + * Query firmware how many CPUs should be idle + * return -1 on failure + */ +static int acpi_pad_pur(acpi_handle handle) +{ + struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object *package; + int num = -1; + + if (ACPI_FAILURE(acpi_evaluate_object(handle, "_PUR", NULL, &buffer))) + return num; + + if (!buffer.length || !buffer.pointer) + return num; + + package = buffer.pointer; + + if (package->type == ACPI_TYPE_PACKAGE && + package->package.count == 2 && + package->package.elements[0].integer.value == 1) /* rev 1 */ + num = package->package.elements[1].integer.value; + + kfree(buffer.pointer); + return num; +} + +static void acpi_pad_handle_notify(acpi_handle handle) +{ + int idle_nums; + struct acpi_buffer param = { + .length = 4, + .pointer = (void *)&idle_nums, + }; + + + mutex_lock(&xen_cpu_lock); + idle_nums = acpi_pad_pur(handle); + if (idle_nums < 0) { + mutex_unlock(&xen_cpu_lock); + return; + } + + idle_nums = xen_acpi_pad_idle_cpus(idle_nums) + ?: xen_acpi_pad_idle_cpus_num(); + if (idle_nums >= 0) + acpi_evaluate_ost(handle, ACPI_PROCESSOR_AGGREGATOR_NOTIFY, + 0, ¶m); + mutex_unlock(&xen_cpu_lock); +} + +static void acpi_pad_notify(acpi_handle handle, u32 event, + void *data) +{ + switch (event) { + case ACPI_PROCESSOR_AGGREGATOR_NOTIFY: + acpi_pad_handle_notify(handle); + break; + default: + pr_warn("Unsupported event [0x%x]\n", event); + break; + } +} + +static int acpi_pad_add(struct acpi_device *device) +{ + acpi_status status; + + strcpy(acpi_device_name(device), ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME); + strcpy(acpi_device_class(device), ACPI_PROCESSOR_AGGREGATOR_CLASS); + + status = acpi_install_notify_handler(device->handle, + ACPI_DEVICE_NOTIFY, acpi_pad_notify, device); + if (ACPI_FAILURE(status)) + return -ENODEV; + + return 0; +} + +static int acpi_pad_remove(struct acpi_device *device) +{ + mutex_lock(&xen_cpu_lock); + xen_acpi_pad_idle_cpus(0); + mutex_unlock(&xen_cpu_lock); + + acpi_remove_notify_handler(device->handle, + ACPI_DEVICE_NOTIFY, acpi_pad_notify); + return 0; +} + +static const struct acpi_device_id pad_device_ids[] = { + {"ACPI000C", 0}, + {"", 0}, +}; + +static struct acpi_driver acpi_pad_driver = { + .name = "processor_aggregator", + .class = ACPI_PROCESSOR_AGGREGATOR_CLASS, + .ids = pad_device_ids, + .ops = { + .add = acpi_pad_add, + .remove = acpi_pad_remove, + }, +}; + +static int __init xen_acpi_pad_init(void) +{ + /* Only DOM0 is responsible for Xen acpi pad */ + if (!xen_initial_domain()) + return -ENODEV; + + /* Only Xen4.2 or later support Xen acpi pad */ + if (!xen_running_on_version_or_later(4, 2)) + return -ENODEV; + + return acpi_bus_register_driver(&acpi_pad_driver); +} +subsys_initcall(xen_acpi_pad_init); diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c index 316df65163c..59fc190f1e9 100644 --- a/drivers/xen/xen-acpi-processor.c +++ b/drivers/xen/xen-acpi-processor.c @@ -17,6 +17,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/cpumask.h> #include <linux/cpufreq.h> #include <linux/freezer.h> @@ -25,16 +27,13 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/types.h> -#include <acpi/acpi_bus.h> -#include <acpi/acpi_drivers.h> +#include <linux/acpi.h> #include <acpi/processor.h> - #include <xen/xen.h> +#include <xen/xen-ops.h> #include <xen/interface/platform.h> #include <asm/xen/hypercall.h> -#define DRV_NAME "xen-acpi-processor: " - static int no_hypercall; MODULE_PARM_DESC(off, "Inhibit the hypercall."); module_param_named(off, no_hypercall, int, 0400); @@ -51,9 +50,9 @@ static DEFINE_MUTEX(acpi_ids_mutex); /* Which ACPI ID we have processed from 'struct acpi_processor'. */ static unsigned long *acpi_ids_done; /* Which ACPI ID exist in the SSDT/DSDT processor definitions. */ -static unsigned long __initdata *acpi_id_present; +static unsigned long *acpi_id_present; /* And if there is an _CST definition (or a PBLK) for the ACPI IDs */ -static unsigned long __initdata *acpi_id_cst_present; +static unsigned long *acpi_id_cst_present; static int push_cxx_to_hypervisor(struct acpi_processor *_pr) { @@ -103,7 +102,7 @@ static int push_cxx_to_hypervisor(struct acpi_processor *_pr) set_xen_guest_handle(dst_cx->dp, NULL); } if (!ok) { - pr_debug(DRV_NAME "No _Cx for ACPI CPU %u\n", _pr->acpi_id); + pr_debug("No _Cx for ACPI CPU %u\n", _pr->acpi_id); kfree(dst_cx_states); return -EINVAL; } @@ -128,11 +127,11 @@ static int push_cxx_to_hypervisor(struct acpi_processor *_pr) pr_debug(" C%d: %s %d uS\n", cx->type, cx->desc, (u32)cx->latency); } - } else if (ret != -EINVAL) + } else if ((ret != -EINVAL) && (ret != -ENOSYS)) /* EINVAL means the ACPI ID is incorrect - meaning the ACPI * table is referencing a non-existing CPU - which can happen * with broken ACPI tables. */ - pr_err(DRV_NAME "(CX): Hypervisor error (%d) for ACPI CPU%u\n", + pr_err("(CX): Hypervisor error (%d) for ACPI CPU%u\n", ret, _pr->acpi_id); kfree(dst_cx_states); @@ -238,7 +237,7 @@ static int push_pxx_to_hypervisor(struct acpi_processor *_pr) dst_perf->flags |= XEN_PX_PSD; if (dst_perf->flags != (XEN_PX_PSD | XEN_PX_PSS | XEN_PX_PCT | XEN_PX_PPC)) { - pr_warn(DRV_NAME "ACPI CPU%u missing some P-state data (%x), skipping.\n", + pr_warn("ACPI CPU%u missing some P-state data (%x), skipping\n", _pr->acpi_id, dst_perf->flags); ret = -ENODEV; goto err_free; @@ -260,12 +259,12 @@ static int push_pxx_to_hypervisor(struct acpi_processor *_pr) (u32) perf->states[i].power, (u32) perf->states[i].transition_latency); } - } else if (ret != -EINVAL) + } else if ((ret != -EINVAL) && (ret != -ENOSYS)) /* EINVAL means the ACPI ID is incorrect - meaning the ACPI * table is referencing a non-existing CPU - which can happen * with broken ACPI tables. */ - pr_warn(DRV_NAME "(_PXX): Hypervisor error (%d) for ACPI CPU%u\n", - ret, _pr->acpi_id); + pr_warn("(_PXX): Hypervisor error (%d) for ACPI CPU%u\n", + ret, _pr->acpi_id); err_free: if (!IS_ERR_OR_NULL(dst_states)) kfree(dst_states); @@ -317,7 +316,7 @@ static unsigned int __init get_max_acpi_id(void) max_acpi_id = max(info->acpi_id, max_acpi_id); } max_acpi_id *= 2; /* Slack for CPU hotplug support. */ - pr_debug(DRV_NAME "Max ACPI ID: %u\n", max_acpi_id); + pr_debug("Max ACPI ID: %u\n", max_acpi_id); return max_acpi_id; } /* @@ -329,7 +328,7 @@ static unsigned int __init get_max_acpi_id(void) * for_each_[present|online]_cpu macros which are banded to the virtual * CPU amount. */ -static acpi_status __init +static acpi_status read_acpi_id(acpi_handle handle, u32 lvl, void *context, void **rv) { u32 acpi_id; @@ -364,15 +363,14 @@ read_acpi_id(acpi_handle handle, u32 lvl, void *context, void **rv) /* There are more ACPI Processor objects than in x2APIC or MADT. * This can happen with incorrect ACPI SSDT declerations. */ if (acpi_id > nr_acpi_bits) { - pr_debug(DRV_NAME "We only have %u, trying to set %u\n", + pr_debug("We only have %u, trying to set %u\n", nr_acpi_bits, acpi_id); return AE_OK; } /* OK, There is a ACPI Processor object */ __set_bit(acpi_id, acpi_id_present); - pr_debug(DRV_NAME "ACPI CPU%u w/ PBLK:0x%lx\n", acpi_id, - (unsigned long)pblk); + pr_debug("ACPI CPU%u w/ PBLK:0x%lx\n", acpi_id, (unsigned long)pblk); status = acpi_evaluate_object(handle, "_CST", NULL, &buffer); if (ACPI_FAILURE(status)) { @@ -384,12 +382,16 @@ read_acpi_id(acpi_handle handle, u32 lvl, void *context, void **rv) return AE_OK; } -static int __init check_acpi_ids(struct acpi_processor *pr_backup) +static int check_acpi_ids(struct acpi_processor *pr_backup) { if (!pr_backup) return -ENODEV; + if (acpi_id_present && acpi_id_cst_present) + /* OK, done this once .. skip to uploading */ + goto upload; + /* All online CPUs have been processed at this stage. Now verify * whether in fact "online CPUs" == physical CPUs. */ @@ -408,6 +410,7 @@ static int __init check_acpi_ids(struct acpi_processor *pr_backup) read_acpi_id, NULL, NULL, NULL); acpi_get_devices("ACPI0007", read_acpi_id, NULL, NULL); +upload: if (!bitmap_equal(acpi_id_present, acpi_ids_done, nr_acpi_bits)) { unsigned int i; for_each_set_bit(i, acpi_id_present, nr_acpi_bits) { @@ -417,10 +420,7 @@ static int __init check_acpi_ids(struct acpi_processor *pr_backup) (void)upload_pm_data(pr_backup); } } - kfree(acpi_id_present); - acpi_id_present = NULL; - kfree(acpi_id_cst_present); - acpi_id_cst_present = NULL; + return 0; } static int __init check_prereq(void) @@ -467,10 +467,48 @@ static void free_acpi_perf_data(void) free_percpu(acpi_perf_data); } -static int __init xen_acpi_processor_init(void) +static int xen_upload_processor_pm_data(void) { struct acpi_processor *pr_backup = NULL; unsigned int i; + int rc = 0; + + pr_info("Uploading Xen processor PM info\n"); + + for_each_possible_cpu(i) { + struct acpi_processor *_pr; + _pr = per_cpu(processors, i /* APIC ID */); + if (!_pr) + continue; + + if (!pr_backup) { + pr_backup = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL); + if (pr_backup) + memcpy(pr_backup, _pr, sizeof(struct acpi_processor)); + } + (void)upload_pm_data(_pr); + } + + rc = check_acpi_ids(pr_backup); + kfree(pr_backup); + + return rc; +} + +static int xen_acpi_processor_resume(struct notifier_block *nb, + unsigned long action, void *data) +{ + bitmap_zero(acpi_ids_done, nr_acpi_bits); + return xen_upload_processor_pm_data(); +} + +struct notifier_block xen_acpi_processor_resume_nb = { + .notifier_call = xen_acpi_processor_resume, +}; + +static int __init xen_acpi_processor_init(void) +{ + unsigned int i; int rc = check_prereq(); if (rc) @@ -483,7 +521,7 @@ static int __init xen_acpi_processor_init(void) acpi_perf_data = alloc_percpu(struct acpi_processor_performance); if (!acpi_perf_data) { - pr_debug(DRV_NAME "Memory allocation error for acpi_perf_data.\n"); + pr_debug("Memory allocation error for acpi_perf_data\n"); kfree(acpi_ids_done); return -ENOMEM; } @@ -500,38 +538,26 @@ static int __init xen_acpi_processor_init(void) (void)acpi_processor_preregister_performance(acpi_perf_data); for_each_possible_cpu(i) { + struct acpi_processor *pr; struct acpi_processor_performance *perf; + pr = per_cpu(processors, i); perf = per_cpu_ptr(acpi_perf_data, i); - rc = acpi_processor_register_performance(perf, i); - if (rc) - goto err_out; - } - rc = acpi_processor_notify_smm(THIS_MODULE); - if (rc) - goto err_unregister; - - for_each_possible_cpu(i) { - struct acpi_processor *_pr; - _pr = per_cpu(processors, i /* APIC ID */); - if (!_pr) + if (!pr) continue; - if (!pr_backup) { - pr_backup = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL); - if (pr_backup) - memcpy(pr_backup, _pr, sizeof(struct acpi_processor)); - } - (void)upload_pm_data(_pr); + pr->performance = perf; + rc = acpi_processor_get_performance_info(pr); + if (rc) + goto err_out; } - rc = check_acpi_ids(pr_backup); - - kfree(pr_backup); - pr_backup = NULL; + rc = xen_upload_processor_pm_data(); if (rc) goto err_unregister; + xen_resume_notifier_register(&xen_acpi_processor_resume_nb); + return 0; err_unregister: for_each_possible_cpu(i) { @@ -549,7 +575,10 @@ static void __exit xen_acpi_processor_exit(void) { int i; + xen_resume_notifier_unregister(&xen_acpi_processor_resume_nb); kfree(acpi_ids_done); + kfree(acpi_id_present); + kfree(acpi_id_cst_present); for_each_possible_cpu(i) { struct acpi_processor_performance *perf; perf = per_cpu_ptr(acpi_perf_data, i); diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c index 8f37e23f6d1..e555845d61f 100644 --- a/drivers/xen/xen-balloon.c +++ b/drivers/xen/xen-balloon.c @@ -30,6 +30,8 @@ * IN THE SOFTWARE. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/module.h> #include <linux/capability.h> @@ -81,7 +83,7 @@ static int balloon_init_watcher(struct notifier_block *notifier, err = register_xenbus_watch(&target_watch); if (err) - printk(KERN_ERR "Failed to set balloon watcher\n"); + pr_err("Failed to set balloon watcher\n"); return NOTIFY_DONE; } @@ -95,7 +97,7 @@ static int __init balloon_init(void) if (!xen_domain()) return -ENODEV; - pr_info("xen-balloon: Initialising balloon driver.\n"); + pr_info("Initialising balloon driver\n"); register_balloon(&balloon_dev); diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c index 3daf862d739..c5ee82587e8 100644 --- a/drivers/xen/xen-pciback/conf_space_header.c +++ b/drivers/xen/xen-pciback/conf_space_header.c @@ -4,6 +4,8 @@ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/pci.h> #include "pciback.h" @@ -75,10 +77,8 @@ static int command_write(struct pci_dev *dev, int offset, u16 value, void *data) pci_name(dev)); err = pci_set_mwi(dev); if (err) { - printk(KERN_WARNING - DRV_NAME ": %s: cannot enable " - "memory-write-invalidate (%d)\n", - pci_name(dev), err); + pr_warn("%s: cannot enable memory-write-invalidate (%d)\n", + pci_name(dev), err); value &= ~PCI_COMMAND_INVALIDATE; } } @@ -91,7 +91,7 @@ static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data) struct pci_bar_info *bar = data; if (unlikely(!bar)) { - printk(KERN_WARNING DRV_NAME ": driver data not found for %s\n", + pr_warn(DRV_NAME ": driver data not found for %s\n", pci_name(dev)); return XEN_PCI_ERR_op_failed; } @@ -125,7 +125,7 @@ static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data) struct pci_bar_info *bar = data; if (unlikely(!bar)) { - printk(KERN_WARNING DRV_NAME ": driver data not found for %s\n", + pr_warn(DRV_NAME ": driver data not found for %s\n", pci_name(dev)); return XEN_PCI_ERR_op_failed; } @@ -153,7 +153,7 @@ static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data) struct pci_bar_info *bar = data; if (unlikely(!bar)) { - printk(KERN_WARNING DRV_NAME ": driver data not found for %s\n", + pr_warn(DRV_NAME ": driver data not found for %s\n", pci_name(dev)); return XEN_PCI_ERR_op_failed; } @@ -375,7 +375,7 @@ int xen_pcibk_config_header_add_fields(struct pci_dev *dev) default: err = -EINVAL; - printk(KERN_ERR DRV_NAME ": %s: Unsupported header type %d!\n", + pr_err("%s: Unsupported header type %d!\n", pci_name(dev), dev->hdr_type); break; } diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c index 961d664e2d2..d57a173685f 100644 --- a/drivers/xen/xen-pciback/pci_stub.c +++ b/drivers/xen/xen-pciback/pci_stub.c @@ -4,6 +4,9 @@ * Ryan Wilson <hap9@epoch.ncsc.mil> * Chris Bookholt <hap10@epoch.ncsc.mil> */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/init.h> #include <linux/rwsem.h> @@ -17,6 +20,7 @@ #include <xen/events.h> #include <asm/xen/pci.h> #include <asm/xen/hypervisor.h> +#include <xen/interface/physdev.h> #include "pciback.h" #include "conf_space.h" #include "conf_space_quirks.h" @@ -85,37 +89,52 @@ static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev) static void pcistub_device_release(struct kref *kref) { struct pcistub_device *psdev; + struct pci_dev *dev; struct xen_pcibk_dev_data *dev_data; psdev = container_of(kref, struct pcistub_device, kref); - dev_data = pci_get_drvdata(psdev->dev); + dev = psdev->dev; + dev_data = pci_get_drvdata(dev); - dev_dbg(&psdev->dev->dev, "pcistub_device_release\n"); + dev_dbg(&dev->dev, "pcistub_device_release\n"); - xen_unregister_device_domain_owner(psdev->dev); + xen_unregister_device_domain_owner(dev); /* Call the reset function which does not take lock as this * is called from "unbind" which takes a device_lock mutex. */ - __pci_reset_function_locked(psdev->dev); - if (pci_load_and_free_saved_state(psdev->dev, - &dev_data->pci_saved_state)) { - dev_dbg(&psdev->dev->dev, "Could not reload PCI state\n"); - } else - pci_restore_state(psdev->dev); + __pci_reset_function_locked(dev); + if (pci_load_and_free_saved_state(dev, &dev_data->pci_saved_state)) + dev_dbg(&dev->dev, "Could not reload PCI state\n"); + else + pci_restore_state(dev); + + if (dev->msix_cap) { + struct physdev_pci_device ppdev = { + .seg = pci_domain_nr(dev->bus), + .bus = dev->bus->number, + .devfn = dev->devfn + }; + int err = HYPERVISOR_physdev_op(PHYSDEVOP_release_msix, + &ppdev); + + if (err) + dev_warn(&dev->dev, "MSI-X release failed (%d)\n", + err); + } /* Disable the device */ - xen_pcibk_reset_device(psdev->dev); + xen_pcibk_reset_device(dev); kfree(dev_data); - pci_set_drvdata(psdev->dev, NULL); + pci_set_drvdata(dev, NULL); /* Clean-up the device */ - xen_pcibk_config_free_dyn_fields(psdev->dev); - xen_pcibk_config_free_dev(psdev->dev); + xen_pcibk_config_free_dyn_fields(dev); + xen_pcibk_config_free_dev(dev); - psdev->dev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED; - pci_dev_put(psdev->dev); + dev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED; + pci_dev_put(dev); kfree(psdev); } @@ -142,7 +161,8 @@ static struct pcistub_device *pcistub_device_find(int domain, int bus, if (psdev->dev != NULL && domain == pci_domain_nr(psdev->dev->bus) && bus == psdev->dev->bus->number - && PCI_DEVFN(slot, func) == psdev->dev->devfn) { + && slot == PCI_SLOT(psdev->dev->devfn) + && func == PCI_FUNC(psdev->dev->devfn)) { pcistub_device_get(psdev); goto out; } @@ -191,7 +211,8 @@ struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev, if (psdev->dev != NULL && domain == pci_domain_nr(psdev->dev->bus) && bus == psdev->dev->bus->number - && PCI_DEVFN(slot, func) == psdev->dev->devfn) { + && slot == PCI_SLOT(psdev->dev->devfn) + && func == PCI_FUNC(psdev->dev->devfn)) { found_dev = pcistub_device_get_pci_dev(pdev, psdev); break; } @@ -221,6 +242,15 @@ struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev, return found_dev; } +/* + * Called when: + * - XenBus state has been reconfigure (pci unplug). See xen_pcibk_remove_device + * - XenBus state has been disconnected (guest shutdown). See xen_pcibk_xenbus_remove + * - 'echo BDF > unbind' on pciback module with no guest attached. See pcistub_remove + * - 'echo BDF > unbind' with a guest still using it. See pcistub_remove + * + * As such we have to be careful. + */ void pcistub_put_pci_dev(struct pci_dev *dev) { struct pcistub_device *psdev, *found_psdev = NULL; @@ -251,16 +281,16 @@ void pcistub_put_pci_dev(struct pci_dev *dev) * and want to inhibit the user from fiddling with 'reset' */ pci_reset_function(dev); - pci_restore_state(psdev->dev); + pci_restore_state(dev); /* This disables the device. */ - xen_pcibk_reset_device(found_psdev->dev); + xen_pcibk_reset_device(dev); /* And cleanup up our emulated fields. */ - xen_pcibk_config_free_dyn_fields(found_psdev->dev); - xen_pcibk_config_reset_dev(found_psdev->dev); + xen_pcibk_config_reset_dev(dev); + xen_pcibk_config_free_dyn_fields(dev); - xen_unregister_device_domain_owner(found_psdev->dev); + xen_unregister_device_domain_owner(dev); spin_lock_irqsave(&found_psdev->lock, flags); found_psdev->pdev = NULL; @@ -270,8 +300,8 @@ void pcistub_put_pci_dev(struct pci_dev *dev) up_write(&pcistub_sem); } -static int __devinit pcistub_match_one(struct pci_dev *dev, - struct pcistub_device_id *pdev_id) +static int pcistub_match_one(struct pci_dev *dev, + struct pcistub_device_id *pdev_id) { /* Match the specified device by domain, bus, slot, func and also if * any of the device's parent bridges match. @@ -290,7 +320,7 @@ static int __devinit pcistub_match_one(struct pci_dev *dev, return 0; } -static int __devinit pcistub_match(struct pci_dev *dev) +static int pcistub_match(struct pci_dev *dev) { struct pcistub_device_id *pdev_id; unsigned long flags; @@ -308,7 +338,7 @@ static int __devinit pcistub_match(struct pci_dev *dev) return found; } -static int __devinit pcistub_init_device(struct pci_dev *dev) +static int pcistub_init_device(struct pci_dev *dev) { struct xen_pcibk_dev_data *dev_data; int err = 0; @@ -353,6 +383,19 @@ static int __devinit pcistub_init_device(struct pci_dev *dev) if (err) goto config_release; + if (dev->msix_cap) { + struct physdev_pci_device ppdev = { + .seg = pci_domain_nr(dev->bus), + .bus = dev->bus->number, + .devfn = dev->devfn + }; + + err = HYPERVISOR_physdev_op(PHYSDEVOP_prepare_msix, &ppdev); + if (err) + dev_err(&dev->dev, "MSI-X preparation failed (%d)\n", + err); + } + /* We need the device active to save the state. */ dev_dbg(&dev->dev, "save state of device\n"); pci_save_state(dev); @@ -360,7 +403,7 @@ static int __devinit pcistub_init_device(struct pci_dev *dev) if (!dev_data->pci_saved_state) dev_err(&dev->dev, "Could not store PCI conf saved state!\n"); else { - dev_dbg(&dev->dev, "reseting (FLR, D3, etc) the device\n"); + dev_dbg(&dev->dev, "resetting (FLR, D3, etc) the device\n"); __pci_reset_function_locked(dev); pci_restore_state(dev); } @@ -394,8 +437,6 @@ static int __init pcistub_init_devices_late(void) unsigned long flags; int err = 0; - pr_debug(DRV_NAME ": pcistub_init_devices_late\n"); - spin_lock_irqsave(&pcistub_devices_lock, flags); while (!list_empty(&seized_devices)) { @@ -426,7 +467,7 @@ static int __init pcistub_init_devices_late(void) return 0; } -static int __devinit pcistub_seize(struct pci_dev *dev) +static int pcistub_seize(struct pci_dev *dev) { struct pcistub_device *psdev; unsigned long flags; @@ -461,8 +502,9 @@ static int __devinit pcistub_seize(struct pci_dev *dev) return err; } -static int __devinit pcistub_probe(struct pci_dev *dev, - const struct pci_device_id *id) +/* Called when 'bind'. This means we must _NOT_ call pci_reset_function or + * other functions that take the sysfs lock. */ +static int pcistub_probe(struct pci_dev *dev, const struct pci_device_id *id) { int err = 0; @@ -489,6 +531,8 @@ out: return err; } +/* Called when 'unbind'. This means we must _NOT_ call pci_reset_function or + * other functions that take the sysfs lock. */ static void pcistub_remove(struct pci_dev *dev) { struct pcistub_device *psdev, *found_psdev = NULL; @@ -514,16 +558,14 @@ static void pcistub_remove(struct pci_dev *dev) found_psdev->pdev); if (found_psdev->pdev) { - printk(KERN_WARNING DRV_NAME ": ****** removing device " - "%s while still in-use! ******\n", + pr_warn("****** removing device %s while still in-use! ******\n", pci_name(found_psdev->dev)); - printk(KERN_WARNING DRV_NAME ": ****** driver domain may" - " still access this device's i/o resources!\n"); - printk(KERN_WARNING DRV_NAME ": ****** shutdown driver " - "domain before binding device\n"); - printk(KERN_WARNING DRV_NAME ": ****** to other drivers " - "or domains\n"); + pr_warn("****** driver domain may still access this device's i/o resources!\n"); + pr_warn("****** shutdown driver domain before binding device\n"); + pr_warn("****** to other drivers or domains\n"); + /* N.B. This ends up calling pcistub_put_pci_dev which ends up + * doing the FLR. */ xen_pcibk_release_pci_dev(found_psdev->pdev, found_psdev->dev); } @@ -897,42 +939,35 @@ static struct pci_driver xen_pcibk_pci_driver = { static inline int str_to_slot(const char *buf, int *domain, int *bus, int *slot, int *func) { - int err; - char wc = '*'; + int parsed = 0; - err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func); - switch (err) { + switch (sscanf(buf, " %x:%x:%x.%x %n", domain, bus, slot, func, + &parsed)) { case 3: *func = -1; - err = sscanf(buf, " %x:%x:%x.%c", domain, bus, slot, &wc); + sscanf(buf, " %x:%x:%x.* %n", domain, bus, slot, &parsed); break; case 2: *slot = *func = -1; - err = sscanf(buf, " %x:%x:*.%c", domain, bus, &wc); - if (err >= 2) - ++err; + sscanf(buf, " %x:%x:*.* %n", domain, bus, &parsed); break; } - if (err == 4 && wc == '*') + if (parsed && !buf[parsed]) return 0; - else if (err < 0) - return -EINVAL; /* try again without domain */ *domain = 0; - wc = '*'; - err = sscanf(buf, " %x:%x.%x", bus, slot, func); - switch (err) { + switch (sscanf(buf, " %x:%x.%x %n", bus, slot, func, &parsed)) { case 2: *func = -1; - err = sscanf(buf, " %x:%x.%c", bus, slot, &wc); + sscanf(buf, " %x:%x.* %n", bus, slot, &parsed); break; case 1: *slot = *func = -1; - err = sscanf(buf, " %x:*.%c", bus, &wc) + 1; + sscanf(buf, " %x:*.* %n", bus, &parsed); break; } - if (err == 3 && wc == '*') + if (parsed && !buf[parsed]) return 0; return -EINVAL; @@ -941,13 +976,20 @@ static inline int str_to_slot(const char *buf, int *domain, int *bus, static inline int str_to_quirk(const char *buf, int *domain, int *bus, int *slot, int *func, int *reg, int *size, int *mask) { - int err; + int parsed = 0; - err = - sscanf(buf, " %04x:%02x:%02x.%d-%08x:%1x:%08x", domain, bus, slot, - func, reg, size, mask); - if (err == 7) + sscanf(buf, " %x:%x:%x.%x-%x:%x:%x %n", domain, bus, slot, func, + reg, size, mask, &parsed); + if (parsed && !buf[parsed]) return 0; + + /* try again without domain */ + *domain = 0; + sscanf(buf, " %x:%x.%x-%x:%x:%x %n", bus, slot, func, reg, size, + mask, &parsed); + if (parsed && !buf[parsed]) + return 0; + return -EINVAL; } @@ -955,7 +997,7 @@ static int pcistub_device_id_add(int domain, int bus, int slot, int func) { struct pcistub_device_id *pci_dev_id; unsigned long flags; - int rc = 0; + int rc = 0, devfn = PCI_DEVFN(slot, func); if (slot < 0) { for (slot = 0; !rc && slot < 32; ++slot) @@ -969,15 +1011,26 @@ static int pcistub_device_id_add(int domain, int bus, int slot, int func) return rc; } + if (( +#if !defined(MODULE) /* pci_domains_supported is not being exported */ \ + || !defined(CONFIG_PCI_DOMAINS) + !pci_domains_supported ? domain : +#endif + domain < 0 || domain > 0xffff) + || bus < 0 || bus > 0xff + || PCI_SLOT(devfn) != slot + || PCI_FUNC(devfn) != func) + return -EINVAL; + pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL); if (!pci_dev_id) return -ENOMEM; pci_dev_id->domain = domain; pci_dev_id->bus = bus; - pci_dev_id->devfn = PCI_DEVFN(slot, func); + pci_dev_id->devfn = devfn; - pr_debug(DRV_NAME ": wants to seize %04x:%02x:%02x.%d\n", + pr_debug("wants to seize %04x:%02x:%02x.%d\n", domain, bus, slot, func); spin_lock_irqsave(&device_ids_lock, flags); @@ -1007,8 +1060,8 @@ static int pcistub_device_id_remove(int domain, int bus, int slot, int func) err = 0; - pr_debug(DRV_NAME ": removed %04x:%02x:%02x.%d from " - "seize list\n", domain, bus, slot, func); + pr_debug("removed %04x:%02x:%02x.%d from seize list\n", + domain, bus, slot, func); } } spin_unlock_irqrestore(&device_ids_lock, flags); @@ -1016,14 +1069,18 @@ static int pcistub_device_id_remove(int domain, int bus, int slot, int func) return err; } -static int pcistub_reg_add(int domain, int bus, int slot, int func, int reg, - int size, int mask) +static int pcistub_reg_add(int domain, int bus, int slot, int func, + unsigned int reg, unsigned int size, + unsigned int mask) { int err = 0; struct pcistub_device *psdev; struct pci_dev *dev; struct config_field *field; + if (reg > 0xfff || (size < 4 && (mask >> (size * 8)))) + return -EINVAL; + psdev = pcistub_device_find(domain, bus, slot, func); if (!psdev) { err = -ENODEV; @@ -1151,19 +1208,23 @@ static ssize_t pcistub_irq_handler_switch(struct device_driver *drv, struct pcistub_device *psdev; struct xen_pcibk_dev_data *dev_data; int domain, bus, slot, func; - int err = -ENOENT; + int err; err = str_to_slot(buf, &domain, &bus, &slot, &func); if (err) return err; psdev = pcistub_device_find(domain, bus, slot, func); - if (!psdev) + if (!psdev) { + err = -ENOENT; goto out; + } dev_data = pci_get_drvdata(psdev->dev); - if (!dev_data) + if (!dev_data) { + err = -ENOENT; goto out; + } dev_dbg(&psdev->dev->dev, "%s fake irq handler: %d->%d\n", dev_data->irq_name, dev_data->isr_on, @@ -1254,13 +1315,11 @@ static ssize_t permissive_add(struct device_driver *drv, const char *buf, int err; struct pcistub_device *psdev; struct xen_pcibk_dev_data *dev_data; + err = str_to_slot(buf, &domain, &bus, &slot, &func); if (err) goto out; - if (slot < 0 || func < 0) { - err = -EINVAL; - goto out; - } + psdev = pcistub_device_find(domain, bus, slot, func); if (!psdev) { err = -ENODEV; @@ -1339,8 +1398,6 @@ static int __init pcistub_init(void) if (pci_devs_to_hide && *pci_devs_to_hide) { do { - char wc = '*'; - parsed = 0; err = sscanf(pci_devs_to_hide + pos, @@ -1349,51 +1406,48 @@ static int __init pcistub_init(void) switch (err) { case 3: func = -1; - err = sscanf(pci_devs_to_hide + pos, - " (%x:%x:%x.%c) %n", - &domain, &bus, &slot, &wc, - &parsed); + sscanf(pci_devs_to_hide + pos, + " (%x:%x:%x.*) %n", + &domain, &bus, &slot, &parsed); break; case 2: slot = func = -1; - err = sscanf(pci_devs_to_hide + pos, - " (%x:%x:*.%c) %n", - &domain, &bus, &wc, &parsed) + 1; + sscanf(pci_devs_to_hide + pos, + " (%x:%x:*.*) %n", + &domain, &bus, &parsed); break; } - if (err != 4 || wc != '*') { + if (!parsed) { domain = 0; - wc = '*'; err = sscanf(pci_devs_to_hide + pos, " (%x:%x.%x) %n", &bus, &slot, &func, &parsed); switch (err) { case 2: func = -1; - err = sscanf(pci_devs_to_hide + pos, - " (%x:%x.%c) %n", - &bus, &slot, &wc, - &parsed); + sscanf(pci_devs_to_hide + pos, + " (%x:%x.*) %n", + &bus, &slot, &parsed); break; case 1: slot = func = -1; - err = sscanf(pci_devs_to_hide + pos, - " (%x:*.%c) %n", - &bus, &wc, &parsed) + 1; + sscanf(pci_devs_to_hide + pos, + " (%x:*.*) %n", + &bus, &parsed); break; } - if (err != 3 || wc != '*') - goto parse_error; } + if (parsed <= 0) + goto parse_error; + err = pcistub_device_id_add(domain, bus, slot, func); if (err) goto out; - /* if parsed<=0, we've reached the end of the string */ pos += parsed; - } while (parsed > 0 && pci_devs_to_hide[pos]); + } while (pci_devs_to_hide[pos]); } /* If we're the first PCI Device Driver to register, we're the @@ -1432,7 +1486,7 @@ out: return err; parse_error: - printk(KERN_ERR DRV_NAME ": Error parsing pci_devs_to_hide at \"%s\"\n", + pr_err("Error parsing pci_devs_to_hide at \"%s\"\n", pci_devs_to_hide + pos); return -EINVAL; } diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h index a7def010eba..f72af87640e 100644 --- a/drivers/xen/xen-pciback/pciback.h +++ b/drivers/xen/xen-pciback/pciback.h @@ -124,7 +124,7 @@ static inline int xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, static inline void xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, struct pci_dev *dev) { - if (xen_pcibk_backend && xen_pcibk_backend->free) + if (xen_pcibk_backend && xen_pcibk_backend->release) return xen_pcibk_backend->release(pdev, dev); } diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c index 97f5d264c31..c4a0666de6f 100644 --- a/drivers/xen/xen-pciback/pciback_ops.c +++ b/drivers/xen/xen-pciback/pciback_ops.c @@ -3,6 +3,9 @@ * * Author: Ryan Wilson <hap9@epoch.ncsc.mil> */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/wait.h> #include <linux/bitops.h> @@ -113,7 +116,8 @@ void xen_pcibk_reset_device(struct pci_dev *dev) if (dev->msi_enabled) pci_disable_msi(dev); #endif - pci_disable_device(dev); + if (pci_is_enabled(dev)) + pci_disable_device(dev); pci_write_config_word(dev, PCI_COMMAND, 0); @@ -135,7 +139,6 @@ int xen_pcibk_enable_msi(struct xen_pcibk_device *pdev, struct pci_dev *dev, struct xen_pci_op *op) { struct xen_pcibk_dev_data *dev_data; - int otherend = pdev->xdev->otherend_id; int status; if (unlikely(verbose_request)) @@ -144,8 +147,9 @@ int xen_pcibk_enable_msi(struct xen_pcibk_device *pdev, status = pci_enable_msi(dev); if (status) { - printk(KERN_ERR "error enable msi for guest %x status %x\n", - otherend, status); + pr_warn_ratelimited("%s: error enabling MSI for guest %u: err %d\n", + pci_name(dev), pdev->xdev->otherend_id, + status); op->value = 0; return XEN_PCI_ERR_op_failed; } @@ -209,12 +213,11 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev, entries[i].vector = op->msix_entries[i].vector; } - result = pci_enable_msix(dev, entries, op->value); - + result = pci_enable_msix_exact(dev, entries, op->value); if (result == 0) { for (i = 0; i < op->value; i++) { op->msix_entries[i].entry = entries[i].entry; - if (entries[i].vector) + if (entries[i].vector) { op->msix_entries[i].vector = xen_pirq_from_irq(entries[i].vector); if (unlikely(verbose_request)) @@ -222,11 +225,12 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev, "MSI-X[%d]: %d\n", pci_name(dev), i, op->msix_entries[i].vector); + } } - } else { - printk(KERN_WARNING DRV_NAME ": %s: failed to enable MSI-X: err %d!\n", - pci_name(dev), result); - } + } else + pr_warn_ratelimited("%s: error enabling MSI-X for guest %u: err %d!\n", + pci_name(dev), pdev->xdev->otherend_id, + result); kfree(entries); op->value = result; @@ -344,9 +348,9 @@ void xen_pcibk_do_op(struct work_struct *data) notify_remote_via_irq(pdev->evtchn_irq); /* Mark that we're done. */ - smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */ + smp_mb__before_atomic(); /* /after/ clearing PCIF_active */ clear_bit(_PDEVF_op_active, &pdev->flags); - smp_mb__after_clear_bit(); /* /before/ final check for work */ + smp_mb__after_atomic(); /* /before/ final check for work */ /* Check to see if the driver domain tried to start another request in * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active. @@ -371,7 +375,7 @@ static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id) dev_data->handled++; if ((dev_data->handled % 1000) == 0) { if (xen_test_irq_shared(irq)) { - printk(KERN_INFO "%s IRQ line is not shared " + pr_info("%s IRQ line is not shared " "with other domains. Turning ISR off\n", dev_data->irq_name); dev_data->ack_intr = 0; diff --git a/drivers/xen/xen-pciback/vpci.c b/drivers/xen/xen-pciback/vpci.c index 46d140baebd..51afff96c51 100644 --- a/drivers/xen/xen-pciback/vpci.c +++ b/drivers/xen/xen-pciback/vpci.c @@ -5,6 +5,8 @@ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/list.h> #include <linux/slab.h> #include <linux/pci.h> @@ -89,15 +91,20 @@ static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, mutex_lock(&vpci_dev->lock); - /* Keep multi-function devices together on the virtual PCI bus */ - for (slot = 0; slot < PCI_SLOT_MAX; slot++) { - if (!list_empty(&vpci_dev->dev_list[slot])) { + /* + * Keep multi-function devices together on the virtual PCI bus, except + * virtual functions. + */ + if (!dev->is_virtfn) { + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + if (list_empty(&vpci_dev->dev_list[slot])) + continue; + t = list_entry(list_first(&vpci_dev->dev_list[slot]), struct pci_dev_entry, list); if (match_slot(dev, t->dev)) { - pr_info(DRV_NAME ": vpci: %s: " - "assign to virtual slot %d func %d\n", + pr_info("vpci: %s: assign to virtual slot %d func %d\n", pci_name(dev), slot, PCI_FUNC(dev->devfn)); list_add_tail(&dev_entry->list, @@ -111,12 +118,11 @@ static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, /* Assign to a new slot on the virtual PCI bus */ for (slot = 0; slot < PCI_SLOT_MAX; slot++) { if (list_empty(&vpci_dev->dev_list[slot])) { - printk(KERN_INFO DRV_NAME - ": vpci: %s: assign to virtual slot %d\n", - pci_name(dev), slot); + pr_info("vpci: %s: assign to virtual slot %d\n", + pci_name(dev), slot); list_add_tail(&dev_entry->list, &vpci_dev->dev_list[slot]); - func = PCI_FUNC(dev->devfn); + func = dev->is_virtfn ? 0 : PCI_FUNC(dev->devfn); goto unlock; } } @@ -131,6 +137,8 @@ unlock: /* Publish this device. */ if (!err) err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid); + else + kfree(dev_entry); out: return err; diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c index 64b11f99eac..4a7e6e0a5f4 100644 --- a/drivers/xen/xen-pciback/xenbus.c +++ b/drivers/xen/xen-pciback/xenbus.c @@ -3,6 +3,9 @@ * * Author: Ryan Wilson <hap9@epoch.ncsc.mil> */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/init.h> #include <linux/list.h> @@ -90,6 +93,8 @@ static void free_pdev(struct xen_pcibk_device *pdev) xen_pcibk_disconnect(pdev); + /* N.B. This calls pcistub_put_pci_dev which does the FLR on all + * of the PCIe devices. */ xen_pcibk_release_devices(pdev); dev_set_drvdata(&pdev->xdev->dev, NULL); @@ -283,6 +288,8 @@ static int xen_pcibk_remove_device(struct xen_pcibk_device *pdev, dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id); xen_unregister_device_domain_owner(dev); + /* N.B. This ends up calling pcistub_put_pci_dev which ends up + * doing the FLR. */ xen_pcibk_release_pci_dev(pdev, dev); out: @@ -723,14 +730,13 @@ int __init xen_pcibk_xenbus_register(void) { xen_pcibk_wq = create_workqueue("xen_pciback_workqueue"); if (!xen_pcibk_wq) { - printk(KERN_ERR "%s: create" - "xen_pciback_workqueue failed\n", __func__); + pr_err("%s: create xen_pciback_workqueue failed\n", __func__); return -EFAULT; } xen_pcibk_backend = &xen_pcibk_vpci_backend; if (passthrough) xen_pcibk_backend = &xen_pcibk_passthrough_backend; - pr_info(DRV_NAME ": backend is %s\n", xen_pcibk_backend->name); + pr_info("backend is %s\n", xen_pcibk_backend->name); return xenbus_register_backend(&xen_pcibk_driver); } diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c index 7d041cb6da2..3b2bffde534 100644 --- a/drivers/xen/xen-selfballoon.c +++ b/drivers/xen/xen-selfballoon.c @@ -53,20 +53,19 @@ * System configuration note: Selfballooning should not be enabled on * systems without a sufficiently large swap device configured; for best * results, it is recommended that total swap be increased by the size - * of the guest memory. Also, while technically not required to be - * configured, it is highly recommended that frontswap also be configured - * and enabled when selfballooning is running. So, selfballooning - * is disabled by default if frontswap is not configured and can only - * be enabled with the "selfballooning" kernel boot option; similarly - * selfballooning is enabled by default if frontswap is configured and - * can be disabled with the "noselfballooning" kernel boot option. Finally, - * when frontswap is configured, frontswap-selfshrinking can be disabled - * with the "noselfshrink" kernel boot option. + * of the guest memory. Note, that selfballooning should be disabled by default + * if frontswap is not configured. Similarly selfballooning should be enabled + * by default if frontswap is configured and can be disabled with the + * "tmem.selfballooning=0" kernel boot option. Finally, when frontswap is + * configured, frontswap-selfshrinking can be disabled with the + * "tmem.selfshrink=0" kernel boot option. * * Selfballooning is disallowed in domain0 and force-disabled. * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/bootmem.h> #include <linux/swap.h> @@ -120,9 +119,6 @@ static DECLARE_DELAYED_WORK(selfballoon_worker, selfballoon_process); /* Enable/disable with sysfs. */ static bool frontswap_selfshrinking __read_mostly; -/* Enable/disable with kernel boot option. */ -static bool use_frontswap_selfshrink __initdata = true; - /* * The default values for the following parameters were deemed reasonable * by experimentation, may be workload-dependent, and can all be @@ -174,40 +170,13 @@ static void frontswap_selfshrink(void) tgt_frontswap_pages = cur_frontswap_pages - (cur_frontswap_pages / frontswap_hysteresis); frontswap_shrink(tgt_frontswap_pages); + frontswap_inertia_counter = frontswap_inertia; } -static int __init xen_nofrontswap_selfshrink_setup(char *s) -{ - use_frontswap_selfshrink = false; - return 1; -} - -__setup("noselfshrink", xen_nofrontswap_selfshrink_setup); - -/* Disable with kernel boot option. */ -static bool use_selfballooning __initdata = true; - -static int __init xen_noselfballooning_setup(char *s) -{ - use_selfballooning = false; - return 1; -} - -__setup("noselfballooning", xen_noselfballooning_setup); -#else /* !CONFIG_FRONTSWAP */ -/* Enable with kernel boot option. */ -static bool use_selfballooning __initdata = false; - -static int __init xen_selfballooning_setup(char *s) -{ - use_selfballooning = true; - return 1; -} - -__setup("selfballooning", xen_selfballooning_setup); #endif /* CONFIG_FRONTSWAP */ #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) +#define PAGES2MB(pages) ((pages) >> (20 - PAGE_SHIFT)) /* * Use current balloon size, the goal (vm_committed_as), and hysteresis @@ -222,7 +191,7 @@ static void selfballoon_process(struct work_struct *work) if (xen_selfballooning_enabled) { cur_pages = totalram_pages; tgt_pages = cur_pages; /* default is no change */ - goal_pages = percpu_counter_read_positive(&vm_committed_as) + + goal_pages = vm_memory_committed() + totalreserve_pages + MB2PAGES(selfballoon_reserved_mb); #ifdef CONFIG_FRONTSWAP @@ -298,8 +267,10 @@ static ssize_t store_selfballooning(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &tmp); - if (err || ((tmp != 0) && (tmp != 1))) + err = kstrtoul(buf, 10, &tmp); + if (err) + return err; + if ((tmp != 0) && (tmp != 1)) return -EINVAL; xen_selfballooning_enabled = !!tmp; @@ -325,8 +296,10 @@ static ssize_t store_selfballoon_interval(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) return -EINVAL; selfballoon_interval = val; return count; @@ -347,8 +320,10 @@ static ssize_t store_selfballoon_downhys(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) return -EINVAL; selfballoon_downhysteresis = val; return count; @@ -370,8 +345,10 @@ static ssize_t store_selfballoon_uphys(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) return -EINVAL; selfballoon_uphysteresis = val; return count; @@ -393,8 +370,10 @@ static ssize_t store_selfballoon_min_usable_mb(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) return -EINVAL; selfballoon_min_usable_mb = val; return count; @@ -417,8 +396,10 @@ static ssize_t store_selfballoon_reserved_mb(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) return -EINVAL; selfballoon_reserved_mb = val; return count; @@ -443,8 +424,10 @@ static ssize_t store_frontswap_selfshrinking(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &tmp); - if (err || ((tmp != 0) && (tmp != 1))) + err = kstrtoul(buf, 10, &tmp); + if (err) + return err; + if ((tmp != 0) && (tmp != 1)) return -EINVAL; frontswap_selfshrinking = !!tmp; if (!was_enabled && !xen_selfballooning_enabled && @@ -470,8 +453,10 @@ static ssize_t store_frontswap_inertia(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) return -EINVAL; frontswap_inertia = val; frontswap_inertia_counter = val; @@ -493,8 +478,10 @@ static ssize_t store_frontswap_hysteresis(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) return -EINVAL; frontswap_hysteresis = val; return count; @@ -537,41 +524,56 @@ int register_xen_selfballooning(struct device *dev) } EXPORT_SYMBOL(register_xen_selfballooning); -static int __init xen_selfballoon_init(void) +int xen_selfballoon_init(bool use_selfballooning, bool use_frontswap_selfshrink) { bool enable = false; + unsigned long reserve_pages; if (!xen_domain()) return -ENODEV; if (xen_initial_domain()) { - pr_info("xen/balloon: Xen selfballooning driver " - "disabled for domain0.\n"); + pr_info("Xen selfballooning driver disabled for domain0\n"); return -ENODEV; } xen_selfballooning_enabled = tmem_enabled && use_selfballooning; if (xen_selfballooning_enabled) { - pr_info("xen/balloon: Initializing Xen " - "selfballooning driver.\n"); + pr_info("Initializing Xen selfballooning driver\n"); enable = true; } #ifdef CONFIG_FRONTSWAP frontswap_selfshrinking = tmem_enabled && use_frontswap_selfshrink; if (frontswap_selfshrinking) { - pr_info("xen/balloon: Initializing frontswap " - "selfshrinking driver.\n"); + pr_info("Initializing frontswap selfshrinking driver\n"); enable = true; } #endif if (!enable) return -ENODEV; + /* + * Give selfballoon_reserved_mb a default value(10% of total ram pages) + * to make selfballoon not so aggressive. + * + * There are mainly two reasons: + * 1) The original goal_page didn't consider some pages used by kernel + * space, like slab pages and memory used by device drivers. + * + * 2) The balloon driver may not give back memory to guest OS fast + * enough when the workload suddenly aquries a lot of physical memory. + * + * In both cases, the guest OS will suffer from memory pressure and + * OOM killer may be triggered. + * By reserving extra 10% of total ram pages, we can keep the system + * much more reliably and response faster in some cases. + */ + if (!selfballoon_reserved_mb) { + reserve_pages = totalram_pages / 10; + selfballoon_reserved_mb = PAGES2MB(reserve_pages); + } schedule_delayed_work(&selfballoon_worker, selfballoon_interval * HZ); return 0; } - -subsys_initcall(xen_selfballoon_init); - -MODULE_LICENSE("GPL"); +EXPORT_SYMBOL(xen_selfballoon_init); diff --git a/drivers/xen/xen-stub.c b/drivers/xen/xen-stub.c new file mode 100644 index 00000000000..bbef194c5b0 --- /dev/null +++ b/drivers/xen/xen-stub.c @@ -0,0 +1,100 @@ +/* + * xen-stub.c - stub drivers to reserve space for Xen + * + * Copyright (C) 2012 Intel Corporation + * Author: Liu Jinsong <jinsong.liu@intel.com> + * Author: Jiang Yunhong <yunhong.jiang@intel.com> + * + * Copyright (C) 2012 Oracle Inc + * Author: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/export.h> +#include <linux/types.h> +#include <linux/acpi.h> +#include <xen/acpi.h> + +#ifdef CONFIG_ACPI + +/*-------------------------------------------- + stub driver for Xen memory hotplug +--------------------------------------------*/ + +static const struct acpi_device_id memory_device_ids[] = { + {ACPI_MEMORY_DEVICE_HID, 0}, + {"", 0}, +}; + +static struct acpi_driver xen_stub_memory_device_driver = { + /* same name as native memory driver to block native loaded */ + .name = "acpi_memhotplug", + .class = ACPI_MEMORY_DEVICE_CLASS, + .ids = memory_device_ids, +}; + +int xen_stub_memory_device_init(void) +{ + if (!xen_initial_domain()) + return -ENODEV; + + /* just reserve space for Xen, block native driver loaded */ + return acpi_bus_register_driver(&xen_stub_memory_device_driver); +} +EXPORT_SYMBOL_GPL(xen_stub_memory_device_init); +subsys_initcall(xen_stub_memory_device_init); + +void xen_stub_memory_device_exit(void) +{ + acpi_bus_unregister_driver(&xen_stub_memory_device_driver); +} +EXPORT_SYMBOL_GPL(xen_stub_memory_device_exit); + + +/*-------------------------------------------- + stub driver for Xen cpu hotplug +--------------------------------------------*/ + +static const struct acpi_device_id processor_device_ids[] = { + {ACPI_PROCESSOR_OBJECT_HID, 0}, + {ACPI_PROCESSOR_DEVICE_HID, 0}, + {"", 0}, +}; + +static struct acpi_driver xen_stub_processor_driver = { + /* same name as native processor driver to block native loaded */ + .name = "processor", + .class = ACPI_PROCESSOR_CLASS, + .ids = processor_device_ids, +}; + +int xen_stub_processor_init(void) +{ + if (!xen_initial_domain()) + return -ENODEV; + + /* just reserve space for Xen, block native driver loaded */ + return acpi_bus_register_driver(&xen_stub_processor_driver); +} +EXPORT_SYMBOL_GPL(xen_stub_processor_init); +subsys_initcall(xen_stub_processor_init); + +void xen_stub_processor_exit(void) +{ + acpi_bus_unregister_driver(&xen_stub_processor_driver); +} +EXPORT_SYMBOL_GPL(xen_stub_processor_exit); + +#endif diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index bcf3ba4a6ec..439c9dca9ee 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -30,6 +30,7 @@ * IN THE SOFTWARE. */ +#include <linux/mm.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/spinlock.h> @@ -44,6 +45,7 @@ #include <xen/grant_table.h> #include <xen/xenbus.h> #include <xen/xen.h> +#include <xen/features.h> #include "xenbus_probe.h" @@ -399,33 +401,6 @@ EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn); /** - * Bind to an existing interdomain event channel in another domain. Returns 0 - * on success and stores the local port in *port. On error, returns -errno, - * switches the device to XenbusStateClosing, and saves the error in XenStore. - */ -int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port) -{ - struct evtchn_bind_interdomain bind_interdomain; - int err; - - bind_interdomain.remote_dom = dev->otherend_id; - bind_interdomain.remote_port = remote_port; - - err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, - &bind_interdomain); - if (err) - xenbus_dev_fatal(dev, err, - "binding to event channel %d from domain %d", - remote_port, dev->otherend_id); - else - *port = bind_interdomain.local_port; - - return err; -} -EXPORT_SYMBOL_GPL(xenbus_bind_evtchn); - - -/** * Free an existing event channel. Returns 0 on success or -errno on error. */ int xenbus_free_evtchn(struct xenbus_device *dev, int port) @@ -533,7 +508,7 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev, err = xenbus_map_ring(dev, gnt_ref, &node->handle, addr); if (err) - goto out_err; + goto out_err_free_ballooned_pages; spin_lock(&xenbus_valloc_lock); list_add(&node->next, &xenbus_valloc_pages); @@ -542,8 +517,9 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev, *vaddr = addr; return 0; - out_err: + out_err_free_ballooned_pages: free_xenballooned_pages(1, &node->page); + out_err: kfree(node); return err; } @@ -741,7 +717,7 @@ static const struct xenbus_ring_ops ring_ops_hvm = { void __init xenbus_ring_ops_init(void) { - if (xen_pv_domain()) + if (!xen_feature(XENFEAT_auto_translated_physmap)) ring_ops = &ring_ops_pv; else ring_ops = &ring_ops_hvm; diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c index c5aa55c5d37..fdb0f339d0a 100644 --- a/drivers/xen/xenbus/xenbus_comms.c +++ b/drivers/xen/xenbus/xenbus_comms.c @@ -30,6 +30,8 @@ * IN THE SOFTWARE. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/wait.h> #include <linux/interrupt.h> #include <linux/sched.h> @@ -205,13 +207,12 @@ int xb_init_comms(void) struct xenstore_domain_interface *intf = xen_store_interface; if (intf->req_prod != intf->req_cons) - printk(KERN_ERR "XENBUS request ring is not quiescent " - "(%08x:%08x)!\n", intf->req_cons, intf->req_prod); + pr_err("request ring is not quiescent (%08x:%08x)!\n", + intf->req_cons, intf->req_prod); if (intf->rsp_prod != intf->rsp_cons) { - printk(KERN_WARNING "XENBUS response ring is not quiescent " - "(%08x:%08x): fixing up\n", - intf->rsp_cons, intf->rsp_prod); + pr_warn("response ring is not quiescent (%08x:%08x): fixing up\n", + intf->rsp_cons, intf->rsp_prod); /* breaks kdump */ if (!reset_devices) intf->rsp_cons = intf->rsp_prod; @@ -225,7 +226,7 @@ int xb_init_comms(void) err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting, 0, "xenbus", &xb_waitq); if (err < 0) { - printk(KERN_ERR "XENBUS request irq failed %i\n", err); + pr_err("request irq failed %i\n", err); return err; } diff --git a/drivers/xen/xenbus/xenbus_comms.h b/drivers/xen/xenbus/xenbus_comms.h index c8abd3b8a6c..e74f9c1fbd8 100644 --- a/drivers/xen/xenbus/xenbus_comms.h +++ b/drivers/xen/xenbus/xenbus_comms.h @@ -45,6 +45,7 @@ int xb_wait_for_data_to_read(void); int xs_input_avail(void); extern struct xenstore_domain_interface *xen_store_interface; extern int xen_store_evtchn; +extern enum xenstore_init xen_store_domain_type; extern const struct file_operations xen_xenbus_fops; diff --git a/drivers/xen/xenbus/xenbus_dev_backend.c b/drivers/xen/xenbus/xenbus_dev_backend.c index d7300080076..b17707ee07d 100644 --- a/drivers/xen/xenbus/xenbus_dev_backend.c +++ b/drivers/xen/xenbus/xenbus_dev_backend.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/slab.h> #include <linux/types.h> #include <linux/mm.h> @@ -70,22 +72,21 @@ static long xenbus_alloc(domid_t domid) return err; } -static long xenbus_backend_ioctl(struct file *file, unsigned int cmd, unsigned long data) +static long xenbus_backend_ioctl(struct file *file, unsigned int cmd, + unsigned long data) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; switch (cmd) { - case IOCTL_XENBUS_BACKEND_EVTCHN: - if (xen_store_evtchn > 0) - return xen_store_evtchn; - return -ENODEV; - - case IOCTL_XENBUS_BACKEND_SETUP: - return xenbus_alloc(data); - - default: - return -ENOTTY; + case IOCTL_XENBUS_BACKEND_EVTCHN: + if (xen_store_evtchn > 0) + return xen_store_evtchn; + return -ENODEV; + case IOCTL_XENBUS_BACKEND_SETUP: + return xenbus_alloc(data); + default: + return -ENOTTY; } } @@ -128,7 +129,7 @@ static int __init xenbus_backend_init(void) err = misc_register(&xenbus_backend_dev); if (err) - printk(KERN_ERR "Could not register xenbus backend device\n"); + pr_err("Could not register xenbus backend device\n"); return err; } diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c index 89f76252a16..85534ea6355 100644 --- a/drivers/xen/xenbus/xenbus_dev_frontend.c +++ b/drivers/xen/xenbus/xenbus_dev_frontend.c @@ -35,6 +35,8 @@ * Turned xenfs into a loadable module. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/errno.h> #include <linux/uio.h> @@ -458,7 +460,7 @@ static ssize_t xenbus_file_write(struct file *filp, goto out; /* Can't write a xenbus message larger we can buffer */ - if ((len + u->len) > sizeof(u->u.buffer)) { + if (len > sizeof(u->u.buffer) - u->len) { /* On error, dump existing buffer */ u->len = 0; rc = -EINVAL; @@ -616,7 +618,7 @@ static int __init xenbus_init(void) err = misc_register(&xenbus_dev); if (err) - printk(KERN_ERR "Could not register xenbus frontend device\n"); + pr_err("Could not register xenbus frontend device\n"); return err; } diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 038b71dbf03..3c0a74b3e9b 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -30,6 +30,8 @@ * IN THE SOFTWARE. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #define DPRINTK(fmt, args...) \ pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ __func__, __LINE__, ##args) @@ -69,6 +71,9 @@ EXPORT_SYMBOL_GPL(xen_store_evtchn); struct xenstore_domain_interface *xen_store_interface; EXPORT_SYMBOL_GPL(xen_store_interface); +enum xenstore_init xen_store_domain_type; +EXPORT_SYMBOL_GPL(xen_store_domain_type); + static unsigned long xen_store_mfn; static BLOCKING_NOTIFIER_HEAD(xenstore_chain); @@ -277,15 +282,15 @@ void xenbus_dev_shutdown(struct device *_dev) get_device(&dev->dev); if (dev->state != XenbusStateConnected) { - printk(KERN_INFO "%s: %s: %s != Connected, skipping\n", __func__, - dev->nodename, xenbus_strstate(dev->state)); + pr_info("%s: %s: %s != Connected, skipping\n", + __func__, dev->nodename, xenbus_strstate(dev->state)); goto out; } xenbus_switch_state(dev, XenbusStateClosing); timeout = wait_for_completion_timeout(&dev->down, timeout); if (!timeout) - printk(KERN_INFO "%s: %s timeout closing device\n", - __func__, dev->nodename); + pr_info("%s: %s timeout closing device\n", + __func__, dev->nodename); out: put_device(&dev->dev); } @@ -379,12 +384,14 @@ static ssize_t nodename_show(struct device *dev, { return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename); } +static DEVICE_ATTR_RO(nodename); static ssize_t devtype_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype); } +static DEVICE_ATTR_RO(devtype); static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -392,14 +399,24 @@ static ssize_t modalias_show(struct device *dev, return sprintf(buf, "%s:%s\n", dev->bus->name, to_xenbus_device(dev)->devicetype); } +static DEVICE_ATTR_RO(modalias); + +static struct attribute *xenbus_dev_attrs[] = { + &dev_attr_nodename.attr, + &dev_attr_devtype.attr, + &dev_attr_modalias.attr, + NULL, +}; -struct device_attribute xenbus_dev_attrs[] = { - __ATTR_RO(nodename), - __ATTR_RO(devtype), - __ATTR_RO(modalias), - __ATTR_NULL +static const struct attribute_group xenbus_dev_group = { + .attrs = xenbus_dev_attrs, }; -EXPORT_SYMBOL_GPL(xenbus_dev_attrs); + +const struct attribute_group *xenbus_dev_groups[] = { + &xenbus_dev_group, + NULL, +}; +EXPORT_SYMBOL_GPL(xenbus_dev_groups); int xenbus_probe_node(struct xen_bus_type *bus, const char *type, @@ -444,7 +461,7 @@ int xenbus_probe_node(struct xen_bus_type *bus, if (err) goto fail; - dev_set_name(&xendev->dev, devname); + dev_set_name(&xendev->dev, "%s", devname); /* Register with generic device framework. */ err = device_register(&xendev->dev); @@ -576,8 +593,7 @@ int xenbus_dev_suspend(struct device *dev) if (drv->suspend) err = drv->suspend(xdev); if (err) - printk(KERN_WARNING - "xenbus: suspend %s failed: %i\n", dev_name(dev), err); + pr_warn("suspend %s failed: %i\n", dev_name(dev), err); return 0; } EXPORT_SYMBOL_GPL(xenbus_dev_suspend); @@ -596,9 +612,8 @@ int xenbus_dev_resume(struct device *dev) drv = to_xenbus_driver(dev->driver); err = talk_to_otherend(xdev); if (err) { - printk(KERN_WARNING - "xenbus: resume (talk_to_otherend) %s failed: %i\n", - dev_name(dev), err); + pr_warn("resume (talk_to_otherend) %s failed: %i\n", + dev_name(dev), err); return err; } @@ -607,18 +622,15 @@ int xenbus_dev_resume(struct device *dev) if (drv->resume) { err = drv->resume(xdev); if (err) { - printk(KERN_WARNING - "xenbus: resume %s failed: %i\n", - dev_name(dev), err); + pr_warn("resume %s failed: %i\n", dev_name(dev), err); return err; } } err = watch_otherend(xdev); if (err) { - printk(KERN_WARNING - "xenbus_probe: resume (watch_otherend) %s failed: " - "%d.\n", dev_name(dev), err); + pr_warn("resume (watch_otherend) %s failed: %d.\n", + dev_name(dev), err); return err; } @@ -719,17 +731,11 @@ static int __init xenstored_local_init(void) return err; } -enum xenstore_init { - UNKNOWN, - PV, - HVM, - LOCAL, -}; static int __init xenbus_init(void) { int err = 0; - enum xenstore_init usage = UNKNOWN; uint64_t v = 0; + xen_store_domain_type = XS_UNKNOWN; if (!xen_domain()) return -ENODEV; @@ -737,29 +743,29 @@ static int __init xenbus_init(void) xenbus_ring_ops_init(); if (xen_pv_domain()) - usage = PV; + xen_store_domain_type = XS_PV; if (xen_hvm_domain()) - usage = HVM; + xen_store_domain_type = XS_HVM; if (xen_hvm_domain() && xen_initial_domain()) - usage = LOCAL; + xen_store_domain_type = XS_LOCAL; if (xen_pv_domain() && !xen_start_info->store_evtchn) - usage = LOCAL; + xen_store_domain_type = XS_LOCAL; if (xen_pv_domain() && xen_start_info->store_evtchn) xenstored_ready = 1; - switch (usage) { - case LOCAL: + switch (xen_store_domain_type) { + case XS_LOCAL: err = xenstored_local_init(); if (err) goto out_error; xen_store_interface = mfn_to_virt(xen_store_mfn); break; - case PV: + case XS_PV: xen_store_evtchn = xen_start_info->store_evtchn; xen_store_mfn = xen_start_info->store_mfn; xen_store_interface = mfn_to_virt(xen_store_mfn); break; - case HVM: + case XS_HVM: err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v); if (err) goto out_error; @@ -769,7 +775,7 @@ static int __init xenbus_init(void) goto out_error; xen_store_mfn = (unsigned long)v; xen_store_interface = - ioremap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE); + xen_remap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE); break; default: pr_warn("Xenstore state unknown\n"); @@ -779,8 +785,7 @@ static int __init xenbus_init(void) /* Initialize the interface to xenstore. */ err = xs_init(); if (err) { - printk(KERN_WARNING - "XENBUS: Error initializing xenstore comms: %i\n", err); + pr_warn("Error initializing xenstore comms: %i\n", err); goto out_error; } diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h index bb4f92ed873..1085ec294a1 100644 --- a/drivers/xen/xenbus/xenbus_probe.h +++ b/drivers/xen/xenbus/xenbus_probe.h @@ -47,7 +47,14 @@ struct xen_bus_type { struct bus_type bus; }; -extern struct device_attribute xenbus_dev_attrs[]; +enum xenstore_init { + XS_UNKNOWN, + XS_PV, + XS_HVM, + XS_LOCAL, +}; + +extern const struct attribute_group *xenbus_dev_groups[]; extern int xenbus_match(struct device *_dev, struct device_driver *_drv); extern int xenbus_dev_probe(struct device *_dev); diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c index 257be37d909..5125dce11a6 100644 --- a/drivers/xen/xenbus/xenbus_probe_backend.c +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -31,9 +31,11 @@ * IN THE SOFTWARE. */ -#define DPRINTK(fmt, args...) \ - pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ - __func__, __LINE__, ##args) +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define DPRINTK(fmt, ...) \ + pr_debug("(%s:%d) " fmt "\n", \ + __func__, __LINE__, ##__VA_ARGS__) #include <linux/kernel.h> #include <linux/err.h> @@ -198,7 +200,7 @@ static struct xen_bus_type xenbus_backend = { .probe = xenbus_dev_probe, .remove = xenbus_dev_remove, .shutdown = xenbus_dev_shutdown, - .dev_attrs = xenbus_dev_attrs, + .dev_groups = xenbus_dev_groups, }, }; diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c index 3159a37d966..cb385c10d2b 100644 --- a/drivers/xen/xenbus/xenbus_probe_frontend.c +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c @@ -1,6 +1,8 @@ -#define DPRINTK(fmt, args...) \ - pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ - __func__, __LINE__, ##args) +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define DPRINTK(fmt, ...) \ + pr_debug("(%s:%d) " fmt "\n", \ + __func__, __LINE__, ##__VA_ARGS__) #include <linux/kernel.h> #include <linux/err.h> @@ -29,18 +31,20 @@ #include "xenbus_probe.h" +static struct workqueue_struct *xenbus_frontend_wq; + /* device/<type>/<id> => <type>-<id> */ static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) { nodename = strchr(nodename, '/'); if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) { - printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename); + pr_warn("bad frontend %s\n", nodename); return -EINVAL; } strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE); if (!strchr(bus_id, '/')) { - printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id); + pr_warn("bus_id %s no slash\n", bus_id); return -EINVAL; } *strchr(bus_id, '/') = '-'; @@ -89,9 +93,49 @@ static void backend_changed(struct xenbus_watch *watch, xenbus_otherend_changed(watch, vec, len, 1); } +static void xenbus_frontend_delayed_resume(struct work_struct *w) +{ + struct xenbus_device *xdev = container_of(w, struct xenbus_device, work); + + xenbus_dev_resume(&xdev->dev); +} + +static int xenbus_frontend_dev_resume(struct device *dev) +{ + /* + * If xenstored is running in this domain, we cannot access the backend + * state at the moment, so we need to defer xenbus_dev_resume + */ + if (xen_store_domain_type == XS_LOCAL) { + struct xenbus_device *xdev = to_xenbus_device(dev); + + if (!xenbus_frontend_wq) { + pr_err("%s: no workqueue to process delayed resume\n", + xdev->nodename); + return -EFAULT; + } + + queue_work(xenbus_frontend_wq, &xdev->work); + + return 0; + } + + return xenbus_dev_resume(dev); +} + +static int xenbus_frontend_dev_probe(struct device *dev) +{ + if (xen_store_domain_type == XS_LOCAL) { + struct xenbus_device *xdev = to_xenbus_device(dev); + INIT_WORK(&xdev->work, xenbus_frontend_delayed_resume); + } + + return xenbus_dev_probe(dev); +} + static const struct dev_pm_ops xenbus_pm_ops = { .suspend = xenbus_dev_suspend, - .resume = xenbus_dev_resume, + .resume = xenbus_frontend_dev_resume, .freeze = xenbus_dev_suspend, .thaw = xenbus_dev_cancel, .restore = xenbus_dev_resume, @@ -107,10 +151,10 @@ static struct xen_bus_type xenbus_frontend = { .name = "xen", .match = xenbus_match, .uevent = xenbus_uevent_frontend, - .probe = xenbus_dev_probe, + .probe = xenbus_frontend_dev_probe, .remove = xenbus_dev_remove, .shutdown = xenbus_dev_shutdown, - .dev_attrs = xenbus_dev_attrs, + .dev_groups = xenbus_dev_groups, .pm = &xenbus_pm_ops, }, @@ -201,15 +245,13 @@ static int print_device_status(struct device *dev, void *data) if (!dev->driver) { /* Information only: is this too noisy? */ - printk(KERN_INFO "XENBUS: Device with no driver: %s\n", - xendev->nodename); + pr_info("Device with no driver: %s\n", xendev->nodename); } else if (xendev->state < XenbusStateConnected) { enum xenbus_state rstate = XenbusStateUnknown; if (xendev->otherend) rstate = xenbus_read_driver_state(xendev->otherend); - printk(KERN_WARNING "XENBUS: Timeout connecting " - "to device: %s (local state %d, remote state %d)\n", - xendev->nodename, xendev->state, rstate); + pr_warn("Timeout connecting to device: %s (local state %d, remote state %d)\n", + xendev->nodename, xendev->state, rstate); } return 0; @@ -223,12 +265,13 @@ static bool wait_loop(unsigned long start, unsigned int max_delay, { if (time_after(jiffies, start + (*seconds_waited+5)*HZ)) { if (!*seconds_waited) - printk(KERN_WARNING "XENBUS: Waiting for " - "devices to initialise: "); + pr_warn("Waiting for devices to initialise: "); *seconds_waited += 5; - printk("%us...", max_delay - *seconds_waited); - if (*seconds_waited == max_delay) + pr_cont("%us...", max_delay - *seconds_waited); + if (*seconds_waited == max_delay) { + pr_cont("\n"); return true; + } } schedule_timeout_interruptible(HZ/10); @@ -309,7 +352,7 @@ static void xenbus_reset_wait_for_backend(char *be, int expected) timeout = wait_event_interruptible_timeout(backend_state_wq, backend_state == expected, 5 * HZ); if (timeout <= 0) - printk(KERN_INFO "XENBUS: backend %s timed out.\n", be); + pr_info("backend %s timed out\n", be); } /* @@ -332,7 +375,7 @@ static void xenbus_reset_frontend(char *fe, char *be, int be_state) be_watch.callback = xenbus_reset_backend_state_changed; backend_state = XenbusStateUnknown; - printk(KERN_INFO "XENBUS: triggering reconnect on %s\n", be); + pr_info("triggering reconnect on %s\n", be); register_xenbus_watch(&be_watch); /* fall through to forward backend to state XenbusStateInitialising */ @@ -351,7 +394,7 @@ static void xenbus_reset_frontend(char *fe, char *be, int be_state) } unregister_xenbus_watch(&be_watch); - printk(KERN_INFO "XENBUS: reconnect done on %s\n", be); + pr_info("reconnect done on %s\n", be); kfree(be_watch.node); } @@ -440,6 +483,12 @@ static int __init xenbus_probe_frontend_init(void) register_xenstore_notifier(&xenstore_notifier); + if (xen_store_domain_type == XS_LOCAL) { + xenbus_frontend_wq = create_workqueue("xenbus_frontend"); + if (!xenbus_frontend_wq) + pr_warn("create xenbus frontend workqueue failed, S3 resume is likely to fail\n"); + } + return 0; } subsys_initcall(xenbus_probe_frontend_init); @@ -447,7 +496,7 @@ subsys_initcall(xenbus_probe_frontend_init); #ifndef MODULE static int __init boot_wait_for_devices(void) { - if (xen_hvm_domain() && !xen_platform_pci_unplug) + if (!xen_has_pv_devices()) return -ENODEV; ready_to_wait_for_devices = 1; diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index f5dda83ad7a..ba804f3d827 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -31,6 +31,8 @@ * IN THE SOFTWARE. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/unistd.h> #include <linux/errno.h> #include <linux/types.h> @@ -48,7 +50,7 @@ #include <xen/xenbus.h> #include <xen/xen.h> #include "xenbus_comms.h" -#include <asm/xen/hypervisor.h> +#include "xenbus_probe.h" struct xs_stored_msg { struct list_head list; @@ -130,15 +132,37 @@ static int get_error(const char *errorstring) for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) { if (i == ARRAY_SIZE(xsd_errors) - 1) { - printk(KERN_WARNING - "XENBUS xen store gave: unknown error %s", - errorstring); + pr_warn("xen store gave: unknown error %s\n", + errorstring); return EINVAL; } } return xsd_errors[i].errnum; } +static bool xenbus_ok(void) +{ + switch (xen_store_domain_type) { + case XS_LOCAL: + switch (system_state) { + case SYSTEM_POWER_OFF: + case SYSTEM_RESTART: + case SYSTEM_HALT: + return false; + default: + break; + } + return true; + case XS_PV: + case XS_HVM: + /* FIXME: Could check that the remote domain is alive, + * but it is normally initial domain. */ + return true; + default: + break; + } + return false; +} static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len) { struct xs_stored_msg *msg; @@ -148,9 +172,20 @@ static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len) while (list_empty(&xs_state.reply_list)) { spin_unlock(&xs_state.reply_lock); - /* XXX FIXME: Avoid synchronous wait for response here. */ - wait_event(xs_state.reply_waitq, - !list_empty(&xs_state.reply_list)); + if (xenbus_ok()) + /* XXX FIXME: Avoid synchronous wait for response here. */ + wait_event_timeout(xs_state.reply_waitq, + !list_empty(&xs_state.reply_list), + msecs_to_jiffies(500)); + else { + /* + * If we are in the process of being shut-down there is + * no point of trying to contact XenBus - it is either + * killed (xenstored application) or the other domain + * has been killed or is unreachable. + */ + return ERR_PTR(-EIO); + } spin_lock(&xs_state.reply_lock); } @@ -215,6 +250,9 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) mutex_unlock(&xs_state.request_mutex); + if (IS_ERR(ret)) + return ret; + if ((msg->type == XS_TRANSACTION_END) || ((req_msg.type == XS_TRANSACTION_START) && (msg->type == XS_ERROR))) @@ -273,10 +311,8 @@ static void *xs_talkv(struct xenbus_transaction t, } if (msg.type != type) { - if (printk_ratelimit()) - printk(KERN_WARNING - "XENBUS unexpected type [%d], expected [%d]\n", - msg.type, type); + pr_warn_ratelimited("unexpected type [%d], expected [%d]\n", + msg.type, type); kfree(ret); return ERR_PTR(-EINVAL); } @@ -627,6 +663,7 @@ static struct xenbus_watch *find_watch(const char *token) */ static bool xen_strict_xenbus_quirk(void) { +#ifdef CONFIG_X86 uint32_t eax, ebx, ecx, edx, base; base = xen_cpuid_base(); @@ -634,6 +671,7 @@ static bool xen_strict_xenbus_quirk(void) if ((eax >> 16) < 4) return true; +#endif return false; } @@ -654,7 +692,7 @@ static void xs_reset_watches(void) err = xs_error(xs_single(XBT_NIL, XS_RESET_WATCHES, "", NULL)); if (err && err != -EEXIST) - printk(KERN_WARNING "xs_reset_watches failed: %d\n", err); + pr_warn("xs_reset_watches failed: %d\n", err); } /* Register callback to watch this node. */ @@ -704,9 +742,7 @@ void unregister_xenbus_watch(struct xenbus_watch *watch) err = xs_unwatch(watch->node, token); if (err) - printk(KERN_WARNING - "XENBUS Failed to release watch %s: %i\n", - watch->node, err); + pr_warn("Failed to release watch %s: %i\n", watch->node, err); up_read(&xs_state.watch_mutex); @@ -900,8 +936,7 @@ static int xenbus_thread(void *unused) for (;;) { err = process_msg(); if (err) - printk(KERN_WARNING "XENBUS error %d while reading " - "message\n", err); + pr_warn("error %d while reading message\n", err); if (kthread_should_stop()) break; } diff --git a/drivers/xen/xencomm.c b/drivers/xen/xencomm.c deleted file mode 100644 index b91f8ff50d0..00000000000 --- a/drivers/xen/xencomm.c +++ /dev/null @@ -1,217 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Copyright (C) IBM Corp. 2006 - * - * Authors: Hollis Blanchard <hollisb@us.ibm.com> - */ - -#include <linux/mm.h> -#include <linux/slab.h> -#include <asm/page.h> -#include <xen/xencomm.h> -#include <xen/interface/xen.h> -#include <asm/xen/xencomm.h> /* for xencomm_is_phys_contiguous() */ - -static int xencomm_init(struct xencomm_desc *desc, - void *buffer, unsigned long bytes) -{ - unsigned long recorded = 0; - int i = 0; - - while ((recorded < bytes) && (i < desc->nr_addrs)) { - unsigned long vaddr = (unsigned long)buffer + recorded; - unsigned long paddr; - int offset; - int chunksz; - - offset = vaddr % PAGE_SIZE; /* handle partial pages */ - chunksz = min(PAGE_SIZE - offset, bytes - recorded); - - paddr = xencomm_vtop(vaddr); - if (paddr == ~0UL) { - printk(KERN_DEBUG "%s: couldn't translate vaddr %lx\n", - __func__, vaddr); - return -EINVAL; - } - - desc->address[i++] = paddr; - recorded += chunksz; - } - - if (recorded < bytes) { - printk(KERN_DEBUG - "%s: could only translate %ld of %ld bytes\n", - __func__, recorded, bytes); - return -ENOSPC; - } - - /* mark remaining addresses invalid (just for safety) */ - while (i < desc->nr_addrs) - desc->address[i++] = XENCOMM_INVALID; - - desc->magic = XENCOMM_MAGIC; - - return 0; -} - -static struct xencomm_desc *xencomm_alloc(gfp_t gfp_mask, - void *buffer, unsigned long bytes) -{ - struct xencomm_desc *desc; - unsigned long buffer_ulong = (unsigned long)buffer; - unsigned long start = buffer_ulong & PAGE_MASK; - unsigned long end = (buffer_ulong + bytes) | ~PAGE_MASK; - unsigned long nr_addrs = (end - start + 1) >> PAGE_SHIFT; - unsigned long size = sizeof(*desc) + - sizeof(desc->address[0]) * nr_addrs; - - /* - * slab allocator returns at least sizeof(void*) aligned pointer. - * When sizeof(*desc) > sizeof(void*), struct xencomm_desc might - * cross page boundary. - */ - if (sizeof(*desc) > sizeof(void *)) { - unsigned long order = get_order(size); - desc = (struct xencomm_desc *)__get_free_pages(gfp_mask, - order); - if (desc == NULL) - return NULL; - - desc->nr_addrs = - ((PAGE_SIZE << order) - sizeof(struct xencomm_desc)) / - sizeof(*desc->address); - } else { - desc = kmalloc(size, gfp_mask); - if (desc == NULL) - return NULL; - - desc->nr_addrs = nr_addrs; - } - return desc; -} - -void xencomm_free(struct xencomm_handle *desc) -{ - if (desc && !((ulong)desc & XENCOMM_INLINE_FLAG)) { - struct xencomm_desc *desc__ = (struct xencomm_desc *)desc; - if (sizeof(*desc__) > sizeof(void *)) { - unsigned long size = sizeof(*desc__) + - sizeof(desc__->address[0]) * desc__->nr_addrs; - unsigned long order = get_order(size); - free_pages((unsigned long)__va(desc), order); - } else - kfree(__va(desc)); - } -} - -static int xencomm_create(void *buffer, unsigned long bytes, - struct xencomm_desc **ret, gfp_t gfp_mask) -{ - struct xencomm_desc *desc; - int rc; - - pr_debug("%s: %p[%ld]\n", __func__, buffer, bytes); - - if (bytes == 0) { - /* don't create a descriptor; Xen recognizes NULL. */ - BUG_ON(buffer != NULL); - *ret = NULL; - return 0; - } - - BUG_ON(buffer == NULL); /* 'bytes' is non-zero */ - - desc = xencomm_alloc(gfp_mask, buffer, bytes); - if (!desc) { - printk(KERN_DEBUG "%s failure\n", "xencomm_alloc"); - return -ENOMEM; - } - - rc = xencomm_init(desc, buffer, bytes); - if (rc) { - printk(KERN_DEBUG "%s failure: %d\n", "xencomm_init", rc); - xencomm_free((struct xencomm_handle *)__pa(desc)); - return rc; - } - - *ret = desc; - return 0; -} - -static struct xencomm_handle *xencomm_create_inline(void *ptr) -{ - unsigned long paddr; - - BUG_ON(!xencomm_is_phys_contiguous((unsigned long)ptr)); - - paddr = (unsigned long)xencomm_pa(ptr); - BUG_ON(paddr & XENCOMM_INLINE_FLAG); - return (struct xencomm_handle *)(paddr | XENCOMM_INLINE_FLAG); -} - -/* "mini" routine, for stack-based communications: */ -static int xencomm_create_mini(void *buffer, - unsigned long bytes, struct xencomm_mini *xc_desc, - struct xencomm_desc **ret) -{ - int rc = 0; - struct xencomm_desc *desc; - BUG_ON(((unsigned long)xc_desc) % sizeof(*xc_desc) != 0); - - desc = (void *)xc_desc; - - desc->nr_addrs = XENCOMM_MINI_ADDRS; - - rc = xencomm_init(desc, buffer, bytes); - if (!rc) - *ret = desc; - - return rc; -} - -struct xencomm_handle *xencomm_map(void *ptr, unsigned long bytes) -{ - int rc; - struct xencomm_desc *desc; - - if (xencomm_is_phys_contiguous((unsigned long)ptr)) - return xencomm_create_inline(ptr); - - rc = xencomm_create(ptr, bytes, &desc, GFP_KERNEL); - - if (rc || desc == NULL) - return NULL; - - return xencomm_pa(desc); -} - -struct xencomm_handle *__xencomm_map_no_alloc(void *ptr, unsigned long bytes, - struct xencomm_mini *xc_desc) -{ - int rc; - struct xencomm_desc *desc = NULL; - - if (xencomm_is_phys_contiguous((unsigned long)ptr)) - return xencomm_create_inline(ptr); - - rc = xencomm_create_mini(ptr, bytes, xc_desc, - &desc); - - if (rc) - return NULL; - - return xencomm_pa(desc); -} diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c index 459b9ac45cf..06092e0fe8c 100644 --- a/drivers/xen/xenfs/super.c +++ b/drivers/xen/xenfs/super.c @@ -7,6 +7,8 @@ * Turned xenfs into a loadable module. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/errno.h> #include <linux/module.h> @@ -24,47 +26,6 @@ MODULE_DESCRIPTION("Xen filesystem"); MODULE_LICENSE("GPL"); -static struct inode *xenfs_make_inode(struct super_block *sb, int mode) -{ - struct inode *ret = new_inode(sb); - - if (ret) { - ret->i_mode = mode; - ret->i_uid = GLOBAL_ROOT_UID; - ret->i_gid = GLOBAL_ROOT_GID; - ret->i_blocks = 0; - ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME; - } - return ret; -} - -static struct dentry *xenfs_create_file(struct super_block *sb, - struct dentry *parent, - const char *name, - const struct file_operations *fops, - void *data, - int mode) -{ - struct dentry *dentry; - struct inode *inode; - - dentry = d_alloc_name(parent, name); - if (!dentry) - return NULL; - - inode = xenfs_make_inode(sb, S_IFREG | mode); - if (!inode) { - dput(dentry); - return NULL; - } - - inode->i_fop = fops; - inode->i_private = data; - - d_add(dentry, inode); - return dentry; -} - static ssize_t capabilities_read(struct file *file, char __user *buf, size_t size, loff_t *off) { @@ -84,26 +45,23 @@ static const struct file_operations capabilities_file_ops = { static int xenfs_fill_super(struct super_block *sb, void *data, int silent) { static struct tree_descr xenfs_files[] = { - [1] = {}, - { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR }, + [2] = { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR }, { "capabilities", &capabilities_file_ops, S_IRUGO }, { "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR }, {""}, }; - int rc; - - rc = simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files); - if (rc < 0) - return rc; - if (xen_initial_domain()) { - xenfs_create_file(sb, sb->s_root, "xsd_kva", - &xsd_kva_file_ops, NULL, S_IRUSR|S_IWUSR); - xenfs_create_file(sb, sb->s_root, "xsd_port", - &xsd_port_file_ops, NULL, S_IRUSR|S_IWUSR); - } + static struct tree_descr xenfs_init_files[] = { + [2] = { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR }, + { "capabilities", &capabilities_file_ops, S_IRUGO }, + { "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR }, + { "xsd_kva", &xsd_kva_file_ops, S_IRUSR|S_IWUSR}, + { "xsd_port", &xsd_port_file_ops, S_IRUSR|S_IWUSR}, + {""}, + }; - return rc; + return simple_fill_super(sb, XENFS_SUPER_MAGIC, + xen_initial_domain() ? xenfs_init_files : xenfs_files); } static struct dentry *xenfs_mount(struct file_system_type *fs_type, @@ -119,13 +77,14 @@ static struct file_system_type xenfs_type = { .mount = xenfs_mount, .kill_sb = kill_litter_super, }; +MODULE_ALIAS_FS("xenfs"); static int __init xenfs_init(void) { if (xen_domain()) return register_filesystem(&xenfs_type); - printk(KERN_INFO "XENFS: not registering filesystem on non-xen platform\n"); + pr_info("not registering filesystem on non-xen platform\n"); return 0; } |
