diff options
Diffstat (limited to 'drivers/xen')
34 files changed, 1830 insertions, 1091 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 23eae5cb69c..38fb36e1c59 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -3,7 +3,6 @@ menu "Xen driver support" config XEN_BALLOON bool "Xen memory balloon driver" - depends on !ARM default y help The balloon driver allows the Xen domain to request more memory from @@ -140,7 +139,6 @@ config XEN_GRANT_DEV_ALLOC config SWIOTLB_XEN def_bool y - depends on PCI && X86 select SWIOTLB config XEN_TMEM @@ -223,7 +221,7 @@ config XEN_ACPI_PROCESSOR To do that the driver parses the Power Management data and uploads said information to the Xen hypervisor. Then the Xen hypervisor can - select the proper Cx and Pxx states. It also registers itslef as the + select the proper Cx and Pxx states. It also registers itself as the SMM so that other drivers (such as ACPI cpufreq scaling driver) will not load. diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 14fe79d8634..45e00afa7f2 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -2,7 +2,8 @@ ifeq ($(filter y, $(CONFIG_ARM) $(CONFIG_ARM64)),) obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o endif obj-$(CONFIG_X86) += fallback.o -obj-y += grant-table.o features.o events.o balloon.o manage.o +obj-y += grant-table.o features.o balloon.o manage.o +obj-y += events/ obj-y += xenbus/ nostackp := $(call cc-option, -fno-stack-protector) @@ -15,7 +16,6 @@ xen-pad-$(CONFIG_X86) += xen-acpi-pad.o dom0-$(CONFIG_X86) += pcpu.o obj-$(CONFIG_XEN_DOM0) += $(dom0-y) obj-$(CONFIG_BLOCK) += biomerge.o -obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += xen-balloon.o obj-$(CONFIG_XEN_SELFBALLOONING) += xen-selfballoon.o obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index b232908a619..5c660c77f03 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -157,13 +157,6 @@ static struct page *balloon_retrieve(bool prefer_highmem) return page; } -static struct page *balloon_first_page(void) -{ - if (list_empty(&ballooned_pages)) - return NULL; - return list_entry(ballooned_pages.next, struct page, lru); -} - static struct page *balloon_next_page(struct page *page) { struct list_head *next = page->lru.next; @@ -328,7 +321,7 @@ static enum bp_state increase_reservation(unsigned long nr_pages) if (nr_pages > ARRAY_SIZE(frame_list)) nr_pages = ARRAY_SIZE(frame_list); - page = balloon_first_page(); + page = list_first_entry_or_null(&ballooned_pages, struct page, lru); for (i = 0; i < nr_pages; i++) { if (!page) { nr_pages = i; @@ -350,17 +343,19 @@ static enum bp_state increase_reservation(unsigned long nr_pages) pfn = page_to_pfn(page); - set_phys_to_machine(pfn, frame_list[i]); - #ifdef CONFIG_XEN_HAVE_PVMMU - /* Link back into the page tables if not highmem. */ - if (xen_pv_domain() && !PageHighMem(page)) { - int ret; - ret = HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), - mfn_pte(frame_list[i], PAGE_KERNEL), - 0); - BUG_ON(ret); + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + set_phys_to_machine(pfn, frame_list[i]); + + /* Link back into the page tables if not highmem. */ + if (!PageHighMem(page)) { + int ret; + ret = HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + mfn_pte(frame_list[i], PAGE_KERNEL), + 0); + BUG_ON(ret); + } } #endif @@ -378,7 +373,6 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) enum bp_state state = BP_DONE; unsigned long pfn, i; struct page *page; - struct page *scratch_page; int ret; struct xen_memory_reservation reservation = { .address_bits = 0, @@ -405,39 +399,51 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) state = BP_EAGAIN; break; } + scrub_page(page); - pfn = page_to_pfn(page); - frame_list[i] = pfn_to_mfn(pfn); + frame_list[i] = page_to_pfn(page); + } - scrub_page(page); + /* + * Ensure that ballooned highmem pages don't have kmaps. + * + * Do this before changing the p2m as kmap_flush_unused() + * reads PTEs to obtain pages (and hence needs the original + * p2m entry). + */ + kmap_flush_unused(); + /* Update direct mapping, invalidate P2M, and add to balloon. */ + for (i = 0; i < nr_pages; i++) { + pfn = frame_list[i]; + frame_list[i] = pfn_to_mfn(pfn); + page = pfn_to_page(pfn); + +#ifdef CONFIG_XEN_HAVE_PVMMU /* * Ballooned out frames are effectively replaced with * a scratch frame. Ensure direct mappings and the * p2m are consistent. */ - scratch_page = get_balloon_scratch_page(); -#ifdef CONFIG_XEN_HAVE_PVMMU - if (xen_pv_domain() && !PageHighMem(page)) { - ret = HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), - pfn_pte(page_to_pfn(scratch_page), - PAGE_KERNEL_RO), 0); - BUG_ON(ret); - } -#endif if (!xen_feature(XENFEAT_auto_translated_physmap)) { - unsigned long p; - p = page_to_pfn(scratch_page); - __set_phys_to_machine(pfn, pfn_to_mfn(p)); + if (!PageHighMem(page)) { + struct page *scratch_page = get_balloon_scratch_page(); + + ret = HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte(page_to_pfn(scratch_page), + PAGE_KERNEL_RO), 0); + BUG_ON(ret); + + put_balloon_scratch_page(); + } + __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } - put_balloon_scratch_page(); +#endif - balloon_append(pfn_to_page(pfn)); + balloon_append(page); } - /* Ensure that ballooned highmem pages don't have kmaps. */ - kmap_flush_unused(); flush_tlb_all(); set_xen_guest_handle(reservation.extent_start, frame_list); @@ -596,19 +602,29 @@ static void __init balloon_add_region(unsigned long start_pfn, } } -static int __cpuinit balloon_cpu_notify(struct notifier_block *self, +static int alloc_balloon_scratch_page(int cpu) +{ + if (per_cpu(balloon_scratch_page, cpu) != NULL) + return 0; + + per_cpu(balloon_scratch_page, cpu) = alloc_page(GFP_KERNEL); + if (per_cpu(balloon_scratch_page, cpu) == NULL) { + pr_warn("Failed to allocate balloon_scratch_page for cpu %d\n", cpu); + return -ENOMEM; + } + + return 0; +} + + +static int balloon_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { int cpu = (long)hcpu; switch (action) { case CPU_UP_PREPARE: - if (per_cpu(balloon_scratch_page, cpu) != NULL) - break; - per_cpu(balloon_scratch_page, cpu) = alloc_page(GFP_KERNEL); - if (per_cpu(balloon_scratch_page, cpu) == NULL) { - pr_warn("Failed to allocate balloon_scratch_page for cpu %d\n", cpu); + if (alloc_balloon_scratch_page(cpu)) return NOTIFY_BAD; - } break; default: break; @@ -616,7 +632,7 @@ static int __cpuinit balloon_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block balloon_cpu_notifier __cpuinitdata = { +static struct notifier_block balloon_cpu_notifier = { .notifier_call = balloon_cpu_notify, }; @@ -627,21 +643,25 @@ static int __init balloon_init(void) if (!xen_domain()) return -ENODEV; - for_each_online_cpu(cpu) - { - per_cpu(balloon_scratch_page, cpu) = alloc_page(GFP_KERNEL); - if (per_cpu(balloon_scratch_page, cpu) == NULL) { - pr_warn("Failed to allocate balloon_scratch_page for cpu %d\n", cpu); - return -ENOMEM; + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + register_cpu_notifier(&balloon_cpu_notifier); + + get_online_cpus(); + for_each_online_cpu(cpu) { + if (alloc_balloon_scratch_page(cpu)) { + put_online_cpus(); + unregister_cpu_notifier(&balloon_cpu_notifier); + return -ENOMEM; + } } + put_online_cpus(); } - register_cpu_notifier(&balloon_cpu_notifier); pr_info("Initialising balloon driver\n"); balloon_stats.current_pages = xen_pv_domain() ? min(xen_start_info->nr_pages - xen_released_pages, max_pfn) - : max_pfn; + : get_num_physpages(); balloon_stats.target_pages = balloon_stats.current_pages; balloon_stats.balloon_low = 0; balloon_stats.balloon_high = 0; diff --git a/drivers/xen/dbgp.c b/drivers/xen/dbgp.c index f3ccc80a455..8145a59fd9f 100644 --- a/drivers/xen/dbgp.c +++ b/drivers/xen/dbgp.c @@ -19,7 +19,7 @@ static int xen_dbgp_op(struct usb_hcd *hcd, int op) dbgp.op = op; #ifdef CONFIG_PCI - if (ctrlr->bus == &pci_bus_type) { + if (dev_is_pci(ctrlr)) { const struct pci_dev *pdev = to_pci_dev(ctrlr); dbgp.u.pci.seg = pci_domain_nr(pdev->bus); diff --git a/drivers/xen/events/Makefile b/drivers/xen/events/Makefile new file mode 100644 index 00000000000..62be55cd981 --- /dev/null +++ b/drivers/xen/events/Makefile @@ -0,0 +1,5 @@ +obj-y += events.o + +events-y += events_base.o +events-y += events_2l.o +events-y += events_fifo.o diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c new file mode 100644 index 00000000000..5db43fc100a --- /dev/null +++ b/drivers/xen/events/events_2l.c @@ -0,0 +1,365 @@ +/* + * Xen event channels (2-level ABI) + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/linkage.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/module.h> + +#include <asm/sync_bitops.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> + +#include "events_internal.h" + +/* + * Note sizeof(xen_ulong_t) can be more than sizeof(unsigned long). Be + * careful to only use bitops which allow for this (e.g + * test_bit/find_first_bit and friends but not __ffs) and to pass + * BITS_PER_EVTCHN_WORD as the bitmask length. + */ +#define BITS_PER_EVTCHN_WORD (sizeof(xen_ulong_t)*8) +/* + * Make a bitmask (i.e. unsigned long *) of a xen_ulong_t + * array. Primarily to avoid long lines (hence the terse name). + */ +#define BM(x) (unsigned long *)(x) +/* Find the first set bit in a evtchn mask */ +#define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD) + +static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD], + cpu_evtchn_mask); + +static unsigned evtchn_2l_max_channels(void) +{ + return EVTCHN_2L_NR_CHANNELS; +} + +static void evtchn_2l_bind_to_cpu(struct irq_info *info, unsigned cpu) +{ + clear_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, info->cpu))); + set_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, cpu))); +} + +static void evtchn_2l_clear_pending(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_clear_bit(port, BM(&s->evtchn_pending[0])); +} + +static void evtchn_2l_set_pending(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, BM(&s->evtchn_pending[0])); +} + +static bool evtchn_2l_is_pending(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + return sync_test_bit(port, BM(&s->evtchn_pending[0])); +} + +static bool evtchn_2l_test_and_set_mask(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + return sync_test_and_set_bit(port, BM(&s->evtchn_mask[0])); +} + +static void evtchn_2l_mask(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, BM(&s->evtchn_mask[0])); +} + +static void evtchn_2l_unmask(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + unsigned int cpu = get_cpu(); + int do_hypercall = 0, evtchn_pending = 0; + + BUG_ON(!irqs_disabled()); + + if (unlikely((cpu != cpu_from_evtchn(port)))) + do_hypercall = 1; + else { + /* + * Need to clear the mask before checking pending to + * avoid a race with an event becoming pending. + * + * EVTCHNOP_unmask will only trigger an upcall if the + * mask bit was set, so if a hypercall is needed + * remask the event. + */ + sync_clear_bit(port, BM(&s->evtchn_mask[0])); + evtchn_pending = sync_test_bit(port, BM(&s->evtchn_pending[0])); + + if (unlikely(evtchn_pending && xen_hvm_domain())) { + sync_set_bit(port, BM(&s->evtchn_mask[0])); + do_hypercall = 1; + } + } + + /* Slow path (hypercall) if this is a non-local port or if this is + * an hvm domain and an event is pending (hvm domains don't have + * their own implementation of irq_enable). */ + if (do_hypercall) { + struct evtchn_unmask unmask = { .port = port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); + } else { + struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + + /* + * The following is basically the equivalent of + * 'hw_resend_irq'. Just like a real IO-APIC we 'lose + * the interrupt edge' if the channel is masked. + */ + if (evtchn_pending && + !sync_test_and_set_bit(port / BITS_PER_EVTCHN_WORD, + BM(&vcpu_info->evtchn_pending_sel))) + vcpu_info->evtchn_upcall_pending = 1; + } + + put_cpu(); +} + +static DEFINE_PER_CPU(unsigned int, current_word_idx); +static DEFINE_PER_CPU(unsigned int, current_bit_idx); + +/* + * Mask out the i least significant bits of w + */ +#define MASK_LSBS(w, i) (w & ((~((xen_ulong_t)0UL)) << i)) + +static inline xen_ulong_t active_evtchns(unsigned int cpu, + struct shared_info *sh, + unsigned int idx) +{ + return sh->evtchn_pending[idx] & + per_cpu(cpu_evtchn_mask, cpu)[idx] & + ~sh->evtchn_mask[idx]; +} + +/* + * Search the CPU's pending events bitmasks. For each one found, map + * the event number to an irq, and feed it into do_IRQ() for handling. + * + * Xen uses a two-level bitmap to speed searching. The first level is + * a bitset of words which contain pending event bits. The second + * level is a bitset of pending events themselves. + */ +static void evtchn_2l_handle_events(unsigned cpu) +{ + int irq; + xen_ulong_t pending_words; + xen_ulong_t pending_bits; + int start_word_idx, start_bit_idx; + int word_idx, bit_idx; + int i; + struct shared_info *s = HYPERVISOR_shared_info; + struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + + /* Timer interrupt has highest priority. */ + irq = irq_from_virq(cpu, VIRQ_TIMER); + if (irq != -1) { + unsigned int evtchn = evtchn_from_irq(irq); + word_idx = evtchn / BITS_PER_LONG; + bit_idx = evtchn % BITS_PER_LONG; + if (active_evtchns(cpu, s, word_idx) & (1ULL << bit_idx)) + generic_handle_irq(irq); + } + + /* + * Master flag must be cleared /before/ clearing + * selector flag. xchg_xen_ulong must contain an + * appropriate barrier. + */ + pending_words = xchg_xen_ulong(&vcpu_info->evtchn_pending_sel, 0); + + start_word_idx = __this_cpu_read(current_word_idx); + start_bit_idx = __this_cpu_read(current_bit_idx); + + word_idx = start_word_idx; + + for (i = 0; pending_words != 0; i++) { + xen_ulong_t words; + + words = MASK_LSBS(pending_words, word_idx); + + /* + * If we masked out all events, wrap to beginning. + */ + if (words == 0) { + word_idx = 0; + bit_idx = 0; + continue; + } + word_idx = EVTCHN_FIRST_BIT(words); + + pending_bits = active_evtchns(cpu, s, word_idx); + bit_idx = 0; /* usually scan entire word from start */ + /* + * We scan the starting word in two parts. + * + * 1st time: start in the middle, scanning the + * upper bits. + * + * 2nd time: scan the whole word (not just the + * parts skipped in the first pass) -- if an + * event in the previously scanned bits is + * pending again it would just be scanned on + * the next loop anyway. + */ + if (word_idx == start_word_idx) { + if (i == 0) + bit_idx = start_bit_idx; + } + + do { + xen_ulong_t bits; + int port; + + bits = MASK_LSBS(pending_bits, bit_idx); + + /* If we masked out all events, move on. */ + if (bits == 0) + break; + + bit_idx = EVTCHN_FIRST_BIT(bits); + + /* Process port. */ + port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx; + irq = get_evtchn_to_irq(port); + + if (irq != -1) + generic_handle_irq(irq); + + bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD; + + /* Next caller starts at last processed + 1 */ + __this_cpu_write(current_word_idx, + bit_idx ? word_idx : + (word_idx+1) % BITS_PER_EVTCHN_WORD); + __this_cpu_write(current_bit_idx, bit_idx); + } while (bit_idx != 0); + + /* Scan start_l1i twice; all others once. */ + if ((word_idx != start_word_idx) || (i != 0)) + pending_words &= ~(1UL << word_idx); + + word_idx = (word_idx + 1) % BITS_PER_EVTCHN_WORD; + } +} + +irqreturn_t xen_debug_interrupt(int irq, void *dev_id) +{ + struct shared_info *sh = HYPERVISOR_shared_info; + int cpu = smp_processor_id(); + xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu); + int i; + unsigned long flags; + static DEFINE_SPINLOCK(debug_lock); + struct vcpu_info *v; + + spin_lock_irqsave(&debug_lock, flags); + + printk("\nvcpu %d\n ", cpu); + + for_each_online_cpu(i) { + int pending; + v = per_cpu(xen_vcpu, i); + pending = (get_irq_regs() && i == cpu) + ? xen_irqs_disabled(get_irq_regs()) + : v->evtchn_upcall_mask; + printk("%d: masked=%d pending=%d event_sel %0*"PRI_xen_ulong"\n ", i, + pending, v->evtchn_upcall_pending, + (int)(sizeof(v->evtchn_pending_sel)*2), + v->evtchn_pending_sel); + } + v = per_cpu(xen_vcpu, cpu); + + printk("\npending:\n "); + for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", + (int)sizeof(sh->evtchn_pending[0])*2, + sh->evtchn_pending[i], + i % 8 == 0 ? "\n " : " "); + printk("\nglobal mask:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", + (int)(sizeof(sh->evtchn_mask[0])*2), + sh->evtchn_mask[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nglobally unmasked:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", + (int)(sizeof(sh->evtchn_mask[0])*2), + sh->evtchn_pending[i] & ~sh->evtchn_mask[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nlocal cpu%d mask:\n ", cpu); + for (i = (EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", (int)(sizeof(cpu_evtchn[0])*2), + cpu_evtchn[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nlocally unmasked:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) { + xen_ulong_t pending = sh->evtchn_pending[i] + & ~sh->evtchn_mask[i] + & cpu_evtchn[i]; + printk("%0*"PRI_xen_ulong"%s", + (int)(sizeof(sh->evtchn_mask[0])*2), + pending, i % 8 == 0 ? "\n " : " "); + } + + printk("\npending list:\n"); + for (i = 0; i < EVTCHN_2L_NR_CHANNELS; i++) { + if (sync_test_bit(i, BM(sh->evtchn_pending))) { + int word_idx = i / BITS_PER_EVTCHN_WORD; + printk(" %d: event %d -> irq %d%s%s%s\n", + cpu_from_evtchn(i), i, + get_evtchn_to_irq(i), + sync_test_bit(word_idx, BM(&v->evtchn_pending_sel)) + ? "" : " l2-clear", + !sync_test_bit(i, BM(sh->evtchn_mask)) + ? "" : " globally-masked", + sync_test_bit(i, BM(cpu_evtchn)) + ? "" : " locally-masked"); + } + } + + spin_unlock_irqrestore(&debug_lock, flags); + + return IRQ_HANDLED; +} + +static const struct evtchn_ops evtchn_ops_2l = { + .max_channels = evtchn_2l_max_channels, + .nr_channels = evtchn_2l_max_channels, + .bind_to_cpu = evtchn_2l_bind_to_cpu, + .clear_pending = evtchn_2l_clear_pending, + .set_pending = evtchn_2l_set_pending, + .is_pending = evtchn_2l_is_pending, + .test_and_set_mask = evtchn_2l_test_and_set_mask, + .mask = evtchn_2l_mask, + .unmask = evtchn_2l_unmask, + .handle_events = evtchn_2l_handle_events, +}; + +void __init xen_evtchn_2l_init(void) +{ + pr_info("Using 2-level ABI\n"); + evtchn_ops = &evtchn_ops_2l; +} diff --git a/drivers/xen/events.c b/drivers/xen/events/events_base.c index 4035e833ea2..c919d3d5c84 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events/events_base.c @@ -59,6 +59,10 @@ #include <xen/interface/vcpu.h> #include <asm/hw_irq.h> +#include "events_internal.h" + +const struct evtchn_ops *evtchn_ops; + /* * This lock protects updates to the following mapping and reference-count * arrays. The lock does not need to be acquired to read the mapping tables. @@ -73,71 +77,15 @@ static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; /* IRQ <-> IPI mapping */ static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1}; -/* Interrupt types. */ -enum xen_irq_type { - IRQT_UNBOUND = 0, - IRQT_PIRQ, - IRQT_VIRQ, - IRQT_IPI, - IRQT_EVTCHN -}; - -/* - * Packed IRQ information: - * type - enum xen_irq_type - * event channel - irq->event channel mapping - * cpu - cpu this event channel is bound to - * index - type-specific information: - * PIRQ - physical IRQ, GSI, flags, and owner domain - * VIRQ - virq number - * IPI - IPI vector - * EVTCHN - - */ -struct irq_info { - struct list_head list; - int refcnt; - enum xen_irq_type type; /* type */ - unsigned irq; - unsigned short evtchn; /* event channel */ - unsigned short cpu; /* cpu bound */ - - union { - unsigned short virq; - enum ipi_vector ipi; - struct { - unsigned short pirq; - unsigned short gsi; - unsigned char flags; - uint16_t domid; - } pirq; - } u; -}; -#define PIRQ_NEEDS_EOI (1 << 0) -#define PIRQ_SHAREABLE (1 << 1) - -static int *evtchn_to_irq; +int **evtchn_to_irq; #ifdef CONFIG_X86 static unsigned long *pirq_eoi_map; #endif static bool (*pirq_needs_eoi)(unsigned irq); -/* - * Note sizeof(xen_ulong_t) can be more than sizeof(unsigned long). Be - * careful to only use bitops which allow for this (e.g - * test_bit/find_first_bit and friends but not __ffs) and to pass - * BITS_PER_EVTCHN_WORD as the bitmask length. - */ -#define BITS_PER_EVTCHN_WORD (sizeof(xen_ulong_t)*8) -/* - * Make a bitmask (i.e. unsigned long *) of a xen_ulong_t - * array. Primarily to avoid long lines (hence the terse name). - */ -#define BM(x) (unsigned long *)(x) -/* Find the first set bit in a evtchn mask */ -#define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD) - -static DEFINE_PER_CPU(xen_ulong_t [NR_EVENT_CHANNELS/BITS_PER_EVTCHN_WORD], - cpu_evtchn_mask); +#define EVTCHN_ROW(e) (e / (PAGE_SIZE/sizeof(**evtchn_to_irq))) +#define EVTCHN_COL(e) (e % (PAGE_SIZE/sizeof(**evtchn_to_irq))) +#define EVTCHN_PER_ROW (PAGE_SIZE / sizeof(**evtchn_to_irq)) /* Xen will never allocate port zero for any purpose. */ #define VALID_EVTCHN(chn) ((chn) != 0) @@ -148,19 +96,75 @@ static struct irq_chip xen_pirq_chip; static void enable_dynirq(struct irq_data *data); static void disable_dynirq(struct irq_data *data); +static void clear_evtchn_to_irq_row(unsigned row) +{ + unsigned col; + + for (col = 0; col < EVTCHN_PER_ROW; col++) + evtchn_to_irq[row][col] = -1; +} + +static void clear_evtchn_to_irq_all(void) +{ + unsigned row; + + for (row = 0; row < EVTCHN_ROW(xen_evtchn_max_channels()); row++) { + if (evtchn_to_irq[row] == NULL) + continue; + clear_evtchn_to_irq_row(row); + } +} + +static int set_evtchn_to_irq(unsigned evtchn, unsigned irq) +{ + unsigned row; + unsigned col; + + if (evtchn >= xen_evtchn_max_channels()) + return -EINVAL; + + row = EVTCHN_ROW(evtchn); + col = EVTCHN_COL(evtchn); + + if (evtchn_to_irq[row] == NULL) { + /* Unallocated irq entries return -1 anyway */ + if (irq == -1) + return 0; + + evtchn_to_irq[row] = (int *)get_zeroed_page(GFP_KERNEL); + if (evtchn_to_irq[row] == NULL) + return -ENOMEM; + + clear_evtchn_to_irq_row(row); + } + + evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)] = irq; + return 0; +} + +int get_evtchn_to_irq(unsigned evtchn) +{ + if (evtchn >= xen_evtchn_max_channels()) + return -1; + if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL) + return -1; + return evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]; +} + /* Get info for IRQ */ -static struct irq_info *info_for_irq(unsigned irq) +struct irq_info *info_for_irq(unsigned irq) { return irq_get_handler_data(irq); } /* Constructors for packed IRQ information. */ -static void xen_irq_info_common_init(struct irq_info *info, +static int xen_irq_info_common_setup(struct irq_info *info, unsigned irq, enum xen_irq_type type, - unsigned short evtchn, + unsigned evtchn, unsigned short cpu) { + int ret; BUG_ON(info->type != IRQT_UNBOUND && info->type != type); @@ -169,68 +173,78 @@ static void xen_irq_info_common_init(struct irq_info *info, info->evtchn = evtchn; info->cpu = cpu; - evtchn_to_irq[evtchn] = irq; + ret = set_evtchn_to_irq(evtchn, irq); + if (ret < 0) + return ret; irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN); + + return xen_evtchn_port_setup(info); } -static void xen_irq_info_evtchn_init(unsigned irq, - unsigned short evtchn) +static int xen_irq_info_evtchn_setup(unsigned irq, + unsigned evtchn) { struct irq_info *info = info_for_irq(irq); - xen_irq_info_common_init(info, irq, IRQT_EVTCHN, evtchn, 0); + return xen_irq_info_common_setup(info, irq, IRQT_EVTCHN, evtchn, 0); } -static void xen_irq_info_ipi_init(unsigned cpu, +static int xen_irq_info_ipi_setup(unsigned cpu, unsigned irq, - unsigned short evtchn, + unsigned evtchn, enum ipi_vector ipi) { struct irq_info *info = info_for_irq(irq); - xen_irq_info_common_init(info, irq, IRQT_IPI, evtchn, 0); - info->u.ipi = ipi; per_cpu(ipi_to_irq, cpu)[ipi] = irq; + + return xen_irq_info_common_setup(info, irq, IRQT_IPI, evtchn, 0); } -static void xen_irq_info_virq_init(unsigned cpu, +static int xen_irq_info_virq_setup(unsigned cpu, unsigned irq, - unsigned short evtchn, - unsigned short virq) + unsigned evtchn, + unsigned virq) { struct irq_info *info = info_for_irq(irq); - xen_irq_info_common_init(info, irq, IRQT_VIRQ, evtchn, 0); - info->u.virq = virq; per_cpu(virq_to_irq, cpu)[virq] = irq; + + return xen_irq_info_common_setup(info, irq, IRQT_VIRQ, evtchn, 0); } -static void xen_irq_info_pirq_init(unsigned irq, - unsigned short evtchn, - unsigned short pirq, - unsigned short gsi, +static int xen_irq_info_pirq_setup(unsigned irq, + unsigned evtchn, + unsigned pirq, + unsigned gsi, uint16_t domid, unsigned char flags) { struct irq_info *info = info_for_irq(irq); - xen_irq_info_common_init(info, irq, IRQT_PIRQ, evtchn, 0); - info->u.pirq.pirq = pirq; info->u.pirq.gsi = gsi; info->u.pirq.domid = domid; info->u.pirq.flags = flags; + + return xen_irq_info_common_setup(info, irq, IRQT_PIRQ, evtchn, 0); +} + +static void xen_irq_info_cleanup(struct irq_info *info) +{ + set_evtchn_to_irq(info->evtchn, -1); + info->evtchn = 0; } /* * Accessors for packed IRQ information. */ -static unsigned int evtchn_from_irq(unsigned irq) +unsigned int evtchn_from_irq(unsigned irq) { if (unlikely(WARN(irq < 0 || irq >= nr_irqs, "Invalid irq %d!\n", irq))) return 0; @@ -240,10 +254,15 @@ static unsigned int evtchn_from_irq(unsigned irq) unsigned irq_from_evtchn(unsigned int evtchn) { - return evtchn_to_irq[evtchn]; + return get_evtchn_to_irq(evtchn); } EXPORT_SYMBOL_GPL(irq_from_evtchn); +int irq_from_virq(unsigned int cpu, unsigned int virq) +{ + return per_cpu(virq_to_irq, cpu)[virq]; +} + static enum ipi_vector ipi_from_irq(unsigned irq) { struct irq_info *info = info_for_irq(irq); @@ -279,14 +298,14 @@ static enum xen_irq_type type_from_irq(unsigned irq) return info_for_irq(irq)->type; } -static unsigned cpu_from_irq(unsigned irq) +unsigned cpu_from_irq(unsigned irq) { return info_for_irq(irq)->cpu; } -static unsigned int cpu_from_evtchn(unsigned int evtchn) +unsigned int cpu_from_evtchn(unsigned int evtchn) { - int irq = evtchn_to_irq[evtchn]; + int irq = get_evtchn_to_irq(evtchn); unsigned ret = 0; if (irq != -1) @@ -310,67 +329,28 @@ static bool pirq_needs_eoi_flag(unsigned irq) return info->u.pirq.flags & PIRQ_NEEDS_EOI; } -static inline xen_ulong_t active_evtchns(unsigned int cpu, - struct shared_info *sh, - unsigned int idx) -{ - return sh->evtchn_pending[idx] & - per_cpu(cpu_evtchn_mask, cpu)[idx] & - ~sh->evtchn_mask[idx]; -} - static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) { - int irq = evtchn_to_irq[chn]; + int irq = get_evtchn_to_irq(chn); + struct irq_info *info = info_for_irq(irq); BUG_ON(irq == -1); #ifdef CONFIG_SMP - cpumask_copy(irq_to_desc(irq)->irq_data.affinity, cpumask_of(cpu)); -#endif - - clear_bit(chn, BM(per_cpu(cpu_evtchn_mask, cpu_from_irq(irq)))); - set_bit(chn, BM(per_cpu(cpu_evtchn_mask, cpu))); - - info_for_irq(irq)->cpu = cpu; -} - -static void init_evtchn_cpu_bindings(void) -{ - int i; -#ifdef CONFIG_SMP - struct irq_info *info; - - /* By default all event channels notify CPU#0. */ - list_for_each_entry(info, &xen_irq_list_head, list) { - struct irq_desc *desc = irq_to_desc(info->irq); - cpumask_copy(desc->irq_data.affinity, cpumask_of(0)); - } + cpumask_copy(irq_get_irq_data(irq)->affinity, cpumask_of(cpu)); #endif + xen_evtchn_port_bind_to_cpu(info, cpu); - for_each_possible_cpu(i) - memset(per_cpu(cpu_evtchn_mask, i), - (i == 0) ? ~0 : 0, NR_EVENT_CHANNELS/8); -} - -static inline void clear_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_clear_bit(port, BM(&s->evtchn_pending[0])); + info->cpu = cpu; } -static inline void set_evtchn(int port) +static void xen_evtchn_mask_all(void) { - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, BM(&s->evtchn_pending[0])); -} + unsigned int evtchn; -static inline int test_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - return sync_test_bit(port, BM(&s->evtchn_pending[0])); + for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++) + mask_evtchn(evtchn); } - /** * notify_remote_via_irq - send event to remote end of event channel via irq * @irq: irq of event channel to send event to @@ -388,71 +368,12 @@ void notify_remote_via_irq(int irq) } EXPORT_SYMBOL_GPL(notify_remote_via_irq); -static void mask_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, BM(&s->evtchn_mask[0])); -} - -static void unmask_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - unsigned int cpu = get_cpu(); - int do_hypercall = 0, evtchn_pending = 0; - - BUG_ON(!irqs_disabled()); - - if (unlikely((cpu != cpu_from_evtchn(port)))) - do_hypercall = 1; - else { - /* - * Need to clear the mask before checking pending to - * avoid a race with an event becoming pending. - * - * EVTCHNOP_unmask will only trigger an upcall if the - * mask bit was set, so if a hypercall is needed - * remask the event. - */ - sync_clear_bit(port, BM(&s->evtchn_mask[0])); - evtchn_pending = sync_test_bit(port, BM(&s->evtchn_pending[0])); - - if (unlikely(evtchn_pending && xen_hvm_domain())) { - sync_set_bit(port, BM(&s->evtchn_mask[0])); - do_hypercall = 1; - } - } - - /* Slow path (hypercall) if this is a non-local port or if this is - * an hvm domain and an event is pending (hvm domains don't have - * their own implementation of irq_enable). */ - if (do_hypercall) { - struct evtchn_unmask unmask = { .port = port }; - (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); - } else { - struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); - - /* - * The following is basically the equivalent of - * 'hw_resend_irq'. Just like a real IO-APIC we 'lose - * the interrupt edge' if the channel is masked. - */ - if (evtchn_pending && - !sync_test_and_set_bit(port / BITS_PER_EVTCHN_WORD, - BM(&vcpu_info->evtchn_pending_sel))) - vcpu_info->evtchn_upcall_pending = 1; - } - - put_cpu(); -} - static void xen_irq_init(unsigned irq) { struct irq_info *info; #ifdef CONFIG_SMP - struct irq_desc *desc = irq_to_desc(irq); - /* By default all event channels notify CPU#0. */ - cpumask_copy(desc->irq_data.affinity, cpumask_of(0)); + cpumask_copy(irq_get_irq_data(irq)->affinity, cpumask_of(0)); #endif info = kzalloc(sizeof(*info), GFP_KERNEL); @@ -467,29 +388,22 @@ static void xen_irq_init(unsigned irq) list_add_tail(&info->list, &xen_irq_list_head); } -static int __must_check xen_allocate_irq_dynamic(void) +static int __must_check xen_allocate_irqs_dynamic(int nvec) { - int first = 0; - int irq; + int i, irq = irq_alloc_descs(-1, 0, nvec, -1); -#ifdef CONFIG_X86_IO_APIC - /* - * For an HVM guest or domain 0 which see "real" (emulated or - * actual respectively) GSIs we allocate dynamic IRQs - * e.g. those corresponding to event channels or MSIs - * etc. from the range above those "real" GSIs to avoid - * collisions. - */ - if (xen_initial_domain() || xen_hvm_domain()) - first = get_nr_irqs_gsi(); -#endif + if (irq >= 0) { + for (i = 0; i < nvec; i++) + xen_irq_init(irq + i); + } - irq = irq_alloc_desc_from(first, -1); + return irq; +} - if (irq >= 0) - xen_irq_init(irq); +static inline int __must_check xen_allocate_irq_dynamic(void) +{ - return irq; + return xen_allocate_irqs_dynamic(1); } static int __must_check xen_allocate_irq_gsi(unsigned gsi) @@ -538,6 +452,15 @@ static void xen_free_irq(unsigned irq) irq_free_desc(irq); } +static void xen_evtchn_close(unsigned int port) +{ + struct evtchn_close close; + + close.port = port; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) + BUG(); +} + static void pirq_query_unmask(int irq) { struct physdev_irq_status_query irq_status; @@ -554,13 +477,6 @@ static void pirq_query_unmask(int irq) info->u.pirq.flags |= PIRQ_NEEDS_EOI; } -static bool probing_irq(int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - - return desc && desc->action == NULL; -} - static void eoi_pirq(struct irq_data *data) { int evtchn = evtchn_from_irq(data->irq); @@ -602,15 +518,20 @@ static unsigned int __startup_pirq(unsigned int irq) BIND_PIRQ__WILL_SHARE : 0; rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq); if (rc != 0) { - if (!probing_irq(irq)) - pr_info("Failed to obtain physical IRQ %d\n", irq); + pr_warn("Failed to obtain physical IRQ %d\n", irq); return 0; } evtchn = bind_pirq.port; pirq_query_unmask(irq); - evtchn_to_irq[evtchn] = irq; + rc = set_evtchn_to_irq(evtchn, irq); + if (rc != 0) { + pr_err("irq%d: Failed to set port to irq mapping (%d)\n", + irq, rc); + xen_evtchn_close(evtchn); + return 0; + } bind_evtchn_to_cpu(evtchn, 0); info->evtchn = evtchn; @@ -628,10 +549,9 @@ static unsigned int startup_pirq(struct irq_data *data) static void shutdown_pirq(struct irq_data *data) { - struct evtchn_close close; unsigned int irq = data->irq; struct irq_info *info = info_for_irq(irq); - int evtchn = evtchn_from_irq(irq); + unsigned evtchn = evtchn_from_irq(irq); BUG_ON(info->type != IRQT_PIRQ); @@ -639,14 +559,8 @@ static void shutdown_pirq(struct irq_data *data) return; mask_evtchn(evtchn); - - close.port = evtchn; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) - BUG(); - - bind_evtchn_to_cpu(evtchn, 0); - evtchn_to_irq[evtchn] = -1; - info->evtchn = 0; + xen_evtchn_close(evtchn); + xen_irq_info_cleanup(info); } static void enable_pirq(struct irq_data *data) @@ -675,6 +589,41 @@ int xen_irq_from_gsi(unsigned gsi) } EXPORT_SYMBOL_GPL(xen_irq_from_gsi); +static void __unbind_from_irq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + struct irq_info *info = irq_get_handler_data(irq); + + if (info->refcnt > 0) { + info->refcnt--; + if (info->refcnt != 0) + return; + } + + if (VALID_EVTCHN(evtchn)) { + unsigned int cpu = cpu_from_irq(irq); + + xen_evtchn_close(evtchn); + + switch (type_from_irq(irq)) { + case IRQT_VIRQ: + per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1; + break; + case IRQT_IPI: + per_cpu(ipi_to_irq, cpu)[ipi_from_irq(irq)] = -1; + break; + default: + break; + } + + xen_irq_info_cleanup(info); + } + + BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND); + + xen_free_irq(irq); +} + /* * Do not make any assumptions regarding the relationship between the * IRQ number returned here and the Xen pirq argument. @@ -690,6 +639,7 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, { int irq = -1; struct physdev_irq irq_op; + int ret; mutex_lock(&irq_mapping_update_lock); @@ -717,8 +667,13 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, goto out; } - xen_irq_info_pirq_init(irq, 0, pirq, gsi, DOMID_SELF, + ret = xen_irq_info_pirq_setup(irq, 0, pirq, gsi, DOMID_SELF, shareable ? PIRQ_SHAREABLE : 0); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } pirq_query_unmask(irq); /* We try to use the handler with the appropriate semantic for the @@ -765,20 +720,25 @@ int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc) } int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, - int pirq, const char *name, domid_t domid) + int pirq, int nvec, const char *name, domid_t domid) { - int irq, ret; + int i, irq, ret; mutex_lock(&irq_mapping_update_lock); - irq = xen_allocate_irq_dynamic(); + irq = xen_allocate_irqs_dynamic(nvec); if (irq < 0) goto out; - irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_edge_irq, - name); + for (i = 0; i < nvec; i++) { + irq_set_chip_and_handler_name(irq + i, &xen_pirq_chip, handle_edge_irq, name); + + ret = xen_irq_info_pirq_setup(irq + i, 0, pirq + i, 0, domid, + i == 0 ? 0 : PIRQ_MSI_GROUP); + if (ret < 0) + goto error_irq; + } - xen_irq_info_pirq_init(irq, 0, pirq, 0, domid, 0); ret = irq_set_msi_desc(irq, msidesc); if (ret < 0) goto error_irq; @@ -786,26 +746,27 @@ out: mutex_unlock(&irq_mapping_update_lock); return irq; error_irq: + for (; i >= 0; i--) + __unbind_from_irq(irq + i); mutex_unlock(&irq_mapping_update_lock); - xen_free_irq(irq); return ret; } #endif int xen_destroy_irq(int irq) { - struct irq_desc *desc; struct physdev_unmap_pirq unmap_irq; struct irq_info *info = info_for_irq(irq); int rc = -ENOENT; mutex_lock(&irq_mapping_update_lock); - desc = irq_to_desc(irq); - if (!desc) - goto out; - - if (xen_initial_domain()) { + /* + * If trying to remove a vector in a MSI group different + * than the first one skip the PIRQ unmap unless this vector + * is the first one in the group. + */ + if (xen_initial_domain() && !(info->u.pirq.flags & PIRQ_MSI_GROUP)) { unmap_irq.pirq = info->u.pirq.pirq; unmap_irq.domid = info->u.pirq.domid; rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq); @@ -857,13 +818,18 @@ int xen_pirq_from_irq(unsigned irq) return pirq_from_irq(irq); } EXPORT_SYMBOL_GPL(xen_pirq_from_irq); + int bind_evtchn_to_irq(unsigned int evtchn) { int irq; + int ret; + + if (evtchn >= xen_evtchn_max_channels()) + return -ENOMEM; mutex_lock(&irq_mapping_update_lock); - irq = evtchn_to_irq[evtchn]; + irq = get_evtchn_to_irq(evtchn); if (irq == -1) { irq = xen_allocate_irq_dynamic(); @@ -873,7 +839,14 @@ int bind_evtchn_to_irq(unsigned int evtchn) irq_set_chip_and_handler_name(irq, &xen_dynamic_chip, handle_edge_irq, "event"); - xen_irq_info_evtchn_init(irq, evtchn); + ret = xen_irq_info_evtchn_setup(irq, evtchn); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } + /* New interdomain events are bound to VCPU 0. */ + bind_evtchn_to_cpu(evtchn, 0); } else { struct irq_info *info = info_for_irq(irq); WARN_ON(info == NULL || info->type != IRQT_EVTCHN); @@ -890,6 +863,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) { struct evtchn_bind_ipi bind_ipi; int evtchn, irq; + int ret; mutex_lock(&irq_mapping_update_lock); @@ -909,8 +883,12 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) BUG(); evtchn = bind_ipi.port; - xen_irq_info_ipi_init(cpu, irq, evtchn, ipi); - + ret = xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } bind_evtchn_to_cpu(evtchn, cpu); } else { struct irq_info *info = info_for_irq(irq); @@ -943,7 +921,7 @@ static int find_virq(unsigned int virq, unsigned int cpu) int port, rc = -ENOENT; memset(&status, 0, sizeof(status)); - for (port = 0; port <= NR_EVENT_CHANNELS; port++) { + for (port = 0; port < xen_evtchn_max_channels(); port++) { status.dom = DOMID_SELF; status.port = port; rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status); @@ -959,6 +937,19 @@ static int find_virq(unsigned int virq, unsigned int cpu) return rc; } +/** + * xen_evtchn_nr_channels - number of usable event channel ports + * + * This may be less than the maximum supported by the current + * hypervisor ABI. Use xen_evtchn_max_channels() for the maximum + * supported. + */ +unsigned xen_evtchn_nr_channels(void) +{ + return evtchn_ops->nr_channels(); +} +EXPORT_SYMBOL_GPL(xen_evtchn_nr_channels); + int bind_virq_to_irq(unsigned int virq, unsigned int cpu) { struct evtchn_bind_virq bind_virq; @@ -989,7 +980,12 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu) evtchn = ret; } - xen_irq_info_virq_init(cpu, irq, evtchn, virq); + ret = xen_irq_info_virq_setup(cpu, irq, evtchn, virq); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } bind_evtchn_to_cpu(evtchn, cpu); } else { @@ -1005,50 +1001,8 @@ out: static void unbind_from_irq(unsigned int irq) { - struct evtchn_close close; - int evtchn = evtchn_from_irq(irq); - struct irq_info *info = irq_get_handler_data(irq); - - if (WARN_ON(!info)) - return; - mutex_lock(&irq_mapping_update_lock); - - if (info->refcnt > 0) { - info->refcnt--; - if (info->refcnt != 0) - goto done; - } - - if (VALID_EVTCHN(evtchn)) { - close.port = evtchn; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) - BUG(); - - switch (type_from_irq(irq)) { - case IRQT_VIRQ: - per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) - [virq_from_irq(irq)] = -1; - break; - case IRQT_IPI: - per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn)) - [ipi_from_irq(irq)] = -1; - break; - default: - break; - } - - /* Closed ports are implicitly re-bound to VCPU0. */ - bind_evtchn_to_cpu(evtchn, 0); - - evtchn_to_irq[evtchn] = -1; - } - - BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND); - - xen_free_irq(irq); - - done: + __unbind_from_irq(irq); mutex_unlock(&irq_mapping_update_lock); } @@ -1148,9 +1102,26 @@ void unbind_from_irqhandler(unsigned int irq, void *dev_id) } EXPORT_SYMBOL_GPL(unbind_from_irqhandler); +/** + * xen_set_irq_priority() - set an event channel priority. + * @irq:irq bound to an event channel. + * @priority: priority between XEN_IRQ_PRIORITY_MAX and XEN_IRQ_PRIORITY_MIN. + */ +int xen_set_irq_priority(unsigned irq, unsigned priority) +{ + struct evtchn_set_priority set_priority; + + set_priority.port = evtchn_from_irq(irq); + set_priority.priority = priority; + + return HYPERVISOR_event_channel_op(EVTCHNOP_set_priority, + &set_priority); +} +EXPORT_SYMBOL_GPL(xen_set_irq_priority); + int evtchn_make_refcounted(unsigned int evtchn) { - int irq = evtchn_to_irq[evtchn]; + int irq = get_evtchn_to_irq(evtchn); struct irq_info *info; if (irq == -1) @@ -1175,12 +1146,12 @@ int evtchn_get(unsigned int evtchn) struct irq_info *info; int err = -ENOENT; - if (evtchn >= NR_EVENT_CHANNELS) + if (evtchn >= xen_evtchn_max_channels()) return -EINVAL; mutex_lock(&irq_mapping_update_lock); - irq = evtchn_to_irq[evtchn]; + irq = get_evtchn_to_irq(evtchn); if (irq == -1) goto done; @@ -1204,7 +1175,7 @@ EXPORT_SYMBOL_GPL(evtchn_get); void evtchn_put(unsigned int evtchn) { - int irq = evtchn_to_irq[evtchn]; + int irq = get_evtchn_to_irq(evtchn); if (WARN_ON(irq == -1)) return; unbind_from_irq(irq); @@ -1228,222 +1199,21 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) notify_remote_via_irq(irq); } -irqreturn_t xen_debug_interrupt(int irq, void *dev_id) -{ - struct shared_info *sh = HYPERVISOR_shared_info; - int cpu = smp_processor_id(); - xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu); - int i; - unsigned long flags; - static DEFINE_SPINLOCK(debug_lock); - struct vcpu_info *v; - - spin_lock_irqsave(&debug_lock, flags); - - printk("\nvcpu %d\n ", cpu); - - for_each_online_cpu(i) { - int pending; - v = per_cpu(xen_vcpu, i); - pending = (get_irq_regs() && i == cpu) - ? xen_irqs_disabled(get_irq_regs()) - : v->evtchn_upcall_mask; - printk("%d: masked=%d pending=%d event_sel %0*"PRI_xen_ulong"\n ", i, - pending, v->evtchn_upcall_pending, - (int)(sizeof(v->evtchn_pending_sel)*2), - v->evtchn_pending_sel); - } - v = per_cpu(xen_vcpu, cpu); - - printk("\npending:\n "); - for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) - printk("%0*"PRI_xen_ulong"%s", - (int)sizeof(sh->evtchn_pending[0])*2, - sh->evtchn_pending[i], - i % 8 == 0 ? "\n " : " "); - printk("\nglobal mask:\n "); - for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) - printk("%0*"PRI_xen_ulong"%s", - (int)(sizeof(sh->evtchn_mask[0])*2), - sh->evtchn_mask[i], - i % 8 == 0 ? "\n " : " "); - - printk("\nglobally unmasked:\n "); - for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) - printk("%0*"PRI_xen_ulong"%s", - (int)(sizeof(sh->evtchn_mask[0])*2), - sh->evtchn_pending[i] & ~sh->evtchn_mask[i], - i % 8 == 0 ? "\n " : " "); - - printk("\nlocal cpu%d mask:\n ", cpu); - for (i = (NR_EVENT_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--) - printk("%0*"PRI_xen_ulong"%s", (int)(sizeof(cpu_evtchn[0])*2), - cpu_evtchn[i], - i % 8 == 0 ? "\n " : " "); - - printk("\nlocally unmasked:\n "); - for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) { - xen_ulong_t pending = sh->evtchn_pending[i] - & ~sh->evtchn_mask[i] - & cpu_evtchn[i]; - printk("%0*"PRI_xen_ulong"%s", - (int)(sizeof(sh->evtchn_mask[0])*2), - pending, i % 8 == 0 ? "\n " : " "); - } - - printk("\npending list:\n"); - for (i = 0; i < NR_EVENT_CHANNELS; i++) { - if (sync_test_bit(i, BM(sh->evtchn_pending))) { - int word_idx = i / BITS_PER_EVTCHN_WORD; - printk(" %d: event %d -> irq %d%s%s%s\n", - cpu_from_evtchn(i), i, - evtchn_to_irq[i], - sync_test_bit(word_idx, BM(&v->evtchn_pending_sel)) - ? "" : " l2-clear", - !sync_test_bit(i, BM(sh->evtchn_mask)) - ? "" : " globally-masked", - sync_test_bit(i, BM(cpu_evtchn)) - ? "" : " locally-masked"); - } - } - - spin_unlock_irqrestore(&debug_lock, flags); - - return IRQ_HANDLED; -} - static DEFINE_PER_CPU(unsigned, xed_nesting_count); -static DEFINE_PER_CPU(unsigned int, current_word_idx); -static DEFINE_PER_CPU(unsigned int, current_bit_idx); -/* - * Mask out the i least significant bits of w - */ -#define MASK_LSBS(w, i) (w & ((~((xen_ulong_t)0UL)) << i)) - -/* - * Search the CPUs pending events bitmasks. For each one found, map - * the event number to an irq, and feed it into do_IRQ() for - * handling. - * - * Xen uses a two-level bitmap to speed searching. The first level is - * a bitset of words which contain pending event bits. The second - * level is a bitset of pending events themselves. - */ static void __xen_evtchn_do_upcall(void) { - int start_word_idx, start_bit_idx; - int word_idx, bit_idx; - int i, irq; - int cpu = get_cpu(); - struct shared_info *s = HYPERVISOR_shared_info; struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + int cpu = get_cpu(); unsigned count; do { - xen_ulong_t pending_words; - xen_ulong_t pending_bits; - struct irq_desc *desc; - vcpu_info->evtchn_upcall_pending = 0; if (__this_cpu_inc_return(xed_nesting_count) - 1) goto out; - /* - * Master flag must be cleared /before/ clearing - * selector flag. xchg_xen_ulong must contain an - * appropriate barrier. - */ - if ((irq = per_cpu(virq_to_irq, cpu)[VIRQ_TIMER]) != -1) { - int evtchn = evtchn_from_irq(irq); - word_idx = evtchn / BITS_PER_LONG; - pending_bits = evtchn % BITS_PER_LONG; - if (active_evtchns(cpu, s, word_idx) & (1ULL << pending_bits)) { - desc = irq_to_desc(irq); - if (desc) - generic_handle_irq_desc(irq, desc); - } - } - - pending_words = xchg_xen_ulong(&vcpu_info->evtchn_pending_sel, 0); - - start_word_idx = __this_cpu_read(current_word_idx); - start_bit_idx = __this_cpu_read(current_bit_idx); - - word_idx = start_word_idx; - - for (i = 0; pending_words != 0; i++) { - xen_ulong_t words; - - words = MASK_LSBS(pending_words, word_idx); - - /* - * If we masked out all events, wrap to beginning. - */ - if (words == 0) { - word_idx = 0; - bit_idx = 0; - continue; - } - word_idx = EVTCHN_FIRST_BIT(words); - - pending_bits = active_evtchns(cpu, s, word_idx); - bit_idx = 0; /* usually scan entire word from start */ - /* - * We scan the starting word in two parts. - * - * 1st time: start in the middle, scanning the - * upper bits. - * - * 2nd time: scan the whole word (not just the - * parts skipped in the first pass) -- if an - * event in the previously scanned bits is - * pending again it would just be scanned on - * the next loop anyway. - */ - if (word_idx == start_word_idx) { - if (i == 0) - bit_idx = start_bit_idx; - } - - do { - xen_ulong_t bits; - int port; - - bits = MASK_LSBS(pending_bits, bit_idx); - - /* If we masked out all events, move on. */ - if (bits == 0) - break; - - bit_idx = EVTCHN_FIRST_BIT(bits); - - /* Process port. */ - port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx; - irq = evtchn_to_irq[port]; - - if (irq != -1) { - desc = irq_to_desc(irq); - if (desc) - generic_handle_irq_desc(irq, desc); - } - - bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD; - - /* Next caller starts at last processed + 1 */ - __this_cpu_write(current_word_idx, - bit_idx ? word_idx : - (word_idx+1) % BITS_PER_EVTCHN_WORD); - __this_cpu_write(current_bit_idx, bit_idx); - } while (bit_idx != 0); - - /* Scan start_l1i twice; all others once. */ - if ((word_idx != start_word_idx) || (i != 0)) - pending_words &= ~(1UL << word_idx); - - word_idx = (word_idx + 1) % BITS_PER_EVTCHN_WORD; - } + xen_evtchn_handle_events(cpu); BUG_ON(!irqs_disabled()); @@ -1463,6 +1233,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) irq_enter(); #ifdef CONFIG_X86 exit_idle(); + inc_irq_stat(irq_hv_callback_count); #endif __xen_evtchn_do_upcall(); @@ -1492,12 +1263,12 @@ void rebind_evtchn_irq(int evtchn, int irq) mutex_lock(&irq_mapping_update_lock); /* After resume the irq<->evtchn mappings are all cleared out */ - BUG_ON(evtchn_to_irq[evtchn] != -1); + BUG_ON(get_evtchn_to_irq(evtchn) != -1); /* Expect irq to have been bound before, so there should be a proper type */ BUG_ON(info->type == IRQT_UNBOUND); - xen_irq_info_evtchn_init(irq, evtchn); + (void)xen_irq_info_evtchn_setup(irq, evtchn); mutex_unlock(&irq_mapping_update_lock); @@ -1511,7 +1282,6 @@ void rebind_evtchn_irq(int evtchn, int irq) /* Rebind an evtchn so that it gets delivered to a specific cpu */ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) { - struct shared_info *s = HYPERVISOR_shared_info; struct evtchn_bind_vcpu bind_vcpu; int evtchn = evtchn_from_irq(irq); int masked; @@ -1534,7 +1304,7 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) * Mask the event while changing the VCPU binding to prevent * it being delivered on an unexpected VCPU. */ - masked = sync_test_and_set_bit(evtchn, BM(s->evtchn_mask)); + masked = test_and_set_mask(evtchn); /* * If this fails, it usually just indicates that we're dealing with a @@ -1553,27 +1323,11 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest, bool force) { - unsigned tcpu = cpumask_first(dest); + unsigned tcpu = cpumask_first_and(dest, cpu_online_mask); return rebind_irq_to_cpu(data->irq, tcpu); } -int resend_irq_on_evtchn(unsigned int irq) -{ - int masked, evtchn = evtchn_from_irq(irq); - struct shared_info *s = HYPERVISOR_shared_info; - - if (!VALID_EVTCHN(evtchn)) - return 1; - - masked = sync_test_and_set_bit(evtchn, BM(s->evtchn_mask)); - sync_set_bit(evtchn, BM(s->evtchn_pending)); - if (!masked) - unmask_evtchn(evtchn); - - return 1; -} - static void enable_dynirq(struct irq_data *data) { int evtchn = evtchn_from_irq(data->irq); @@ -1608,21 +1362,18 @@ static void mask_ack_dynirq(struct irq_data *data) static int retrigger_dynirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(data->irq); - struct shared_info *sh = HYPERVISOR_shared_info; - int ret = 0; + unsigned int evtchn = evtchn_from_irq(data->irq); + int masked; - if (VALID_EVTCHN(evtchn)) { - int masked; + if (!VALID_EVTCHN(evtchn)) + return 0; - masked = sync_test_and_set_bit(evtchn, BM(sh->evtchn_mask)); - sync_set_bit(evtchn, BM(sh->evtchn_pending)); - if (!masked) - unmask_evtchn(evtchn); - ret = 1; - } + masked = test_and_set_mask(evtchn); + set_evtchn(evtchn); + if (!masked) + unmask_evtchn(evtchn); - return ret; + return 1; } static void restore_pirqs(void) @@ -1683,7 +1434,7 @@ static void restore_cpu_virqs(unsigned int cpu) evtchn = bind_virq.port; /* Record the new mapping. */ - xen_irq_info_virq_init(cpu, irq, evtchn, virq); + (void)xen_irq_info_virq_setup(cpu, irq, evtchn, virq); bind_evtchn_to_cpu(evtchn, cpu); } } @@ -1707,7 +1458,7 @@ static void restore_cpu_ipis(unsigned int cpu) evtchn = bind_ipi.port; /* Record the new mapping. */ - xen_irq_info_ipi_init(cpu, irq, evtchn, ipi); + (void)xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi); bind_evtchn_to_cpu(evtchn, cpu); } } @@ -1784,21 +1535,18 @@ EXPORT_SYMBOL_GPL(xen_test_irq_shared); void xen_irq_resume(void) { - unsigned int cpu, evtchn; + unsigned int cpu; struct irq_info *info; - init_evtchn_cpu_bindings(); - /* New event-channel space is not 'live' yet. */ - for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) - mask_evtchn(evtchn); + xen_evtchn_mask_all(); + xen_evtchn_resume(); /* No IRQ <-> event-channel mappings. */ list_for_each_entry(info, &xen_irq_list_head, list) info->evtchn = 0; /* zap event-channel binding */ - for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) - evtchn_to_irq[evtchn] = -1; + clear_evtchn_to_irq_all(); for_each_possible_cpu(cpu) { restore_cpu_virqs(cpu); @@ -1889,27 +1637,40 @@ void xen_callback_vector(void) void xen_callback_vector(void) {} #endif +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "xen." + +static bool fifo_events = true; +module_param(fifo_events, bool, 0); + void __init xen_init_IRQ(void) { - int i; + int ret = -EINVAL; - evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq), - GFP_KERNEL); - BUG_ON(!evtchn_to_irq); - for (i = 0; i < NR_EVENT_CHANNELS; i++) - evtchn_to_irq[i] = -1; + if (fifo_events) + ret = xen_evtchn_fifo_init(); + if (ret < 0) + xen_evtchn_2l_init(); - init_evtchn_cpu_bindings(); + evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()), + sizeof(*evtchn_to_irq), GFP_KERNEL); + BUG_ON(!evtchn_to_irq); /* No event channels are 'live' right now. */ - for (i = 0; i < NR_EVENT_CHANNELS; i++) - mask_evtchn(i); + xen_evtchn_mask_all(); pirq_needs_eoi = pirq_needs_eoi_flag; #ifdef CONFIG_X86 - if (xen_hvm_domain()) { + if (xen_pv_domain()) { + irq_ctx_init(smp_processor_id()); + if (xen_initial_domain()) + pci_xen_initial_domain(); + } + if (xen_feature(XENFEAT_hvm_callback_vector)) xen_callback_vector(); + + if (xen_hvm_domain()) { native_init_IRQ(); /* pci_xen_hvm_init must be called after native_init_IRQ so that * __acpi_register_gsi can point at the right function */ @@ -1918,13 +1679,10 @@ void __init xen_init_IRQ(void) int rc; struct physdev_pirq_eoi_gmfn eoi_gmfn; - irq_ctx_init(smp_processor_id()); - if (xen_initial_domain()) - pci_xen_initial_domain(); - pirq_eoi_map = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO); eoi_gmfn.gmfn = virt_to_mfn(pirq_eoi_map); rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn); + /* TODO: No PVH support for PIRQ EOI */ if (rc != 0) { free_page((unsigned long) pirq_eoi_map); pirq_eoi_map = NULL; diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c new file mode 100644 index 00000000000..84b4bfb8434 --- /dev/null +++ b/drivers/xen/events/events_fifo.c @@ -0,0 +1,443 @@ +/* + * Xen event channels (FIFO-based ABI) + * + * Copyright (C) 2013 Citrix Systems R&D ltd. + * + * This source code is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * Or, when distributed separately from the Linux kernel or + * incorporated into other software packages, subject to the following + * license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/linkage.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/smp.h> +#include <linux/percpu.h> +#include <linux/cpu.h> + +#include <asm/sync_bitops.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/page.h> + +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> + +#include "events_internal.h" + +#define EVENT_WORDS_PER_PAGE (PAGE_SIZE / sizeof(event_word_t)) +#define MAX_EVENT_ARRAY_PAGES (EVTCHN_FIFO_NR_CHANNELS / EVENT_WORDS_PER_PAGE) + +struct evtchn_fifo_queue { + uint32_t head[EVTCHN_FIFO_MAX_QUEUES]; +}; + +static DEFINE_PER_CPU(struct evtchn_fifo_control_block *, cpu_control_block); +static DEFINE_PER_CPU(struct evtchn_fifo_queue, cpu_queue); +static event_word_t *event_array[MAX_EVENT_ARRAY_PAGES] __read_mostly; +static unsigned event_array_pages __read_mostly; + +/* + * sync_set_bit() and friends must be unsigned long aligned on non-x86 + * platforms. + */ +#if !defined(CONFIG_X86) && BITS_PER_LONG > 32 + +#define BM(w) (unsigned long *)((unsigned long)w & ~0x7UL) +#define EVTCHN_FIFO_BIT(b, w) \ + (((unsigned long)w & 0x4UL) ? (EVTCHN_FIFO_ ##b + 32) : EVTCHN_FIFO_ ##b) + +#else + +#define BM(w) ((unsigned long *)(w)) +#define EVTCHN_FIFO_BIT(b, w) EVTCHN_FIFO_ ##b + +#endif + +static inline event_word_t *event_word_from_port(unsigned port) +{ + unsigned i = port / EVENT_WORDS_PER_PAGE; + + return event_array[i] + port % EVENT_WORDS_PER_PAGE; +} + +static unsigned evtchn_fifo_max_channels(void) +{ + return EVTCHN_FIFO_NR_CHANNELS; +} + +static unsigned evtchn_fifo_nr_channels(void) +{ + return event_array_pages * EVENT_WORDS_PER_PAGE; +} + +static void free_unused_array_pages(void) +{ + unsigned i; + + for (i = event_array_pages; i < MAX_EVENT_ARRAY_PAGES; i++) { + if (!event_array[i]) + break; + free_page((unsigned long)event_array[i]); + event_array[i] = NULL; + } +} + +static void init_array_page(event_word_t *array_page) +{ + unsigned i; + + for (i = 0; i < EVENT_WORDS_PER_PAGE; i++) + array_page[i] = 1 << EVTCHN_FIFO_MASKED; +} + +static int evtchn_fifo_setup(struct irq_info *info) +{ + unsigned port = info->evtchn; + unsigned new_array_pages; + int ret; + + new_array_pages = port / EVENT_WORDS_PER_PAGE + 1; + + if (new_array_pages > MAX_EVENT_ARRAY_PAGES) + return -EINVAL; + + while (event_array_pages < new_array_pages) { + void *array_page; + struct evtchn_expand_array expand_array; + + /* Might already have a page if we've resumed. */ + array_page = event_array[event_array_pages]; + if (!array_page) { + array_page = (void *)__get_free_page(GFP_KERNEL); + if (array_page == NULL) { + ret = -ENOMEM; + goto error; + } + event_array[event_array_pages] = array_page; + } + + /* Mask all events in this page before adding it. */ + init_array_page(array_page); + + expand_array.array_gfn = virt_to_mfn(array_page); + + ret = HYPERVISOR_event_channel_op(EVTCHNOP_expand_array, &expand_array); + if (ret < 0) + goto error; + + event_array_pages++; + } + return 0; + + error: + if (event_array_pages == 0) + panic("xen: unable to expand event array with initial page (%d)\n", ret); + else + pr_err("unable to expand event array (%d)\n", ret); + free_unused_array_pages(); + return ret; +} + +static void evtchn_fifo_bind_to_cpu(struct irq_info *info, unsigned cpu) +{ + /* no-op */ +} + +static void evtchn_fifo_clear_pending(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + sync_clear_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static void evtchn_fifo_set_pending(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + sync_set_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static bool evtchn_fifo_is_pending(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + return sync_test_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static bool evtchn_fifo_test_and_set_mask(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + return sync_test_and_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); +} + +static void evtchn_fifo_mask(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + sync_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); +} + +static bool evtchn_fifo_is_masked(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + return sync_test_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); +} +/* + * Clear MASKED, spinning if BUSY is set. + */ +static void clear_masked(volatile event_word_t *word) +{ + event_word_t new, old, w; + + w = *word; + + do { + old = w & ~(1 << EVTCHN_FIFO_BUSY); + new = old & ~(1 << EVTCHN_FIFO_MASKED); + w = sync_cmpxchg(word, old, new); + } while (w != old); +} + +static void evtchn_fifo_unmask(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + + BUG_ON(!irqs_disabled()); + + clear_masked(word); + if (evtchn_fifo_is_pending(port)) { + struct evtchn_unmask unmask = { .port = port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); + } +} + +static uint32_t clear_linked(volatile event_word_t *word) +{ + event_word_t new, old, w; + + w = *word; + + do { + old = w; + new = (w & ~((1 << EVTCHN_FIFO_LINKED) + | EVTCHN_FIFO_LINK_MASK)); + } while ((w = sync_cmpxchg(word, old, new)) != old); + + return w & EVTCHN_FIFO_LINK_MASK; +} + +static void handle_irq_for_port(unsigned port) +{ + int irq; + + irq = get_evtchn_to_irq(port); + if (irq != -1) + generic_handle_irq(irq); +} + +static void consume_one_event(unsigned cpu, + struct evtchn_fifo_control_block *control_block, + unsigned priority, unsigned long *ready) +{ + struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu); + uint32_t head; + unsigned port; + event_word_t *word; + + head = q->head[priority]; + + /* + * Reached the tail last time? Read the new HEAD from the + * control block. + */ + if (head == 0) { + rmb(); /* Ensure word is up-to-date before reading head. */ + head = control_block->head[priority]; + } + + port = head; + word = event_word_from_port(port); + head = clear_linked(word); + + /* + * If the link is non-zero, there are more events in the + * queue, otherwise the queue is empty. + * + * If the queue is empty, clear this priority from our local + * copy of the ready word. + */ + if (head == 0) + clear_bit(priority, ready); + + if (evtchn_fifo_is_pending(port) && !evtchn_fifo_is_masked(port)) + handle_irq_for_port(port); + + q->head[priority] = head; +} + +static void evtchn_fifo_handle_events(unsigned cpu) +{ + struct evtchn_fifo_control_block *control_block; + unsigned long ready; + unsigned q; + + control_block = per_cpu(cpu_control_block, cpu); + + ready = xchg(&control_block->ready, 0); + + while (ready) { + q = find_first_bit(BM(&ready), EVTCHN_FIFO_MAX_QUEUES); + consume_one_event(cpu, control_block, q, &ready); + ready |= xchg(&control_block->ready, 0); + } +} + +static void evtchn_fifo_resume(void) +{ + unsigned cpu; + + for_each_possible_cpu(cpu) { + void *control_block = per_cpu(cpu_control_block, cpu); + struct evtchn_init_control init_control; + int ret; + + if (!control_block) + continue; + + /* + * If this CPU is offline, take the opportunity to + * free the control block while it is not being + * used. + */ + if (!cpu_online(cpu)) { + free_page((unsigned long)control_block); + per_cpu(cpu_control_block, cpu) = NULL; + continue; + } + + init_control.control_gfn = virt_to_mfn(control_block); + init_control.offset = 0; + init_control.vcpu = cpu; + + ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control, + &init_control); + if (ret < 0) + BUG(); + } + + /* + * The event array starts out as empty again and is extended + * as normal when events are bound. The existing pages will + * be reused. + */ + event_array_pages = 0; +} + +static const struct evtchn_ops evtchn_ops_fifo = { + .max_channels = evtchn_fifo_max_channels, + .nr_channels = evtchn_fifo_nr_channels, + .setup = evtchn_fifo_setup, + .bind_to_cpu = evtchn_fifo_bind_to_cpu, + .clear_pending = evtchn_fifo_clear_pending, + .set_pending = evtchn_fifo_set_pending, + .is_pending = evtchn_fifo_is_pending, + .test_and_set_mask = evtchn_fifo_test_and_set_mask, + .mask = evtchn_fifo_mask, + .unmask = evtchn_fifo_unmask, + .handle_events = evtchn_fifo_handle_events, + .resume = evtchn_fifo_resume, +}; + +static int evtchn_fifo_init_control_block(unsigned cpu) +{ + struct page *control_block = NULL; + struct evtchn_init_control init_control; + int ret = -ENOMEM; + + control_block = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (control_block == NULL) + goto error; + + init_control.control_gfn = virt_to_mfn(page_address(control_block)); + init_control.offset = 0; + init_control.vcpu = cpu; + + ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control, &init_control); + if (ret < 0) + goto error; + + per_cpu(cpu_control_block, cpu) = page_address(control_block); + + return 0; + + error: + __free_page(control_block); + return ret; +} + +static int evtchn_fifo_cpu_notification(struct notifier_block *self, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + int ret = 0; + + switch (action) { + case CPU_UP_PREPARE: + if (!per_cpu(cpu_control_block, cpu)) + ret = evtchn_fifo_init_control_block(cpu); + break; + default: + break; + } + return ret < 0 ? NOTIFY_BAD : NOTIFY_OK; +} + +static struct notifier_block evtchn_fifo_cpu_notifier = { + .notifier_call = evtchn_fifo_cpu_notification, +}; + +int __init xen_evtchn_fifo_init(void) +{ + int cpu = get_cpu(); + int ret; + + ret = evtchn_fifo_init_control_block(cpu); + if (ret < 0) + goto out; + + pr_info("Using FIFO-based ABI\n"); + + evtchn_ops = &evtchn_ops_fifo; + + register_cpu_notifier(&evtchn_fifo_cpu_notifier); +out: + put_cpu(); + return ret; +} diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h new file mode 100644 index 00000000000..50c2050a1e3 --- /dev/null +++ b/drivers/xen/events/events_internal.h @@ -0,0 +1,151 @@ +/* + * Xen Event Channels (internal header) + * + * Copyright (C) 2013 Citrix Systems R&D Ltd. + * + * This source code is licensed under the GNU General Public License, + * Version 2 or later. See the file COPYING for more details. + */ +#ifndef __EVENTS_INTERNAL_H__ +#define __EVENTS_INTERNAL_H__ + +/* Interrupt types. */ +enum xen_irq_type { + IRQT_UNBOUND = 0, + IRQT_PIRQ, + IRQT_VIRQ, + IRQT_IPI, + IRQT_EVTCHN +}; + +/* + * Packed IRQ information: + * type - enum xen_irq_type + * event channel - irq->event channel mapping + * cpu - cpu this event channel is bound to + * index - type-specific information: + * PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM + * guest, or GSI (real passthrough IRQ) of the device. + * VIRQ - virq number + * IPI - IPI vector + * EVTCHN - + */ +struct irq_info { + struct list_head list; + int refcnt; + enum xen_irq_type type; /* type */ + unsigned irq; + unsigned int evtchn; /* event channel */ + unsigned short cpu; /* cpu bound */ + + union { + unsigned short virq; + enum ipi_vector ipi; + struct { + unsigned short pirq; + unsigned short gsi; + unsigned char vector; + unsigned char flags; + uint16_t domid; + } pirq; + } u; +}; + +#define PIRQ_NEEDS_EOI (1 << 0) +#define PIRQ_SHAREABLE (1 << 1) +#define PIRQ_MSI_GROUP (1 << 2) + +struct evtchn_ops { + unsigned (*max_channels)(void); + unsigned (*nr_channels)(void); + + int (*setup)(struct irq_info *info); + void (*bind_to_cpu)(struct irq_info *info, unsigned cpu); + + void (*clear_pending)(unsigned port); + void (*set_pending)(unsigned port); + bool (*is_pending)(unsigned port); + bool (*test_and_set_mask)(unsigned port); + void (*mask)(unsigned port); + void (*unmask)(unsigned port); + + void (*handle_events)(unsigned cpu); + void (*resume)(void); +}; + +extern const struct evtchn_ops *evtchn_ops; + +extern int **evtchn_to_irq; +int get_evtchn_to_irq(unsigned int evtchn); + +struct irq_info *info_for_irq(unsigned irq); +unsigned cpu_from_irq(unsigned irq); +unsigned cpu_from_evtchn(unsigned int evtchn); + +static inline unsigned xen_evtchn_max_channels(void) +{ + return evtchn_ops->max_channels(); +} + +/* + * Do any ABI specific setup for a bound event channel before it can + * be unmasked and used. + */ +static inline int xen_evtchn_port_setup(struct irq_info *info) +{ + if (evtchn_ops->setup) + return evtchn_ops->setup(info); + return 0; +} + +static inline void xen_evtchn_port_bind_to_cpu(struct irq_info *info, + unsigned cpu) +{ + evtchn_ops->bind_to_cpu(info, cpu); +} + +static inline void clear_evtchn(unsigned port) +{ + evtchn_ops->clear_pending(port); +} + +static inline void set_evtchn(unsigned port) +{ + evtchn_ops->set_pending(port); +} + +static inline bool test_evtchn(unsigned port) +{ + return evtchn_ops->is_pending(port); +} + +static inline bool test_and_set_mask(unsigned port) +{ + return evtchn_ops->test_and_set_mask(port); +} + +static inline void mask_evtchn(unsigned port) +{ + return evtchn_ops->mask(port); +} + +static inline void unmask_evtchn(unsigned port) +{ + return evtchn_ops->unmask(port); +} + +static inline void xen_evtchn_handle_events(unsigned cpu) +{ + return evtchn_ops->handle_events(cpu); +} + +static inline void xen_evtchn_resume(void) +{ + if (evtchn_ops->resume) + evtchn_ops->resume(); +} + +void xen_evtchn_2l_init(void); +int xen_evtchn_fifo_init(void); + +#endif /* #ifndef __EVENTS_INTERNAL_H__ */ diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index 8b3a69a06c3..00f40f051d9 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -305,7 +305,7 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port) if (rc < 0) goto err; - rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED, + rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, 0, u->name, evtchn); if (rc < 0) goto err; @@ -417,7 +417,7 @@ static long evtchn_ioctl(struct file *file, break; rc = -EINVAL; - if (unbind.port >= NR_EVENT_CHANNELS) + if (unbind.port >= xen_evtchn_nr_channels()) break; rc = -ENOTCONN; diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index e41c79c986e..073b4a19a8b 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -846,7 +846,7 @@ static int __init gntdev_init(void) if (!xen_domain()) return -ENODEV; - use_ptemod = xen_pv_domain(); + use_ptemod = !xen_feature(XENFEAT_auto_translated_physmap); err = misc_register(&gntdev_miscdev); if (err != 0) { diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index c4d2298893b..eeba7544f0c 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -49,6 +49,7 @@ #include <xen/grant_table.h> #include <xen/interface/memory.h> #include <xen/hvc-console.h> +#include <xen/swiotlb-xen.h> #include <asm/xen/hypercall.h> #include <asm/xen/interface.h> @@ -61,12 +62,10 @@ static grant_ref_t **gnttab_list; static unsigned int nr_grant_frames; -static unsigned int boot_max_nr_grant_frames; static int gnttab_free_count; static grant_ref_t gnttab_free_head; static DEFINE_SPINLOCK(gnttab_list_lock); -unsigned long xen_hvm_resume_frames; -EXPORT_SYMBOL_GPL(xen_hvm_resume_frames); +struct grant_frames xen_auto_xlat_grant_frames; static union { struct grant_entry_v1 *v1; @@ -826,6 +825,11 @@ static unsigned int __max_nr_grant_frames(void) unsigned int gnttab_max_grant_frames(void) { unsigned int xen_max = __max_nr_grant_frames(); + static unsigned int boot_max_nr_grant_frames; + + /* First time, initialize it properly. */ + if (!boot_max_nr_grant_frames) + boot_max_nr_grant_frames = __max_nr_grant_frames(); if (xen_max > boot_max_nr_grant_frames) return boot_max_nr_grant_frames; @@ -833,6 +837,51 @@ unsigned int gnttab_max_grant_frames(void) } EXPORT_SYMBOL_GPL(gnttab_max_grant_frames); +int gnttab_setup_auto_xlat_frames(phys_addr_t addr) +{ + xen_pfn_t *pfn; + unsigned int max_nr_gframes = __max_nr_grant_frames(); + unsigned int i; + void *vaddr; + + if (xen_auto_xlat_grant_frames.count) + return -EINVAL; + + vaddr = xen_remap(addr, PAGE_SIZE * max_nr_gframes); + if (vaddr == NULL) { + pr_warn("Failed to ioremap gnttab share frames (addr=%pa)!\n", + &addr); + return -ENOMEM; + } + pfn = kcalloc(max_nr_gframes, sizeof(pfn[0]), GFP_KERNEL); + if (!pfn) { + xen_unmap(vaddr); + return -ENOMEM; + } + for (i = 0; i < max_nr_gframes; i++) + pfn[i] = PFN_DOWN(addr) + i; + + xen_auto_xlat_grant_frames.vaddr = vaddr; + xen_auto_xlat_grant_frames.pfn = pfn; + xen_auto_xlat_grant_frames.count = max_nr_gframes; + + return 0; +} +EXPORT_SYMBOL_GPL(gnttab_setup_auto_xlat_frames); + +void gnttab_free_auto_xlat_frames(void) +{ + if (!xen_auto_xlat_grant_frames.count) + return; + kfree(xen_auto_xlat_grant_frames.pfn); + xen_unmap(xen_auto_xlat_grant_frames.vaddr); + + xen_auto_xlat_grant_frames.pfn = NULL; + xen_auto_xlat_grant_frames.count = 0; + xen_auto_xlat_grant_frames.vaddr = NULL; +} +EXPORT_SYMBOL_GPL(gnttab_free_auto_xlat_frames); + /* Handling of paged out grant targets (GNTST_eagain) */ #define MAX_DELAY 256 static inline void @@ -884,9 +933,6 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, struct page **pages, unsigned int count) { int i, ret; - bool lazy = false; - pte_t *pte; - unsigned long mfn; ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count); if (ret) @@ -898,36 +944,7 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, map_ops + i, &map_ops[i].status, __func__); - if (xen_feature(XENFEAT_auto_translated_physmap)) - return ret; - - if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { - arch_enter_lazy_mmu_mode(); - lazy = true; - } - - for (i = 0; i < count; i++) { - /* Do not add to override if the map failed. */ - if (map_ops[i].status) - continue; - - if (map_ops[i].flags & GNTMAP_contains_pte) { - pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + - (map_ops[i].host_addr & ~PAGE_MASK)); - mfn = pte_mfn(*pte); - } else { - mfn = PFN_DOWN(map_ops[i].dev_bus_addr); - } - ret = m2p_add_override(mfn, pages[i], kmap_ops ? - &kmap_ops[i] : NULL); - if (ret) - return ret; - } - - if (lazy) - arch_leave_lazy_mmu_mode(); - - return ret; + return set_foreign_p2m_mapping(map_ops, kmap_ops, pages, count); } EXPORT_SYMBOL_GPL(gnttab_map_refs); @@ -935,32 +952,13 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count) { - int i, ret; - bool lazy = false; + int ret; ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count); if (ret) return ret; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return ret; - - if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { - arch_enter_lazy_mmu_mode(); - lazy = true; - } - - for (i = 0; i < count; i++) { - ret = m2p_remove_override(pages[i], kmap_ops ? - &kmap_ops[i] : NULL); - if (ret) - return ret; - } - - if (lazy) - arch_leave_lazy_mmu_mode(); - - return ret; + return clear_foreign_p2m_mapping(unmap_ops, kmap_ops, pages, count); } EXPORT_SYMBOL_GPL(gnttab_unmap_refs); @@ -1043,10 +1041,11 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) unsigned int nr_gframes = end_idx + 1; int rc; - if (xen_hvm_domain()) { + if (xen_feature(XENFEAT_auto_translated_physmap)) { struct xen_add_to_physmap xatp; unsigned int i = end_idx; rc = 0; + BUG_ON(xen_auto_xlat_grant_frames.count < nr_gframes); /* * Loop backwards, so that the first hypercall has the largest * index, ensuring that the table will grow only once. @@ -1055,7 +1054,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) xatp.domid = DOMID_SELF; xatp.idx = i; xatp.space = XENMAPSPACE_grant_table; - xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i; + xatp.gpfn = xen_auto_xlat_grant_frames.pfn[i]; rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp); if (rc != 0) { pr_warn("grant table add_to_physmap failed, err=%d\n", @@ -1118,10 +1117,8 @@ static void gnttab_request_version(void) int rc; struct gnttab_set_version gsv; - if (xen_hvm_domain()) - gsv.version = 1; - else - gsv.version = 2; + gsv.version = 1; + rc = HYPERVISOR_grant_table_op(GNTTABOP_set_version, &gsv, 1); if (rc == 0 && gsv.version == 2) { grant_table_version = 2; @@ -1152,21 +1149,15 @@ static int gnttab_setup(void) if (max_nr_gframes < nr_grant_frames) return -ENOSYS; - if (xen_pv_domain()) - return gnttab_map(0, nr_grant_frames - 1); - - if (gnttab_shared.addr == NULL) { - gnttab_shared.addr = xen_remap(xen_hvm_resume_frames, - PAGE_SIZE * max_nr_gframes); + if (xen_feature(XENFEAT_auto_translated_physmap) && gnttab_shared.addr == NULL) { + gnttab_shared.addr = xen_auto_xlat_grant_frames.vaddr; if (gnttab_shared.addr == NULL) { - pr_warn("Failed to ioremap gnttab share frames!\n"); + pr_warn("gnttab share frames (addr=0x%08lx) is not mapped!\n", + (unsigned long)xen_auto_xlat_grant_frames.vaddr); return -ENOMEM; } } - - gnttab_map(0, nr_grant_frames - 1); - - return 0; + return gnttab_map(0, nr_grant_frames - 1); } int gnttab_resume(void) @@ -1177,7 +1168,8 @@ int gnttab_resume(void) int gnttab_suspend(void) { - gnttab_interface->unmap_frames(); + if (!xen_feature(XENFEAT_auto_translated_physmap)) + gnttab_interface->unmap_frames(); return 0; } @@ -1203,19 +1195,20 @@ static int gnttab_expand(unsigned int req_entries) int gnttab_init(void) { int i; + unsigned long max_nr_grant_frames; unsigned int max_nr_glist_frames, nr_glist_frames; unsigned int nr_init_grefs; int ret; gnttab_request_version(); + max_nr_grant_frames = gnttab_max_grant_frames(); nr_grant_frames = 1; - boot_max_nr_grant_frames = __max_nr_grant_frames(); /* Determine the maximum number of frames required for the * grant reference free list on the current hypervisor. */ BUG_ON(grefs_per_grant_frame == 0); - max_nr_glist_frames = (boot_max_nr_grant_frames * + max_nr_glist_frames = (max_nr_grant_frames * grefs_per_grant_frame / RPP); gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *), @@ -1232,6 +1225,11 @@ int gnttab_init(void) } } + ret = arch_gnttab_init(max_nr_grant_frames, + nr_status_frames(max_nr_grant_frames)); + if (ret < 0) + goto ini_nomem; + if (gnttab_setup() < 0) { ret = -ENODEV; goto ini_nomem; @@ -1268,5 +1266,6 @@ static int __gnttab_init(void) return gnttab_init(); } - -core_initcall(__gnttab_init); +/* Starts after core_initcall so that xen_pvh_gnttab_setup can be called + * beforehand to initialize xen_auto_xlat_grant_frames. */ +core_initcall_sync(__gnttab_init); diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 624e8dc2453..5f1e1f3cd18 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -41,32 +41,23 @@ static enum shutdown_state shutting_down = SHUTDOWN_INVALID; struct suspend_info { int cancelled; - unsigned long arg; /* extra hypercall argument */ - void (*pre)(void); - void (*post)(int cancelled); }; -#ifdef CONFIG_HIBERNATE_CALLBACKS -static void xen_hvm_post_suspend(int cancelled) -{ - xen_arch_hvm_post_suspend(cancelled); - gnttab_resume(); -} +static RAW_NOTIFIER_HEAD(xen_resume_notifier); -static void xen_pre_suspend(void) +void xen_resume_notifier_register(struct notifier_block *nb) { - xen_mm_pin_all(); - gnttab_suspend(); - xen_arch_pre_suspend(); + raw_notifier_chain_register(&xen_resume_notifier, nb); } +EXPORT_SYMBOL_GPL(xen_resume_notifier_register); -static void xen_post_suspend(int cancelled) +void xen_resume_notifier_unregister(struct notifier_block *nb) { - xen_arch_post_suspend(cancelled); - gnttab_resume(); - xen_mm_unpin_all(); + raw_notifier_chain_unregister(&xen_resume_notifier, nb); } +EXPORT_SYMBOL_GPL(xen_resume_notifier_unregister); +#ifdef CONFIG_HIBERNATE_CALLBACKS static int xen_suspend(void *data) { struct suspend_info *si = data; @@ -80,22 +71,23 @@ static int xen_suspend(void *data) return err; } - if (si->pre) - si->pre(); + gnttab_suspend(); + xen_arch_pre_suspend(); /* * This hypercall returns 1 if suspend was cancelled * or the domain was merely checkpointed, and 0 if it * is resuming in a new domain. */ - si->cancelled = HYPERVISOR_suspend(si->arg); + si->cancelled = HYPERVISOR_suspend(xen_pv_domain() + ? virt_to_mfn(xen_start_info) + : 0); - if (si->post) - si->post(si->cancelled); + xen_arch_post_suspend(si->cancelled); + gnttab_resume(); if (!si->cancelled) { xen_irq_resume(); - xen_console_resume(); xen_timer_resume(); } @@ -140,18 +132,14 @@ static void do_suspend(void) si.cancelled = 1; - if (xen_hvm_domain()) { - si.arg = 0UL; - si.pre = NULL; - si.post = &xen_hvm_post_suspend; - } else { - si.arg = virt_to_mfn(xen_start_info); - si.pre = &xen_pre_suspend; - si.post = &xen_post_suspend; - } - err = stop_machine(xen_suspend, &si, cpumask_of(0)); + /* Resume console as early as possible. */ + if (!si.cancelled) + xen_console_resume(); + + raw_notifier_call_chain(&xen_resume_notifier, 0, NULL); + dpm_resume_start(si.cancelled ? PMSG_THAW : PMSG_RESTORE); if (err) { @@ -182,10 +170,32 @@ struct shutdown_handler { void (*cb)(void); }; +static int poweroff_nb(struct notifier_block *cb, unsigned long code, void *unused) +{ + switch (code) { + case SYS_DOWN: + case SYS_HALT: + case SYS_POWER_OFF: + shutting_down = SHUTDOWN_POWEROFF; + default: + break; + } + return NOTIFY_DONE; +} static void do_poweroff(void) { - shutting_down = SHUTDOWN_POWEROFF; - orderly_poweroff(false); + switch (system_state) { + case SYSTEM_BOOTING: + orderly_poweroff(true); + break; + case SYSTEM_RUNNING: + orderly_poweroff(false); + break; + default: + /* Don't do it when we are halting/rebooting. */ + pr_info("Ignoring Xen toolstack shutdown.\n"); + break; + } } static void do_reboot(void) @@ -291,6 +301,10 @@ static struct xenbus_watch shutdown_watch = { .callback = shutdown_handler }; +static struct notifier_block xen_reboot_nb = { + .notifier_call = poweroff_nb, +}; + static int setup_shutdown_watcher(void) { int err; @@ -301,6 +315,7 @@ static int setup_shutdown_watcher(void) return err; } + #ifdef CONFIG_MAGIC_SYSRQ err = register_xenbus_watch(&sysrq_watch); if (err) { @@ -329,6 +344,7 @@ int xen_setup_shutdown_event(void) if (!xen_domain()) return -ENODEV; register_xenstore_notifier(&xenstore_notifier); + register_reboot_notifier(&xen_reboot_nb); return 0; } diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c index 18fff88254e..dd9c249ea31 100644 --- a/drivers/xen/pci.c +++ b/drivers/xen/pci.c @@ -26,6 +26,9 @@ #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> #include "../pci/pci.h" +#ifdef CONFIG_PCI_MMCONFIG +#include <asm/pci_x86.h> +#endif static bool __read_mostly pci_seg_supported = true; @@ -58,12 +61,12 @@ static int xen_add_device(struct device *dev) add.flags = XEN_PCI_DEV_EXTFN; #ifdef CONFIG_ACPI - handle = DEVICE_ACPI_HANDLE(&pci_dev->dev); + handle = ACPI_HANDLE(&pci_dev->dev); if (!handle && pci_dev->bus->bridge) - handle = DEVICE_ACPI_HANDLE(pci_dev->bus->bridge); + handle = ACPI_HANDLE(pci_dev->bus->bridge); #ifdef CONFIG_PCI_IOV if (!handle && pci_dev->is_virtfn) - handle = DEVICE_ACPI_HANDLE(physfn->bus->bridge); + handle = ACPI_HANDLE(physfn->bus->bridge); #endif if (handle) { acpi_status status; @@ -192,3 +195,49 @@ static int __init register_xen_pci_notifier(void) } arch_initcall(register_xen_pci_notifier); + +#ifdef CONFIG_PCI_MMCONFIG +static int __init xen_mcfg_late(void) +{ + struct pci_mmcfg_region *cfg; + int rc; + + if (!xen_initial_domain()) + return 0; + + if ((pci_probe & PCI_PROBE_MMCONF) == 0) + return 0; + + if (list_empty(&pci_mmcfg_list)) + return 0; + + /* Check whether they are in the right area. */ + list_for_each_entry(cfg, &pci_mmcfg_list, list) { + struct physdev_pci_mmcfg_reserved r; + + r.address = cfg->address; + r.segment = cfg->segment; + r.start_bus = cfg->start_bus; + r.end_bus = cfg->end_bus; + r.flags = XEN_PCI_MMCFG_RESERVED; + + rc = HYPERVISOR_physdev_op(PHYSDEVOP_pci_mmcfg_reserved, &r); + switch (rc) { + case 0: + case -ENOSYS: + continue; + + default: + pr_warn("Failed to report MMCONFIG reservation" + " state for %s to hypervisor" + " (%d)\n", + cfg->name, rc); + } + } + return 0; +} +/* + * Needs to be done after acpi_init which are subsys_initcall. + */ +subsys_initcall_sync(xen_mcfg_late); +#endif diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c index 79e1dff7ed4..0aac403d53f 100644 --- a/drivers/xen/pcpu.c +++ b/drivers/xen/pcpu.c @@ -40,6 +40,7 @@ #include <linux/capability.h> #include <xen/xen.h> +#include <xen/acpi.h> #include <xen/xenbus.h> #include <xen/events.h> #include <xen/interface/platform.h> diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index 99db9e1eb8b..3454973dc3b 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -45,7 +45,7 @@ static unsigned long platform_mmio_alloc; static unsigned long platform_mmiolen; static uint64_t callback_via; -unsigned long alloc_xen_mmio(unsigned long len) +static unsigned long alloc_xen_mmio(unsigned long len) { unsigned long addr; @@ -84,7 +84,7 @@ static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id) static int xen_allocate_irq(struct pci_dev *pdev) { return request_irq(pdev->irq, do_hvm_evtchn_intr, - IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TRIGGER_RISING, + IRQF_NOBALANCING | IRQF_TRIGGER_RISING, "xen-platform-pci", pdev); } @@ -108,6 +108,7 @@ static int platform_pci_init(struct pci_dev *pdev, long ioaddr; long mmio_addr, mmio_len; unsigned int max_nr_gframes; + unsigned long grant_frames; if (!xen_domain()) return -ENODEV; @@ -154,13 +155,17 @@ static int platform_pci_init(struct pci_dev *pdev, } max_nr_gframes = gnttab_max_grant_frames(); - xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); - ret = gnttab_init(); + grant_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); + ret = gnttab_setup_auto_xlat_frames(grant_frames); if (ret) goto out; + ret = gnttab_init(); + if (ret) + goto grant_out; xenbus_probe(NULL); return 0; - +grant_out: + gnttab_free_auto_xlat_frames(); out: pci_release_region(pdev, 0); mem_out: diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 8e74590fa1b..569a13b9e85 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -533,12 +533,17 @@ static void privcmd_close(struct vm_area_struct *vma) { struct page **pages = vma->vm_private_data; int numpgs = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + int rc; if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages) return; - xen_unmap_domain_mfn_range(vma, numpgs, pages); - free_xenballooned_pages(numpgs, pages); + rc = xen_unmap_domain_mfn_range(vma, numpgs, pages); + if (rc == 0) + free_xenballooned_pages(numpgs, pages); + else + pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n", + numpgs, rc); kfree(pages); } diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 1b2277c311d..ebd8f218a78 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -42,12 +42,31 @@ #include <xen/page.h> #include <xen/xen-ops.h> #include <xen/hvc-console.h> + +#include <asm/dma-mapping.h> +#include <asm/xen/page-coherent.h> + +#include <trace/events/swiotlb.h> /* * Used to do a quick range check in swiotlb_tbl_unmap_single and * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this * API. */ +#ifndef CONFIG_X86 +static unsigned long dma_alloc_coherent_mask(struct device *dev, + gfp_t gfp) +{ + unsigned long dma_mask = 0; + + dma_mask = dev->coherent_dma_mask; + if (!dma_mask) + dma_mask = (gfp & GFP_DMA) ? DMA_BIT_MASK(24) : DMA_BIT_MASK(32); + + return dma_mask; +} +#endif + static char *xen_io_tlb_start, *xen_io_tlb_end; static unsigned long xen_io_tlb_nslabs; /* @@ -56,17 +75,35 @@ static unsigned long xen_io_tlb_nslabs; static u64 start_dma_addr; -static dma_addr_t xen_phys_to_bus(phys_addr_t paddr) +/* + * Both of these functions should avoid PFN_PHYS because phys_addr_t + * can be 32bit when dma_addr_t is 64bit leading to a loss in + * information if the shift is done before casting to 64bit. + */ +static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr) { - return phys_to_machine(XPADDR(paddr)).maddr; + unsigned long mfn = pfn_to_mfn(PFN_DOWN(paddr)); + dma_addr_t dma = (dma_addr_t)mfn << PAGE_SHIFT; + + dma |= paddr & ~PAGE_MASK; + + return dma; } -static phys_addr_t xen_bus_to_phys(dma_addr_t baddr) +static inline phys_addr_t xen_bus_to_phys(dma_addr_t baddr) { - return machine_to_phys(XMADDR(baddr)).paddr; + unsigned long pfn = mfn_to_pfn(PFN_DOWN(baddr)); + dma_addr_t dma = (dma_addr_t)pfn << PAGE_SHIFT; + phys_addr_t paddr = dma; + + BUG_ON(paddr != dma); /* truncation has occurred, should never happen */ + + paddr |= baddr & ~PAGE_MASK; + + return paddr; } -static dma_addr_t xen_virt_to_bus(void *address) +static inline dma_addr_t xen_virt_to_bus(void *address) { return xen_phys_to_bus(virt_to_phys(address)); } @@ -89,7 +126,7 @@ static int check_pages_physically_contiguous(unsigned long pfn, return 1; } -static int range_straddles_page_boundary(phys_addr_t p, size_t size) +static inline int range_straddles_page_boundary(phys_addr_t p, size_t size) { unsigned long pfn = PFN_DOWN(p); unsigned int offset = p & ~PAGE_MASK; @@ -126,6 +163,8 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) { int i, rc; int dma_bits; + dma_addr_t dma_handle; + phys_addr_t p = virt_to_phys(buf); dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; @@ -135,9 +174,9 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) do { rc = xen_create_contiguous_region( - (unsigned long)buf + (i << IO_TLB_SHIFT), + p + (i << IO_TLB_SHIFT), get_order(slabs << IO_TLB_SHIFT), - dma_bits); + dma_bits, &dma_handle); } while (rc && dma_bits++ < max_dma_bits); if (rc) return rc; @@ -263,7 +302,6 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, void *ret; int order = get_order(size); u64 dma_mask = DMA_BIT_MASK(32); - unsigned long vstart; phys_addr_t phys; dma_addr_t dev_addr; @@ -278,8 +316,12 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret)) return ret; - vstart = __get_free_pages(flags, order); - ret = (void *)vstart; + /* On ARM this function returns an ioremap'ped virtual address for + * which virt_to_phys doesn't return the corresponding physical + * address. In fact on ARM virt_to_phys only works for kernel direct + * mapped RAM memory. Also see comment below. + */ + ret = xen_alloc_coherent_pages(hwdev, size, dma_handle, flags, attrs); if (!ret) return ret; @@ -287,18 +329,21 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, if (hwdev && hwdev->coherent_dma_mask) dma_mask = dma_alloc_coherent_mask(hwdev, flags); - phys = virt_to_phys(ret); + /* At this point dma_handle is the physical address, next we are + * going to set it to the machine address. + * Do not use virt_to_phys(ret) because on ARM it doesn't correspond + * to *dma_handle. */ + phys = *dma_handle; dev_addr = xen_phys_to_bus(phys); if (((dev_addr + size - 1 <= dma_mask)) && !range_straddles_page_boundary(phys, size)) *dma_handle = dev_addr; else { - if (xen_create_contiguous_region(vstart, order, - fls64(dma_mask)) != 0) { - free_pages(vstart, order); + if (xen_create_contiguous_region(phys, order, + fls64(dma_mask), dma_handle) != 0) { + xen_free_coherent_pages(hwdev, size, ret, (dma_addr_t)phys, attrs); return NULL; } - *dma_handle = virt_to_machine(ret).maddr; } memset(ret, 0, size); return ret; @@ -319,13 +364,15 @@ xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, if (hwdev && hwdev->coherent_dma_mask) dma_mask = hwdev->coherent_dma_mask; - phys = virt_to_phys(vaddr); + /* do not use virt_to_phys because on ARM it doesn't return you the + * physical address */ + phys = xen_bus_to_phys(dev_addr); if (((dev_addr + size - 1 > dma_mask)) || range_straddles_page_boundary(phys, size)) - xen_destroy_contiguous_region((unsigned long)vaddr, order); + xen_destroy_contiguous_region(phys, order); - free_pages((unsigned long)vaddr, order); + xen_free_coherent_pages(hwdev, size, vaddr, (dma_addr_t)phys, attrs); } EXPORT_SYMBOL_GPL(xen_swiotlb_free_coherent); @@ -352,16 +399,25 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, * buffering it. */ if (dma_capable(dev, dev_addr, size) && - !range_straddles_page_boundary(phys, size) && !swiotlb_force) + !range_straddles_page_boundary(phys, size) && !swiotlb_force) { + /* we are not interested in the dma_addr returned by + * xen_dma_map_page, only in the potential cache flushes executed + * by the function. */ + xen_dma_map_page(dev, page, offset, size, dir, attrs); return dev_addr; + } /* * Oh well, have to allocate and map a bounce buffer. */ + trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); + map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir); if (map == SWIOTLB_MAP_ERROR) return DMA_ERROR_CODE; + xen_dma_map_page(dev, pfn_to_page(map >> PAGE_SHIFT), + map & ~PAGE_MASK, size, dir, attrs); dev_addr = xen_phys_to_bus(map); /* @@ -384,12 +440,15 @@ EXPORT_SYMBOL_GPL(xen_swiotlb_map_page); * whatever the device wrote there. */ static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir) + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) { phys_addr_t paddr = xen_bus_to_phys(dev_addr); BUG_ON(dir == DMA_NONE); + xen_dma_unmap_page(hwdev, paddr, size, dir, attrs); + /* NOTE: We use dev_addr here, not paddr! */ if (is_xen_swiotlb_buffer(dev_addr)) { swiotlb_tbl_unmap_single(hwdev, paddr, size, dir); @@ -412,7 +471,7 @@ void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, size_t size, enum dma_data_direction dir, struct dma_attrs *attrs) { - xen_unmap_single(hwdev, dev_addr, size, dir); + xen_unmap_single(hwdev, dev_addr, size, dir, attrs); } EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page); @@ -435,11 +494,15 @@ xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, BUG_ON(dir == DMA_NONE); + if (target == SYNC_FOR_CPU) + xen_dma_sync_single_for_cpu(hwdev, paddr, size, dir); + /* NOTE: We use dev_addr here, not paddr! */ - if (is_xen_swiotlb_buffer(dev_addr)) { + if (is_xen_swiotlb_buffer(dev_addr)) swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target); - return; - } + + if (target == SYNC_FOR_DEVICE) + xen_dma_sync_single_for_cpu(hwdev, paddr, size, dir); if (dir != DMA_FROM_DEVICE) return; @@ -502,16 +565,31 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, sg->length, dir); if (map == SWIOTLB_MAP_ERROR) { + dev_warn(hwdev, "swiotlb buffer is full\n"); /* Don't panic here, we expect map_sg users to do proper error handling. */ xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, attrs); sg_dma_len(sgl) = 0; - return DMA_ERROR_CODE; + return 0; } + xen_dma_map_page(hwdev, pfn_to_page(map >> PAGE_SHIFT), + map & ~PAGE_MASK, + sg->length, + dir, + attrs); sg->dma_address = xen_phys_to_bus(map); - } else + } else { + /* we are not interested in the dma_addr returned by + * xen_dma_map_page, only in the potential cache flushes executed + * by the function. */ + xen_dma_map_page(hwdev, pfn_to_page(paddr >> PAGE_SHIFT), + paddr & ~PAGE_MASK, + sg->length, + dir, + attrs); sg->dma_address = dev_addr; + } sg_dma_len(sg) = sg->length; } return nelems; @@ -533,7 +611,7 @@ xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, BUG_ON(dir == DMA_NONE); for_each_sg(sgl, sg, nelems, i) - xen_unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir); + xen_unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, attrs); } EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg_attrs); @@ -593,3 +671,15 @@ xen_swiotlb_dma_supported(struct device *hwdev, u64 mask) return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask; } EXPORT_SYMBOL_GPL(xen_swiotlb_dma_supported); + +int +xen_swiotlb_set_dma_mask(struct device *dev, u64 dma_mask) +{ + if (!dev->dma_mask || !xen_swiotlb_dma_supported(dev, dma_mask)) + return -EIO; + + *dev->dma_mask = dma_mask; + + return 0; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_set_dma_mask); diff --git a/drivers/xen/xen-acpi-cpuhotplug.c b/drivers/xen/xen-acpi-cpuhotplug.c index 8dae6c13063..3e62ee4b3b6 100644 --- a/drivers/xen/xen-acpi-cpuhotplug.c +++ b/drivers/xen/xen-acpi-cpuhotplug.c @@ -24,10 +24,7 @@ #include <linux/cpu.h> #include <linux/acpi.h> #include <linux/uaccess.h> -#include <acpi/acpi_bus.h> -#include <acpi/acpi_drivers.h> #include <acpi/processor.h> - #include <xen/acpi.h> #include <xen/interface/platform.h> #include <asm/xen/hypercall.h> @@ -269,7 +266,8 @@ static void acpi_processor_hotplug_notify(acpi_handle handle, if (!is_processor_present(handle)) break; - if (!acpi_bus_get_device(handle, &device)) + acpi_bus_get_device(handle, &device); + if (acpi_device_enumerated(device)) break; result = acpi_bus_scan(handle); @@ -277,8 +275,9 @@ static void acpi_processor_hotplug_notify(acpi_handle handle, pr_err(PREFIX "Unable to add the device\n"); break; } - result = acpi_bus_get_device(handle, &device); - if (result) { + device = NULL; + acpi_bus_get_device(handle, &device); + if (!acpi_device_enumerated(device)) { pr_err(PREFIX "Missing device object\n"); break; } @@ -314,7 +313,7 @@ static void acpi_processor_hotplug_notify(acpi_handle handle, goto out; } - (void) acpi_evaluate_hotplug_ost(handle, event, ost_code, NULL); + (void) acpi_evaluate_ost(handle, event, ost_code, NULL); out: acpi_scan_lock_release(); diff --git a/drivers/xen/xen-acpi-memhotplug.c b/drivers/xen/xen-acpi-memhotplug.c index 9083f1e474f..34e40b733f9 100644 --- a/drivers/xen/xen-acpi-memhotplug.c +++ b/drivers/xen/xen-acpi-memhotplug.c @@ -22,7 +22,6 @@ #include <linux/init.h> #include <linux/types.h> #include <linux/acpi.h> -#include <acpi/acpi_drivers.h> #include <xen/acpi.h> #include <xen/interface/platform.h> #include <asm/xen/hypercall.h> @@ -169,7 +168,7 @@ static int acpi_memory_get_device(acpi_handle handle, acpi_scan_lock_acquire(); acpi_bus_get_device(handle, &device); - if (device) + if (acpi_device_enumerated(device)) goto end; /* @@ -182,8 +181,9 @@ static int acpi_memory_get_device(acpi_handle handle, result = -EINVAL; goto out; } - result = acpi_bus_get_device(handle, &device); - if (result) { + device = NULL; + acpi_bus_get_device(handle, &device); + if (!acpi_device_enumerated(device)) { pr_warn(PREFIX "Missing device object\n"); result = -EINVAL; goto out; @@ -285,7 +285,7 @@ static void acpi_memory_device_notify(acpi_handle handle, u32 event, void *data) return; } - (void) acpi_evaluate_hotplug_ost(handle, event, ost_code, NULL); + (void) acpi_evaluate_ost(handle, event, ost_code, NULL); return; } diff --git a/drivers/xen/xen-acpi-pad.c b/drivers/xen/xen-acpi-pad.c index 59708fdd068..f83b754505f 100644 --- a/drivers/xen/xen-acpi-pad.c +++ b/drivers/xen/xen-acpi-pad.c @@ -18,11 +18,10 @@ #include <linux/kernel.h> #include <linux/types.h> -#include <acpi/acpi_bus.h> -#include <acpi/acpi_drivers.h> -#include <asm/xen/hypercall.h> +#include <linux/acpi.h> #include <xen/interface/version.h> #include <xen/xen-ops.h> +#include <asm/xen/hypercall.h> #define ACPI_PROCESSOR_AGGREGATOR_CLASS "acpi_pad" #define ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME "Processor Aggregator" @@ -78,27 +77,14 @@ static int acpi_pad_pur(acpi_handle handle) return num; } -/* Notify firmware how many CPUs are idle */ -static void acpi_pad_ost(acpi_handle handle, int stat, - uint32_t idle_nums) -{ - union acpi_object params[3] = { - {.type = ACPI_TYPE_INTEGER,}, - {.type = ACPI_TYPE_INTEGER,}, - {.type = ACPI_TYPE_BUFFER,}, - }; - struct acpi_object_list arg_list = {3, params}; - - params[0].integer.value = ACPI_PROCESSOR_AGGREGATOR_NOTIFY; - params[1].integer.value = stat; - params[2].buffer.length = 4; - params[2].buffer.pointer = (void *)&idle_nums; - acpi_evaluate_object(handle, "_OST", &arg_list, NULL); -} - static void acpi_pad_handle_notify(acpi_handle handle) { int idle_nums; + struct acpi_buffer param = { + .length = 4, + .pointer = (void *)&idle_nums, + }; + mutex_lock(&xen_cpu_lock); idle_nums = acpi_pad_pur(handle); @@ -110,7 +96,8 @@ static void acpi_pad_handle_notify(acpi_handle handle) idle_nums = xen_acpi_pad_idle_cpus(idle_nums) ?: xen_acpi_pad_idle_cpus_num(); if (idle_nums >= 0) - acpi_pad_ost(handle, 0, idle_nums); + acpi_evaluate_ost(handle, ACPI_PROCESSOR_AGGREGATOR_NOTIFY, + 0, ¶m); mutex_unlock(&xen_cpu_lock); } diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c index 13bc6c31c06..59fc190f1e9 100644 --- a/drivers/xen/xen-acpi-processor.c +++ b/drivers/xen/xen-acpi-processor.c @@ -27,12 +27,10 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/types.h> -#include <linux/syscore_ops.h> -#include <acpi/acpi_bus.h> -#include <acpi/acpi_drivers.h> +#include <linux/acpi.h> #include <acpi/processor.h> - #include <xen/xen.h> +#include <xen/xen-ops.h> #include <xen/interface/platform.h> #include <asm/xen/hypercall.h> @@ -129,7 +127,7 @@ static int push_cxx_to_hypervisor(struct acpi_processor *_pr) pr_debug(" C%d: %s %d uS\n", cx->type, cx->desc, (u32)cx->latency); } - } else if (ret != -EINVAL) + } else if ((ret != -EINVAL) && (ret != -ENOSYS)) /* EINVAL means the ACPI ID is incorrect - meaning the ACPI * table is referencing a non-existing CPU - which can happen * with broken ACPI tables. */ @@ -261,7 +259,7 @@ static int push_pxx_to_hypervisor(struct acpi_processor *_pr) (u32) perf->states[i].power, (u32) perf->states[i].transition_latency); } - } else if (ret != -EINVAL) + } else if ((ret != -EINVAL) && (ret != -ENOSYS)) /* EINVAL means the ACPI ID is incorrect - meaning the ACPI * table is referencing a non-existing CPU - which can happen * with broken ACPI tables. */ @@ -497,14 +495,15 @@ static int xen_upload_processor_pm_data(void) return rc; } -static void xen_acpi_processor_resume(void) +static int xen_acpi_processor_resume(struct notifier_block *nb, + unsigned long action, void *data) { bitmap_zero(acpi_ids_done, nr_acpi_bits); - xen_upload_processor_pm_data(); + return xen_upload_processor_pm_data(); } -static struct syscore_ops xap_syscore_ops = { - .resume = xen_acpi_processor_resume, +struct notifier_block xen_acpi_processor_resume_nb = { + .notifier_call = xen_acpi_processor_resume, }; static int __init xen_acpi_processor_init(void) @@ -557,7 +556,7 @@ static int __init xen_acpi_processor_init(void) if (rc) goto err_unregister; - register_syscore_ops(&xap_syscore_ops); + xen_resume_notifier_register(&xen_acpi_processor_resume_nb); return 0; err_unregister: @@ -576,7 +575,7 @@ static void __exit xen_acpi_processor_exit(void) { int i; - unregister_syscore_ops(&xap_syscore_ops); + xen_resume_notifier_unregister(&xen_acpi_processor_resume_nb); kfree(acpi_ids_done); kfree(acpi_id_present); kfree(acpi_id_cst_present); diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c index 62fcd485f0a..d57a173685f 100644 --- a/drivers/xen/xen-pciback/pci_stub.c +++ b/drivers/xen/xen-pciback/pci_stub.c @@ -242,6 +242,15 @@ struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev, return found_dev; } +/* + * Called when: + * - XenBus state has been reconfigure (pci unplug). See xen_pcibk_remove_device + * - XenBus state has been disconnected (guest shutdown). See xen_pcibk_xenbus_remove + * - 'echo BDF > unbind' on pciback module with no guest attached. See pcistub_remove + * - 'echo BDF > unbind' with a guest still using it. See pcistub_remove + * + * As such we have to be careful. + */ void pcistub_put_pci_dev(struct pci_dev *dev) { struct pcistub_device *psdev, *found_psdev = NULL; @@ -272,16 +281,16 @@ void pcistub_put_pci_dev(struct pci_dev *dev) * and want to inhibit the user from fiddling with 'reset' */ pci_reset_function(dev); - pci_restore_state(psdev->dev); + pci_restore_state(dev); /* This disables the device. */ - xen_pcibk_reset_device(found_psdev->dev); + xen_pcibk_reset_device(dev); /* And cleanup up our emulated fields. */ - xen_pcibk_config_free_dyn_fields(found_psdev->dev); - xen_pcibk_config_reset_dev(found_psdev->dev); + xen_pcibk_config_reset_dev(dev); + xen_pcibk_config_free_dyn_fields(dev); - xen_unregister_device_domain_owner(found_psdev->dev); + xen_unregister_device_domain_owner(dev); spin_lock_irqsave(&found_psdev->lock, flags); found_psdev->pdev = NULL; @@ -493,6 +502,8 @@ static int pcistub_seize(struct pci_dev *dev) return err; } +/* Called when 'bind'. This means we must _NOT_ call pci_reset_function or + * other functions that take the sysfs lock. */ static int pcistub_probe(struct pci_dev *dev, const struct pci_device_id *id) { int err = 0; @@ -520,6 +531,8 @@ out: return err; } +/* Called when 'unbind'. This means we must _NOT_ call pci_reset_function or + * other functions that take the sysfs lock. */ static void pcistub_remove(struct pci_dev *dev) { struct pcistub_device *psdev, *found_psdev = NULL; @@ -551,6 +564,8 @@ static void pcistub_remove(struct pci_dev *dev) pr_warn("****** shutdown driver domain before binding device\n"); pr_warn("****** to other drivers or domains\n"); + /* N.B. This ends up calling pcistub_put_pci_dev which ends up + * doing the FLR. */ xen_pcibk_release_pci_dev(found_psdev->pdev, found_psdev->dev); } diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c index 64eb0cd8b8a..c4a0666de6f 100644 --- a/drivers/xen/xen-pciback/pciback_ops.c +++ b/drivers/xen/xen-pciback/pciback_ops.c @@ -213,12 +213,11 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev, entries[i].vector = op->msix_entries[i].vector; } - result = pci_enable_msix(dev, entries, op->value); - + result = pci_enable_msix_exact(dev, entries, op->value); if (result == 0) { for (i = 0; i < op->value; i++) { op->msix_entries[i].entry = entries[i].entry; - if (entries[i].vector) + if (entries[i].vector) { op->msix_entries[i].vector = xen_pirq_from_irq(entries[i].vector); if (unlikely(verbose_request)) @@ -226,6 +225,7 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev, "MSI-X[%d]: %d\n", pci_name(dev), i, op->msix_entries[i].vector); + } } } else pr_warn_ratelimited("%s: error enabling MSI-X for guest %u: err %d!\n", @@ -348,9 +348,9 @@ void xen_pcibk_do_op(struct work_struct *data) notify_remote_via_irq(pdev->evtchn_irq); /* Mark that we're done. */ - smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */ + smp_mb__before_atomic(); /* /after/ clearing PCIF_active */ clear_bit(_PDEVF_op_active, &pdev->flags); - smp_mb__after_clear_bit(); /* /before/ final check for work */ + smp_mb__after_atomic(); /* /before/ final check for work */ /* Check to see if the driver domain tried to start another request in * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active. diff --git a/drivers/xen/xen-pciback/vpci.c b/drivers/xen/xen-pciback/vpci.c index 3165ce361b0..51afff96c51 100644 --- a/drivers/xen/xen-pciback/vpci.c +++ b/drivers/xen/xen-pciback/vpci.c @@ -137,6 +137,8 @@ unlock: /* Publish this device. */ if (!err) err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid); + else + kfree(dev_entry); out: return err; diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c index a9ed867afab..4a7e6e0a5f4 100644 --- a/drivers/xen/xen-pciback/xenbus.c +++ b/drivers/xen/xen-pciback/xenbus.c @@ -93,6 +93,8 @@ static void free_pdev(struct xen_pcibk_device *pdev) xen_pcibk_disconnect(pdev); + /* N.B. This calls pcistub_put_pci_dev which does the FLR on all + * of the PCIe devices. */ xen_pcibk_release_devices(pdev); dev_set_drvdata(&pdev->xdev->dev, NULL); @@ -286,6 +288,8 @@ static int xen_pcibk_remove_device(struct xen_pcibk_device *pdev, dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id); xen_unregister_device_domain_owner(dev); + /* N.B. This ends up calling pcistub_put_pci_dev which ends up + * doing the FLR. */ xen_pcibk_release_pci_dev(pdev, dev); out: diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c index 21e18c18c7a..3b2bffde534 100644 --- a/drivers/xen/xen-selfballoon.c +++ b/drivers/xen/xen-selfballoon.c @@ -170,11 +170,13 @@ static void frontswap_selfshrink(void) tgt_frontswap_pages = cur_frontswap_pages - (cur_frontswap_pages / frontswap_hysteresis); frontswap_shrink(tgt_frontswap_pages); + frontswap_inertia_counter = frontswap_inertia; } #endif /* CONFIG_FRONTSWAP */ #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) +#define PAGES2MB(pages) ((pages) >> (20 - PAGE_SHIFT)) /* * Use current balloon size, the goal (vm_committed_as), and hysteresis @@ -525,6 +527,7 @@ EXPORT_SYMBOL(register_xen_selfballooning); int xen_selfballoon_init(bool use_selfballooning, bool use_frontswap_selfshrink) { bool enable = false; + unsigned long reserve_pages; if (!xen_domain()) return -ENODEV; @@ -549,6 +552,26 @@ int xen_selfballoon_init(bool use_selfballooning, bool use_frontswap_selfshrink) if (!enable) return -ENODEV; + /* + * Give selfballoon_reserved_mb a default value(10% of total ram pages) + * to make selfballoon not so aggressive. + * + * There are mainly two reasons: + * 1) The original goal_page didn't consider some pages used by kernel + * space, like slab pages and memory used by device drivers. + * + * 2) The balloon driver may not give back memory to guest OS fast + * enough when the workload suddenly aquries a lot of physical memory. + * + * In both cases, the guest OS will suffer from memory pressure and + * OOM killer may be triggered. + * By reserving extra 10% of total ram pages, we can keep the system + * much more reliably and response faster in some cases. + */ + if (!selfballoon_reserved_mb) { + reserve_pages = totalram_pages / 10; + selfballoon_reserved_mb = PAGES2MB(reserve_pages); + } schedule_delayed_work(&selfballoon_worker, selfballoon_interval * HZ); return 0; diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index ec097d6f964..439c9dca9ee 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -45,6 +45,7 @@ #include <xen/grant_table.h> #include <xen/xenbus.h> #include <xen/xen.h> +#include <xen/features.h> #include "xenbus_probe.h" @@ -400,33 +401,6 @@ EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn); /** - * Bind to an existing interdomain event channel in another domain. Returns 0 - * on success and stores the local port in *port. On error, returns -errno, - * switches the device to XenbusStateClosing, and saves the error in XenStore. - */ -int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port) -{ - struct evtchn_bind_interdomain bind_interdomain; - int err; - - bind_interdomain.remote_dom = dev->otherend_id; - bind_interdomain.remote_port = remote_port; - - err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, - &bind_interdomain); - if (err) - xenbus_dev_fatal(dev, err, - "binding to event channel %d from domain %d", - remote_port, dev->otherend_id); - else - *port = bind_interdomain.local_port; - - return err; -} -EXPORT_SYMBOL_GPL(xenbus_bind_evtchn); - - -/** * Free an existing event channel. Returns 0 on success or -errno on error. */ int xenbus_free_evtchn(struct xenbus_device *dev, int port) @@ -743,7 +717,7 @@ static const struct xenbus_ring_ops ring_ops_hvm = { void __init xenbus_ring_ops_init(void) { - if (xen_pv_domain()) + if (!xen_feature(XENFEAT_auto_translated_physmap)) ring_ops = &ring_ops_pv; else ring_ops = &ring_ops_hvm; diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 38e92b770e9..3c0a74b3e9b 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -384,12 +384,14 @@ static ssize_t nodename_show(struct device *dev, { return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename); } +static DEVICE_ATTR_RO(nodename); static ssize_t devtype_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype); } +static DEVICE_ATTR_RO(devtype); static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -397,14 +399,24 @@ static ssize_t modalias_show(struct device *dev, return sprintf(buf, "%s:%s\n", dev->bus->name, to_xenbus_device(dev)->devicetype); } +static DEVICE_ATTR_RO(modalias); -struct device_attribute xenbus_dev_attrs[] = { - __ATTR_RO(nodename), - __ATTR_RO(devtype), - __ATTR_RO(modalias), - __ATTR_NULL +static struct attribute *xenbus_dev_attrs[] = { + &dev_attr_nodename.attr, + &dev_attr_devtype.attr, + &dev_attr_modalias.attr, + NULL, }; -EXPORT_SYMBOL_GPL(xenbus_dev_attrs); + +static const struct attribute_group xenbus_dev_group = { + .attrs = xenbus_dev_attrs, +}; + +const struct attribute_group *xenbus_dev_groups[] = { + &xenbus_dev_group, + NULL, +}; +EXPORT_SYMBOL_GPL(xenbus_dev_groups); int xenbus_probe_node(struct xen_bus_type *bus, const char *type, diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h index 146f857a36f..1085ec294a1 100644 --- a/drivers/xen/xenbus/xenbus_probe.h +++ b/drivers/xen/xenbus/xenbus_probe.h @@ -54,7 +54,7 @@ enum xenstore_init { XS_LOCAL, }; -extern struct device_attribute xenbus_dev_attrs[]; +extern const struct attribute_group *xenbus_dev_groups[]; extern int xenbus_match(struct device *_dev, struct device_driver *_drv); extern int xenbus_dev_probe(struct device *_dev); diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c index 998bbbab816..5125dce11a6 100644 --- a/drivers/xen/xenbus/xenbus_probe_backend.c +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -200,7 +200,7 @@ static struct xen_bus_type xenbus_backend = { .probe = xenbus_dev_probe, .remove = xenbus_dev_remove, .shutdown = xenbus_dev_shutdown, - .dev_attrs = xenbus_dev_attrs, + .dev_groups = xenbus_dev_groups, }, }; diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c index 34b20bfa4e8..cb385c10d2b 100644 --- a/drivers/xen/xenbus/xenbus_probe_frontend.c +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c @@ -154,7 +154,7 @@ static struct xen_bus_type xenbus_frontend = { .probe = xenbus_frontend_dev_probe, .remove = xenbus_dev_remove, .shutdown = xenbus_dev_shutdown, - .dev_attrs = xenbus_dev_attrs, + .dev_groups = xenbus_dev_groups, .pm = &xenbus_pm_ops, }, @@ -496,7 +496,7 @@ subsys_initcall(xenbus_probe_frontend_init); #ifndef MODULE static int __init boot_wait_for_devices(void) { - if (xen_hvm_domain() && !xen_platform_pci_unplug) + if (!xen_has_pv_devices()) return -ENODEV; ready_to_wait_for_devices = 1; diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index b6d5fff43d1..ba804f3d827 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -50,6 +50,7 @@ #include <xen/xenbus.h> #include <xen/xen.h> #include "xenbus_comms.h" +#include "xenbus_probe.h" struct xs_stored_msg { struct list_head list; @@ -139,6 +140,29 @@ static int get_error(const char *errorstring) return xsd_errors[i].errnum; } +static bool xenbus_ok(void) +{ + switch (xen_store_domain_type) { + case XS_LOCAL: + switch (system_state) { + case SYSTEM_POWER_OFF: + case SYSTEM_RESTART: + case SYSTEM_HALT: + return false; + default: + break; + } + return true; + case XS_PV: + case XS_HVM: + /* FIXME: Could check that the remote domain is alive, + * but it is normally initial domain. */ + return true; + default: + break; + } + return false; +} static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len) { struct xs_stored_msg *msg; @@ -148,9 +172,20 @@ static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len) while (list_empty(&xs_state.reply_list)) { spin_unlock(&xs_state.reply_lock); - /* XXX FIXME: Avoid synchronous wait for response here. */ - wait_event(xs_state.reply_waitq, - !list_empty(&xs_state.reply_list)); + if (xenbus_ok()) + /* XXX FIXME: Avoid synchronous wait for response here. */ + wait_event_timeout(xs_state.reply_waitq, + !list_empty(&xs_state.reply_list), + msecs_to_jiffies(500)); + else { + /* + * If we are in the process of being shut-down there is + * no point of trying to contact XenBus - it is either + * killed (xenstored application) or the other domain + * has been killed or is unreachable. + */ + return ERR_PTR(-EIO); + } spin_lock(&xs_state.reply_lock); } @@ -215,6 +250,9 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) mutex_unlock(&xs_state.request_mutex); + if (IS_ERR(ret)) + return ret; + if ((msg->type == XS_TRANSACTION_END) || ((req_msg.type == XS_TRANSACTION_START) && (msg->type == XS_ERROR))) diff --git a/drivers/xen/xencomm.c b/drivers/xen/xencomm.c deleted file mode 100644 index 4793fc59454..00000000000 --- a/drivers/xen/xencomm.c +++ /dev/null @@ -1,219 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Copyright (C) IBM Corp. 2006 - * - * Authors: Hollis Blanchard <hollisb@us.ibm.com> - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/mm.h> -#include <linux/slab.h> -#include <asm/page.h> -#include <xen/xencomm.h> -#include <xen/interface/xen.h> -#include <asm/xen/xencomm.h> /* for xencomm_is_phys_contiguous() */ - -static int xencomm_init(struct xencomm_desc *desc, - void *buffer, unsigned long bytes) -{ - unsigned long recorded = 0; - int i = 0; - - while ((recorded < bytes) && (i < desc->nr_addrs)) { - unsigned long vaddr = (unsigned long)buffer + recorded; - unsigned long paddr; - int offset; - int chunksz; - - offset = vaddr % PAGE_SIZE; /* handle partial pages */ - chunksz = min(PAGE_SIZE - offset, bytes - recorded); - - paddr = xencomm_vtop(vaddr); - if (paddr == ~0UL) { - printk(KERN_DEBUG "%s: couldn't translate vaddr %lx\n", - __func__, vaddr); - return -EINVAL; - } - - desc->address[i++] = paddr; - recorded += chunksz; - } - - if (recorded < bytes) { - printk(KERN_DEBUG - "%s: could only translate %ld of %ld bytes\n", - __func__, recorded, bytes); - return -ENOSPC; - } - - /* mark remaining addresses invalid (just for safety) */ - while (i < desc->nr_addrs) - desc->address[i++] = XENCOMM_INVALID; - - desc->magic = XENCOMM_MAGIC; - - return 0; -} - -static struct xencomm_desc *xencomm_alloc(gfp_t gfp_mask, - void *buffer, unsigned long bytes) -{ - struct xencomm_desc *desc; - unsigned long buffer_ulong = (unsigned long)buffer; - unsigned long start = buffer_ulong & PAGE_MASK; - unsigned long end = (buffer_ulong + bytes) | ~PAGE_MASK; - unsigned long nr_addrs = (end - start + 1) >> PAGE_SHIFT; - unsigned long size = sizeof(*desc) + - sizeof(desc->address[0]) * nr_addrs; - - /* - * slab allocator returns at least sizeof(void*) aligned pointer. - * When sizeof(*desc) > sizeof(void*), struct xencomm_desc might - * cross page boundary. - */ - if (sizeof(*desc) > sizeof(void *)) { - unsigned long order = get_order(size); - desc = (struct xencomm_desc *)__get_free_pages(gfp_mask, - order); - if (desc == NULL) - return NULL; - - desc->nr_addrs = - ((PAGE_SIZE << order) - sizeof(struct xencomm_desc)) / - sizeof(*desc->address); - } else { - desc = kmalloc(size, gfp_mask); - if (desc == NULL) - return NULL; - - desc->nr_addrs = nr_addrs; - } - return desc; -} - -void xencomm_free(struct xencomm_handle *desc) -{ - if (desc && !((ulong)desc & XENCOMM_INLINE_FLAG)) { - struct xencomm_desc *desc__ = (struct xencomm_desc *)desc; - if (sizeof(*desc__) > sizeof(void *)) { - unsigned long size = sizeof(*desc__) + - sizeof(desc__->address[0]) * desc__->nr_addrs; - unsigned long order = get_order(size); - free_pages((unsigned long)__va(desc), order); - } else - kfree(__va(desc)); - } -} - -static int xencomm_create(void *buffer, unsigned long bytes, - struct xencomm_desc **ret, gfp_t gfp_mask) -{ - struct xencomm_desc *desc; - int rc; - - pr_debug("%s: %p[%ld]\n", __func__, buffer, bytes); - - if (bytes == 0) { - /* don't create a descriptor; Xen recognizes NULL. */ - BUG_ON(buffer != NULL); - *ret = NULL; - return 0; - } - - BUG_ON(buffer == NULL); /* 'bytes' is non-zero */ - - desc = xencomm_alloc(gfp_mask, buffer, bytes); - if (!desc) { - printk(KERN_DEBUG "%s failure\n", "xencomm_alloc"); - return -ENOMEM; - } - - rc = xencomm_init(desc, buffer, bytes); - if (rc) { - printk(KERN_DEBUG "%s failure: %d\n", "xencomm_init", rc); - xencomm_free((struct xencomm_handle *)__pa(desc)); - return rc; - } - - *ret = desc; - return 0; -} - -static struct xencomm_handle *xencomm_create_inline(void *ptr) -{ - unsigned long paddr; - - BUG_ON(!xencomm_is_phys_contiguous((unsigned long)ptr)); - - paddr = (unsigned long)xencomm_pa(ptr); - BUG_ON(paddr & XENCOMM_INLINE_FLAG); - return (struct xencomm_handle *)(paddr | XENCOMM_INLINE_FLAG); -} - -/* "mini" routine, for stack-based communications: */ -static int xencomm_create_mini(void *buffer, - unsigned long bytes, struct xencomm_mini *xc_desc, - struct xencomm_desc **ret) -{ - int rc = 0; - struct xencomm_desc *desc; - BUG_ON(((unsigned long)xc_desc) % sizeof(*xc_desc) != 0); - - desc = (void *)xc_desc; - - desc->nr_addrs = XENCOMM_MINI_ADDRS; - - rc = xencomm_init(desc, buffer, bytes); - if (!rc) - *ret = desc; - - return rc; -} - -struct xencomm_handle *xencomm_map(void *ptr, unsigned long bytes) -{ - int rc; - struct xencomm_desc *desc; - - if (xencomm_is_phys_contiguous((unsigned long)ptr)) - return xencomm_create_inline(ptr); - - rc = xencomm_create(ptr, bytes, &desc, GFP_KERNEL); - - if (rc || desc == NULL) - return NULL; - - return xencomm_pa(desc); -} - -struct xencomm_handle *__xencomm_map_no_alloc(void *ptr, unsigned long bytes, - struct xencomm_mini *xc_desc) -{ - int rc; - struct xencomm_desc *desc = NULL; - - if (xencomm_is_phys_contiguous((unsigned long)ptr)) - return xencomm_create_inline(ptr); - - rc = xencomm_create_mini(ptr, bytes, xc_desc, - &desc); - - if (rc) - return NULL; - - return xencomm_pa(desc); -} |
