diff options
Diffstat (limited to 'drivers/xen')
34 files changed, 1833 insertions, 1095 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 23eae5cb69c..38fb36e1c59 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -3,7 +3,6 @@ menu "Xen driver support"  config XEN_BALLOON  	bool "Xen memory balloon driver" -	depends on !ARM  	default y  	help  	  The balloon driver allows the Xen domain to request more memory from @@ -140,7 +139,6 @@ config XEN_GRANT_DEV_ALLOC  config SWIOTLB_XEN  	def_bool y -	depends on PCI && X86  	select SWIOTLB  config XEN_TMEM @@ -223,7 +221,7 @@ config XEN_ACPI_PROCESSOR  	  To do that the driver parses the Power Management data and uploads  	  said information to the Xen hypervisor. Then the Xen hypervisor can -	  select the proper Cx and Pxx states. It also registers itslef as the +	  select the proper Cx and Pxx states. It also registers itself as the  	  SMM so that other drivers (such as ACPI cpufreq scaling driver) will  	  not load. diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 14fe79d8634..45e00afa7f2 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -2,7 +2,8 @@ ifeq ($(filter y, $(CONFIG_ARM) $(CONFIG_ARM64)),)  obj-$(CONFIG_HOTPLUG_CPU)		+= cpu_hotplug.o  endif  obj-$(CONFIG_X86)			+= fallback.o -obj-y	+= grant-table.o features.o events.o balloon.o manage.o +obj-y	+= grant-table.o features.o balloon.o manage.o +obj-y	+= events/  obj-y	+= xenbus/  nostackp := $(call cc-option, -fno-stack-protector) @@ -15,7 +16,6 @@ xen-pad-$(CONFIG_X86) += xen-acpi-pad.o  dom0-$(CONFIG_X86) += pcpu.o  obj-$(CONFIG_XEN_DOM0)			+= $(dom0-y)  obj-$(CONFIG_BLOCK)			+= biomerge.o -obj-$(CONFIG_XEN_XENCOMM)		+= xencomm.o  obj-$(CONFIG_XEN_BALLOON)		+= xen-balloon.o  obj-$(CONFIG_XEN_SELFBALLOONING)	+= xen-selfballoon.o  obj-$(CONFIG_XEN_DEV_EVTCHN)		+= xen-evtchn.o diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index a50c6e3a7cc..5c660c77f03 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -157,13 +157,6 @@ static struct page *balloon_retrieve(bool prefer_highmem)  	return page;  } -static struct page *balloon_first_page(void) -{ -	if (list_empty(&ballooned_pages)) -		return NULL; -	return list_entry(ballooned_pages.next, struct page, lru); -} -  static struct page *balloon_next_page(struct page *page)  {  	struct list_head *next = page->lru.next; @@ -328,7 +321,7 @@ static enum bp_state increase_reservation(unsigned long nr_pages)  	if (nr_pages > ARRAY_SIZE(frame_list))  		nr_pages = ARRAY_SIZE(frame_list); -	page = balloon_first_page(); +	page = list_first_entry_or_null(&ballooned_pages, struct page, lru);  	for (i = 0; i < nr_pages; i++) {  		if (!page) {  			nr_pages = i; @@ -350,17 +343,19 @@ static enum bp_state increase_reservation(unsigned long nr_pages)  		pfn = page_to_pfn(page); -		set_phys_to_machine(pfn, frame_list[i]); -  #ifdef CONFIG_XEN_HAVE_PVMMU -		/* Link back into the page tables if not highmem. */ -		if (xen_pv_domain() && !PageHighMem(page)) { -			int ret; -			ret = HYPERVISOR_update_va_mapping( -				(unsigned long)__va(pfn << PAGE_SHIFT), -				mfn_pte(frame_list[i], PAGE_KERNEL), -				0); -			BUG_ON(ret); +		if (!xen_feature(XENFEAT_auto_translated_physmap)) { +			set_phys_to_machine(pfn, frame_list[i]); + +			/* Link back into the page tables if not highmem. */ +			if (!PageHighMem(page)) { +				int ret; +				ret = HYPERVISOR_update_va_mapping( +						(unsigned long)__va(pfn << PAGE_SHIFT), +						mfn_pte(frame_list[i], PAGE_KERNEL), +						0); +				BUG_ON(ret); +			}  		}  #endif @@ -378,7 +373,6 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)  	enum bp_state state = BP_DONE;  	unsigned long  pfn, i;  	struct page   *page; -	struct page   *scratch_page;  	int ret;  	struct xen_memory_reservation reservation = {  		.address_bits = 0, @@ -398,8 +392,6 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)  	if (nr_pages > ARRAY_SIZE(frame_list))  		nr_pages = ARRAY_SIZE(frame_list); -	scratch_page = get_balloon_scratch_page(); -  	for (i = 0; i < nr_pages; i++) {  		page = alloc_page(gfp);  		if (page == NULL) { @@ -407,39 +399,52 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)  			state = BP_EAGAIN;  			break;  		} - -		pfn = page_to_pfn(page); -		frame_list[i] = pfn_to_mfn(pfn); -  		scrub_page(page); -#ifdef CONFIG_XEN_HAVE_PVMMU -		if (xen_pv_domain() && !PageHighMem(page)) { -			ret = HYPERVISOR_update_va_mapping( -				(unsigned long)__va(pfn << PAGE_SHIFT), -				pfn_pte(page_to_pfn(scratch_page), -					PAGE_KERNEL_RO), 0); -			BUG_ON(ret); -		} -#endif +		frame_list[i] = page_to_pfn(page);  	} -	/* Ensure that ballooned highmem pages don't have kmaps. */ +	/* +	 * Ensure that ballooned highmem pages don't have kmaps. +	 * +	 * Do this before changing the p2m as kmap_flush_unused() +	 * reads PTEs to obtain pages (and hence needs the original +	 * p2m entry). +	 */  	kmap_flush_unused(); -	flush_tlb_all(); -	/* No more mappings: invalidate P2M and add to balloon. */ +	/* Update direct mapping, invalidate P2M, and add to balloon. */  	for (i = 0; i < nr_pages; i++) { -		pfn = mfn_to_pfn(frame_list[i]); +		pfn = frame_list[i]; +		frame_list[i] = pfn_to_mfn(pfn); +		page = pfn_to_page(pfn); + +#ifdef CONFIG_XEN_HAVE_PVMMU +		/* +		 * Ballooned out frames are effectively replaced with +		 * a scratch frame.  Ensure direct mappings and the +		 * p2m are consistent. +		 */  		if (!xen_feature(XENFEAT_auto_translated_physmap)) { -			unsigned long p; -			p = page_to_pfn(scratch_page); -			__set_phys_to_machine(pfn, pfn_to_mfn(p)); +			if (!PageHighMem(page)) { +				struct page *scratch_page = get_balloon_scratch_page(); + +				ret = HYPERVISOR_update_va_mapping( +						(unsigned long)__va(pfn << PAGE_SHIFT), +						pfn_pte(page_to_pfn(scratch_page), +							PAGE_KERNEL_RO), 0); +				BUG_ON(ret); + +				put_balloon_scratch_page(); +			} +			__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);  		} -		balloon_append(pfn_to_page(pfn)); +#endif + +		balloon_append(page);  	} -	put_balloon_scratch_page(); +	flush_tlb_all();  	set_xen_guest_handle(reservation.extent_start, frame_list);  	reservation.nr_extents   = nr_pages; @@ -597,19 +602,29 @@ static void __init balloon_add_region(unsigned long start_pfn,  	}  } -static int __cpuinit balloon_cpu_notify(struct notifier_block *self, +static int alloc_balloon_scratch_page(int cpu) +{ +	if (per_cpu(balloon_scratch_page, cpu) != NULL) +		return 0; + +	per_cpu(balloon_scratch_page, cpu) = alloc_page(GFP_KERNEL); +	if (per_cpu(balloon_scratch_page, cpu) == NULL) { +		pr_warn("Failed to allocate balloon_scratch_page for cpu %d\n", cpu); +		return -ENOMEM; +	} + +	return 0; +} + + +static int balloon_cpu_notify(struct notifier_block *self,  				    unsigned long action, void *hcpu)  {  	int cpu = (long)hcpu;  	switch (action) {  	case CPU_UP_PREPARE: -		if (per_cpu(balloon_scratch_page, cpu) != NULL) -			break; -		per_cpu(balloon_scratch_page, cpu) = alloc_page(GFP_KERNEL); -		if (per_cpu(balloon_scratch_page, cpu) == NULL) { -			pr_warn("Failed to allocate balloon_scratch_page for cpu %d\n", cpu); +		if (alloc_balloon_scratch_page(cpu))  			return NOTIFY_BAD; -		}  		break;  	default:  		break; @@ -617,7 +632,7 @@ static int __cpuinit balloon_cpu_notify(struct notifier_block *self,  	return NOTIFY_OK;  } -static struct notifier_block balloon_cpu_notifier __cpuinitdata = { +static struct notifier_block balloon_cpu_notifier = {  	.notifier_call	= balloon_cpu_notify,  }; @@ -628,21 +643,25 @@ static int __init balloon_init(void)  	if (!xen_domain())  		return -ENODEV; -	for_each_online_cpu(cpu) -	{ -		per_cpu(balloon_scratch_page, cpu) = alloc_page(GFP_KERNEL); -		if (per_cpu(balloon_scratch_page, cpu) == NULL) { -			pr_warn("Failed to allocate balloon_scratch_page for cpu %d\n", cpu); -			return -ENOMEM; +	if (!xen_feature(XENFEAT_auto_translated_physmap)) { +		register_cpu_notifier(&balloon_cpu_notifier); + +		get_online_cpus(); +		for_each_online_cpu(cpu) { +			if (alloc_balloon_scratch_page(cpu)) { +				put_online_cpus(); +				unregister_cpu_notifier(&balloon_cpu_notifier); +				return -ENOMEM; +			}  		} +		put_online_cpus();  	} -	register_cpu_notifier(&balloon_cpu_notifier);  	pr_info("Initialising balloon driver\n");  	balloon_stats.current_pages = xen_pv_domain()  		? min(xen_start_info->nr_pages - xen_released_pages, max_pfn) -		: max_pfn; +		: get_num_physpages();  	balloon_stats.target_pages  = balloon_stats.current_pages;  	balloon_stats.balloon_low   = 0;  	balloon_stats.balloon_high  = 0; diff --git a/drivers/xen/dbgp.c b/drivers/xen/dbgp.c index f3ccc80a455..8145a59fd9f 100644 --- a/drivers/xen/dbgp.c +++ b/drivers/xen/dbgp.c @@ -19,7 +19,7 @@ static int xen_dbgp_op(struct usb_hcd *hcd, int op)  	dbgp.op = op;  #ifdef CONFIG_PCI -	if (ctrlr->bus == &pci_bus_type) { +	if (dev_is_pci(ctrlr)) {  		const struct pci_dev *pdev = to_pci_dev(ctrlr);  		dbgp.u.pci.seg = pci_domain_nr(pdev->bus); diff --git a/drivers/xen/events/Makefile b/drivers/xen/events/Makefile new file mode 100644 index 00000000000..62be55cd981 --- /dev/null +++ b/drivers/xen/events/Makefile @@ -0,0 +1,5 @@ +obj-y += events.o + +events-y += events_base.o +events-y += events_2l.o +events-y += events_fifo.o diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c new file mode 100644 index 00000000000..5db43fc100a --- /dev/null +++ b/drivers/xen/events/events_2l.c @@ -0,0 +1,365 @@ +/* + * Xen event channels (2-level ABI) + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/linkage.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/module.h> + +#include <asm/sync_bitops.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> + +#include "events_internal.h" + +/* + * Note sizeof(xen_ulong_t) can be more than sizeof(unsigned long). Be + * careful to only use bitops which allow for this (e.g + * test_bit/find_first_bit and friends but not __ffs) and to pass + * BITS_PER_EVTCHN_WORD as the bitmask length. + */ +#define BITS_PER_EVTCHN_WORD (sizeof(xen_ulong_t)*8) +/* + * Make a bitmask (i.e. unsigned long *) of a xen_ulong_t + * array. Primarily to avoid long lines (hence the terse name). + */ +#define BM(x) (unsigned long *)(x) +/* Find the first set bit in a evtchn mask */ +#define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD) + +static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD], +		      cpu_evtchn_mask); + +static unsigned evtchn_2l_max_channels(void) +{ +	return EVTCHN_2L_NR_CHANNELS; +} + +static void evtchn_2l_bind_to_cpu(struct irq_info *info, unsigned cpu) +{ +	clear_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, info->cpu))); +	set_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, cpu))); +} + +static void evtchn_2l_clear_pending(unsigned port) +{ +	struct shared_info *s = HYPERVISOR_shared_info; +	sync_clear_bit(port, BM(&s->evtchn_pending[0])); +} + +static void evtchn_2l_set_pending(unsigned port) +{ +	struct shared_info *s = HYPERVISOR_shared_info; +	sync_set_bit(port, BM(&s->evtchn_pending[0])); +} + +static bool evtchn_2l_is_pending(unsigned port) +{ +	struct shared_info *s = HYPERVISOR_shared_info; +	return sync_test_bit(port, BM(&s->evtchn_pending[0])); +} + +static bool evtchn_2l_test_and_set_mask(unsigned port) +{ +	struct shared_info *s = HYPERVISOR_shared_info; +	return sync_test_and_set_bit(port, BM(&s->evtchn_mask[0])); +} + +static void evtchn_2l_mask(unsigned port) +{ +	struct shared_info *s = HYPERVISOR_shared_info; +	sync_set_bit(port, BM(&s->evtchn_mask[0])); +} + +static void evtchn_2l_unmask(unsigned port) +{ +	struct shared_info *s = HYPERVISOR_shared_info; +	unsigned int cpu = get_cpu(); +	int do_hypercall = 0, evtchn_pending = 0; + +	BUG_ON(!irqs_disabled()); + +	if (unlikely((cpu != cpu_from_evtchn(port)))) +		do_hypercall = 1; +	else { +		/* +		 * Need to clear the mask before checking pending to +		 * avoid a race with an event becoming pending. +		 * +		 * EVTCHNOP_unmask will only trigger an upcall if the +		 * mask bit was set, so if a hypercall is needed +		 * remask the event. +		 */ +		sync_clear_bit(port, BM(&s->evtchn_mask[0])); +		evtchn_pending = sync_test_bit(port, BM(&s->evtchn_pending[0])); + +		if (unlikely(evtchn_pending && xen_hvm_domain())) { +			sync_set_bit(port, BM(&s->evtchn_mask[0])); +			do_hypercall = 1; +		} +	} + +	/* Slow path (hypercall) if this is a non-local port or if this is +	 * an hvm domain and an event is pending (hvm domains don't have +	 * their own implementation of irq_enable). */ +	if (do_hypercall) { +		struct evtchn_unmask unmask = { .port = port }; +		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); +	} else { +		struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + +		/* +		 * The following is basically the equivalent of +		 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose +		 * the interrupt edge' if the channel is masked. +		 */ +		if (evtchn_pending && +		    !sync_test_and_set_bit(port / BITS_PER_EVTCHN_WORD, +					   BM(&vcpu_info->evtchn_pending_sel))) +			vcpu_info->evtchn_upcall_pending = 1; +	} + +	put_cpu(); +} + +static DEFINE_PER_CPU(unsigned int, current_word_idx); +static DEFINE_PER_CPU(unsigned int, current_bit_idx); + +/* + * Mask out the i least significant bits of w + */ +#define MASK_LSBS(w, i) (w & ((~((xen_ulong_t)0UL)) << i)) + +static inline xen_ulong_t active_evtchns(unsigned int cpu, +					 struct shared_info *sh, +					 unsigned int idx) +{ +	return sh->evtchn_pending[idx] & +		per_cpu(cpu_evtchn_mask, cpu)[idx] & +		~sh->evtchn_mask[idx]; +} + +/* + * Search the CPU's pending events bitmasks.  For each one found, map + * the event number to an irq, and feed it into do_IRQ() for handling. + * + * Xen uses a two-level bitmap to speed searching.  The first level is + * a bitset of words which contain pending event bits.  The second + * level is a bitset of pending events themselves. + */ +static void evtchn_2l_handle_events(unsigned cpu) +{ +	int irq; +	xen_ulong_t pending_words; +	xen_ulong_t pending_bits; +	int start_word_idx, start_bit_idx; +	int word_idx, bit_idx; +	int i; +	struct shared_info *s = HYPERVISOR_shared_info; +	struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + +	/* Timer interrupt has highest priority. */ +	irq = irq_from_virq(cpu, VIRQ_TIMER); +	if (irq != -1) { +		unsigned int evtchn = evtchn_from_irq(irq); +		word_idx = evtchn / BITS_PER_LONG; +		bit_idx = evtchn % BITS_PER_LONG; +		if (active_evtchns(cpu, s, word_idx) & (1ULL << bit_idx)) +			generic_handle_irq(irq); +	} + +	/* +	 * Master flag must be cleared /before/ clearing +	 * selector flag. xchg_xen_ulong must contain an +	 * appropriate barrier. +	 */ +	pending_words = xchg_xen_ulong(&vcpu_info->evtchn_pending_sel, 0); + +	start_word_idx = __this_cpu_read(current_word_idx); +	start_bit_idx = __this_cpu_read(current_bit_idx); + +	word_idx = start_word_idx; + +	for (i = 0; pending_words != 0; i++) { +		xen_ulong_t words; + +		words = MASK_LSBS(pending_words, word_idx); + +		/* +		 * If we masked out all events, wrap to beginning. +		 */ +		if (words == 0) { +			word_idx = 0; +			bit_idx = 0; +			continue; +		} +		word_idx = EVTCHN_FIRST_BIT(words); + +		pending_bits = active_evtchns(cpu, s, word_idx); +		bit_idx = 0; /* usually scan entire word from start */ +		/* +		 * We scan the starting word in two parts. +		 * +		 * 1st time: start in the middle, scanning the +		 * upper bits. +		 * +		 * 2nd time: scan the whole word (not just the +		 * parts skipped in the first pass) -- if an +		 * event in the previously scanned bits is +		 * pending again it would just be scanned on +		 * the next loop anyway. +		 */ +		if (word_idx == start_word_idx) { +			if (i == 0) +				bit_idx = start_bit_idx; +		} + +		do { +			xen_ulong_t bits; +			int port; + +			bits = MASK_LSBS(pending_bits, bit_idx); + +			/* If we masked out all events, move on. */ +			if (bits == 0) +				break; + +			bit_idx = EVTCHN_FIRST_BIT(bits); + +			/* Process port. */ +			port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx; +			irq = get_evtchn_to_irq(port); + +			if (irq != -1) +				generic_handle_irq(irq); + +			bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD; + +			/* Next caller starts at last processed + 1 */ +			__this_cpu_write(current_word_idx, +					 bit_idx ? word_idx : +					 (word_idx+1) % BITS_PER_EVTCHN_WORD); +			__this_cpu_write(current_bit_idx, bit_idx); +		} while (bit_idx != 0); + +		/* Scan start_l1i twice; all others once. */ +		if ((word_idx != start_word_idx) || (i != 0)) +			pending_words &= ~(1UL << word_idx); + +		word_idx = (word_idx + 1) % BITS_PER_EVTCHN_WORD; +	} +} + +irqreturn_t xen_debug_interrupt(int irq, void *dev_id) +{ +	struct shared_info *sh = HYPERVISOR_shared_info; +	int cpu = smp_processor_id(); +	xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu); +	int i; +	unsigned long flags; +	static DEFINE_SPINLOCK(debug_lock); +	struct vcpu_info *v; + +	spin_lock_irqsave(&debug_lock, flags); + +	printk("\nvcpu %d\n  ", cpu); + +	for_each_online_cpu(i) { +		int pending; +		v = per_cpu(xen_vcpu, i); +		pending = (get_irq_regs() && i == cpu) +			? xen_irqs_disabled(get_irq_regs()) +			: v->evtchn_upcall_mask; +		printk("%d: masked=%d pending=%d event_sel %0*"PRI_xen_ulong"\n  ", i, +		       pending, v->evtchn_upcall_pending, +		       (int)(sizeof(v->evtchn_pending_sel)*2), +		       v->evtchn_pending_sel); +	} +	v = per_cpu(xen_vcpu, cpu); + +	printk("\npending:\n   "); +	for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) +		printk("%0*"PRI_xen_ulong"%s", +		       (int)sizeof(sh->evtchn_pending[0])*2, +		       sh->evtchn_pending[i], +		       i % 8 == 0 ? "\n   " : " "); +	printk("\nglobal mask:\n   "); +	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) +		printk("%0*"PRI_xen_ulong"%s", +		       (int)(sizeof(sh->evtchn_mask[0])*2), +		       sh->evtchn_mask[i], +		       i % 8 == 0 ? "\n   " : " "); + +	printk("\nglobally unmasked:\n   "); +	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) +		printk("%0*"PRI_xen_ulong"%s", +		       (int)(sizeof(sh->evtchn_mask[0])*2), +		       sh->evtchn_pending[i] & ~sh->evtchn_mask[i], +		       i % 8 == 0 ? "\n   " : " "); + +	printk("\nlocal cpu%d mask:\n   ", cpu); +	for (i = (EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--) +		printk("%0*"PRI_xen_ulong"%s", (int)(sizeof(cpu_evtchn[0])*2), +		       cpu_evtchn[i], +		       i % 8 == 0 ? "\n   " : " "); + +	printk("\nlocally unmasked:\n   "); +	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) { +		xen_ulong_t pending = sh->evtchn_pending[i] +			& ~sh->evtchn_mask[i] +			& cpu_evtchn[i]; +		printk("%0*"PRI_xen_ulong"%s", +		       (int)(sizeof(sh->evtchn_mask[0])*2), +		       pending, i % 8 == 0 ? "\n   " : " "); +	} + +	printk("\npending list:\n"); +	for (i = 0; i < EVTCHN_2L_NR_CHANNELS; i++) { +		if (sync_test_bit(i, BM(sh->evtchn_pending))) { +			int word_idx = i / BITS_PER_EVTCHN_WORD; +			printk("  %d: event %d -> irq %d%s%s%s\n", +			       cpu_from_evtchn(i), i, +			       get_evtchn_to_irq(i), +			       sync_test_bit(word_idx, BM(&v->evtchn_pending_sel)) +			       ? "" : " l2-clear", +			       !sync_test_bit(i, BM(sh->evtchn_mask)) +			       ? "" : " globally-masked", +			       sync_test_bit(i, BM(cpu_evtchn)) +			       ? "" : " locally-masked"); +		} +	} + +	spin_unlock_irqrestore(&debug_lock, flags); + +	return IRQ_HANDLED; +} + +static const struct evtchn_ops evtchn_ops_2l = { +	.max_channels      = evtchn_2l_max_channels, +	.nr_channels       = evtchn_2l_max_channels, +	.bind_to_cpu       = evtchn_2l_bind_to_cpu, +	.clear_pending     = evtchn_2l_clear_pending, +	.set_pending       = evtchn_2l_set_pending, +	.is_pending        = evtchn_2l_is_pending, +	.test_and_set_mask = evtchn_2l_test_and_set_mask, +	.mask              = evtchn_2l_mask, +	.unmask            = evtchn_2l_unmask, +	.handle_events     = evtchn_2l_handle_events, +}; + +void __init xen_evtchn_2l_init(void) +{ +	pr_info("Using 2-level ABI\n"); +	evtchn_ops = &evtchn_ops_2l; +} diff --git a/drivers/xen/events.c b/drivers/xen/events/events_base.c index 4035e833ea2..c919d3d5c84 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events/events_base.c @@ -59,6 +59,10 @@  #include <xen/interface/vcpu.h>  #include <asm/hw_irq.h> +#include "events_internal.h" + +const struct evtchn_ops *evtchn_ops; +  /*   * This lock protects updates to the following mapping and reference-count   * arrays. The lock does not need to be acquired to read the mapping tables. @@ -73,71 +77,15 @@ static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};  /* IRQ <-> IPI mapping */  static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1}; -/* Interrupt types. */ -enum xen_irq_type { -	IRQT_UNBOUND = 0, -	IRQT_PIRQ, -	IRQT_VIRQ, -	IRQT_IPI, -	IRQT_EVTCHN -}; - -/* - * Packed IRQ information: - * type - enum xen_irq_type - * event channel - irq->event channel mapping - * cpu - cpu this event channel is bound to - * index - type-specific information: - *    PIRQ - physical IRQ, GSI, flags, and owner domain - *    VIRQ - virq number - *    IPI - IPI vector - *    EVTCHN - - */ -struct irq_info { -	struct list_head list; -	int refcnt; -	enum xen_irq_type type;	/* type */ -	unsigned irq; -	unsigned short evtchn;	/* event channel */ -	unsigned short cpu;	/* cpu bound */ - -	union { -		unsigned short virq; -		enum ipi_vector ipi; -		struct { -			unsigned short pirq; -			unsigned short gsi; -			unsigned char flags; -			uint16_t domid; -		} pirq; -	} u; -}; -#define PIRQ_NEEDS_EOI	(1 << 0) -#define PIRQ_SHAREABLE	(1 << 1) - -static int *evtchn_to_irq; +int **evtchn_to_irq;  #ifdef CONFIG_X86  static unsigned long *pirq_eoi_map;  #endif  static bool (*pirq_needs_eoi)(unsigned irq); -/* - * Note sizeof(xen_ulong_t) can be more than sizeof(unsigned long). Be - * careful to only use bitops which allow for this (e.g - * test_bit/find_first_bit and friends but not __ffs) and to pass - * BITS_PER_EVTCHN_WORD as the bitmask length. - */ -#define BITS_PER_EVTCHN_WORD (sizeof(xen_ulong_t)*8) -/* - * Make a bitmask (i.e. unsigned long *) of a xen_ulong_t - * array. Primarily to avoid long lines (hence the terse name). - */ -#define BM(x) (unsigned long *)(x) -/* Find the first set bit in a evtchn mask */ -#define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD) - -static DEFINE_PER_CPU(xen_ulong_t [NR_EVENT_CHANNELS/BITS_PER_EVTCHN_WORD], -		      cpu_evtchn_mask); +#define EVTCHN_ROW(e)  (e / (PAGE_SIZE/sizeof(**evtchn_to_irq))) +#define EVTCHN_COL(e)  (e % (PAGE_SIZE/sizeof(**evtchn_to_irq))) +#define EVTCHN_PER_ROW (PAGE_SIZE / sizeof(**evtchn_to_irq))  /* Xen will never allocate port zero for any purpose. */  #define VALID_EVTCHN(chn)	((chn) != 0) @@ -148,19 +96,75 @@ static struct irq_chip xen_pirq_chip;  static void enable_dynirq(struct irq_data *data);  static void disable_dynirq(struct irq_data *data); +static void clear_evtchn_to_irq_row(unsigned row) +{ +	unsigned col; + +	for (col = 0; col < EVTCHN_PER_ROW; col++) +		evtchn_to_irq[row][col] = -1; +} + +static void clear_evtchn_to_irq_all(void) +{ +	unsigned row; + +	for (row = 0; row < EVTCHN_ROW(xen_evtchn_max_channels()); row++) { +		if (evtchn_to_irq[row] == NULL) +			continue; +		clear_evtchn_to_irq_row(row); +	} +} + +static int set_evtchn_to_irq(unsigned evtchn, unsigned irq) +{ +	unsigned row; +	unsigned col; + +	if (evtchn >= xen_evtchn_max_channels()) +		return -EINVAL; + +	row = EVTCHN_ROW(evtchn); +	col = EVTCHN_COL(evtchn); + +	if (evtchn_to_irq[row] == NULL) { +		/* Unallocated irq entries return -1 anyway */ +		if (irq == -1) +			return 0; + +		evtchn_to_irq[row] = (int *)get_zeroed_page(GFP_KERNEL); +		if (evtchn_to_irq[row] == NULL) +			return -ENOMEM; + +		clear_evtchn_to_irq_row(row); +	} + +	evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)] = irq; +	return 0; +} + +int get_evtchn_to_irq(unsigned evtchn) +{ +	if (evtchn >= xen_evtchn_max_channels()) +		return -1; +	if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL) +		return -1; +	return evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]; +} +  /* Get info for IRQ */ -static struct irq_info *info_for_irq(unsigned irq) +struct irq_info *info_for_irq(unsigned irq)  {  	return irq_get_handler_data(irq);  }  /* Constructors for packed IRQ information. */ -static void xen_irq_info_common_init(struct irq_info *info, +static int xen_irq_info_common_setup(struct irq_info *info,  				     unsigned irq,  				     enum xen_irq_type type, -				     unsigned short evtchn, +				     unsigned evtchn,  				     unsigned short cpu)  { +	int ret;  	BUG_ON(info->type != IRQT_UNBOUND && info->type != type); @@ -169,68 +173,78 @@ static void xen_irq_info_common_init(struct irq_info *info,  	info->evtchn = evtchn;  	info->cpu = cpu; -	evtchn_to_irq[evtchn] = irq; +	ret = set_evtchn_to_irq(evtchn, irq); +	if (ret < 0) +		return ret;  	irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN); + +	return xen_evtchn_port_setup(info);  } -static void xen_irq_info_evtchn_init(unsigned irq, -				     unsigned short evtchn) +static int xen_irq_info_evtchn_setup(unsigned irq, +				     unsigned evtchn)  {  	struct irq_info *info = info_for_irq(irq); -	xen_irq_info_common_init(info, irq, IRQT_EVTCHN, evtchn, 0); +	return xen_irq_info_common_setup(info, irq, IRQT_EVTCHN, evtchn, 0);  } -static void xen_irq_info_ipi_init(unsigned cpu, +static int xen_irq_info_ipi_setup(unsigned cpu,  				  unsigned irq, -				  unsigned short evtchn, +				  unsigned evtchn,  				  enum ipi_vector ipi)  {  	struct irq_info *info = info_for_irq(irq); -	xen_irq_info_common_init(info, irq, IRQT_IPI, evtchn, 0); -  	info->u.ipi = ipi;  	per_cpu(ipi_to_irq, cpu)[ipi] = irq; + +	return xen_irq_info_common_setup(info, irq, IRQT_IPI, evtchn, 0);  } -static void xen_irq_info_virq_init(unsigned cpu, +static int xen_irq_info_virq_setup(unsigned cpu,  				   unsigned irq, -				   unsigned short evtchn, -				   unsigned short virq) +				   unsigned evtchn, +				   unsigned virq)  {  	struct irq_info *info = info_for_irq(irq); -	xen_irq_info_common_init(info, irq, IRQT_VIRQ, evtchn, 0); -  	info->u.virq = virq;  	per_cpu(virq_to_irq, cpu)[virq] = irq; + +	return xen_irq_info_common_setup(info, irq, IRQT_VIRQ, evtchn, 0);  } -static void xen_irq_info_pirq_init(unsigned irq, -				   unsigned short evtchn, -				   unsigned short pirq, -				   unsigned short gsi, +static int xen_irq_info_pirq_setup(unsigned irq, +				   unsigned evtchn, +				   unsigned pirq, +				   unsigned gsi,  				   uint16_t domid,  				   unsigned char flags)  {  	struct irq_info *info = info_for_irq(irq); -	xen_irq_info_common_init(info, irq, IRQT_PIRQ, evtchn, 0); -  	info->u.pirq.pirq = pirq;  	info->u.pirq.gsi = gsi;  	info->u.pirq.domid = domid;  	info->u.pirq.flags = flags; + +	return xen_irq_info_common_setup(info, irq, IRQT_PIRQ, evtchn, 0); +} + +static void xen_irq_info_cleanup(struct irq_info *info) +{ +	set_evtchn_to_irq(info->evtchn, -1); +	info->evtchn = 0;  }  /*   * Accessors for packed IRQ information.   */ -static unsigned int evtchn_from_irq(unsigned irq) +unsigned int evtchn_from_irq(unsigned irq)  {  	if (unlikely(WARN(irq < 0 || irq >= nr_irqs, "Invalid irq %d!\n", irq)))  		return 0; @@ -240,10 +254,15 @@ static unsigned int evtchn_from_irq(unsigned irq)  unsigned irq_from_evtchn(unsigned int evtchn)  { -	return evtchn_to_irq[evtchn]; +	return get_evtchn_to_irq(evtchn);  }  EXPORT_SYMBOL_GPL(irq_from_evtchn); +int irq_from_virq(unsigned int cpu, unsigned int virq) +{ +	return per_cpu(virq_to_irq, cpu)[virq]; +} +  static enum ipi_vector ipi_from_irq(unsigned irq)  {  	struct irq_info *info = info_for_irq(irq); @@ -279,14 +298,14 @@ static enum xen_irq_type type_from_irq(unsigned irq)  	return info_for_irq(irq)->type;  } -static unsigned cpu_from_irq(unsigned irq) +unsigned cpu_from_irq(unsigned irq)  {  	return info_for_irq(irq)->cpu;  } -static unsigned int cpu_from_evtchn(unsigned int evtchn) +unsigned int cpu_from_evtchn(unsigned int evtchn)  { -	int irq = evtchn_to_irq[evtchn]; +	int irq = get_evtchn_to_irq(evtchn);  	unsigned ret = 0;  	if (irq != -1) @@ -310,67 +329,28 @@ static bool pirq_needs_eoi_flag(unsigned irq)  	return info->u.pirq.flags & PIRQ_NEEDS_EOI;  } -static inline xen_ulong_t active_evtchns(unsigned int cpu, -					 struct shared_info *sh, -					 unsigned int idx) -{ -	return sh->evtchn_pending[idx] & -		per_cpu(cpu_evtchn_mask, cpu)[idx] & -		~sh->evtchn_mask[idx]; -} -  static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)  { -	int irq = evtchn_to_irq[chn]; +	int irq = get_evtchn_to_irq(chn); +	struct irq_info *info = info_for_irq(irq);  	BUG_ON(irq == -1);  #ifdef CONFIG_SMP -	cpumask_copy(irq_to_desc(irq)->irq_data.affinity, cpumask_of(cpu)); -#endif - -	clear_bit(chn, BM(per_cpu(cpu_evtchn_mask, cpu_from_irq(irq)))); -	set_bit(chn, BM(per_cpu(cpu_evtchn_mask, cpu))); - -	info_for_irq(irq)->cpu = cpu; -} - -static void init_evtchn_cpu_bindings(void) -{ -	int i; -#ifdef CONFIG_SMP -	struct irq_info *info; - -	/* By default all event channels notify CPU#0. */ -	list_for_each_entry(info, &xen_irq_list_head, list) { -		struct irq_desc *desc = irq_to_desc(info->irq); -		cpumask_copy(desc->irq_data.affinity, cpumask_of(0)); -	} +	cpumask_copy(irq_get_irq_data(irq)->affinity, cpumask_of(cpu));  #endif +	xen_evtchn_port_bind_to_cpu(info, cpu); -	for_each_possible_cpu(i) -		memset(per_cpu(cpu_evtchn_mask, i), -		       (i == 0) ? ~0 : 0, NR_EVENT_CHANNELS/8); -} - -static inline void clear_evtchn(int port) -{ -	struct shared_info *s = HYPERVISOR_shared_info; -	sync_clear_bit(port, BM(&s->evtchn_pending[0])); +	info->cpu = cpu;  } -static inline void set_evtchn(int port) +static void xen_evtchn_mask_all(void)  { -	struct shared_info *s = HYPERVISOR_shared_info; -	sync_set_bit(port, BM(&s->evtchn_pending[0])); -} +	unsigned int evtchn; -static inline int test_evtchn(int port) -{ -	struct shared_info *s = HYPERVISOR_shared_info; -	return sync_test_bit(port, BM(&s->evtchn_pending[0])); +	for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++) +		mask_evtchn(evtchn);  } -  /**   * notify_remote_via_irq - send event to remote end of event channel via irq   * @irq: irq of event channel to send event to @@ -388,71 +368,12 @@ void notify_remote_via_irq(int irq)  }  EXPORT_SYMBOL_GPL(notify_remote_via_irq); -static void mask_evtchn(int port) -{ -	struct shared_info *s = HYPERVISOR_shared_info; -	sync_set_bit(port, BM(&s->evtchn_mask[0])); -} - -static void unmask_evtchn(int port) -{ -	struct shared_info *s = HYPERVISOR_shared_info; -	unsigned int cpu = get_cpu(); -	int do_hypercall = 0, evtchn_pending = 0; - -	BUG_ON(!irqs_disabled()); - -	if (unlikely((cpu != cpu_from_evtchn(port)))) -		do_hypercall = 1; -	else { -		/* -		 * Need to clear the mask before checking pending to -		 * avoid a race with an event becoming pending. -		 * -		 * EVTCHNOP_unmask will only trigger an upcall if the -		 * mask bit was set, so if a hypercall is needed -		 * remask the event. -		 */ -		sync_clear_bit(port, BM(&s->evtchn_mask[0])); -		evtchn_pending = sync_test_bit(port, BM(&s->evtchn_pending[0])); - -		if (unlikely(evtchn_pending && xen_hvm_domain())) { -			sync_set_bit(port, BM(&s->evtchn_mask[0])); -			do_hypercall = 1; -		} -	} - -	/* Slow path (hypercall) if this is a non-local port or if this is -	 * an hvm domain and an event is pending (hvm domains don't have -	 * their own implementation of irq_enable). */ -	if (do_hypercall) { -		struct evtchn_unmask unmask = { .port = port }; -		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); -	} else { -		struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); - -		/* -		 * The following is basically the equivalent of -		 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose -		 * the interrupt edge' if the channel is masked. -		 */ -		if (evtchn_pending && -		    !sync_test_and_set_bit(port / BITS_PER_EVTCHN_WORD, -					   BM(&vcpu_info->evtchn_pending_sel))) -			vcpu_info->evtchn_upcall_pending = 1; -	} - -	put_cpu(); -} -  static void xen_irq_init(unsigned irq)  {  	struct irq_info *info;  #ifdef CONFIG_SMP -	struct irq_desc *desc = irq_to_desc(irq); -  	/* By default all event channels notify CPU#0. */ -	cpumask_copy(desc->irq_data.affinity, cpumask_of(0)); +	cpumask_copy(irq_get_irq_data(irq)->affinity, cpumask_of(0));  #endif  	info = kzalloc(sizeof(*info), GFP_KERNEL); @@ -467,29 +388,22 @@ static void xen_irq_init(unsigned irq)  	list_add_tail(&info->list, &xen_irq_list_head);  } -static int __must_check xen_allocate_irq_dynamic(void) +static int __must_check xen_allocate_irqs_dynamic(int nvec)  { -	int first = 0; -	int irq; +	int i, irq = irq_alloc_descs(-1, 0, nvec, -1); -#ifdef CONFIG_X86_IO_APIC -	/* -	 * For an HVM guest or domain 0 which see "real" (emulated or -	 * actual respectively) GSIs we allocate dynamic IRQs -	 * e.g. those corresponding to event channels or MSIs -	 * etc. from the range above those "real" GSIs to avoid -	 * collisions. -	 */ -	if (xen_initial_domain() || xen_hvm_domain()) -		first = get_nr_irqs_gsi(); -#endif +	if (irq >= 0) { +		for (i = 0; i < nvec; i++) +			xen_irq_init(irq + i); +	} -	irq = irq_alloc_desc_from(first, -1); +	return irq; +} -	if (irq >= 0) -		xen_irq_init(irq); +static inline int __must_check xen_allocate_irq_dynamic(void) +{ -	return irq; +	return xen_allocate_irqs_dynamic(1);  }  static int __must_check xen_allocate_irq_gsi(unsigned gsi) @@ -538,6 +452,15 @@ static void xen_free_irq(unsigned irq)  	irq_free_desc(irq);  } +static void xen_evtchn_close(unsigned int port) +{ +	struct evtchn_close close; + +	close.port = port; +	if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) +		BUG(); +} +  static void pirq_query_unmask(int irq)  {  	struct physdev_irq_status_query irq_status; @@ -554,13 +477,6 @@ static void pirq_query_unmask(int irq)  		info->u.pirq.flags |= PIRQ_NEEDS_EOI;  } -static bool probing_irq(int irq) -{ -	struct irq_desc *desc = irq_to_desc(irq); - -	return desc && desc->action == NULL; -} -  static void eoi_pirq(struct irq_data *data)  {  	int evtchn = evtchn_from_irq(data->irq); @@ -602,15 +518,20 @@ static unsigned int __startup_pirq(unsigned int irq)  					BIND_PIRQ__WILL_SHARE : 0;  	rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);  	if (rc != 0) { -		if (!probing_irq(irq)) -			pr_info("Failed to obtain physical IRQ %d\n", irq); +		pr_warn("Failed to obtain physical IRQ %d\n", irq);  		return 0;  	}  	evtchn = bind_pirq.port;  	pirq_query_unmask(irq); -	evtchn_to_irq[evtchn] = irq; +	rc = set_evtchn_to_irq(evtchn, irq); +	if (rc != 0) { +		pr_err("irq%d: Failed to set port to irq mapping (%d)\n", +		       irq, rc); +		xen_evtchn_close(evtchn); +		return 0; +	}  	bind_evtchn_to_cpu(evtchn, 0);  	info->evtchn = evtchn; @@ -628,10 +549,9 @@ static unsigned int startup_pirq(struct irq_data *data)  static void shutdown_pirq(struct irq_data *data)  { -	struct evtchn_close close;  	unsigned int irq = data->irq;  	struct irq_info *info = info_for_irq(irq); -	int evtchn = evtchn_from_irq(irq); +	unsigned evtchn = evtchn_from_irq(irq);  	BUG_ON(info->type != IRQT_PIRQ); @@ -639,14 +559,8 @@ static void shutdown_pirq(struct irq_data *data)  		return;  	mask_evtchn(evtchn); - -	close.port = evtchn; -	if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) -		BUG(); - -	bind_evtchn_to_cpu(evtchn, 0); -	evtchn_to_irq[evtchn] = -1; -	info->evtchn = 0; +	xen_evtchn_close(evtchn); +	xen_irq_info_cleanup(info);  }  static void enable_pirq(struct irq_data *data) @@ -675,6 +589,41 @@ int xen_irq_from_gsi(unsigned gsi)  }  EXPORT_SYMBOL_GPL(xen_irq_from_gsi); +static void __unbind_from_irq(unsigned int irq) +{ +	int evtchn = evtchn_from_irq(irq); +	struct irq_info *info = irq_get_handler_data(irq); + +	if (info->refcnt > 0) { +		info->refcnt--; +		if (info->refcnt != 0) +			return; +	} + +	if (VALID_EVTCHN(evtchn)) { +		unsigned int cpu = cpu_from_irq(irq); + +		xen_evtchn_close(evtchn); + +		switch (type_from_irq(irq)) { +		case IRQT_VIRQ: +			per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1; +			break; +		case IRQT_IPI: +			per_cpu(ipi_to_irq, cpu)[ipi_from_irq(irq)] = -1; +			break; +		default: +			break; +		} + +		xen_irq_info_cleanup(info); +	} + +	BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND); + +	xen_free_irq(irq); +} +  /*   * Do not make any assumptions regarding the relationship between the   * IRQ number returned here and the Xen pirq argument. @@ -690,6 +639,7 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi,  {  	int irq = -1;  	struct physdev_irq irq_op; +	int ret;  	mutex_lock(&irq_mapping_update_lock); @@ -717,8 +667,13 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi,  		goto out;  	} -	xen_irq_info_pirq_init(irq, 0, pirq, gsi, DOMID_SELF, +	ret = xen_irq_info_pirq_setup(irq, 0, pirq, gsi, DOMID_SELF,  			       shareable ? PIRQ_SHAREABLE : 0); +	if (ret < 0) { +		__unbind_from_irq(irq); +		irq = ret; +		goto out; +	}  	pirq_query_unmask(irq);  	/* We try to use the handler with the appropriate semantic for the @@ -765,20 +720,25 @@ int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc)  }  int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, -			     int pirq, const char *name, domid_t domid) +			     int pirq, int nvec, const char *name, domid_t domid)  { -	int irq, ret; +	int i, irq, ret;  	mutex_lock(&irq_mapping_update_lock); -	irq = xen_allocate_irq_dynamic(); +	irq = xen_allocate_irqs_dynamic(nvec);  	if (irq < 0)  		goto out; -	irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_edge_irq, -			name); +	for (i = 0; i < nvec; i++) { +		irq_set_chip_and_handler_name(irq + i, &xen_pirq_chip, handle_edge_irq, name); + +		ret = xen_irq_info_pirq_setup(irq + i, 0, pirq + i, 0, domid, +					      i == 0 ? 0 : PIRQ_MSI_GROUP); +		if (ret < 0) +			goto error_irq; +	} -	xen_irq_info_pirq_init(irq, 0, pirq, 0, domid, 0);  	ret = irq_set_msi_desc(irq, msidesc);  	if (ret < 0)  		goto error_irq; @@ -786,26 +746,27 @@ out:  	mutex_unlock(&irq_mapping_update_lock);  	return irq;  error_irq: +	for (; i >= 0; i--) +		__unbind_from_irq(irq + i);  	mutex_unlock(&irq_mapping_update_lock); -	xen_free_irq(irq);  	return ret;  }  #endif  int xen_destroy_irq(int irq)  { -	struct irq_desc *desc;  	struct physdev_unmap_pirq unmap_irq;  	struct irq_info *info = info_for_irq(irq);  	int rc = -ENOENT;  	mutex_lock(&irq_mapping_update_lock); -	desc = irq_to_desc(irq); -	if (!desc) -		goto out; - -	if (xen_initial_domain()) { +	/* +	 * If trying to remove a vector in a MSI group different +	 * than the first one skip the PIRQ unmap unless this vector +	 * is the first one in the group. +	 */ +	if (xen_initial_domain() && !(info->u.pirq.flags & PIRQ_MSI_GROUP)) {  		unmap_irq.pirq = info->u.pirq.pirq;  		unmap_irq.domid = info->u.pirq.domid;  		rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq); @@ -857,13 +818,18 @@ int xen_pirq_from_irq(unsigned irq)  	return pirq_from_irq(irq);  }  EXPORT_SYMBOL_GPL(xen_pirq_from_irq); +  int bind_evtchn_to_irq(unsigned int evtchn)  {  	int irq; +	int ret; + +	if (evtchn >= xen_evtchn_max_channels()) +		return -ENOMEM;  	mutex_lock(&irq_mapping_update_lock); -	irq = evtchn_to_irq[evtchn]; +	irq = get_evtchn_to_irq(evtchn);  	if (irq == -1) {  		irq = xen_allocate_irq_dynamic(); @@ -873,7 +839,14 @@ int bind_evtchn_to_irq(unsigned int evtchn)  		irq_set_chip_and_handler_name(irq, &xen_dynamic_chip,  					      handle_edge_irq, "event"); -		xen_irq_info_evtchn_init(irq, evtchn); +		ret = xen_irq_info_evtchn_setup(irq, evtchn); +		if (ret < 0) { +			__unbind_from_irq(irq); +			irq = ret; +			goto out; +		} +		/* New interdomain events are bound to VCPU 0. */ +		bind_evtchn_to_cpu(evtchn, 0);  	} else {  		struct irq_info *info = info_for_irq(irq);  		WARN_ON(info == NULL || info->type != IRQT_EVTCHN); @@ -890,6 +863,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)  {  	struct evtchn_bind_ipi bind_ipi;  	int evtchn, irq; +	int ret;  	mutex_lock(&irq_mapping_update_lock); @@ -909,8 +883,12 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)  			BUG();  		evtchn = bind_ipi.port; -		xen_irq_info_ipi_init(cpu, irq, evtchn, ipi); - +		ret = xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi); +		if (ret < 0) { +			__unbind_from_irq(irq); +			irq = ret; +			goto out; +		}  		bind_evtchn_to_cpu(evtchn, cpu);  	} else {  		struct irq_info *info = info_for_irq(irq); @@ -943,7 +921,7 @@ static int find_virq(unsigned int virq, unsigned int cpu)  	int port, rc = -ENOENT;  	memset(&status, 0, sizeof(status)); -	for (port = 0; port <= NR_EVENT_CHANNELS; port++) { +	for (port = 0; port < xen_evtchn_max_channels(); port++) {  		status.dom = DOMID_SELF;  		status.port = port;  		rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status); @@ -959,6 +937,19 @@ static int find_virq(unsigned int virq, unsigned int cpu)  	return rc;  } +/** + * xen_evtchn_nr_channels - number of usable event channel ports + * + * This may be less than the maximum supported by the current + * hypervisor ABI. Use xen_evtchn_max_channels() for the maximum + * supported. + */ +unsigned xen_evtchn_nr_channels(void) +{ +        return evtchn_ops->nr_channels(); +} +EXPORT_SYMBOL_GPL(xen_evtchn_nr_channels); +  int bind_virq_to_irq(unsigned int virq, unsigned int cpu)  {  	struct evtchn_bind_virq bind_virq; @@ -989,7 +980,12 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu)  			evtchn = ret;  		} -		xen_irq_info_virq_init(cpu, irq, evtchn, virq); +		ret = xen_irq_info_virq_setup(cpu, irq, evtchn, virq); +		if (ret < 0) { +			__unbind_from_irq(irq); +			irq = ret; +			goto out; +		}  		bind_evtchn_to_cpu(evtchn, cpu);  	} else { @@ -1005,50 +1001,8 @@ out:  static void unbind_from_irq(unsigned int irq)  { -	struct evtchn_close close; -	int evtchn = evtchn_from_irq(irq); -	struct irq_info *info = irq_get_handler_data(irq); - -	if (WARN_ON(!info)) -		return; -  	mutex_lock(&irq_mapping_update_lock); - -	if (info->refcnt > 0) { -		info->refcnt--; -		if (info->refcnt != 0) -			goto done; -	} - -	if (VALID_EVTCHN(evtchn)) { -		close.port = evtchn; -		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) -			BUG(); - -		switch (type_from_irq(irq)) { -		case IRQT_VIRQ: -			per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) -				[virq_from_irq(irq)] = -1; -			break; -		case IRQT_IPI: -			per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn)) -				[ipi_from_irq(irq)] = -1; -			break; -		default: -			break; -		} - -		/* Closed ports are implicitly re-bound to VCPU0. */ -		bind_evtchn_to_cpu(evtchn, 0); - -		evtchn_to_irq[evtchn] = -1; -	} - -	BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND); - -	xen_free_irq(irq); - - done: +	__unbind_from_irq(irq);  	mutex_unlock(&irq_mapping_update_lock);  } @@ -1148,9 +1102,26 @@ void unbind_from_irqhandler(unsigned int irq, void *dev_id)  }  EXPORT_SYMBOL_GPL(unbind_from_irqhandler); +/** + * xen_set_irq_priority() - set an event channel priority. + * @irq:irq bound to an event channel. + * @priority: priority between XEN_IRQ_PRIORITY_MAX and XEN_IRQ_PRIORITY_MIN. + */ +int xen_set_irq_priority(unsigned irq, unsigned priority) +{ +	struct evtchn_set_priority set_priority; + +	set_priority.port = evtchn_from_irq(irq); +	set_priority.priority = priority; + +	return HYPERVISOR_event_channel_op(EVTCHNOP_set_priority, +					   &set_priority); +} +EXPORT_SYMBOL_GPL(xen_set_irq_priority); +  int evtchn_make_refcounted(unsigned int evtchn)  { -	int irq = evtchn_to_irq[evtchn]; +	int irq = get_evtchn_to_irq(evtchn);  	struct irq_info *info;  	if (irq == -1) @@ -1175,12 +1146,12 @@ int evtchn_get(unsigned int evtchn)  	struct irq_info *info;  	int err = -ENOENT; -	if (evtchn >= NR_EVENT_CHANNELS) +	if (evtchn >= xen_evtchn_max_channels())  		return -EINVAL;  	mutex_lock(&irq_mapping_update_lock); -	irq = evtchn_to_irq[evtchn]; +	irq = get_evtchn_to_irq(evtchn);  	if (irq == -1)  		goto done; @@ -1204,7 +1175,7 @@ EXPORT_SYMBOL_GPL(evtchn_get);  void evtchn_put(unsigned int evtchn)  { -	int irq = evtchn_to_irq[evtchn]; +	int irq = get_evtchn_to_irq(evtchn);  	if (WARN_ON(irq == -1))  		return;  	unbind_from_irq(irq); @@ -1228,222 +1199,21 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)  	notify_remote_via_irq(irq);  } -irqreturn_t xen_debug_interrupt(int irq, void *dev_id) -{ -	struct shared_info *sh = HYPERVISOR_shared_info; -	int cpu = smp_processor_id(); -	xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu); -	int i; -	unsigned long flags; -	static DEFINE_SPINLOCK(debug_lock); -	struct vcpu_info *v; - -	spin_lock_irqsave(&debug_lock, flags); - -	printk("\nvcpu %d\n  ", cpu); - -	for_each_online_cpu(i) { -		int pending; -		v = per_cpu(xen_vcpu, i); -		pending = (get_irq_regs() && i == cpu) -			? xen_irqs_disabled(get_irq_regs()) -			: v->evtchn_upcall_mask; -		printk("%d: masked=%d pending=%d event_sel %0*"PRI_xen_ulong"\n  ", i, -		       pending, v->evtchn_upcall_pending, -		       (int)(sizeof(v->evtchn_pending_sel)*2), -		       v->evtchn_pending_sel); -	} -	v = per_cpu(xen_vcpu, cpu); - -	printk("\npending:\n   "); -	for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) -		printk("%0*"PRI_xen_ulong"%s", -		       (int)sizeof(sh->evtchn_pending[0])*2, -		       sh->evtchn_pending[i], -		       i % 8 == 0 ? "\n   " : " "); -	printk("\nglobal mask:\n   "); -	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) -		printk("%0*"PRI_xen_ulong"%s", -		       (int)(sizeof(sh->evtchn_mask[0])*2), -		       sh->evtchn_mask[i], -		       i % 8 == 0 ? "\n   " : " "); - -	printk("\nglobally unmasked:\n   "); -	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) -		printk("%0*"PRI_xen_ulong"%s", -		       (int)(sizeof(sh->evtchn_mask[0])*2), -		       sh->evtchn_pending[i] & ~sh->evtchn_mask[i], -		       i % 8 == 0 ? "\n   " : " "); - -	printk("\nlocal cpu%d mask:\n   ", cpu); -	for (i = (NR_EVENT_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--) -		printk("%0*"PRI_xen_ulong"%s", (int)(sizeof(cpu_evtchn[0])*2), -		       cpu_evtchn[i], -		       i % 8 == 0 ? "\n   " : " "); - -	printk("\nlocally unmasked:\n   "); -	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) { -		xen_ulong_t pending = sh->evtchn_pending[i] -			& ~sh->evtchn_mask[i] -			& cpu_evtchn[i]; -		printk("%0*"PRI_xen_ulong"%s", -		       (int)(sizeof(sh->evtchn_mask[0])*2), -		       pending, i % 8 == 0 ? "\n   " : " "); -	} - -	printk("\npending list:\n"); -	for (i = 0; i < NR_EVENT_CHANNELS; i++) { -		if (sync_test_bit(i, BM(sh->evtchn_pending))) { -			int word_idx = i / BITS_PER_EVTCHN_WORD; -			printk("  %d: event %d -> irq %d%s%s%s\n", -			       cpu_from_evtchn(i), i, -			       evtchn_to_irq[i], -			       sync_test_bit(word_idx, BM(&v->evtchn_pending_sel)) -					     ? "" : " l2-clear", -			       !sync_test_bit(i, BM(sh->evtchn_mask)) -					     ? "" : " globally-masked", -			       sync_test_bit(i, BM(cpu_evtchn)) -					     ? "" : " locally-masked"); -		} -	} - -	spin_unlock_irqrestore(&debug_lock, flags); - -	return IRQ_HANDLED; -} -  static DEFINE_PER_CPU(unsigned, xed_nesting_count); -static DEFINE_PER_CPU(unsigned int, current_word_idx); -static DEFINE_PER_CPU(unsigned int, current_bit_idx); -/* - * Mask out the i least significant bits of w - */ -#define MASK_LSBS(w, i) (w & ((~((xen_ulong_t)0UL)) << i)) - -/* - * Search the CPUs pending events bitmasks.  For each one found, map - * the event number to an irq, and feed it into do_IRQ() for - * handling. - * - * Xen uses a two-level bitmap to speed searching.  The first level is - * a bitset of words which contain pending event bits.  The second - * level is a bitset of pending events themselves. - */  static void __xen_evtchn_do_upcall(void)  { -	int start_word_idx, start_bit_idx; -	int word_idx, bit_idx; -	int i, irq; -	int cpu = get_cpu(); -	struct shared_info *s = HYPERVISOR_shared_info;  	struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); +	int cpu = get_cpu();  	unsigned count;  	do { -		xen_ulong_t pending_words; -		xen_ulong_t pending_bits; -		struct irq_desc *desc; -  		vcpu_info->evtchn_upcall_pending = 0;  		if (__this_cpu_inc_return(xed_nesting_count) - 1)  			goto out; -		/* -		 * Master flag must be cleared /before/ clearing -		 * selector flag. xchg_xen_ulong must contain an -		 * appropriate barrier. -		 */ -		if ((irq = per_cpu(virq_to_irq, cpu)[VIRQ_TIMER]) != -1) { -			int evtchn = evtchn_from_irq(irq); -			word_idx = evtchn / BITS_PER_LONG; -			pending_bits = evtchn % BITS_PER_LONG; -			if (active_evtchns(cpu, s, word_idx) & (1ULL << pending_bits)) { -				desc = irq_to_desc(irq); -				if (desc) -					generic_handle_irq_desc(irq, desc); -			} -		} - -		pending_words = xchg_xen_ulong(&vcpu_info->evtchn_pending_sel, 0); - -		start_word_idx = __this_cpu_read(current_word_idx); -		start_bit_idx = __this_cpu_read(current_bit_idx); - -		word_idx = start_word_idx; - -		for (i = 0; pending_words != 0; i++) { -			xen_ulong_t words; - -			words = MASK_LSBS(pending_words, word_idx); - -			/* -			 * If we masked out all events, wrap to beginning. -			 */ -			if (words == 0) { -				word_idx = 0; -				bit_idx = 0; -				continue; -			} -			word_idx = EVTCHN_FIRST_BIT(words); - -			pending_bits = active_evtchns(cpu, s, word_idx); -			bit_idx = 0; /* usually scan entire word from start */ -			/* -			 * We scan the starting word in two parts. -			 * -			 * 1st time: start in the middle, scanning the -			 * upper bits. -			 * -			 * 2nd time: scan the whole word (not just the -			 * parts skipped in the first pass) -- if an -			 * event in the previously scanned bits is -			 * pending again it would just be scanned on -			 * the next loop anyway. -			 */ -			if (word_idx == start_word_idx) { -				if (i == 0) -					bit_idx = start_bit_idx; -			} - -			do { -				xen_ulong_t bits; -				int port; - -				bits = MASK_LSBS(pending_bits, bit_idx); - -				/* If we masked out all events, move on. */ -				if (bits == 0) -					break; - -				bit_idx = EVTCHN_FIRST_BIT(bits); - -				/* Process port. */ -				port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx; -				irq = evtchn_to_irq[port]; - -				if (irq != -1) { -					desc = irq_to_desc(irq); -					if (desc) -						generic_handle_irq_desc(irq, desc); -				} - -				bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD; - -				/* Next caller starts at last processed + 1 */ -				__this_cpu_write(current_word_idx, -						 bit_idx ? word_idx : -						 (word_idx+1) % BITS_PER_EVTCHN_WORD); -				__this_cpu_write(current_bit_idx, bit_idx); -			} while (bit_idx != 0); - -			/* Scan start_l1i twice; all others once. */ -			if ((word_idx != start_word_idx) || (i != 0)) -				pending_words &= ~(1UL << word_idx); - -			word_idx = (word_idx + 1) % BITS_PER_EVTCHN_WORD; -		} +		xen_evtchn_handle_events(cpu);  		BUG_ON(!irqs_disabled()); @@ -1463,6 +1233,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)  	irq_enter();  #ifdef CONFIG_X86  	exit_idle(); +	inc_irq_stat(irq_hv_callback_count);  #endif  	__xen_evtchn_do_upcall(); @@ -1492,12 +1263,12 @@ void rebind_evtchn_irq(int evtchn, int irq)  	mutex_lock(&irq_mapping_update_lock);  	/* After resume the irq<->evtchn mappings are all cleared out */ -	BUG_ON(evtchn_to_irq[evtchn] != -1); +	BUG_ON(get_evtchn_to_irq(evtchn) != -1);  	/* Expect irq to have been bound before,  	   so there should be a proper type */  	BUG_ON(info->type == IRQT_UNBOUND); -	xen_irq_info_evtchn_init(irq, evtchn); +	(void)xen_irq_info_evtchn_setup(irq, evtchn);  	mutex_unlock(&irq_mapping_update_lock); @@ -1511,7 +1282,6 @@ void rebind_evtchn_irq(int evtchn, int irq)  /* Rebind an evtchn so that it gets delivered to a specific cpu */  static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)  { -	struct shared_info *s = HYPERVISOR_shared_info;  	struct evtchn_bind_vcpu bind_vcpu;  	int evtchn = evtchn_from_irq(irq);  	int masked; @@ -1534,7 +1304,7 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)  	 * Mask the event while changing the VCPU binding to prevent  	 * it being delivered on an unexpected VCPU.  	 */ -	masked = sync_test_and_set_bit(evtchn, BM(s->evtchn_mask)); +	masked = test_and_set_mask(evtchn);  	/*  	 * If this fails, it usually just indicates that we're dealing with a @@ -1553,27 +1323,11 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)  static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest,  			    bool force)  { -	unsigned tcpu = cpumask_first(dest); +	unsigned tcpu = cpumask_first_and(dest, cpu_online_mask);  	return rebind_irq_to_cpu(data->irq, tcpu);  } -int resend_irq_on_evtchn(unsigned int irq) -{ -	int masked, evtchn = evtchn_from_irq(irq); -	struct shared_info *s = HYPERVISOR_shared_info; - -	if (!VALID_EVTCHN(evtchn)) -		return 1; - -	masked = sync_test_and_set_bit(evtchn, BM(s->evtchn_mask)); -	sync_set_bit(evtchn, BM(s->evtchn_pending)); -	if (!masked) -		unmask_evtchn(evtchn); - -	return 1; -} -  static void enable_dynirq(struct irq_data *data)  {  	int evtchn = evtchn_from_irq(data->irq); @@ -1608,21 +1362,18 @@ static void mask_ack_dynirq(struct irq_data *data)  static int retrigger_dynirq(struct irq_data *data)  { -	int evtchn = evtchn_from_irq(data->irq); -	struct shared_info *sh = HYPERVISOR_shared_info; -	int ret = 0; +	unsigned int evtchn = evtchn_from_irq(data->irq); +	int masked; -	if (VALID_EVTCHN(evtchn)) { -		int masked; +	if (!VALID_EVTCHN(evtchn)) +		return 0; -		masked = sync_test_and_set_bit(evtchn, BM(sh->evtchn_mask)); -		sync_set_bit(evtchn, BM(sh->evtchn_pending)); -		if (!masked) -			unmask_evtchn(evtchn); -		ret = 1; -	} +	masked = test_and_set_mask(evtchn); +	set_evtchn(evtchn); +	if (!masked) +		unmask_evtchn(evtchn); -	return ret; +	return 1;  }  static void restore_pirqs(void) @@ -1683,7 +1434,7 @@ static void restore_cpu_virqs(unsigned int cpu)  		evtchn = bind_virq.port;  		/* Record the new mapping. */ -		xen_irq_info_virq_init(cpu, irq, evtchn, virq); +		(void)xen_irq_info_virq_setup(cpu, irq, evtchn, virq);  		bind_evtchn_to_cpu(evtchn, cpu);  	}  } @@ -1707,7 +1458,7 @@ static void restore_cpu_ipis(unsigned int cpu)  		evtchn = bind_ipi.port;  		/* Record the new mapping. */ -		xen_irq_info_ipi_init(cpu, irq, evtchn, ipi); +		(void)xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi);  		bind_evtchn_to_cpu(evtchn, cpu);  	}  } @@ -1784,21 +1535,18 @@ EXPORT_SYMBOL_GPL(xen_test_irq_shared);  void xen_irq_resume(void)  { -	unsigned int cpu, evtchn; +	unsigned int cpu;  	struct irq_info *info; -	init_evtchn_cpu_bindings(); -  	/* New event-channel space is not 'live' yet. */ -	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) -		mask_evtchn(evtchn); +	xen_evtchn_mask_all(); +	xen_evtchn_resume();  	/* No IRQ <-> event-channel mappings. */  	list_for_each_entry(info, &xen_irq_list_head, list)  		info->evtchn = 0; /* zap event-channel binding */ -	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) -		evtchn_to_irq[evtchn] = -1; +	clear_evtchn_to_irq_all();  	for_each_possible_cpu(cpu) {  		restore_cpu_virqs(cpu); @@ -1889,27 +1637,40 @@ void xen_callback_vector(void)  void xen_callback_vector(void) {}  #endif +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "xen." + +static bool fifo_events = true; +module_param(fifo_events, bool, 0); +  void __init xen_init_IRQ(void)  { -	int i; +	int ret = -EINVAL; -	evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq), -				    GFP_KERNEL); -	BUG_ON(!evtchn_to_irq); -	for (i = 0; i < NR_EVENT_CHANNELS; i++) -		evtchn_to_irq[i] = -1; +	if (fifo_events) +		ret = xen_evtchn_fifo_init(); +	if (ret < 0) +		xen_evtchn_2l_init(); -	init_evtchn_cpu_bindings(); +	evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()), +				sizeof(*evtchn_to_irq), GFP_KERNEL); +	BUG_ON(!evtchn_to_irq);  	/* No event channels are 'live' right now. */ -	for (i = 0; i < NR_EVENT_CHANNELS; i++) -		mask_evtchn(i); +	xen_evtchn_mask_all();  	pirq_needs_eoi = pirq_needs_eoi_flag;  #ifdef CONFIG_X86 -	if (xen_hvm_domain()) { +	if (xen_pv_domain()) { +		irq_ctx_init(smp_processor_id()); +		if (xen_initial_domain()) +			pci_xen_initial_domain(); +	} +	if (xen_feature(XENFEAT_hvm_callback_vector))  		xen_callback_vector(); + +	if (xen_hvm_domain()) {  		native_init_IRQ();  		/* pci_xen_hvm_init must be called after native_init_IRQ so that  		 * __acpi_register_gsi can point at the right function */ @@ -1918,13 +1679,10 @@ void __init xen_init_IRQ(void)  		int rc;  		struct physdev_pirq_eoi_gmfn eoi_gmfn; -		irq_ctx_init(smp_processor_id()); -		if (xen_initial_domain()) -			pci_xen_initial_domain(); -  		pirq_eoi_map = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO);  		eoi_gmfn.gmfn = virt_to_mfn(pirq_eoi_map);  		rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn); +		/* TODO: No PVH support for PIRQ EOI */  		if (rc != 0) {  			free_page((unsigned long) pirq_eoi_map);  			pirq_eoi_map = NULL; diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c new file mode 100644 index 00000000000..84b4bfb8434 --- /dev/null +++ b/drivers/xen/events/events_fifo.c @@ -0,0 +1,443 @@ +/* + * Xen event channels (FIFO-based ABI) + * + * Copyright (C) 2013 Citrix Systems R&D ltd. + * + * This source code is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * Or, when distributed separately from the Linux kernel or + * incorporated into other software packages, subject to the following + * license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/linkage.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/smp.h> +#include <linux/percpu.h> +#include <linux/cpu.h> + +#include <asm/sync_bitops.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/page.h> + +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> + +#include "events_internal.h" + +#define EVENT_WORDS_PER_PAGE (PAGE_SIZE / sizeof(event_word_t)) +#define MAX_EVENT_ARRAY_PAGES (EVTCHN_FIFO_NR_CHANNELS / EVENT_WORDS_PER_PAGE) + +struct evtchn_fifo_queue { +	uint32_t head[EVTCHN_FIFO_MAX_QUEUES]; +}; + +static DEFINE_PER_CPU(struct evtchn_fifo_control_block *, cpu_control_block); +static DEFINE_PER_CPU(struct evtchn_fifo_queue, cpu_queue); +static event_word_t *event_array[MAX_EVENT_ARRAY_PAGES] __read_mostly; +static unsigned event_array_pages __read_mostly; + +/* + * sync_set_bit() and friends must be unsigned long aligned on non-x86 + * platforms. + */ +#if !defined(CONFIG_X86) && BITS_PER_LONG > 32 + +#define BM(w) (unsigned long *)((unsigned long)w & ~0x7UL) +#define EVTCHN_FIFO_BIT(b, w) \ +    (((unsigned long)w & 0x4UL) ? (EVTCHN_FIFO_ ##b + 32) : EVTCHN_FIFO_ ##b) + +#else + +#define BM(w) ((unsigned long *)(w)) +#define EVTCHN_FIFO_BIT(b, w) EVTCHN_FIFO_ ##b + +#endif + +static inline event_word_t *event_word_from_port(unsigned port) +{ +	unsigned i = port / EVENT_WORDS_PER_PAGE; + +	return event_array[i] + port % EVENT_WORDS_PER_PAGE; +} + +static unsigned evtchn_fifo_max_channels(void) +{ +	return EVTCHN_FIFO_NR_CHANNELS; +} + +static unsigned evtchn_fifo_nr_channels(void) +{ +	return event_array_pages * EVENT_WORDS_PER_PAGE; +} + +static void free_unused_array_pages(void) +{ +	unsigned i; + +	for (i = event_array_pages; i < MAX_EVENT_ARRAY_PAGES; i++) { +		if (!event_array[i]) +			break; +		free_page((unsigned long)event_array[i]); +		event_array[i] = NULL; +	} +} + +static void init_array_page(event_word_t *array_page) +{ +	unsigned i; + +	for (i = 0; i < EVENT_WORDS_PER_PAGE; i++) +		array_page[i] = 1 << EVTCHN_FIFO_MASKED; +} + +static int evtchn_fifo_setup(struct irq_info *info) +{ +	unsigned port = info->evtchn; +	unsigned new_array_pages; +	int ret; + +	new_array_pages = port / EVENT_WORDS_PER_PAGE + 1; + +	if (new_array_pages > MAX_EVENT_ARRAY_PAGES) +		return -EINVAL; + +	while (event_array_pages < new_array_pages) { +		void *array_page; +		struct evtchn_expand_array expand_array; + +		/* Might already have a page if we've resumed. */ +		array_page = event_array[event_array_pages]; +		if (!array_page) { +			array_page = (void *)__get_free_page(GFP_KERNEL); +			if (array_page == NULL) { +				ret = -ENOMEM; +				goto error; +			} +			event_array[event_array_pages] = array_page; +		} + +		/* Mask all events in this page before adding it. */ +		init_array_page(array_page); + +		expand_array.array_gfn = virt_to_mfn(array_page); + +		ret = HYPERVISOR_event_channel_op(EVTCHNOP_expand_array, &expand_array); +		if (ret < 0) +			goto error; + +		event_array_pages++; +	} +	return 0; + +  error: +	if (event_array_pages == 0) +		panic("xen: unable to expand event array with initial page (%d)\n", ret); +	else +		pr_err("unable to expand event array (%d)\n", ret); +	free_unused_array_pages(); +	return ret; +} + +static void evtchn_fifo_bind_to_cpu(struct irq_info *info, unsigned cpu) +{ +	/* no-op */ +} + +static void evtchn_fifo_clear_pending(unsigned port) +{ +	event_word_t *word = event_word_from_port(port); +	sync_clear_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static void evtchn_fifo_set_pending(unsigned port) +{ +	event_word_t *word = event_word_from_port(port); +	sync_set_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static bool evtchn_fifo_is_pending(unsigned port) +{ +	event_word_t *word = event_word_from_port(port); +	return sync_test_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static bool evtchn_fifo_test_and_set_mask(unsigned port) +{ +	event_word_t *word = event_word_from_port(port); +	return sync_test_and_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); +} + +static void evtchn_fifo_mask(unsigned port) +{ +	event_word_t *word = event_word_from_port(port); +	sync_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); +} + +static bool evtchn_fifo_is_masked(unsigned port) +{ +	event_word_t *word = event_word_from_port(port); +	return sync_test_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); +} +/* + * Clear MASKED, spinning if BUSY is set. + */ +static void clear_masked(volatile event_word_t *word) +{ +	event_word_t new, old, w; + +	w = *word; + +	do { +		old = w & ~(1 << EVTCHN_FIFO_BUSY); +		new = old & ~(1 << EVTCHN_FIFO_MASKED); +		w = sync_cmpxchg(word, old, new); +	} while (w != old); +} + +static void evtchn_fifo_unmask(unsigned port) +{ +	event_word_t *word = event_word_from_port(port); + +	BUG_ON(!irqs_disabled()); + +	clear_masked(word); +	if (evtchn_fifo_is_pending(port)) { +		struct evtchn_unmask unmask = { .port = port }; +		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); +	} +} + +static uint32_t clear_linked(volatile event_word_t *word) +{ +	event_word_t new, old, w; + +	w = *word; + +	do { +		old = w; +		new = (w & ~((1 << EVTCHN_FIFO_LINKED) +			     | EVTCHN_FIFO_LINK_MASK)); +	} while ((w = sync_cmpxchg(word, old, new)) != old); + +	return w & EVTCHN_FIFO_LINK_MASK; +} + +static void handle_irq_for_port(unsigned port) +{ +	int irq; + +	irq = get_evtchn_to_irq(port); +	if (irq != -1) +		generic_handle_irq(irq); +} + +static void consume_one_event(unsigned cpu, +			      struct evtchn_fifo_control_block *control_block, +			      unsigned priority, unsigned long *ready) +{ +	struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu); +	uint32_t head; +	unsigned port; +	event_word_t *word; + +	head = q->head[priority]; + +	/* +	 * Reached the tail last time?  Read the new HEAD from the +	 * control block. +	 */ +	if (head == 0) { +		rmb(); /* Ensure word is up-to-date before reading head. */ +		head = control_block->head[priority]; +	} + +	port = head; +	word = event_word_from_port(port); +	head = clear_linked(word); + +	/* +	 * If the link is non-zero, there are more events in the +	 * queue, otherwise the queue is empty. +	 * +	 * If the queue is empty, clear this priority from our local +	 * copy of the ready word. +	 */ +	if (head == 0) +		clear_bit(priority, ready); + +	if (evtchn_fifo_is_pending(port) && !evtchn_fifo_is_masked(port)) +		handle_irq_for_port(port); + +	q->head[priority] = head; +} + +static void evtchn_fifo_handle_events(unsigned cpu) +{ +	struct evtchn_fifo_control_block *control_block; +	unsigned long ready; +	unsigned q; + +	control_block = per_cpu(cpu_control_block, cpu); + +	ready = xchg(&control_block->ready, 0); + +	while (ready) { +		q = find_first_bit(BM(&ready), EVTCHN_FIFO_MAX_QUEUES); +		consume_one_event(cpu, control_block, q, &ready); +		ready |= xchg(&control_block->ready, 0); +	} +} + +static void evtchn_fifo_resume(void) +{ +	unsigned cpu; + +	for_each_possible_cpu(cpu) { +		void *control_block = per_cpu(cpu_control_block, cpu); +		struct evtchn_init_control init_control; +		int ret; + +		if (!control_block) +			continue; + +		/* +		 * If this CPU is offline, take the opportunity to +		 * free the control block while it is not being +		 * used. +		 */ +		if (!cpu_online(cpu)) { +			free_page((unsigned long)control_block); +			per_cpu(cpu_control_block, cpu) = NULL; +			continue; +		} + +		init_control.control_gfn = virt_to_mfn(control_block); +		init_control.offset = 0; +		init_control.vcpu = cpu; + +		ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control, +						  &init_control); +		if (ret < 0) +			BUG(); +	} + +	/* +	 * The event array starts out as empty again and is extended +	 * as normal when events are bound.  The existing pages will +	 * be reused. +	 */ +	event_array_pages = 0; +} + +static const struct evtchn_ops evtchn_ops_fifo = { +	.max_channels      = evtchn_fifo_max_channels, +	.nr_channels       = evtchn_fifo_nr_channels, +	.setup             = evtchn_fifo_setup, +	.bind_to_cpu       = evtchn_fifo_bind_to_cpu, +	.clear_pending     = evtchn_fifo_clear_pending, +	.set_pending       = evtchn_fifo_set_pending, +	.is_pending        = evtchn_fifo_is_pending, +	.test_and_set_mask = evtchn_fifo_test_and_set_mask, +	.mask              = evtchn_fifo_mask, +	.unmask            = evtchn_fifo_unmask, +	.handle_events     = evtchn_fifo_handle_events, +	.resume            = evtchn_fifo_resume, +}; + +static int evtchn_fifo_init_control_block(unsigned cpu) +{ +	struct page *control_block = NULL; +	struct evtchn_init_control init_control; +	int ret = -ENOMEM; + +	control_block = alloc_page(GFP_KERNEL|__GFP_ZERO); +	if (control_block == NULL) +		goto error; + +	init_control.control_gfn = virt_to_mfn(page_address(control_block)); +	init_control.offset      = 0; +	init_control.vcpu        = cpu; + +	ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control, &init_control); +	if (ret < 0) +		goto error; + +	per_cpu(cpu_control_block, cpu) = page_address(control_block); + +	return 0; + +  error: +	__free_page(control_block); +	return ret; +} + +static int evtchn_fifo_cpu_notification(struct notifier_block *self, +						  unsigned long action, +						  void *hcpu) +{ +	int cpu = (long)hcpu; +	int ret = 0; + +	switch (action) { +	case CPU_UP_PREPARE: +		if (!per_cpu(cpu_control_block, cpu)) +			ret = evtchn_fifo_init_control_block(cpu); +		break; +	default: +		break; +	} +	return ret < 0 ? NOTIFY_BAD : NOTIFY_OK; +} + +static struct notifier_block evtchn_fifo_cpu_notifier = { +	.notifier_call	= evtchn_fifo_cpu_notification, +}; + +int __init xen_evtchn_fifo_init(void) +{ +	int cpu = get_cpu(); +	int ret; + +	ret = evtchn_fifo_init_control_block(cpu); +	if (ret < 0) +		goto out; + +	pr_info("Using FIFO-based ABI\n"); + +	evtchn_ops = &evtchn_ops_fifo; + +	register_cpu_notifier(&evtchn_fifo_cpu_notifier); +out: +	put_cpu(); +	return ret; +} diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h new file mode 100644 index 00000000000..50c2050a1e3 --- /dev/null +++ b/drivers/xen/events/events_internal.h @@ -0,0 +1,151 @@ +/* + * Xen Event Channels (internal header) + * + * Copyright (C) 2013 Citrix Systems R&D Ltd. + * + * This source code is licensed under the GNU General Public License, + * Version 2 or later.  See the file COPYING for more details. + */ +#ifndef __EVENTS_INTERNAL_H__ +#define __EVENTS_INTERNAL_H__ + +/* Interrupt types. */ +enum xen_irq_type { +	IRQT_UNBOUND = 0, +	IRQT_PIRQ, +	IRQT_VIRQ, +	IRQT_IPI, +	IRQT_EVTCHN +}; + +/* + * Packed IRQ information: + * type - enum xen_irq_type + * event channel - irq->event channel mapping + * cpu - cpu this event channel is bound to + * index - type-specific information: + *    PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM + *           guest, or GSI (real passthrough IRQ) of the device. + *    VIRQ - virq number + *    IPI - IPI vector + *    EVTCHN - + */ +struct irq_info { +	struct list_head list; +	int refcnt; +	enum xen_irq_type type;	/* type */ +	unsigned irq; +	unsigned int evtchn;	/* event channel */ +	unsigned short cpu;	/* cpu bound */ + +	union { +		unsigned short virq; +		enum ipi_vector ipi; +		struct { +			unsigned short pirq; +			unsigned short gsi; +			unsigned char vector; +			unsigned char flags; +			uint16_t domid; +		} pirq; +	} u; +}; + +#define PIRQ_NEEDS_EOI	(1 << 0) +#define PIRQ_SHAREABLE	(1 << 1) +#define PIRQ_MSI_GROUP	(1 << 2) + +struct evtchn_ops { +	unsigned (*max_channels)(void); +	unsigned (*nr_channels)(void); + +	int (*setup)(struct irq_info *info); +	void (*bind_to_cpu)(struct irq_info *info, unsigned cpu); + +	void (*clear_pending)(unsigned port); +	void (*set_pending)(unsigned port); +	bool (*is_pending)(unsigned port); +	bool (*test_and_set_mask)(unsigned port); +	void (*mask)(unsigned port); +	void (*unmask)(unsigned port); + +	void (*handle_events)(unsigned cpu); +	void (*resume)(void); +}; + +extern const struct evtchn_ops *evtchn_ops; + +extern int **evtchn_to_irq; +int get_evtchn_to_irq(unsigned int evtchn); + +struct irq_info *info_for_irq(unsigned irq); +unsigned cpu_from_irq(unsigned irq); +unsigned cpu_from_evtchn(unsigned int evtchn); + +static inline unsigned xen_evtchn_max_channels(void) +{ +	return evtchn_ops->max_channels(); +} + +/* + * Do any ABI specific setup for a bound event channel before it can + * be unmasked and used. + */ +static inline int xen_evtchn_port_setup(struct irq_info *info) +{ +	if (evtchn_ops->setup) +		return evtchn_ops->setup(info); +	return 0; +} + +static inline void xen_evtchn_port_bind_to_cpu(struct irq_info *info, +					       unsigned cpu) +{ +	evtchn_ops->bind_to_cpu(info, cpu); +} + +static inline void clear_evtchn(unsigned port) +{ +	evtchn_ops->clear_pending(port); +} + +static inline void set_evtchn(unsigned port) +{ +	evtchn_ops->set_pending(port); +} + +static inline bool test_evtchn(unsigned port) +{ +	return evtchn_ops->is_pending(port); +} + +static inline bool test_and_set_mask(unsigned port) +{ +	return evtchn_ops->test_and_set_mask(port); +} + +static inline void mask_evtchn(unsigned port) +{ +	return evtchn_ops->mask(port); +} + +static inline void unmask_evtchn(unsigned port) +{ +	return evtchn_ops->unmask(port); +} + +static inline void xen_evtchn_handle_events(unsigned cpu) +{ +	return evtchn_ops->handle_events(cpu); +} + +static inline void xen_evtchn_resume(void) +{ +	if (evtchn_ops->resume) +		evtchn_ops->resume(); +} + +void xen_evtchn_2l_init(void); +int xen_evtchn_fifo_init(void); + +#endif /* #ifndef __EVENTS_INTERNAL_H__ */ diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index 8b3a69a06c3..00f40f051d9 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -305,7 +305,7 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port)  	if (rc < 0)  		goto err; -	rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED, +	rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, 0,  				       u->name, evtchn);  	if (rc < 0)  		goto err; @@ -417,7 +417,7 @@ static long evtchn_ioctl(struct file *file,  			break;  		rc = -EINVAL; -		if (unbind.port >= NR_EVENT_CHANNELS) +		if (unbind.port >= xen_evtchn_nr_channels())  			break;  		rc = -ENOTCONN; diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index e41c79c986e..073b4a19a8b 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -846,7 +846,7 @@ static int __init gntdev_init(void)  	if (!xen_domain())  		return -ENODEV; -	use_ptemod = xen_pv_domain(); +	use_ptemod = !xen_feature(XENFEAT_auto_translated_physmap);  	err = misc_register(&gntdev_miscdev);  	if (err != 0) { diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index c4d2298893b..eeba7544f0c 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -49,6 +49,7 @@  #include <xen/grant_table.h>  #include <xen/interface/memory.h>  #include <xen/hvc-console.h> +#include <xen/swiotlb-xen.h>  #include <asm/xen/hypercall.h>  #include <asm/xen/interface.h> @@ -61,12 +62,10 @@  static grant_ref_t **gnttab_list;  static unsigned int nr_grant_frames; -static unsigned int boot_max_nr_grant_frames;  static int gnttab_free_count;  static grant_ref_t gnttab_free_head;  static DEFINE_SPINLOCK(gnttab_list_lock); -unsigned long xen_hvm_resume_frames; -EXPORT_SYMBOL_GPL(xen_hvm_resume_frames); +struct grant_frames xen_auto_xlat_grant_frames;  static union {  	struct grant_entry_v1 *v1; @@ -826,6 +825,11 @@ static unsigned int __max_nr_grant_frames(void)  unsigned int gnttab_max_grant_frames(void)  {  	unsigned int xen_max = __max_nr_grant_frames(); +	static unsigned int boot_max_nr_grant_frames; + +	/* First time, initialize it properly. */ +	if (!boot_max_nr_grant_frames) +		boot_max_nr_grant_frames = __max_nr_grant_frames();  	if (xen_max > boot_max_nr_grant_frames)  		return boot_max_nr_grant_frames; @@ -833,6 +837,51 @@ unsigned int gnttab_max_grant_frames(void)  }  EXPORT_SYMBOL_GPL(gnttab_max_grant_frames); +int gnttab_setup_auto_xlat_frames(phys_addr_t addr) +{ +	xen_pfn_t *pfn; +	unsigned int max_nr_gframes = __max_nr_grant_frames(); +	unsigned int i; +	void *vaddr; + +	if (xen_auto_xlat_grant_frames.count) +		return -EINVAL; + +	vaddr = xen_remap(addr, PAGE_SIZE * max_nr_gframes); +	if (vaddr == NULL) { +		pr_warn("Failed to ioremap gnttab share frames (addr=%pa)!\n", +			&addr); +		return -ENOMEM; +	} +	pfn = kcalloc(max_nr_gframes, sizeof(pfn[0]), GFP_KERNEL); +	if (!pfn) { +		xen_unmap(vaddr); +		return -ENOMEM; +	} +	for (i = 0; i < max_nr_gframes; i++) +		pfn[i] = PFN_DOWN(addr) + i; + +	xen_auto_xlat_grant_frames.vaddr = vaddr; +	xen_auto_xlat_grant_frames.pfn = pfn; +	xen_auto_xlat_grant_frames.count = max_nr_gframes; + +	return 0; +} +EXPORT_SYMBOL_GPL(gnttab_setup_auto_xlat_frames); + +void gnttab_free_auto_xlat_frames(void) +{ +	if (!xen_auto_xlat_grant_frames.count) +		return; +	kfree(xen_auto_xlat_grant_frames.pfn); +	xen_unmap(xen_auto_xlat_grant_frames.vaddr); + +	xen_auto_xlat_grant_frames.pfn = NULL; +	xen_auto_xlat_grant_frames.count = 0; +	xen_auto_xlat_grant_frames.vaddr = NULL; +} +EXPORT_SYMBOL_GPL(gnttab_free_auto_xlat_frames); +  /* Handling of paged out grant targets (GNTST_eagain) */  #define MAX_DELAY 256  static inline void @@ -884,9 +933,6 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,  		    struct page **pages, unsigned int count)  {  	int i, ret; -	bool lazy = false; -	pte_t *pte; -	unsigned long mfn;  	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count);  	if (ret) @@ -898,36 +944,7 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,  			gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, map_ops + i,  						&map_ops[i].status, __func__); -	if (xen_feature(XENFEAT_auto_translated_physmap)) -		return ret; - -	if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { -		arch_enter_lazy_mmu_mode(); -		lazy = true; -	} - -	for (i = 0; i < count; i++) { -		/* Do not add to override if the map failed. */ -		if (map_ops[i].status) -			continue; - -		if (map_ops[i].flags & GNTMAP_contains_pte) { -			pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + -				(map_ops[i].host_addr & ~PAGE_MASK)); -			mfn = pte_mfn(*pte); -		} else { -			mfn = PFN_DOWN(map_ops[i].dev_bus_addr); -		} -		ret = m2p_add_override(mfn, pages[i], kmap_ops ? -				       &kmap_ops[i] : NULL); -		if (ret) -			return ret; -	} - -	if (lazy) -		arch_leave_lazy_mmu_mode(); - -	return ret; +	return set_foreign_p2m_mapping(map_ops, kmap_ops, pages, count);  }  EXPORT_SYMBOL_GPL(gnttab_map_refs); @@ -935,32 +952,13 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,  		      struct gnttab_map_grant_ref *kmap_ops,  		      struct page **pages, unsigned int count)  { -	int i, ret; -	bool lazy = false; +	int ret;  	ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count);  	if (ret)  		return ret; -	if (xen_feature(XENFEAT_auto_translated_physmap)) -		return ret; - -	if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { -		arch_enter_lazy_mmu_mode(); -		lazy = true; -	} - -	for (i = 0; i < count; i++) { -		ret = m2p_remove_override(pages[i], kmap_ops ? -				       &kmap_ops[i] : NULL); -		if (ret) -			return ret; -	} - -	if (lazy) -		arch_leave_lazy_mmu_mode(); - -	return ret; +	return clear_foreign_p2m_mapping(unmap_ops, kmap_ops, pages, count);  }  EXPORT_SYMBOL_GPL(gnttab_unmap_refs); @@ -1043,10 +1041,11 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)  	unsigned int nr_gframes = end_idx + 1;  	int rc; -	if (xen_hvm_domain()) { +	if (xen_feature(XENFEAT_auto_translated_physmap)) {  		struct xen_add_to_physmap xatp;  		unsigned int i = end_idx;  		rc = 0; +		BUG_ON(xen_auto_xlat_grant_frames.count < nr_gframes);  		/*  		 * Loop backwards, so that the first hypercall has the largest  		 * index, ensuring that the table will grow only once. @@ -1055,7 +1054,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)  			xatp.domid = DOMID_SELF;  			xatp.idx = i;  			xatp.space = XENMAPSPACE_grant_table; -			xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i; +			xatp.gpfn = xen_auto_xlat_grant_frames.pfn[i];  			rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp);  			if (rc != 0) {  				pr_warn("grant table add_to_physmap failed, err=%d\n", @@ -1118,10 +1117,8 @@ static void gnttab_request_version(void)  	int rc;  	struct gnttab_set_version gsv; -	if (xen_hvm_domain()) -		gsv.version = 1; -	else -		gsv.version = 2; +	gsv.version = 1; +  	rc = HYPERVISOR_grant_table_op(GNTTABOP_set_version, &gsv, 1);  	if (rc == 0 && gsv.version == 2) {  		grant_table_version = 2; @@ -1152,21 +1149,15 @@ static int gnttab_setup(void)  	if (max_nr_gframes < nr_grant_frames)  		return -ENOSYS; -	if (xen_pv_domain()) -		return gnttab_map(0, nr_grant_frames - 1); - -	if (gnttab_shared.addr == NULL) { -		gnttab_shared.addr = xen_remap(xen_hvm_resume_frames, -						PAGE_SIZE * max_nr_gframes); +	if (xen_feature(XENFEAT_auto_translated_physmap) && gnttab_shared.addr == NULL) { +		gnttab_shared.addr = xen_auto_xlat_grant_frames.vaddr;  		if (gnttab_shared.addr == NULL) { -			pr_warn("Failed to ioremap gnttab share frames!\n"); +			pr_warn("gnttab share frames (addr=0x%08lx) is not mapped!\n", +				(unsigned long)xen_auto_xlat_grant_frames.vaddr);  			return -ENOMEM;  		}  	} - -	gnttab_map(0, nr_grant_frames - 1); - -	return 0; +	return gnttab_map(0, nr_grant_frames - 1);  }  int gnttab_resume(void) @@ -1177,7 +1168,8 @@ int gnttab_resume(void)  int gnttab_suspend(void)  { -	gnttab_interface->unmap_frames(); +	if (!xen_feature(XENFEAT_auto_translated_physmap)) +		gnttab_interface->unmap_frames();  	return 0;  } @@ -1203,19 +1195,20 @@ static int gnttab_expand(unsigned int req_entries)  int gnttab_init(void)  {  	int i; +	unsigned long max_nr_grant_frames;  	unsigned int max_nr_glist_frames, nr_glist_frames;  	unsigned int nr_init_grefs;  	int ret;  	gnttab_request_version(); +	max_nr_grant_frames = gnttab_max_grant_frames();  	nr_grant_frames = 1; -	boot_max_nr_grant_frames = __max_nr_grant_frames();  	/* Determine the maximum number of frames required for the  	 * grant reference free list on the current hypervisor.  	 */  	BUG_ON(grefs_per_grant_frame == 0); -	max_nr_glist_frames = (boot_max_nr_grant_frames * +	max_nr_glist_frames = (max_nr_grant_frames *  			       grefs_per_grant_frame / RPP);  	gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *), @@ -1232,6 +1225,11 @@ int gnttab_init(void)  		}  	} +	ret = arch_gnttab_init(max_nr_grant_frames, +			       nr_status_frames(max_nr_grant_frames)); +	if (ret < 0) +		goto ini_nomem; +  	if (gnttab_setup() < 0) {  		ret = -ENODEV;  		goto ini_nomem; @@ -1268,5 +1266,6 @@ static int __gnttab_init(void)  	return gnttab_init();  } - -core_initcall(__gnttab_init); +/* Starts after core_initcall so that xen_pvh_gnttab_setup can be called + * beforehand to initialize xen_auto_xlat_grant_frames. */ +core_initcall_sync(__gnttab_init); diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 624e8dc2453..5f1e1f3cd18 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -41,32 +41,23 @@ static enum shutdown_state shutting_down = SHUTDOWN_INVALID;  struct suspend_info {  	int cancelled; -	unsigned long arg; /* extra hypercall argument */ -	void (*pre)(void); -	void (*post)(int cancelled);  }; -#ifdef CONFIG_HIBERNATE_CALLBACKS -static void xen_hvm_post_suspend(int cancelled) -{ -	xen_arch_hvm_post_suspend(cancelled); -	gnttab_resume(); -} +static RAW_NOTIFIER_HEAD(xen_resume_notifier); -static void xen_pre_suspend(void) +void xen_resume_notifier_register(struct notifier_block *nb)  { -	xen_mm_pin_all(); -	gnttab_suspend(); -	xen_arch_pre_suspend(); +	raw_notifier_chain_register(&xen_resume_notifier, nb);  } +EXPORT_SYMBOL_GPL(xen_resume_notifier_register); -static void xen_post_suspend(int cancelled) +void xen_resume_notifier_unregister(struct notifier_block *nb)  { -	xen_arch_post_suspend(cancelled); -	gnttab_resume(); -	xen_mm_unpin_all(); +	raw_notifier_chain_unregister(&xen_resume_notifier, nb);  } +EXPORT_SYMBOL_GPL(xen_resume_notifier_unregister); +#ifdef CONFIG_HIBERNATE_CALLBACKS  static int xen_suspend(void *data)  {  	struct suspend_info *si = data; @@ -80,22 +71,23 @@ static int xen_suspend(void *data)  		return err;  	} -	if (si->pre) -		si->pre(); +	gnttab_suspend(); +	xen_arch_pre_suspend();  	/*  	 * This hypercall returns 1 if suspend was cancelled  	 * or the domain was merely checkpointed, and 0 if it  	 * is resuming in a new domain.  	 */ -	si->cancelled = HYPERVISOR_suspend(si->arg); +	si->cancelled = HYPERVISOR_suspend(xen_pv_domain() +                                           ? virt_to_mfn(xen_start_info) +                                           : 0); -	if (si->post) -		si->post(si->cancelled); +	xen_arch_post_suspend(si->cancelled); +	gnttab_resume();  	if (!si->cancelled) {  		xen_irq_resume(); -		xen_console_resume();  		xen_timer_resume();  	} @@ -140,18 +132,14 @@ static void do_suspend(void)  	si.cancelled = 1; -	if (xen_hvm_domain()) { -		si.arg = 0UL; -		si.pre = NULL; -		si.post = &xen_hvm_post_suspend; -	} else { -		si.arg = virt_to_mfn(xen_start_info); -		si.pre = &xen_pre_suspend; -		si.post = &xen_post_suspend; -	} -  	err = stop_machine(xen_suspend, &si, cpumask_of(0)); +	/* Resume console as early as possible. */ +	if (!si.cancelled) +		xen_console_resume(); + +	raw_notifier_call_chain(&xen_resume_notifier, 0, NULL); +  	dpm_resume_start(si.cancelled ? PMSG_THAW : PMSG_RESTORE);  	if (err) { @@ -182,10 +170,32 @@ struct shutdown_handler {  	void (*cb)(void);  }; +static int poweroff_nb(struct notifier_block *cb, unsigned long code, void *unused) +{ +	switch (code) { +	case SYS_DOWN: +	case SYS_HALT: +	case SYS_POWER_OFF: +		shutting_down = SHUTDOWN_POWEROFF; +	default: +		break; +	} +	return NOTIFY_DONE; +}  static void do_poweroff(void)  { -	shutting_down = SHUTDOWN_POWEROFF; -	orderly_poweroff(false); +	switch (system_state) { +	case SYSTEM_BOOTING: +		orderly_poweroff(true); +		break; +	case SYSTEM_RUNNING: +		orderly_poweroff(false); +		break; +	default: +		/* Don't do it when we are halting/rebooting. */ +		pr_info("Ignoring Xen toolstack shutdown.\n"); +		break; +	}  }  static void do_reboot(void) @@ -291,6 +301,10 @@ static struct xenbus_watch shutdown_watch = {  	.callback = shutdown_handler  }; +static struct notifier_block xen_reboot_nb = { +	.notifier_call = poweroff_nb, +}; +  static int setup_shutdown_watcher(void)  {  	int err; @@ -301,6 +315,7 @@ static int setup_shutdown_watcher(void)  		return err;  	} +  #ifdef CONFIG_MAGIC_SYSRQ  	err = register_xenbus_watch(&sysrq_watch);  	if (err) { @@ -329,6 +344,7 @@ int xen_setup_shutdown_event(void)  	if (!xen_domain())  		return -ENODEV;  	register_xenstore_notifier(&xenstore_notifier); +	register_reboot_notifier(&xen_reboot_nb);  	return 0;  } diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c index 18fff88254e..dd9c249ea31 100644 --- a/drivers/xen/pci.c +++ b/drivers/xen/pci.c @@ -26,6 +26,9 @@  #include <asm/xen/hypervisor.h>  #include <asm/xen/hypercall.h>  #include "../pci/pci.h" +#ifdef CONFIG_PCI_MMCONFIG +#include <asm/pci_x86.h> +#endif  static bool __read_mostly pci_seg_supported = true; @@ -58,12 +61,12 @@ static int xen_add_device(struct device *dev)  			add.flags = XEN_PCI_DEV_EXTFN;  #ifdef CONFIG_ACPI -		handle = DEVICE_ACPI_HANDLE(&pci_dev->dev); +		handle = ACPI_HANDLE(&pci_dev->dev);  		if (!handle && pci_dev->bus->bridge) -			handle = DEVICE_ACPI_HANDLE(pci_dev->bus->bridge); +			handle = ACPI_HANDLE(pci_dev->bus->bridge);  #ifdef CONFIG_PCI_IOV  		if (!handle && pci_dev->is_virtfn) -			handle = DEVICE_ACPI_HANDLE(physfn->bus->bridge); +			handle = ACPI_HANDLE(physfn->bus->bridge);  #endif  		if (handle) {  			acpi_status status; @@ -192,3 +195,49 @@ static int __init register_xen_pci_notifier(void)  }  arch_initcall(register_xen_pci_notifier); + +#ifdef CONFIG_PCI_MMCONFIG +static int __init xen_mcfg_late(void) +{ +	struct pci_mmcfg_region *cfg; +	int rc; + +	if (!xen_initial_domain()) +		return 0; + +	if ((pci_probe & PCI_PROBE_MMCONF) == 0) +		return 0; + +	if (list_empty(&pci_mmcfg_list)) +		return 0; + +	/* Check whether they are in the right area. */ +	list_for_each_entry(cfg, &pci_mmcfg_list, list) { +		struct physdev_pci_mmcfg_reserved r; + +		r.address = cfg->address; +		r.segment = cfg->segment; +		r.start_bus = cfg->start_bus; +		r.end_bus = cfg->end_bus; +		r.flags = XEN_PCI_MMCFG_RESERVED; + +		rc = HYPERVISOR_physdev_op(PHYSDEVOP_pci_mmcfg_reserved, &r); +		switch (rc) { +		case 0: +		case -ENOSYS: +			continue; + +		default: +			pr_warn("Failed to report MMCONFIG reservation" +				" state for %s to hypervisor" +				" (%d)\n", +				cfg->name, rc); +		} +	} +	return 0; +} +/* + * Needs to be done after acpi_init which are subsys_initcall. + */ +subsys_initcall_sync(xen_mcfg_late); +#endif diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c index 79e1dff7ed4..0aac403d53f 100644 --- a/drivers/xen/pcpu.c +++ b/drivers/xen/pcpu.c @@ -40,6 +40,7 @@  #include <linux/capability.h>  #include <xen/xen.h> +#include <xen/acpi.h>  #include <xen/xenbus.h>  #include <xen/events.h>  #include <xen/interface/platform.h> diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index 99db9e1eb8b..3454973dc3b 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -45,7 +45,7 @@ static unsigned long platform_mmio_alloc;  static unsigned long platform_mmiolen;  static uint64_t callback_via; -unsigned long alloc_xen_mmio(unsigned long len) +static unsigned long alloc_xen_mmio(unsigned long len)  {  	unsigned long addr; @@ -84,7 +84,7 @@ static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id)  static int xen_allocate_irq(struct pci_dev *pdev)  {  	return request_irq(pdev->irq, do_hvm_evtchn_intr, -			IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TRIGGER_RISING, +			IRQF_NOBALANCING | IRQF_TRIGGER_RISING,  			"xen-platform-pci", pdev);  } @@ -108,6 +108,7 @@ static int platform_pci_init(struct pci_dev *pdev,  	long ioaddr;  	long mmio_addr, mmio_len;  	unsigned int max_nr_gframes; +	unsigned long grant_frames;  	if (!xen_domain())  		return -ENODEV; @@ -154,13 +155,17 @@ static int platform_pci_init(struct pci_dev *pdev,  	}  	max_nr_gframes = gnttab_max_grant_frames(); -	xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); -	ret = gnttab_init(); +	grant_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); +	ret = gnttab_setup_auto_xlat_frames(grant_frames);  	if (ret)  		goto out; +	ret = gnttab_init(); +	if (ret) +		goto grant_out;  	xenbus_probe(NULL);  	return 0; - +grant_out: +	gnttab_free_auto_xlat_frames();  out:  	pci_release_region(pdev, 0);  mem_out: diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 8e74590fa1b..569a13b9e85 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -533,12 +533,17 @@ static void privcmd_close(struct vm_area_struct *vma)  {  	struct page **pages = vma->vm_private_data;  	int numpgs = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; +	int rc;  	if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages)  		return; -	xen_unmap_domain_mfn_range(vma, numpgs, pages); -	free_xenballooned_pages(numpgs, pages); +	rc = xen_unmap_domain_mfn_range(vma, numpgs, pages); +	if (rc == 0) +		free_xenballooned_pages(numpgs, pages); +	else +		pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n", +			numpgs, rc);  	kfree(pages);  } diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 1b2277c311d..ebd8f218a78 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -42,12 +42,31 @@  #include <xen/page.h>  #include <xen/xen-ops.h>  #include <xen/hvc-console.h> + +#include <asm/dma-mapping.h> +#include <asm/xen/page-coherent.h> + +#include <trace/events/swiotlb.h>  /*   * Used to do a quick range check in swiotlb_tbl_unmap_single and   * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this   * API.   */ +#ifndef CONFIG_X86 +static unsigned long dma_alloc_coherent_mask(struct device *dev, +					    gfp_t gfp) +{ +	unsigned long dma_mask = 0; + +	dma_mask = dev->coherent_dma_mask; +	if (!dma_mask) +		dma_mask = (gfp & GFP_DMA) ? DMA_BIT_MASK(24) : DMA_BIT_MASK(32); + +	return dma_mask; +} +#endif +  static char *xen_io_tlb_start, *xen_io_tlb_end;  static unsigned long xen_io_tlb_nslabs;  /* @@ -56,17 +75,35 @@ static unsigned long xen_io_tlb_nslabs;  static u64 start_dma_addr; -static dma_addr_t xen_phys_to_bus(phys_addr_t paddr) +/* + * Both of these functions should avoid PFN_PHYS because phys_addr_t + * can be 32bit when dma_addr_t is 64bit leading to a loss in + * information if the shift is done before casting to 64bit. + */ +static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr)  { -	return phys_to_machine(XPADDR(paddr)).maddr; +	unsigned long mfn = pfn_to_mfn(PFN_DOWN(paddr)); +	dma_addr_t dma = (dma_addr_t)mfn << PAGE_SHIFT; + +	dma |= paddr & ~PAGE_MASK; + +	return dma;  } -static phys_addr_t xen_bus_to_phys(dma_addr_t baddr) +static inline phys_addr_t xen_bus_to_phys(dma_addr_t baddr)  { -	return machine_to_phys(XMADDR(baddr)).paddr; +	unsigned long pfn = mfn_to_pfn(PFN_DOWN(baddr)); +	dma_addr_t dma = (dma_addr_t)pfn << PAGE_SHIFT; +	phys_addr_t paddr = dma; + +	BUG_ON(paddr != dma); /* truncation has occurred, should never happen */ + +	paddr |= baddr & ~PAGE_MASK; + +	return paddr;  } -static dma_addr_t xen_virt_to_bus(void *address) +static inline dma_addr_t xen_virt_to_bus(void *address)  {  	return xen_phys_to_bus(virt_to_phys(address));  } @@ -89,7 +126,7 @@ static int check_pages_physically_contiguous(unsigned long pfn,  	return 1;  } -static int range_straddles_page_boundary(phys_addr_t p, size_t size) +static inline int range_straddles_page_boundary(phys_addr_t p, size_t size)  {  	unsigned long pfn = PFN_DOWN(p);  	unsigned int offset = p & ~PAGE_MASK; @@ -126,6 +163,8 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)  {  	int i, rc;  	int dma_bits; +	dma_addr_t dma_handle; +	phys_addr_t p = virt_to_phys(buf);  	dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; @@ -135,9 +174,9 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)  		do {  			rc = xen_create_contiguous_region( -				(unsigned long)buf + (i << IO_TLB_SHIFT), +				p + (i << IO_TLB_SHIFT),  				get_order(slabs << IO_TLB_SHIFT), -				dma_bits); +				dma_bits, &dma_handle);  		} while (rc && dma_bits++ < max_dma_bits);  		if (rc)  			return rc; @@ -263,7 +302,6 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,  	void *ret;  	int order = get_order(size);  	u64 dma_mask = DMA_BIT_MASK(32); -	unsigned long vstart;  	phys_addr_t phys;  	dma_addr_t dev_addr; @@ -278,8 +316,12 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,  	if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret))  		return ret; -	vstart = __get_free_pages(flags, order); -	ret = (void *)vstart; +	/* On ARM this function returns an ioremap'ped virtual address for +	 * which virt_to_phys doesn't return the corresponding physical +	 * address. In fact on ARM virt_to_phys only works for kernel direct +	 * mapped RAM memory. Also see comment below. +	 */ +	ret = xen_alloc_coherent_pages(hwdev, size, dma_handle, flags, attrs);  	if (!ret)  		return ret; @@ -287,18 +329,21 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,  	if (hwdev && hwdev->coherent_dma_mask)  		dma_mask = dma_alloc_coherent_mask(hwdev, flags); -	phys = virt_to_phys(ret); +	/* At this point dma_handle is the physical address, next we are +	 * going to set it to the machine address. +	 * Do not use virt_to_phys(ret) because on ARM it doesn't correspond +	 * to *dma_handle. */ +	phys = *dma_handle;  	dev_addr = xen_phys_to_bus(phys);  	if (((dev_addr + size - 1 <= dma_mask)) &&  	    !range_straddles_page_boundary(phys, size))  		*dma_handle = dev_addr;  	else { -		if (xen_create_contiguous_region(vstart, order, -						 fls64(dma_mask)) != 0) { -			free_pages(vstart, order); +		if (xen_create_contiguous_region(phys, order, +						 fls64(dma_mask), dma_handle) != 0) { +			xen_free_coherent_pages(hwdev, size, ret, (dma_addr_t)phys, attrs);  			return NULL;  		} -		*dma_handle = virt_to_machine(ret).maddr;  	}  	memset(ret, 0, size);  	return ret; @@ -319,13 +364,15 @@ xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,  	if (hwdev && hwdev->coherent_dma_mask)  		dma_mask = hwdev->coherent_dma_mask; -	phys = virt_to_phys(vaddr); +	/* do not use virt_to_phys because on ARM it doesn't return you the +	 * physical address */ +	phys = xen_bus_to_phys(dev_addr);  	if (((dev_addr + size - 1 > dma_mask)) ||  	    range_straddles_page_boundary(phys, size)) -		xen_destroy_contiguous_region((unsigned long)vaddr, order); +		xen_destroy_contiguous_region(phys, order); -	free_pages((unsigned long)vaddr, order); +	xen_free_coherent_pages(hwdev, size, vaddr, (dma_addr_t)phys, attrs);  }  EXPORT_SYMBOL_GPL(xen_swiotlb_free_coherent); @@ -352,16 +399,25 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,  	 * buffering it.  	 */  	if (dma_capable(dev, dev_addr, size) && -	    !range_straddles_page_boundary(phys, size) && !swiotlb_force) +	    !range_straddles_page_boundary(phys, size) && !swiotlb_force) { +		/* we are not interested in the dma_addr returned by +		 * xen_dma_map_page, only in the potential cache flushes executed +		 * by the function. */ +		xen_dma_map_page(dev, page, offset, size, dir, attrs);  		return dev_addr; +	}  	/*  	 * Oh well, have to allocate and map a bounce buffer.  	 */ +	trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); +  	map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir);  	if (map == SWIOTLB_MAP_ERROR)  		return DMA_ERROR_CODE; +	xen_dma_map_page(dev, pfn_to_page(map >> PAGE_SHIFT), +					map & ~PAGE_MASK, size, dir, attrs);  	dev_addr = xen_phys_to_bus(map);  	/* @@ -384,12 +440,15 @@ EXPORT_SYMBOL_GPL(xen_swiotlb_map_page);   * whatever the device wrote there.   */  static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr, -			     size_t size, enum dma_data_direction dir) +			     size_t size, enum dma_data_direction dir, +				 struct dma_attrs *attrs)  {  	phys_addr_t paddr = xen_bus_to_phys(dev_addr);  	BUG_ON(dir == DMA_NONE); +	xen_dma_unmap_page(hwdev, paddr, size, dir, attrs); +  	/* NOTE: We use dev_addr here, not paddr! */  	if (is_xen_swiotlb_buffer(dev_addr)) {  		swiotlb_tbl_unmap_single(hwdev, paddr, size, dir); @@ -412,7 +471,7 @@ void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,  			    size_t size, enum dma_data_direction dir,  			    struct dma_attrs *attrs)  { -	xen_unmap_single(hwdev, dev_addr, size, dir); +	xen_unmap_single(hwdev, dev_addr, size, dir, attrs);  }  EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page); @@ -435,11 +494,15 @@ xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,  	BUG_ON(dir == DMA_NONE); +	if (target == SYNC_FOR_CPU) +		xen_dma_sync_single_for_cpu(hwdev, paddr, size, dir); +  	/* NOTE: We use dev_addr here, not paddr! */ -	if (is_xen_swiotlb_buffer(dev_addr)) { +	if (is_xen_swiotlb_buffer(dev_addr))  		swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target); -		return; -	} + +	if (target == SYNC_FOR_DEVICE) +		xen_dma_sync_single_for_cpu(hwdev, paddr, size, dir);  	if (dir != DMA_FROM_DEVICE)  		return; @@ -502,16 +565,31 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,  								 sg->length,  								 dir);  			if (map == SWIOTLB_MAP_ERROR) { +				dev_warn(hwdev, "swiotlb buffer is full\n");  				/* Don't panic here, we expect map_sg users  				   to do proper error handling. */  				xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,  							   attrs);  				sg_dma_len(sgl) = 0; -				return DMA_ERROR_CODE; +				return 0;  			} +			xen_dma_map_page(hwdev, pfn_to_page(map >> PAGE_SHIFT), +						map & ~PAGE_MASK, +						sg->length, +						dir, +						attrs);  			sg->dma_address = xen_phys_to_bus(map); -		} else +		} else { +			/* we are not interested in the dma_addr returned by +			 * xen_dma_map_page, only in the potential cache flushes executed +			 * by the function. */ +			xen_dma_map_page(hwdev, pfn_to_page(paddr >> PAGE_SHIFT), +						paddr & ~PAGE_MASK, +						sg->length, +						dir, +						attrs);  			sg->dma_address = dev_addr; +		}  		sg_dma_len(sg) = sg->length;  	}  	return nelems; @@ -533,7 +611,7 @@ xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,  	BUG_ON(dir == DMA_NONE);  	for_each_sg(sgl, sg, nelems, i) -		xen_unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir); +		xen_unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, attrs);  }  EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg_attrs); @@ -593,3 +671,15 @@ xen_swiotlb_dma_supported(struct device *hwdev, u64 mask)  	return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask;  }  EXPORT_SYMBOL_GPL(xen_swiotlb_dma_supported); + +int +xen_swiotlb_set_dma_mask(struct device *dev, u64 dma_mask) +{ +	if (!dev->dma_mask || !xen_swiotlb_dma_supported(dev, dma_mask)) +		return -EIO; + +	*dev->dma_mask = dma_mask; + +	return 0; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_set_dma_mask); diff --git a/drivers/xen/xen-acpi-cpuhotplug.c b/drivers/xen/xen-acpi-cpuhotplug.c index 8dae6c13063..3e62ee4b3b6 100644 --- a/drivers/xen/xen-acpi-cpuhotplug.c +++ b/drivers/xen/xen-acpi-cpuhotplug.c @@ -24,10 +24,7 @@  #include <linux/cpu.h>  #include <linux/acpi.h>  #include <linux/uaccess.h> -#include <acpi/acpi_bus.h> -#include <acpi/acpi_drivers.h>  #include <acpi/processor.h> -  #include <xen/acpi.h>  #include <xen/interface/platform.h>  #include <asm/xen/hypercall.h> @@ -269,7 +266,8 @@ static void acpi_processor_hotplug_notify(acpi_handle handle,  		if (!is_processor_present(handle))  			break; -		if (!acpi_bus_get_device(handle, &device)) +		acpi_bus_get_device(handle, &device); +		if (acpi_device_enumerated(device))  			break;  		result = acpi_bus_scan(handle); @@ -277,8 +275,9 @@ static void acpi_processor_hotplug_notify(acpi_handle handle,  			pr_err(PREFIX "Unable to add the device\n");  			break;  		} -		result = acpi_bus_get_device(handle, &device); -		if (result) { +		device = NULL; +		acpi_bus_get_device(handle, &device); +		if (!acpi_device_enumerated(device)) {  			pr_err(PREFIX "Missing device object\n");  			break;  		} @@ -314,7 +313,7 @@ static void acpi_processor_hotplug_notify(acpi_handle handle,  		goto out;  	} -	(void) acpi_evaluate_hotplug_ost(handle, event, ost_code, NULL); +	(void) acpi_evaluate_ost(handle, event, ost_code, NULL);  out:  	acpi_scan_lock_release(); diff --git a/drivers/xen/xen-acpi-memhotplug.c b/drivers/xen/xen-acpi-memhotplug.c index 9083f1e474f..34e40b733f9 100644 --- a/drivers/xen/xen-acpi-memhotplug.c +++ b/drivers/xen/xen-acpi-memhotplug.c @@ -22,7 +22,6 @@  #include <linux/init.h>  #include <linux/types.h>  #include <linux/acpi.h> -#include <acpi/acpi_drivers.h>  #include <xen/acpi.h>  #include <xen/interface/platform.h>  #include <asm/xen/hypercall.h> @@ -169,7 +168,7 @@ static int acpi_memory_get_device(acpi_handle handle,  	acpi_scan_lock_acquire();  	acpi_bus_get_device(handle, &device); -	if (device) +	if (acpi_device_enumerated(device))  		goto end;  	/* @@ -182,8 +181,9 @@ static int acpi_memory_get_device(acpi_handle handle,  		result = -EINVAL;  		goto out;  	} -	result = acpi_bus_get_device(handle, &device); -	if (result) { +	device = NULL; +	acpi_bus_get_device(handle, &device); +	if (!acpi_device_enumerated(device)) {  		pr_warn(PREFIX "Missing device object\n");  		result = -EINVAL;  		goto out; @@ -285,7 +285,7 @@ static void acpi_memory_device_notify(acpi_handle handle, u32 event, void *data)  		return;  	} -	(void) acpi_evaluate_hotplug_ost(handle, event, ost_code, NULL); +	(void) acpi_evaluate_ost(handle, event, ost_code, NULL);  	return;  } diff --git a/drivers/xen/xen-acpi-pad.c b/drivers/xen/xen-acpi-pad.c index 59708fdd068..f83b754505f 100644 --- a/drivers/xen/xen-acpi-pad.c +++ b/drivers/xen/xen-acpi-pad.c @@ -18,11 +18,10 @@  #include <linux/kernel.h>  #include <linux/types.h> -#include <acpi/acpi_bus.h> -#include <acpi/acpi_drivers.h> -#include <asm/xen/hypercall.h> +#include <linux/acpi.h>  #include <xen/interface/version.h>  #include <xen/xen-ops.h> +#include <asm/xen/hypercall.h>  #define ACPI_PROCESSOR_AGGREGATOR_CLASS	"acpi_pad"  #define ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME "Processor Aggregator" @@ -78,27 +77,14 @@ static int acpi_pad_pur(acpi_handle handle)  	return num;  } -/* Notify firmware how many CPUs are idle */ -static void acpi_pad_ost(acpi_handle handle, int stat, -	uint32_t idle_nums) -{ -	union acpi_object params[3] = { -		{.type = ACPI_TYPE_INTEGER,}, -		{.type = ACPI_TYPE_INTEGER,}, -		{.type = ACPI_TYPE_BUFFER,}, -	}; -	struct acpi_object_list arg_list = {3, params}; - -	params[0].integer.value = ACPI_PROCESSOR_AGGREGATOR_NOTIFY; -	params[1].integer.value =  stat; -	params[2].buffer.length = 4; -	params[2].buffer.pointer = (void *)&idle_nums; -	acpi_evaluate_object(handle, "_OST", &arg_list, NULL); -} -  static void acpi_pad_handle_notify(acpi_handle handle)  {  	int idle_nums; +	struct acpi_buffer param = { +		.length = 4, +		.pointer = (void *)&idle_nums, +	}; +  	mutex_lock(&xen_cpu_lock);  	idle_nums = acpi_pad_pur(handle); @@ -110,7 +96,8 @@ static void acpi_pad_handle_notify(acpi_handle handle)  	idle_nums = xen_acpi_pad_idle_cpus(idle_nums)  		    ?: xen_acpi_pad_idle_cpus_num();  	if (idle_nums >= 0) -		acpi_pad_ost(handle, 0, idle_nums); +		acpi_evaluate_ost(handle, ACPI_PROCESSOR_AGGREGATOR_NOTIFY, +				  0, ¶m);  	mutex_unlock(&xen_cpu_lock);  } diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c index 13bc6c31c06..59fc190f1e9 100644 --- a/drivers/xen/xen-acpi-processor.c +++ b/drivers/xen/xen-acpi-processor.c @@ -27,12 +27,10 @@  #include <linux/init.h>  #include <linux/module.h>  #include <linux/types.h> -#include <linux/syscore_ops.h> -#include <acpi/acpi_bus.h> -#include <acpi/acpi_drivers.h> +#include <linux/acpi.h>  #include <acpi/processor.h> -  #include <xen/xen.h> +#include <xen/xen-ops.h>  #include <xen/interface/platform.h>  #include <asm/xen/hypercall.h> @@ -129,7 +127,7 @@ static int push_cxx_to_hypervisor(struct acpi_processor *_pr)  			pr_debug("     C%d: %s %d uS\n",  				 cx->type, cx->desc, (u32)cx->latency);  		} -	} else if (ret != -EINVAL) +	} else if ((ret != -EINVAL) && (ret != -ENOSYS))  		/* EINVAL means the ACPI ID is incorrect - meaning the ACPI  		 * table is referencing a non-existing CPU - which can happen  		 * with broken ACPI tables. */ @@ -261,7 +259,7 @@ static int push_pxx_to_hypervisor(struct acpi_processor *_pr)  			(u32) perf->states[i].power,  			(u32) perf->states[i].transition_latency);  		} -	} else if (ret != -EINVAL) +	} else if ((ret != -EINVAL) && (ret != -ENOSYS))  		/* EINVAL means the ACPI ID is incorrect - meaning the ACPI  		 * table is referencing a non-existing CPU - which can happen  		 * with broken ACPI tables. */ @@ -497,14 +495,15 @@ static int xen_upload_processor_pm_data(void)  	return rc;  } -static void xen_acpi_processor_resume(void) +static int xen_acpi_processor_resume(struct notifier_block *nb, +				     unsigned long action, void *data)  {  	bitmap_zero(acpi_ids_done, nr_acpi_bits); -	xen_upload_processor_pm_data(); +	return xen_upload_processor_pm_data();  } -static struct syscore_ops xap_syscore_ops = { -	.resume	= xen_acpi_processor_resume, +struct notifier_block xen_acpi_processor_resume_nb = { +	.notifier_call = xen_acpi_processor_resume,  };  static int __init xen_acpi_processor_init(void) @@ -557,7 +556,7 @@ static int __init xen_acpi_processor_init(void)  	if (rc)  		goto err_unregister; -	register_syscore_ops(&xap_syscore_ops); +	xen_resume_notifier_register(&xen_acpi_processor_resume_nb);  	return 0;  err_unregister: @@ -576,7 +575,7 @@ static void __exit xen_acpi_processor_exit(void)  {  	int i; -	unregister_syscore_ops(&xap_syscore_ops); +	xen_resume_notifier_unregister(&xen_acpi_processor_resume_nb);  	kfree(acpi_ids_done);  	kfree(acpi_id_present);  	kfree(acpi_id_cst_present); diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c index 62fcd485f0a..d57a173685f 100644 --- a/drivers/xen/xen-pciback/pci_stub.c +++ b/drivers/xen/xen-pciback/pci_stub.c @@ -242,6 +242,15 @@ struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev,  	return found_dev;  } +/* + * Called when: + *  - XenBus state has been reconfigure (pci unplug). See xen_pcibk_remove_device + *  - XenBus state has been disconnected (guest shutdown). See xen_pcibk_xenbus_remove + *  - 'echo BDF > unbind' on pciback module with no guest attached. See pcistub_remove + *  - 'echo BDF > unbind' with a guest still using it. See pcistub_remove + * + *  As such we have to be careful. + */  void pcistub_put_pci_dev(struct pci_dev *dev)  {  	struct pcistub_device *psdev, *found_psdev = NULL; @@ -272,16 +281,16 @@ void pcistub_put_pci_dev(struct pci_dev *dev)  	 * and want to inhibit the user from fiddling with 'reset'  	 */  	pci_reset_function(dev); -	pci_restore_state(psdev->dev); +	pci_restore_state(dev);  	/* This disables the device. */ -	xen_pcibk_reset_device(found_psdev->dev); +	xen_pcibk_reset_device(dev);  	/* And cleanup up our emulated fields. */ -	xen_pcibk_config_free_dyn_fields(found_psdev->dev); -	xen_pcibk_config_reset_dev(found_psdev->dev); +	xen_pcibk_config_reset_dev(dev); +	xen_pcibk_config_free_dyn_fields(dev); -	xen_unregister_device_domain_owner(found_psdev->dev); +	xen_unregister_device_domain_owner(dev);  	spin_lock_irqsave(&found_psdev->lock, flags);  	found_psdev->pdev = NULL; @@ -493,6 +502,8 @@ static int pcistub_seize(struct pci_dev *dev)  	return err;  } +/* Called when 'bind'. This means we must _NOT_ call pci_reset_function or + * other functions that take the sysfs lock. */  static int pcistub_probe(struct pci_dev *dev, const struct pci_device_id *id)  {  	int err = 0; @@ -520,6 +531,8 @@ out:  	return err;  } +/* Called when 'unbind'. This means we must _NOT_ call pci_reset_function or + * other functions that take the sysfs lock. */  static void pcistub_remove(struct pci_dev *dev)  {  	struct pcistub_device *psdev, *found_psdev = NULL; @@ -551,6 +564,8 @@ static void pcistub_remove(struct pci_dev *dev)  			pr_warn("****** shutdown driver domain before binding device\n");  			pr_warn("****** to other drivers or domains\n"); +			/* N.B. This ends up calling pcistub_put_pci_dev which ends up +			 * doing the FLR. */  			xen_pcibk_release_pci_dev(found_psdev->pdev,  						found_psdev->dev);  		} diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c index 64eb0cd8b8a..c4a0666de6f 100644 --- a/drivers/xen/xen-pciback/pciback_ops.c +++ b/drivers/xen/xen-pciback/pciback_ops.c @@ -213,12 +213,11 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev,  		entries[i].vector = op->msix_entries[i].vector;  	} -	result = pci_enable_msix(dev, entries, op->value); - +	result = pci_enable_msix_exact(dev, entries, op->value);  	if (result == 0) {  		for (i = 0; i < op->value; i++) {  			op->msix_entries[i].entry = entries[i].entry; -			if (entries[i].vector) +			if (entries[i].vector) {  				op->msix_entries[i].vector =  					xen_pirq_from_irq(entries[i].vector);  				if (unlikely(verbose_request)) @@ -226,6 +225,7 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev,  						"MSI-X[%d]: %d\n",  						pci_name(dev), i,  						op->msix_entries[i].vector); +			}  		}  	} else  		pr_warn_ratelimited("%s: error enabling MSI-X for guest %u: err %d!\n", @@ -348,9 +348,9 @@ void xen_pcibk_do_op(struct work_struct *data)  	notify_remote_via_irq(pdev->evtchn_irq);  	/* Mark that we're done. */ -	smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */ +	smp_mb__before_atomic(); /* /after/ clearing PCIF_active */  	clear_bit(_PDEVF_op_active, &pdev->flags); -	smp_mb__after_clear_bit(); /* /before/ final check for work */ +	smp_mb__after_atomic(); /* /before/ final check for work */  	/* Check to see if the driver domain tried to start another request in  	 * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active. diff --git a/drivers/xen/xen-pciback/vpci.c b/drivers/xen/xen-pciback/vpci.c index 3165ce361b0..51afff96c51 100644 --- a/drivers/xen/xen-pciback/vpci.c +++ b/drivers/xen/xen-pciback/vpci.c @@ -137,6 +137,8 @@ unlock:  	/* Publish this device. */  	if (!err)  		err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid); +	else +		kfree(dev_entry);  out:  	return err; diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c index a9ed867afab..4a7e6e0a5f4 100644 --- a/drivers/xen/xen-pciback/xenbus.c +++ b/drivers/xen/xen-pciback/xenbus.c @@ -93,6 +93,8 @@ static void free_pdev(struct xen_pcibk_device *pdev)  	xen_pcibk_disconnect(pdev); +	/* N.B. This calls pcistub_put_pci_dev which does the FLR on all +	 * of the PCIe devices. */  	xen_pcibk_release_devices(pdev);  	dev_set_drvdata(&pdev->xdev->dev, NULL); @@ -286,6 +288,8 @@ static int xen_pcibk_remove_device(struct xen_pcibk_device *pdev,  	dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id);  	xen_unregister_device_domain_owner(dev); +	/* N.B. This ends up calling pcistub_put_pci_dev which ends up +	 * doing the FLR. */  	xen_pcibk_release_pci_dev(pdev, dev);  out: diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c index 21e18c18c7a..3b2bffde534 100644 --- a/drivers/xen/xen-selfballoon.c +++ b/drivers/xen/xen-selfballoon.c @@ -170,11 +170,13 @@ static void frontswap_selfshrink(void)  		tgt_frontswap_pages = cur_frontswap_pages -  			(cur_frontswap_pages / frontswap_hysteresis);  	frontswap_shrink(tgt_frontswap_pages); +	frontswap_inertia_counter = frontswap_inertia;  }  #endif /* CONFIG_FRONTSWAP */  #define MB2PAGES(mb)	((mb) << (20 - PAGE_SHIFT)) +#define PAGES2MB(pages) ((pages) >> (20 - PAGE_SHIFT))  /*   * Use current balloon size, the goal (vm_committed_as), and hysteresis @@ -525,6 +527,7 @@ EXPORT_SYMBOL(register_xen_selfballooning);  int xen_selfballoon_init(bool use_selfballooning, bool use_frontswap_selfshrink)  {  	bool enable = false; +	unsigned long reserve_pages;  	if (!xen_domain())  		return -ENODEV; @@ -549,6 +552,26 @@ int xen_selfballoon_init(bool use_selfballooning, bool use_frontswap_selfshrink)  	if (!enable)  		return -ENODEV; +	/* +	 * Give selfballoon_reserved_mb a default value(10% of total ram pages) +	 * to make selfballoon not so aggressive. +	 * +	 * There are mainly two reasons: +	 * 1) The original goal_page didn't consider some pages used by kernel +	 *    space, like slab pages and memory used by device drivers. +	 * +	 * 2) The balloon driver may not give back memory to guest OS fast +	 *    enough when the workload suddenly aquries a lot of physical memory. +	 * +	 * In both cases, the guest OS will suffer from memory pressure and +	 * OOM killer may be triggered. +	 * By reserving extra 10% of total ram pages, we can keep the system +	 * much more reliably and response faster in some cases. +	 */ +	if (!selfballoon_reserved_mb) { +		reserve_pages = totalram_pages / 10; +		selfballoon_reserved_mb = PAGES2MB(reserve_pages); +	}  	schedule_delayed_work(&selfballoon_worker, selfballoon_interval * HZ);  	return 0; diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index ec097d6f964..439c9dca9ee 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -45,6 +45,7 @@  #include <xen/grant_table.h>  #include <xen/xenbus.h>  #include <xen/xen.h> +#include <xen/features.h>  #include "xenbus_probe.h" @@ -400,33 +401,6 @@ EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn);  /** - * Bind to an existing interdomain event channel in another domain. Returns 0 - * on success and stores the local port in *port. On error, returns -errno, - * switches the device to XenbusStateClosing, and saves the error in XenStore. - */ -int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port) -{ -	struct evtchn_bind_interdomain bind_interdomain; -	int err; - -	bind_interdomain.remote_dom = dev->otherend_id; -	bind_interdomain.remote_port = remote_port; - -	err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, -					  &bind_interdomain); -	if (err) -		xenbus_dev_fatal(dev, err, -				 "binding to event channel %d from domain %d", -				 remote_port, dev->otherend_id); -	else -		*port = bind_interdomain.local_port; - -	return err; -} -EXPORT_SYMBOL_GPL(xenbus_bind_evtchn); - - -/**   * Free an existing event channel. Returns 0 on success or -errno on error.   */  int xenbus_free_evtchn(struct xenbus_device *dev, int port) @@ -743,7 +717,7 @@ static const struct xenbus_ring_ops ring_ops_hvm = {  void __init xenbus_ring_ops_init(void)  { -	if (xen_pv_domain()) +	if (!xen_feature(XENFEAT_auto_translated_physmap))  		ring_ops = &ring_ops_pv;  	else  		ring_ops = &ring_ops_hvm; diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 38e92b770e9..3c0a74b3e9b 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -384,12 +384,14 @@ static ssize_t nodename_show(struct device *dev,  {  	return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);  } +static DEVICE_ATTR_RO(nodename);  static ssize_t devtype_show(struct device *dev,  			    struct device_attribute *attr, char *buf)  {  	return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);  } +static DEVICE_ATTR_RO(devtype);  static ssize_t modalias_show(struct device *dev,  			     struct device_attribute *attr, char *buf) @@ -397,14 +399,24 @@ static ssize_t modalias_show(struct device *dev,  	return sprintf(buf, "%s:%s\n", dev->bus->name,  		       to_xenbus_device(dev)->devicetype);  } +static DEVICE_ATTR_RO(modalias); -struct device_attribute xenbus_dev_attrs[] = { -	__ATTR_RO(nodename), -	__ATTR_RO(devtype), -	__ATTR_RO(modalias), -	__ATTR_NULL +static struct attribute *xenbus_dev_attrs[] = { +	&dev_attr_nodename.attr, +	&dev_attr_devtype.attr, +	&dev_attr_modalias.attr, +	NULL,  }; -EXPORT_SYMBOL_GPL(xenbus_dev_attrs); + +static const struct attribute_group xenbus_dev_group = { +	.attrs = xenbus_dev_attrs, +}; + +const struct attribute_group *xenbus_dev_groups[] = { +	&xenbus_dev_group, +	NULL, +}; +EXPORT_SYMBOL_GPL(xenbus_dev_groups);  int xenbus_probe_node(struct xen_bus_type *bus,  		      const char *type, diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h index 146f857a36f..1085ec294a1 100644 --- a/drivers/xen/xenbus/xenbus_probe.h +++ b/drivers/xen/xenbus/xenbus_probe.h @@ -54,7 +54,7 @@ enum xenstore_init {  	XS_LOCAL,  }; -extern struct device_attribute xenbus_dev_attrs[]; +extern const struct attribute_group *xenbus_dev_groups[];  extern int xenbus_match(struct device *_dev, struct device_driver *_drv);  extern int xenbus_dev_probe(struct device *_dev); diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c index 998bbbab816..5125dce11a6 100644 --- a/drivers/xen/xenbus/xenbus_probe_backend.c +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -200,7 +200,7 @@ static struct xen_bus_type xenbus_backend = {  		.probe		= xenbus_dev_probe,  		.remove		= xenbus_dev_remove,  		.shutdown	= xenbus_dev_shutdown, -		.dev_attrs	= xenbus_dev_attrs, +		.dev_groups	= xenbus_dev_groups,  	},  }; diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c index 34b20bfa4e8..cb385c10d2b 100644 --- a/drivers/xen/xenbus/xenbus_probe_frontend.c +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c @@ -154,7 +154,7 @@ static struct xen_bus_type xenbus_frontend = {  		.probe		= xenbus_frontend_dev_probe,  		.remove		= xenbus_dev_remove,  		.shutdown	= xenbus_dev_shutdown, -		.dev_attrs	= xenbus_dev_attrs, +		.dev_groups	= xenbus_dev_groups,  		.pm		= &xenbus_pm_ops,  	}, @@ -496,7 +496,7 @@ subsys_initcall(xenbus_probe_frontend_init);  #ifndef MODULE  static int __init boot_wait_for_devices(void)  { -	if (xen_hvm_domain() && !xen_platform_pci_unplug) +	if (!xen_has_pv_devices())  		return -ENODEV;  	ready_to_wait_for_devices = 1; diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index b6d5fff43d1..ba804f3d827 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -50,6 +50,7 @@  #include <xen/xenbus.h>  #include <xen/xen.h>  #include "xenbus_comms.h" +#include "xenbus_probe.h"  struct xs_stored_msg {  	struct list_head list; @@ -139,6 +140,29 @@ static int get_error(const char *errorstring)  	return xsd_errors[i].errnum;  } +static bool xenbus_ok(void) +{ +	switch (xen_store_domain_type) { +	case XS_LOCAL: +		switch (system_state) { +		case SYSTEM_POWER_OFF: +		case SYSTEM_RESTART: +		case SYSTEM_HALT: +			return false; +		default: +			break; +		} +		return true; +	case XS_PV: +	case XS_HVM: +		/* FIXME: Could check that the remote domain is alive, +		 * but it is normally initial domain. */ +		return true; +	default: +		break; +	} +	return false; +}  static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)  {  	struct xs_stored_msg *msg; @@ -148,9 +172,20 @@ static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)  	while (list_empty(&xs_state.reply_list)) {  		spin_unlock(&xs_state.reply_lock); -		/* XXX FIXME: Avoid synchronous wait for response here. */ -		wait_event(xs_state.reply_waitq, -			   !list_empty(&xs_state.reply_list)); +		if (xenbus_ok()) +			/* XXX FIXME: Avoid synchronous wait for response here. */ +			wait_event_timeout(xs_state.reply_waitq, +					   !list_empty(&xs_state.reply_list), +					   msecs_to_jiffies(500)); +		else { +			/* +			 * If we are in the process of being shut-down there is +			 * no point of trying to contact XenBus - it is either +			 * killed (xenstored application) or the other domain +			 * has been killed or is unreachable. +			 */ +			return ERR_PTR(-EIO); +		}  		spin_lock(&xs_state.reply_lock);  	} @@ -215,6 +250,9 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)  	mutex_unlock(&xs_state.request_mutex); +	if (IS_ERR(ret)) +		return ret; +  	if ((msg->type == XS_TRANSACTION_END) ||  	    ((req_msg.type == XS_TRANSACTION_START) &&  	     (msg->type == XS_ERROR))) diff --git a/drivers/xen/xencomm.c b/drivers/xen/xencomm.c deleted file mode 100644 index 4793fc59454..00000000000 --- a/drivers/xen/xencomm.c +++ /dev/null @@ -1,219 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA - * - * Copyright (C) IBM Corp. 2006 - * - * Authors: Hollis Blanchard <hollisb@us.ibm.com> - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/mm.h> -#include <linux/slab.h> -#include <asm/page.h> -#include <xen/xencomm.h> -#include <xen/interface/xen.h> -#include <asm/xen/xencomm.h>	/* for xencomm_is_phys_contiguous() */ - -static int xencomm_init(struct xencomm_desc *desc, -			void *buffer, unsigned long bytes) -{ -	unsigned long recorded = 0; -	int i = 0; - -	while ((recorded < bytes) && (i < desc->nr_addrs)) { -		unsigned long vaddr = (unsigned long)buffer + recorded; -		unsigned long paddr; -		int offset; -		int chunksz; - -		offset = vaddr % PAGE_SIZE; /* handle partial pages */ -		chunksz = min(PAGE_SIZE - offset, bytes - recorded); - -		paddr = xencomm_vtop(vaddr); -		if (paddr == ~0UL) { -			printk(KERN_DEBUG "%s: couldn't translate vaddr %lx\n", -			       __func__, vaddr); -			return -EINVAL; -		} - -		desc->address[i++] = paddr; -		recorded += chunksz; -	} - -	if (recorded < bytes) { -		printk(KERN_DEBUG -		       "%s: could only translate %ld of %ld bytes\n", -		       __func__, recorded, bytes); -		return -ENOSPC; -	} - -	/* mark remaining addresses invalid (just for safety) */ -	while (i < desc->nr_addrs) -		desc->address[i++] = XENCOMM_INVALID; - -	desc->magic = XENCOMM_MAGIC; - -	return 0; -} - -static struct xencomm_desc *xencomm_alloc(gfp_t gfp_mask, -					  void *buffer, unsigned long bytes) -{ -	struct xencomm_desc *desc; -	unsigned long buffer_ulong = (unsigned long)buffer; -	unsigned long start = buffer_ulong & PAGE_MASK; -	unsigned long end = (buffer_ulong + bytes) | ~PAGE_MASK; -	unsigned long nr_addrs = (end - start + 1) >> PAGE_SHIFT; -	unsigned long size = sizeof(*desc) + -		sizeof(desc->address[0]) * nr_addrs; - -	/* -	 * slab allocator returns at least sizeof(void*) aligned pointer. -	 * When sizeof(*desc) > sizeof(void*), struct xencomm_desc might -	 * cross page boundary. -	 */ -	if (sizeof(*desc) > sizeof(void *)) { -		unsigned long order = get_order(size); -		desc = (struct xencomm_desc *)__get_free_pages(gfp_mask, -							       order); -		if (desc == NULL) -			return NULL; - -		desc->nr_addrs = -			((PAGE_SIZE << order) - sizeof(struct xencomm_desc)) / -			sizeof(*desc->address); -	} else { -		desc = kmalloc(size, gfp_mask); -		if (desc == NULL) -			return NULL; - -		desc->nr_addrs = nr_addrs; -	} -	return desc; -} - -void xencomm_free(struct xencomm_handle *desc) -{ -	if (desc && !((ulong)desc & XENCOMM_INLINE_FLAG)) { -		struct xencomm_desc *desc__ = (struct xencomm_desc *)desc; -		if (sizeof(*desc__) > sizeof(void *)) { -			unsigned long size = sizeof(*desc__) + -				sizeof(desc__->address[0]) * desc__->nr_addrs; -			unsigned long order = get_order(size); -			free_pages((unsigned long)__va(desc), order); -		} else -			kfree(__va(desc)); -	} -} - -static int xencomm_create(void *buffer, unsigned long bytes, -			  struct xencomm_desc **ret, gfp_t gfp_mask) -{ -	struct xencomm_desc *desc; -	int rc; - -	pr_debug("%s: %p[%ld]\n", __func__, buffer, bytes); - -	if (bytes == 0) { -		/* don't create a descriptor; Xen recognizes NULL. */ -		BUG_ON(buffer != NULL); -		*ret = NULL; -		return 0; -	} - -	BUG_ON(buffer == NULL); /* 'bytes' is non-zero */ - -	desc = xencomm_alloc(gfp_mask, buffer, bytes); -	if (!desc) { -		printk(KERN_DEBUG "%s failure\n", "xencomm_alloc"); -		return -ENOMEM; -	} - -	rc = xencomm_init(desc, buffer, bytes); -	if (rc) { -		printk(KERN_DEBUG "%s failure: %d\n", "xencomm_init", rc); -		xencomm_free((struct xencomm_handle *)__pa(desc)); -		return rc; -	} - -	*ret = desc; -	return 0; -} - -static struct xencomm_handle *xencomm_create_inline(void *ptr) -{ -	unsigned long paddr; - -	BUG_ON(!xencomm_is_phys_contiguous((unsigned long)ptr)); - -	paddr = (unsigned long)xencomm_pa(ptr); -	BUG_ON(paddr & XENCOMM_INLINE_FLAG); -	return (struct xencomm_handle *)(paddr | XENCOMM_INLINE_FLAG); -} - -/* "mini" routine, for stack-based communications: */ -static int xencomm_create_mini(void *buffer, -	unsigned long bytes, struct xencomm_mini *xc_desc, -	struct xencomm_desc **ret) -{ -	int rc = 0; -	struct xencomm_desc *desc; -	BUG_ON(((unsigned long)xc_desc) % sizeof(*xc_desc) != 0); - -	desc = (void *)xc_desc; - -	desc->nr_addrs = XENCOMM_MINI_ADDRS; - -	rc = xencomm_init(desc, buffer, bytes); -	if (!rc) -		*ret = desc; - -	return rc; -} - -struct xencomm_handle *xencomm_map(void *ptr, unsigned long bytes) -{ -	int rc; -	struct xencomm_desc *desc; - -	if (xencomm_is_phys_contiguous((unsigned long)ptr)) -		return xencomm_create_inline(ptr); - -	rc = xencomm_create(ptr, bytes, &desc, GFP_KERNEL); - -	if (rc || desc == NULL) -		return NULL; - -	return xencomm_pa(desc); -} - -struct xencomm_handle *__xencomm_map_no_alloc(void *ptr, unsigned long bytes, -			struct xencomm_mini *xc_desc) -{ -	int rc; -	struct xencomm_desc *desc = NULL; - -	if (xencomm_is_phys_contiguous((unsigned long)ptr)) -		return xencomm_create_inline(ptr); - -	rc = xencomm_create_mini(ptr, bytes, xc_desc, -				&desc); - -	if (rc) -		return NULL; - -	return xencomm_pa(desc); -}  | 
