diff options
Diffstat (limited to 'arch/x86/xen')
| -rw-r--r-- | arch/x86/xen/Kconfig | 26 | ||||
| -rw-r--r-- | arch/x86/xen/Makefile | 7 | ||||
| -rw-r--r-- | arch/x86/xen/apic.c | 34 | ||||
| -rw-r--r-- | arch/x86/xen/debugfs.c | 104 | ||||
| -rw-r--r-- | arch/x86/xen/debugfs.h | 4 | ||||
| -rw-r--r-- | arch/x86/xen/enlighten.c | 774 | ||||
| -rw-r--r-- | arch/x86/xen/grant-table.c | 188 | ||||
| -rw-r--r-- | arch/x86/xen/irq.c | 51 | ||||
| -rw-r--r-- | arch/x86/xen/mmu.c | 1565 | ||||
| -rw-r--r-- | arch/x86/xen/mmu.h | 37 | ||||
| -rw-r--r-- | arch/x86/xen/multicalls.c | 177 | ||||
| -rw-r--r-- | arch/x86/xen/multicalls.h | 10 | ||||
| -rw-r--r-- | arch/x86/xen/p2m.c | 1340 | ||||
| -rw-r--r-- | arch/x86/xen/pci-swiotlb-xen.c | 62 | ||||
| -rw-r--r-- | arch/x86/xen/platform-pci-unplug.c | 84 | ||||
| -rw-r--r-- | arch/x86/xen/setup.c | 529 | ||||
| -rw-r--r-- | arch/x86/xen/smp.c | 459 | ||||
| -rw-r--r-- | arch/x86/xen/smp.h | 11 | ||||
| -rw-r--r-- | arch/x86/xen/spinlock.c | 447 | ||||
| -rw-r--r-- | arch/x86/xen/suspend.c | 26 | ||||
| -rw-r--r-- | arch/x86/xen/time.c | 152 | ||||
| -rw-r--r-- | arch/x86/xen/trace.c | 62 | ||||
| -rw-r--r-- | arch/x86/xen/vga.c | 74 | ||||
| -rw-r--r-- | arch/x86/xen/xen-asm.S | 2 | ||||
| -rw-r--r-- | arch/x86/xen/xen-asm_32.S | 45 | ||||
| -rw-r--r-- | arch/x86/xen/xen-head.S | 81 | ||||
| -rw-r--r-- | arch/x86/xen/xen-ops.h | 46 | 
27 files changed, 4519 insertions, 1878 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 5b54892e4bc..e88fda867a3 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -4,10 +4,11 @@  config XEN  	bool "Xen guest support" -	select PARAVIRT +	depends on PARAVIRT  	select PARAVIRT_CLOCK -	depends on X86_64 || (X86_32 && X86_PAE && !X86_VISWS) -	depends on X86_CMPXCHG && X86_TSC +	select XEN_HAVE_PVMMU +	depends on X86_64 || (X86_32 && X86_PAE) +	depends on X86_TSC  	help  	  This is the Linux Xen port.  Enabling this will allow the  	  kernel to boot in a paravirtualized environment under the @@ -18,19 +19,14 @@ config XEN_DOM0  	depends on XEN && PCI_XEN && SWIOTLB_XEN  	depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI -# Dummy symbol since people have come to rely on the PRIVILEGED_GUEST -# name in tools. -config XEN_PRIVILEGED_GUEST -	def_bool XEN_DOM0 -  config XEN_PVHVM  	def_bool y -	depends on XEN -	depends on X86_LOCAL_APIC +	depends on XEN && PCI && X86_LOCAL_APIC  config XEN_MAX_DOMAIN_MEMORY         int -       default 128 +       default 500 if X86_64 +       default 64 if X86_32         depends on XEN         help           This only affects the sizing of some bss arrays, the unused @@ -38,7 +34,8 @@ config XEN_MAX_DOMAIN_MEMORY  config XEN_SAVE_RESTORE         bool -       depends on XEN && PM +       depends on XEN +       select HIBERNATE_CALLBACKS         default y  config XEN_DEBUG_FS @@ -48,3 +45,8 @@ config XEN_DEBUG_FS  	help  	  Enable statistics output and various tuning options in debugfs.  	  Enabling this option may incur a significant performance overhead. + +config XEN_PVH +	bool "Support for running as a PVH guest" +	depends on X86_64 && XEN && XEN_PVHVM +	def_bool n diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 77938515891..96ab2c09cb6 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -12,10 +12,13 @@ CFLAGS_mmu.o			:= $(nostackp)  obj-y		:= enlighten.o setup.o multicalls.o mmu.o irq.o \  			time.o xen-asm.o xen-asm_$(BITS).o \ -			grant-table.o suspend.o platform-pci-unplug.o +			grant-table.o suspend.o platform-pci-unplug.o \ +			p2m.o + +obj-$(CONFIG_EVENT_TRACING) += trace.o  obj-$(CONFIG_SMP)		+= smp.o  obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o  obj-$(CONFIG_XEN_DEBUG_FS)	+= debugfs.o - +obj-$(CONFIG_XEN_DOM0)		+= apic.o vga.o  obj-$(CONFIG_SWIOTLB_XEN)	+= pci-swiotlb-xen.o diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c new file mode 100644 index 00000000000..7005ced5d1a --- /dev/null +++ b/arch/x86/xen/apic.c @@ -0,0 +1,34 @@ +#include <linux/init.h> + +#include <asm/x86_init.h> +#include <asm/apic.h> +#include <asm/xen/hypercall.h> + +#include <xen/xen.h> +#include <xen/interface/physdev.h> +#include "xen-ops.h" + +static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) +{ +	struct physdev_apic apic_op; +	int ret; + +	apic_op.apic_physbase = mpc_ioapic_addr(apic); +	apic_op.reg = reg; +	ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op); +	if (!ret) +		return apic_op.value; + +	/* fallback to return an emulated IO_APIC values */ +	if (reg == 0x1) +		return 0x00170020; +	else if (reg == 0x0) +		return apic << 24; + +	return 0xfd; +} + +void __init xen_init_apic(void) +{ +	x86_io_apic_ops.read = xen_io_apic_read; +} diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c index 7c0fedd98ea..c8377fb26cd 100644 --- a/arch/x86/xen/debugfs.c +++ b/arch/x86/xen/debugfs.c @@ -19,107 +19,3 @@ struct dentry * __init xen_init_debugfs(void)  	return d_xen_debug;  } -struct array_data -{ -	void *array; -	unsigned elements; -}; - -static int u32_array_open(struct inode *inode, struct file *file) -{ -	file->private_data = NULL; -	return nonseekable_open(inode, file); -} - -static size_t format_array(char *buf, size_t bufsize, const char *fmt, -			   u32 *array, unsigned array_size) -{ -	size_t ret = 0; -	unsigned i; - -	for(i = 0; i < array_size; i++) { -		size_t len; - -		len = snprintf(buf, bufsize, fmt, array[i]); -		len++;	/* ' ' or '\n' */ -		ret += len; - -		if (buf) { -			buf += len; -			bufsize -= len; -			buf[-1] = (i == array_size-1) ? '\n' : ' '; -		} -	} - -	ret++;		/* \0 */ -	if (buf) -		*buf = '\0'; - -	return ret; -} - -static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size) -{ -	size_t len = format_array(NULL, 0, fmt, array, array_size); -	char *ret; - -	ret = kmalloc(len, GFP_KERNEL); -	if (ret == NULL) -		return NULL; - -	format_array(ret, len, fmt, array, array_size); -	return ret; -} - -static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len, -			      loff_t *ppos) -{ -	struct inode *inode = file->f_path.dentry->d_inode; -	struct array_data *data = inode->i_private; -	size_t size; - -	if (*ppos == 0) { -		if (file->private_data) { -			kfree(file->private_data); -			file->private_data = NULL; -		} - -		file->private_data = format_array_alloc("%u", data->array, data->elements); -	} - -	size = 0; -	if (file->private_data) -		size = strlen(file->private_data); - -	return simple_read_from_buffer(buf, len, ppos, file->private_data, size); -} - -static int xen_array_release(struct inode *inode, struct file *file) -{ -	kfree(file->private_data); - -	return 0; -} - -static const struct file_operations u32_array_fops = { -	.owner	= THIS_MODULE, -	.open	= u32_array_open, -	.release= xen_array_release, -	.read	= u32_array_read, -	.llseek = no_llseek, -}; - -struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, -					    struct dentry *parent, -					    u32 *array, unsigned elements) -{ -	struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL); - -	if (data == NULL) -		return NULL; - -	data->array = array; -	data->elements = elements; - -	return debugfs_create_file(name, mode, parent, data, &u32_array_fops); -} diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h index e2813208483..12ebf3325c7 100644 --- a/arch/x86/xen/debugfs.h +++ b/arch/x86/xen/debugfs.h @@ -3,8 +3,4 @@  struct dentry * __init xen_init_debugfs(void); -struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, -					    struct dentry *parent, -					    u32 *array, unsigned elements); -  #endif /* _XEN_DEBUGFS_H */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 235c0f4d386..ffb101e4573 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -31,17 +31,21 @@  #include <linux/pci.h>  #include <linux/gfp.h>  #include <linux/memblock.h> +#include <linux/edd.h>  #include <xen/xen.h> +#include <xen/events.h>  #include <xen/interface/xen.h>  #include <xen/interface/version.h>  #include <xen/interface/physdev.h>  #include <xen/interface/vcpu.h>  #include <xen/interface/memory.h> +#include <xen/interface/xen-mca.h>  #include <xen/features.h>  #include <xen/page.h>  #include <xen/hvm.h>  #include <xen/hvc-console.h> +#include <xen/acpi.h>  #include <asm/paravirt.h>  #include <asm/apic.h> @@ -62,19 +66,58 @@  #include <asm/reboot.h>  #include <asm/stackprotector.h>  #include <asm/hypervisor.h> +#include <asm/mwait.h> +#include <asm/pci_x86.h> +#include <asm/pat.h> + +#ifdef CONFIG_ACPI +#include <linux/acpi.h> +#include <asm/acpi.h> +#include <acpi/pdc_intel.h> +#include <acpi/processor.h> +#include <xen/interface/platform.h> +#endif  #include "xen-ops.h"  #include "mmu.h" +#include "smp.h"  #include "multicalls.h"  EXPORT_SYMBOL_GPL(hypercall_page); +/* + * Pointer to the xen_vcpu_info structure or + * &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info + * and xen_vcpu_setup for details. By default it points to share_info->vcpu_info + * but if the hypervisor supports VCPUOP_register_vcpu_info then it can point + * to xen_vcpu_info. The pointer is used in __xen_evtchn_do_upcall to + * acknowledge pending events. + * Also more subtly it is used by the patched version of irq enable/disable + * e.g. xen_irq_enable_direct and xen_iret in PV mode. + * + * The desire to be able to do those mask/unmask operations as a single + * instruction by using the per-cpu offset held in %gs is the real reason + * vcpu info is in a per-cpu pointer and the original reason for this + * hypercall. + * + */  DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); + +/* + * Per CPU pages used if hypervisor supports VCPUOP_register_vcpu_info + * hypercall. This can be used both in PV and PVHVM mode. The structure + * overrides the default per_cpu(xen_vcpu, cpu) value. + */  DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);  enum xen_domain_type xen_domain_type = XEN_NATIVE;  EXPORT_SYMBOL_GPL(xen_domain_type); +unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; +EXPORT_SYMBOL(machine_to_phys_mapping); +unsigned long  machine_to_phys_nr; +EXPORT_SYMBOL(machine_to_phys_nr); +  struct start_info *xen_start_info;  EXPORT_SYMBOL_GPL(xen_start_info); @@ -90,7 +133,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback);   * Point at some empty memory to start with. We map the real shared_info   * page as soon as fixmap is up and running.   */ -struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; +struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;  /*   * Flag to determine whether vcpu info placement is available on all @@ -107,6 +150,19 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;   */  static int have_vcpu_info_placement = 1; +struct tls_descs { +	struct desc_struct desc[3]; +}; + +/* + * Updating the 3 TLS descriptors in the GDT on every task switch is + * surprisingly expensive so we avoid updating them if they haven't + * changed.  Since Xen writes different descriptors than the one + * passed in the update_descriptor hypercall we keep shadow copies to + * compare against. + */ +static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); +  static void clamp_max_cpus(void)  {  #ifdef CONFIG_SMP @@ -123,6 +179,21 @@ static void xen_vcpu_setup(int cpu)  	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); +	/* +	 * This path is called twice on PVHVM - first during bootup via +	 * smp_init -> xen_hvm_cpu_notify, and then if the VCPU is being +	 * hotplugged: cpu_up -> xen_hvm_cpu_notify. +	 * As we can only do the VCPUOP_register_vcpu_info once lets +	 * not over-write its result. +	 * +	 * For PV it is called during restore (xen_vcpu_restore) and bootup +	 * (xen_setup_vcpu_info_placement). The hotplug mechanism does not +	 * use this function. +	 */ +	if (xen_hvm_domain()) { +		if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu)) +			return; +	}  	if (cpu < MAX_VIRT_CPUS)  		per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; @@ -138,7 +209,12 @@ static void xen_vcpu_setup(int cpu)  	/* Check to see if the hypervisor will put the vcpu_info  	   structure where we want it, which allows direct access via -	   a percpu-variable. */ +	   a percpu-variable. +	   N.B. This hypercall can _only_ be called once per CPU. Subsequent +	   calls will error out with -EINVAL. This is due to the fact that +	   hypervisor has no unregister variant and this hypercall does not +	   allow to over-write info.mfn and info.offset. +	 */  	err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);  	if (err) { @@ -161,10 +237,11 @@ void xen_vcpu_restore(void)  {  	int cpu; -	for_each_online_cpu(cpu) { +	for_each_possible_cpu(cpu) {  		bool other_cpu = (cpu != smp_processor_id()); +		bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL); -		if (other_cpu && +		if (other_cpu && is_up &&  		    HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL))  			BUG(); @@ -173,7 +250,7 @@ void xen_vcpu_restore(void)  		if (have_vcpu_info_placement)  			xen_vcpu_setup(cpu); -		if (other_cpu && +		if (other_cpu && is_up &&  		    HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))  			BUG();  	} @@ -185,23 +262,46 @@ static void __init xen_banner(void)  	struct xen_extraversion extra;  	HYPERVISOR_xen_version(XENVER_extraversion, &extra); -	printk(KERN_INFO "Booting paravirtualized kernel on %s\n", -	       pv_info.name); +	pr_info("Booting paravirtualized kernel %son %s\n", +		xen_feature(XENFEAT_auto_translated_physmap) ? +			"with PVH extensions " : "", pv_info.name);  	printk(KERN_INFO "Xen version: %d.%d%s%s\n",  	       version >> 16, version & 0xffff, extra.extraversion,  	       xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");  } +/* Check if running on Xen version (major, minor) or later */ +bool +xen_running_on_version_or_later(unsigned int major, unsigned int minor) +{ +	unsigned int version; + +	if (!xen_domain()) +		return false; + +	version = HYPERVISOR_xen_version(XENVER_version, NULL); +	if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) || +		((version >> 16) > major)) +		return true; +	return false; +} + +#define CPUID_THERM_POWER_LEAF 6 +#define APERFMPERF_PRESENT 0  static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;  static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0; +static __read_mostly unsigned int cpuid_leaf1_ecx_set_mask; +static __read_mostly unsigned int cpuid_leaf5_ecx_val; +static __read_mostly unsigned int cpuid_leaf5_edx_val; +  static void xen_cpuid(unsigned int *ax, unsigned int *bx,  		      unsigned int *cx, unsigned int *dx)  {  	unsigned maskebx = ~0;  	unsigned maskecx = ~0;  	unsigned maskedx = ~0; - +	unsigned setecx = 0;  	/*  	 * Mask out inconvenient features, to try and disable as many  	 * unsupported kernel subsystems as possible. @@ -209,9 +309,23 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,  	switch (*ax) {  	case 1:  		maskecx = cpuid_leaf1_ecx_mask; +		setecx = cpuid_leaf1_ecx_set_mask;  		maskedx = cpuid_leaf1_edx_mask;  		break; +	case CPUID_MWAIT_LEAF: +		/* Synthesize the values.. */ +		*ax = 0; +		*bx = 0; +		*cx = cpuid_leaf5_ecx_val; +		*dx = cpuid_leaf5_edx_val; +		return; + +	case CPUID_THERM_POWER_LEAF: +		/* Disabling APERFMPERF for kernel usage */ +		maskecx = ~(1 << APERFMPERF_PRESENT); +		break; +  	case 0xb:  		/* Suppress extended topology stuff */  		maskebx = 0; @@ -227,41 +341,110 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,  	*bx &= maskebx;  	*cx &= maskecx; +	*cx |= setecx;  	*dx &= maskedx; +  } -static __init void xen_init_cpuid_mask(void) +static bool __init xen_check_mwait(void)  { +#ifdef CONFIG_ACPI +	struct xen_platform_op op = { +		.cmd			= XENPF_set_processor_pminfo, +		.u.set_pminfo.id	= -1, +		.u.set_pminfo.type	= XEN_PM_PDC, +	}; +	uint32_t buf[3];  	unsigned int ax, bx, cx, dx; +	unsigned int mwait_mask; + +	/* We need to determine whether it is OK to expose the MWAIT +	 * capability to the kernel to harvest deeper than C3 states from ACPI +	 * _CST using the processor_harvest_xen.c module. For this to work, we +	 * need to gather the MWAIT_LEAF values (which the cstate.c code +	 * checks against). The hypervisor won't expose the MWAIT flag because +	 * it would break backwards compatibility; so we will find out directly +	 * from the hardware and hypercall. +	 */ +	if (!xen_initial_domain()) +		return false; + +	/* +	 * When running under platform earlier than Xen4.2, do not expose +	 * mwait, to avoid the risk of loading native acpi pad driver +	 */ +	if (!xen_running_on_version_or_later(4, 2)) +		return false; + +	ax = 1; +	cx = 0; + +	native_cpuid(&ax, &bx, &cx, &dx); + +	mwait_mask = (1 << (X86_FEATURE_EST % 32)) | +		     (1 << (X86_FEATURE_MWAIT % 32)); + +	if ((cx & mwait_mask) != mwait_mask) +		return false; + +	/* We need to emulate the MWAIT_LEAF and for that we need both +	 * ecx and edx. The hypercall provides only partial information. +	 */ + +	ax = CPUID_MWAIT_LEAF; +	bx = 0; +	cx = 0; +	dx = 0; + +	native_cpuid(&ax, &bx, &cx, &dx); + +	/* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so, +	 * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3. +	 */ +	buf[0] = ACPI_PDC_REVISION_ID; +	buf[1] = 1; +	buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP); + +	set_xen_guest_handle(op.u.set_pminfo.pdc, buf); + +	if ((HYPERVISOR_dom0_op(&op) == 0) && +	    (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) { +		cpuid_leaf5_ecx_val = cx; +		cpuid_leaf5_edx_val = dx; +	} +	return true; +#else +	return false; +#endif +} +static void __init xen_init_cpuid_mask(void) +{ +	unsigned int ax, bx, cx, dx; +	unsigned int xsave_mask;  	cpuid_leaf1_edx_mask = -		~((1 << X86_FEATURE_MCE)  |  /* disable MCE */ -		  (1 << X86_FEATURE_MCA)  |  /* disable MCA */ -		  (1 << X86_FEATURE_MTRR) |  /* disable MTRR */ +		~((1 << X86_FEATURE_MTRR) |  /* disable MTRR */  		  (1 << X86_FEATURE_ACC));   /* thermal monitoring */  	if (!xen_initial_domain())  		cpuid_leaf1_edx_mask &= -			~((1 << X86_FEATURE_APIC) |  /* disable local APIC */ -			  (1 << X86_FEATURE_ACPI));  /* disable ACPI */ +			~((1 << X86_FEATURE_ACPI));  /* disable ACPI */ + +	cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32));  	ax = 1;  	cx = 0; -	xen_cpuid(&ax, &bx, &cx, &dx); - -	/* cpuid claims we support xsave; try enabling it to see what happens */ -	if (cx & (1 << (X86_FEATURE_XSAVE % 32))) { -		unsigned long cr4; +	cpuid(1, &ax, &bx, &cx, &dx); -		set_in_cr4(X86_CR4_OSXSAVE); -		 -		cr4 = read_cr4(); +	xsave_mask = +		(1 << (X86_FEATURE_XSAVE % 32)) | +		(1 << (X86_FEATURE_OSXSAVE % 32)); -		if ((cr4 & X86_CR4_OSXSAVE) == 0) -			cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32)); - -		clear_in_cr4(X86_CR4_OSXSAVE); -	} +	/* Xen will set CR4.OSXSAVE if supported and not disabled by force */ +	if ((cx & xsave_mask) != xsave_mask) +		cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */ +	if (xen_check_mwait()) +		cpuid_leaf1_ecx_set_mask = (1 << (X86_FEATURE_MWAIT % 32));  }  static void xen_set_debugreg(int reg, unsigned long val) @@ -343,6 +526,8 @@ static void xen_set_ldt(const void *addr, unsigned entries)  	struct mmuext_op *op;  	struct multicall_space mcs = xen_mc_entry(sizeof(*op)); +	trace_xen_cpu_set_ldt(addr, entries); +  	op = mcs.args;  	op->cmd = MMUEXT_SET_LDT;  	op->arg1.linear_addr = (unsigned long)addr; @@ -402,7 +587,7 @@ static void xen_load_gdt(const struct desc_ptr *dtr)  /*   * load_gdt for early boot, when the gdt is only mapped once   */ -static __init void xen_load_gdt_boot(const struct desc_ptr *dtr) +static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)  {  	unsigned long va = dtr->address;  	unsigned int size = dtr->size + 1; @@ -437,12 +622,28 @@ static __init void xen_load_gdt_boot(const struct desc_ptr *dtr)  		BUG();  } +static inline bool desc_equal(const struct desc_struct *d1, +			      const struct desc_struct *d2) +{ +	return d1->a == d2->a && d1->b == d2->b; +} +  static void load_TLS_descriptor(struct thread_struct *t,  				unsigned int cpu, unsigned int i)  { -	struct desc_struct *gdt = get_cpu_gdt_table(cpu); -	xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); -	struct multicall_space mc = __xen_mc_entry(0); +	struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i]; +	struct desc_struct *gdt; +	xmaddr_t maddr; +	struct multicall_space mc; + +	if (desc_equal(shadow, &t->tls_array[i])) +		return; + +	*shadow = t->tls_array[i]; + +	gdt = get_cpu_gdt_table(cpu); +	maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); +	mc = __xen_mc_entry(0);  	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);  } @@ -498,6 +699,8 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,  	xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);  	u64 entry = *(u64 *)ptr; +	trace_xen_cpu_write_ldt_entry(dt, entrynum, entry); +  	preempt_disable();  	xen_mc_flush(); @@ -522,8 +725,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,  	/*  	 * Look for known traps using IST, and substitute them  	 * appropriately.  The debugger ones are the only ones we care -	 * about.  Xen will handle faults like double_fault and -	 * machine_check, so we should never see them.  Warn if +	 * about.  Xen will handle faults like double_fault, +	 * so we should never see them.  Warn if  	 * there's an unexpected IST-using fault handler.  	 */  	if (addr == (unsigned long)debug) @@ -532,15 +735,23 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,  		addr = (unsigned long)xen_int3;  	else if (addr == (unsigned long)stack_segment)  		addr = (unsigned long)xen_stack_segment; -	else if (addr == (unsigned long)double_fault || -		 addr == (unsigned long)nmi) { +	else if (addr == (unsigned long)double_fault) {  		/* Don't need to handle these */  		return 0;  #ifdef CONFIG_X86_MCE  	} else if (addr == (unsigned long)machine_check) { -		return 0; +		/* +		 * when xen hypervisor inject vMCE to guest, +		 * use native mce handler to handle it +		 */ +		;  #endif -	} else { +	} else if (addr == (unsigned long)nmi) +		/* +		 * Use the native version as well. +		 */ +		; +	else {  		/* Some other trap using IST? */  		if (WARN_ON(val->ist != 0))  			return 0; @@ -567,10 +778,12 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)  	unsigned long p = (unsigned long)&dt[entrynum];  	unsigned long start, end; +	trace_xen_cpu_write_idt_entry(dt, entrynum, g); +  	preempt_disable(); -	start = __get_cpu_var(idt_desc).address; -	end = start + __get_cpu_var(idt_desc).size + 1; +	start = __this_cpu_read(idt_desc.address); +	end = start + __this_cpu_read(idt_desc.size) + 1;  	xen_mc_flush(); @@ -621,6 +834,8 @@ static void xen_load_idt(const struct desc_ptr *desc)  	static DEFINE_SPINLOCK(lock);  	static struct trap_info traps[257]; +	trace_xen_cpu_load_idt(desc); +  	spin_lock(&lock);  	__get_cpu_var(idt_desc) = *desc; @@ -639,6 +854,8 @@ static void xen_load_idt(const struct desc_ptr *desc)  static void xen_write_gdt_entry(struct desc_struct *dt, int entry,  				const void *desc, int type)  { +	trace_xen_cpu_write_gdt_entry(dt, entry, desc, type); +  	preempt_disable();  	switch (type) { @@ -664,9 +881,11 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,   * Version of write_gdt_entry for use at early boot-time needed to   * update an entry as simply as possible.   */ -static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, +static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,  					    const void *desc, int type)  { +	trace_xen_cpu_write_gdt_entry(dt, entry, desc, type); +  	switch (type) {  	case DESC_LDT:  	case DESC_TSS: @@ -686,7 +905,9 @@ static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,  static void xen_load_sp0(struct tss_struct *tss,  			 struct thread_struct *thread)  { -	struct multicall_space mcs = xen_mc_entry(0); +	struct multicall_space mcs; + +	mcs = xen_mc_entry(0);  	MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);  	xen_mc_issue(PARAVIRT_LAZY_CPU);  } @@ -705,9 +926,40 @@ static void xen_io_delay(void)  }  #ifdef CONFIG_X86_LOCAL_APIC +static unsigned long xen_set_apic_id(unsigned int x) +{ +	WARN_ON(1); +	return x; +} +static unsigned int xen_get_apic_id(unsigned long x) +{ +	return ((x)>>24) & 0xFFu; +}  static u32 xen_apic_read(u32 reg)  { -	return 0; +	struct xen_platform_op op = { +		.cmd = XENPF_get_cpuinfo, +		.interface_version = XENPF_INTERFACE_VERSION, +		.u.pcpu_info.xen_cpuid = 0, +	}; +	int ret = 0; + +	/* Shouldn't need this as APIC is turned off for PV, and we only +	 * get called on the bootup processor. But just in case. */ +	if (!xen_initial_domain() || smp_processor_id()) +		return 0; + +	if (reg == APIC_LVR) +		return 0x10; + +	if (reg != APIC_ID) +		return 0; + +	ret = HYPERVISOR_dom0_op(&op); +	if (ret) +		return 0; + +	return op.u.pcpu_info.apic_id << 24;  }  static void xen_apic_write(u32 reg, u32 val) @@ -745,6 +997,16 @@ static void set_xen_basic_apic_ops(void)  	apic->icr_write = xen_apic_icr_write;  	apic->wait_icr_idle = xen_apic_wait_icr_idle;  	apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle; +	apic->set_apic_id = xen_set_apic_id; +	apic->get_apic_id = xen_get_apic_id; + +#ifdef CONFIG_SMP +	apic->send_IPI_allbutself = xen_send_IPI_allbutself; +	apic->send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself; +	apic->send_IPI_mask = xen_send_IPI_mask; +	apic->send_IPI_all = xen_send_IPI_all; +	apic->send_IPI_self = xen_send_IPI_self; +#endif  }  #endif @@ -764,11 +1026,11 @@ static DEFINE_PER_CPU(unsigned long, xen_cr0_value);  static unsigned long xen_read_cr0(void)  { -	unsigned long cr0 = percpu_read(xen_cr0_value); +	unsigned long cr0 = this_cpu_read(xen_cr0_value);  	if (unlikely(cr0 == 0)) {  		cr0 = native_read_cr0(); -		percpu_write(xen_cr0_value, cr0); +		this_cpu_write(xen_cr0_value, cr0);  	}  	return cr0; @@ -778,7 +1040,7 @@ static void xen_write_cr0(unsigned long cr0)  {  	struct multicall_space mcs; -	percpu_write(xen_cr0_value, cr0); +	this_cpu_write(xen_cr0_value, cr0);  	/* Only pay attention to cr0.TS; everything else is  	   ignored. */ @@ -796,7 +1058,16 @@ static void xen_write_cr4(unsigned long cr4)  	native_write_cr4(cr4);  } - +#ifdef CONFIG_X86_64 +static inline unsigned long xen_read_cr8(void) +{ +	return 0; +} +static inline void xen_write_cr8(unsigned long val) +{ +	BUG_ON(val); +} +#endif  static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)  {  	int ret; @@ -863,7 +1134,7 @@ void xen_setup_shared_info(void)  	xen_setup_mfn_list_list();  } -/* This is called once we have the cpu_possible_map */ +/* This is called once we have the cpu_possible_mask */  void xen_setup_vcpu_info_placement(void)  {  	int cpu; @@ -872,8 +1143,9 @@ void xen_setup_vcpu_info_placement(void)  		xen_vcpu_setup(cpu);  	/* xen_vcpu_setup managed to place the vcpu_info within the -	   percpu area for all cpus, so make use of it */ -	if (have_vcpu_info_placement) { +	 * percpu area for all cpus, so make use of it. Note that for +	 * PVH we want to use native IRQ mechanism. */ +	if (have_vcpu_info_placement && !xen_pvh_domain()) {  		pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);  		pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);  		pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); @@ -935,18 +1207,22 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,  	return ret;  } -static const struct pv_info xen_info __initdata = { +static const struct pv_info xen_info __initconst = {  	.paravirt_enabled = 1,  	.shared_kernel_pmd = 0, +#ifdef CONFIG_X86_64 +	.extra_user_64bit_cs = FLAT_USER_CS64, +#endif +  	.name = "Xen",  }; -static const struct pv_init_ops xen_init_ops __initdata = { +static const struct pv_init_ops xen_init_ops __initconst = {  	.patch = xen_patch,  }; -static const struct pv_cpu_ops xen_cpu_ops __initdata = { +static const struct pv_cpu_ops xen_cpu_ops __initconst = {  	.cpuid = xen_cpuid,  	.set_debugreg = xen_set_debugreg, @@ -961,13 +1237,21 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {  	.read_cr4_safe = native_read_cr4_safe,  	.write_cr4 = xen_write_cr4, +#ifdef CONFIG_X86_64 +	.read_cr8 = xen_read_cr8, +	.write_cr8 = xen_write_cr8, +#endif +  	.wbinvd = native_wbinvd,  	.read_msr = native_read_msr_safe,  	.write_msr = xen_write_msr_safe, +  	.read_tsc = native_read_tsc,  	.read_pmc = native_read_pmc, +	.read_tscp = native_read_tscp, +  	.iret = xen_iret,  	.irq_enable_sysexit = xen_sysexit,  #ifdef CONFIG_X86_64 @@ -987,7 +1271,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {  	.alloc_ldt = xen_alloc_ldt,  	.free_ldt = xen_free_ldt, -	.store_gdt = native_store_gdt,  	.store_idt = native_store_idt,  	.store_tr = xen_store_tr, @@ -1006,7 +1289,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {  	.end_context_switch = xen_end_context_switch,  }; -static const struct pv_apic_ops xen_apic_ops __initdata = { +static const struct pv_apic_ops xen_apic_ops __initconst = {  #ifdef CONFIG_X86_LOCAL_APIC  	.startup_ipi_hook = paravirt_nop,  #endif @@ -1016,10 +1299,6 @@ static void xen_reboot(int reason)  {  	struct sched_shutdown r = { .reason = reason }; -#ifdef CONFIG_SMP -	stop_other_cpus(); -#endif -  	if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))  		BUG();  } @@ -1039,6 +1318,13 @@ static void xen_machine_halt(void)  	xen_reboot(SHUTDOWN_poweroff);  } +static void xen_machine_power_off(void) +{ +	if (pm_power_off) +		pm_power_off(); +	xen_reboot(SHUTDOWN_poweroff); +} +  static void xen_crash_shutdown(struct pt_regs *regs)  {  	xen_reboot(SHUTDOWN_crash); @@ -1053,6 +1339,7 @@ xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)  static struct notifier_block xen_panic_block = {  	.notifier_call= xen_panic_event, +	.priority = INT_MIN  };  int xen_panic_handler_init(void) @@ -1061,22 +1348,111 @@ int xen_panic_handler_init(void)  	return 0;  } -static const struct machine_ops __initdata xen_machine_ops = { +static const struct machine_ops xen_machine_ops __initconst = {  	.restart = xen_restart,  	.halt = xen_machine_halt, -	.power_off = xen_machine_halt, +	.power_off = xen_machine_power_off,  	.shutdown = xen_machine_halt,  	.crash_shutdown = xen_crash_shutdown,  	.emergency_restart = xen_emergency_restart,  }; +static void __init xen_boot_params_init_edd(void) +{ +#if IS_ENABLED(CONFIG_EDD) +	struct xen_platform_op op; +	struct edd_info *edd_info; +	u32 *mbr_signature; +	unsigned nr; +	int ret; + +	edd_info = boot_params.eddbuf; +	mbr_signature = boot_params.edd_mbr_sig_buffer; + +	op.cmd = XENPF_firmware_info; + +	op.u.firmware_info.type = XEN_FW_DISK_INFO; +	for (nr = 0; nr < EDDMAXNR; nr++) { +		struct edd_info *info = edd_info + nr; + +		op.u.firmware_info.index = nr; +		info->params.length = sizeof(info->params); +		set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params, +				     &info->params); +		ret = HYPERVISOR_dom0_op(&op); +		if (ret) +			break; + +#define C(x) info->x = op.u.firmware_info.u.disk_info.x +		C(device); +		C(version); +		C(interface_support); +		C(legacy_max_cylinder); +		C(legacy_max_head); +		C(legacy_sectors_per_track); +#undef C +	} +	boot_params.eddbuf_entries = nr; + +	op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE; +	for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) { +		op.u.firmware_info.index = nr; +		ret = HYPERVISOR_dom0_op(&op); +		if (ret) +			break; +		mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature; +	} +	boot_params.edd_mbr_sig_buf_entries = nr; +#endif +} +  /*   * Set up the GDT and segment registers for -fstack-protector.  Until   * we do this, we have to be careful not to call any stack-protected   * function, which is most of the kernel. + * + * Note, that it is __ref because the only caller of this after init + * is PVH which is not going to use xen_load_gdt_boot or other + * __init functions.   */ -static void __init xen_setup_stackprotector(void) +static void __ref xen_setup_gdt(int cpu)  { +	if (xen_feature(XENFEAT_auto_translated_physmap)) { +#ifdef CONFIG_X86_64 +		unsigned long dummy; + +		load_percpu_segment(cpu); /* We need to access per-cpu area */ +		switch_to_new_gdt(cpu); /* GDT and GS set */ + +		/* We are switching of the Xen provided GDT to our HVM mode +		 * GDT. The new GDT has  __KERNEL_CS with CS.L = 1 +		 * and we are jumping to reload it. +		 */ +		asm volatile ("pushq %0\n" +			      "leaq 1f(%%rip),%0\n" +			      "pushq %0\n" +			      "lretq\n" +			      "1:\n" +			      : "=&r" (dummy) : "0" (__KERNEL_CS)); + +		/* +		 * While not needed, we also set the %es, %ds, and %fs +		 * to zero. We don't care about %ss as it is NULL. +		 * Strictly speaking this is not needed as Xen zeros those +		 * out (and also MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE) +		 * +		 * Linux zeros them in cpu_init() and in secondary_startup_64 +		 * (for BSP). +		 */ +		loadsegment(es, 0); +		loadsegment(ds, 0); +		loadsegment(fs, 0); +#else +		/* PVH: TODO Implement. */ +		BUG(); +#endif +		return; /* PVH does not need any PV GDT ops. */ +	}  	pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;  	pv_cpu_ops.load_gdt = xen_load_gdt_boot; @@ -1087,23 +1463,84 @@ static void __init xen_setup_stackprotector(void)  	pv_cpu_ops.load_gdt = xen_load_gdt;  } +/* + * A PV guest starts with default flags that are not set for PVH, set them + * here asap. + */ +static void xen_pvh_set_cr_flags(int cpu) +{ + +	/* Some of these are setup in 'secondary_startup_64'. The others: +	 * X86_CR0_TS, X86_CR0_PE, X86_CR0_ET are set by Xen for HVM guests +	 * (which PVH shared codepaths), while X86_CR0_PG is for PVH. */ +	write_cr0(read_cr0() | X86_CR0_MP | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM); + +	if (!cpu) +		return; +	/* +	 * For BSP, PSE PGE are set in probe_page_size_mask(), for APs +	 * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu_init. +	*/ +	if (cpu_has_pse) +		set_in_cr4(X86_CR4_PSE); + +	if (cpu_has_pge) +		set_in_cr4(X86_CR4_PGE); +} + +/* + * Note, that it is ref - because the only caller of this after init + * is PVH which is not going to use xen_load_gdt_boot or other + * __init functions. + */ +void __ref xen_pvh_secondary_vcpu_init(int cpu) +{ +	xen_setup_gdt(cpu); +	xen_pvh_set_cr_flags(cpu); +} + +static void __init xen_pvh_early_guest_init(void) +{ +	if (!xen_feature(XENFEAT_auto_translated_physmap)) +		return; + +	if (!xen_feature(XENFEAT_hvm_callback_vector)) +		return; + +	xen_have_vector_callback = 1; +	xen_pvh_set_cr_flags(0); + +#ifdef CONFIG_X86_32 +	BUG(); /* PVH: Implement proper support. */ +#endif +} +  /* First C function to be called on Xen boot */ -asmlinkage void __init xen_start_kernel(void) +asmlinkage __visible void __init xen_start_kernel(void)  { -	pgd_t *pgd; +	struct physdev_set_iopl set_iopl; +	int rc;  	if (!xen_start_info)  		return;  	xen_domain_type = XEN_PV_DOMAIN; +	xen_setup_features(); +	xen_pvh_early_guest_init(); +	xen_setup_machphys_mapping(); +  	/* Install Xen paravirt ops */  	pv_info = xen_info;  	pv_init_ops = xen_init_ops; -	pv_cpu_ops = xen_cpu_ops;  	pv_apic_ops = xen_apic_ops; +	if (!xen_pvh_domain()) +		pv_cpu_ops = xen_cpu_ops; -	x86_init.resources.memory_setup = xen_memory_setup; +	if (xen_feature(XENFEAT_auto_translated_physmap)) +		x86_init.resources.memory_setup = xen_auto_xlated_memory_setup; +	else +		x86_init.resources.memory_setup = xen_memory_setup;  	x86_init.oem.arch_setup = xen_arch_setup;  	x86_init.oem.banner = xen_banner; @@ -1117,7 +1554,9 @@ asmlinkage void __init xen_start_kernel(void)  	/* Prevent unwanted bits from being set in PTEs. */  	__supported_pte_mask &= ~_PAGE_GLOBAL; +#if 0  	if (!xen_initial_domain()) +#endif  		__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);  	__supported_pte_mask |= _PAGE_IOMAP; @@ -1131,17 +1570,14 @@ asmlinkage void __init xen_start_kernel(void)  	/* Work out if we support NX */  	x86_configure_nx(); -	xen_setup_features(); -  	/* Get mfn list */ -	if (!xen_feature(XENFEAT_auto_translated_physmap)) -		xen_build_dynamic_phys_to_machine(); +	xen_build_dynamic_phys_to_machine();  	/*  	 * Set up kernel GDT and segment registers, mainly so that  	 * -fstack-protector code can be executed.  	 */ -	xen_setup_stackprotector(); +	xen_setup_gdt(0);  	xen_init_irq_ops();  	xen_init_cpuid_mask(); @@ -1169,30 +1605,35 @@ asmlinkage void __init xen_start_kernel(void)  	xen_smp_init(); -	pgd = (pgd_t *)xen_start_info->pt_base; - -	if (!xen_initial_domain()) -		__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); - -	__supported_pte_mask |= _PAGE_IOMAP; +#ifdef CONFIG_ACPI_NUMA +	/* +	 * The pages we from Xen are not related to machine pages, so +	 * any NUMA information the kernel tries to get from ACPI will +	 * be meaningless.  Prevent it from trying. +	 */ +	acpi_numa = -1; +#endif +#ifdef CONFIG_X86_PAT +	/* +	 * For right now disable the PAT. We should remove this once +	 * git commit 8eaffa67b43e99ae581622c5133e20b0f48bcef1 +	 * (xen/pat: Disable PAT support for now) is reverted. +	 */ +	pat_enabled = 0; +#endif  	/* Don't do the full vcpu_info placement stuff until we have a  	   possible map and a non-dummy shared_info. */  	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];  	local_irq_disable(); -	early_boot_irqs_off(); - -	memblock_init(); +	early_boot_irqs_disabled = true;  	xen_raw_console_write("mapping kernel into physical memory\n"); -	pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); -	xen_ident_map_ISA(); +	xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages);  	/* Allocate and initialize top and mid mfn levels for p2m structure */  	xen_build_mfn_list_list(); -	init_mm.pgd = pgd; -  	/* keep using Xen gdt for now; no urgent need to change it */  #ifdef CONFIG_X86_32 @@ -1202,14 +1643,26 @@ asmlinkage void __init xen_start_kernel(void)  #else  	pv_info.kernel_rpl = 0;  #endif -  	/* set the limit of our address space */  	xen_reserve_top(); +	/* PVH: runs at default kernel iopl of 0 */ +	if (!xen_pvh_domain()) { +		/* +		 * We used to do this in xen_arch_setup, but that is too late +		 * on AMD were early_cpu_init (run before ->arch_setup()) calls +		 * early_amd_init which pokes 0xcf8 port. +		 */ +		set_iopl.iopl = 1; +		rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); +		if (rc != 0) +			xen_raw_printk("physdev_op failed %d\n", rc); +	} +  #ifdef CONFIG_X86_32  	/* set up basic CPUID stuff */  	cpu_detect(&new_cpu_data); -	new_cpu_data.hard_math = 1; +	set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);  	new_cpu_data.wp_works_ok = 1;  	new_cpu_data.x86_capability[0] = cpuid_edx(1);  #endif @@ -1228,11 +1681,39 @@ asmlinkage void __init xen_start_kernel(void)  		if (pci_xen)  			x86_init.pci.arch_init = pci_xen_init;  	} else { +		const struct dom0_vga_console_info *info = +			(void *)((char *)xen_start_info + +				 xen_start_info->console.dom0.info_off); +		struct xen_platform_op op = { +			.cmd = XENPF_firmware_info, +			.interface_version = XENPF_INTERFACE_VERSION, +			.u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS, +		}; + +		xen_init_vga(info, xen_start_info->console.dom0.info_size); +		xen_start_info->console.domU.mfn = 0; +		xen_start_info->console.domU.evtchn = 0; + +		if (HYPERVISOR_dom0_op(&op) == 0) +			boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags; + +		xen_init_apic(); +  		/* Make sure ACS will be enabled */  		pci_request_acs(); -	} -		 +		xen_acpi_sleep_register(); + +		/* Avoid searching for BIOS MP tables */ +		x86_init.mpparse.find_smp_config = x86_init_noop; +		x86_init.mpparse.get_smp_config = x86_init_uint_noop; + +		xen_boot_params_init_edd(); +	} +#ifdef CONFIG_PCI +	/* PCI BIOS service won't work from a PV guest. */ +	pci_probe &= ~PCI_PROBE_BIOS; +#endif  	xen_raw_console_write("about to get started...\n");  	xen_setup_runstate_info(0); @@ -1245,53 +1726,7 @@ asmlinkage void __init xen_start_kernel(void)  #endif  } -static uint32_t xen_cpuid_base(void) -{ -	uint32_t base, eax, ebx, ecx, edx; -	char signature[13]; - -	for (base = 0x40000000; base < 0x40010000; base += 0x100) { -		cpuid(base, &eax, &ebx, &ecx, &edx); -		*(uint32_t *)(signature + 0) = ebx; -		*(uint32_t *)(signature + 4) = ecx; -		*(uint32_t *)(signature + 8) = edx; -		signature[12] = 0; - -		if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2)) -			return base; -	} - -	return 0; -} - -static int init_hvm_pv_info(int *major, int *minor) -{ -	uint32_t eax, ebx, ecx, edx, pages, msr, base; -	u64 pfn; - -	base = xen_cpuid_base(); -	cpuid(base + 1, &eax, &ebx, &ecx, &edx); - -	*major = eax >> 16; -	*minor = eax & 0xffff; -	printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor); - -	cpuid(base + 2, &pages, &msr, &ecx, &edx); - -	pfn = __pa(hypercall_page); -	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); - -	xen_setup_features(); - -	pv_info = xen_info; -	pv_info.kernel_rpl = 0; - -	xen_domain_type = XEN_HVM_DOMAIN; - -	return 0; -} - -void xen_hvm_init_shared_info(void) +void __ref xen_hvm_init_shared_info(void)  {  	int cpu;  	struct xen_add_to_physmap xatp; @@ -1318,18 +1753,50 @@ void xen_hvm_init_shared_info(void)  	 * online but xen_hvm_init_shared_info is run at resume time too and  	 * in that case multiple vcpus might be online. */  	for_each_online_cpu(cpu) { +		/* Leave it to be NULL. */ +		if (cpu >= MAX_VIRT_CPUS) +			continue;  		per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];  	}  }  #ifdef CONFIG_XEN_PVHVM -static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, -				    unsigned long action, void *hcpu) +static void __init init_hvm_pv_info(void) +{ +	int major, minor; +	uint32_t eax, ebx, ecx, edx, pages, msr, base; +	u64 pfn; + +	base = xen_cpuid_base(); +	cpuid(base + 1, &eax, &ebx, &ecx, &edx); + +	major = eax >> 16; +	minor = eax & 0xffff; +	printk(KERN_INFO "Xen version %d.%d.\n", major, minor); + +	cpuid(base + 2, &pages, &msr, &ecx, &edx); + +	pfn = __pa(hypercall_page); +	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + +	xen_setup_features(); + +	pv_info.name = "Xen HVM"; + +	xen_domain_type = XEN_HVM_DOMAIN; +} + +static int xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action, +			      void *hcpu)  {  	int cpu = (long)hcpu;  	switch (action) {  	case CPU_UP_PREPARE: -		per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; +		xen_vcpu_setup(cpu); +		if (xen_have_vector_callback) { +			if (xen_feature(XENFEAT_hvm_safe_pvclock)) +				xen_setup_timer(cpu); +		}  		break;  	default:  		break; @@ -1337,46 +1804,53 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,  	return NOTIFY_OK;  } -static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = { +static struct notifier_block xen_hvm_cpu_notifier = {  	.notifier_call	= xen_hvm_cpu_notify,  };  static void __init xen_hvm_guest_init(void)  { -	int r; -	int major, minor; - -	r = init_hvm_pv_info(&major, &minor); -	if (r < 0) -		return; +	init_hvm_pv_info();  	xen_hvm_init_shared_info(); +	xen_panic_handler_init(); +  	if (xen_feature(XENFEAT_hvm_callback_vector))  		xen_have_vector_callback = 1; +	xen_hvm_smp_init();  	register_cpu_notifier(&xen_hvm_cpu_notifier);  	xen_unplug_emulated_devices(); -	have_vcpu_info_placement = 0;  	x86_init.irqs.intr_init = xen_init_IRQ;  	xen_hvm_init_time_ops();  	xen_hvm_init_mmu_ops();  } -static bool __init xen_hvm_platform(void) +static uint32_t __init xen_hvm_platform(void)  {  	if (xen_pv_domain()) -		return false; +		return 0; -	if (!xen_cpuid_base()) -		return false; +	return xen_cpuid_base(); +} +bool xen_hvm_need_lapic(void) +{ +	if (xen_pv_domain()) +		return false; +	if (!xen_hvm_domain()) +		return false; +	if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback) +		return false;  	return true;  } +EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); -const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = { +const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = {  	.name			= "Xen HVM",  	.detect			= xen_hvm_platform,  	.init_platform		= xen_hvm_guest_init, +	.x2apic_available	= xen_x2apic_para_available,  };  EXPORT_SYMBOL(x86_hyper_xen_hvm);  #endif diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index 49ba9b5224d..ebfa9b2c871 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -36,56 +36,190 @@  #include <linux/sched.h>  #include <linux/mm.h> +#include <linux/slab.h>  #include <linux/vmalloc.h>  #include <xen/interface/xen.h>  #include <xen/page.h>  #include <xen/grant_table.h> +#include <xen/xen.h>  #include <asm/pgtable.h> -static int map_pte_fn(pte_t *pte, struct page *pmd_page, -		      unsigned long addr, void *data) +static struct gnttab_vm_area { +	struct vm_struct *area; +	pte_t **ptes; +} gnttab_shared_vm_area, gnttab_status_vm_area; + +int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, +			   unsigned long max_nr_gframes, +			   void **__shared)  { -	unsigned long **frames = (unsigned long **)data; +	void *shared = *__shared; +	unsigned long addr; +	unsigned long i; + +	if (shared == NULL) +		*__shared = shared = gnttab_shared_vm_area.area->addr; + +	addr = (unsigned long)shared; + +	for (i = 0; i < nr_gframes; i++) { +		set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i], +			   mfn_pte(frames[i], PAGE_KERNEL)); +		addr += PAGE_SIZE; +	} -	set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL)); -	(*frames)++;  	return 0;  } -static int unmap_pte_fn(pte_t *pte, struct page *pmd_page, -			unsigned long addr, void *data) +int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes, +			   unsigned long max_nr_gframes, +			   grant_status_t **__shared)  { +	grant_status_t *shared = *__shared; +	unsigned long addr; +	unsigned long i; + +	if (shared == NULL) +		*__shared = shared = gnttab_status_vm_area.area->addr; + +	addr = (unsigned long)shared; + +	for (i = 0; i < nr_gframes; i++) { +		set_pte_at(&init_mm, addr, gnttab_status_vm_area.ptes[i], +			   mfn_pte(frames[i], PAGE_KERNEL)); +		addr += PAGE_SIZE; +	} -	set_pte_at(&init_mm, addr, pte, __pte(0));  	return 0;  } -int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, -			   unsigned long max_nr_gframes, -			   struct grant_entry **__shared) +void arch_gnttab_unmap(void *shared, unsigned long nr_gframes) +{ +	pte_t **ptes; +	unsigned long addr; +	unsigned long i; + +	if (shared == gnttab_status_vm_area.area->addr) +		ptes = gnttab_status_vm_area.ptes; +	else +		ptes = gnttab_shared_vm_area.ptes; + +	addr = (unsigned long)shared; + +	for (i = 0; i < nr_gframes; i++) { +		set_pte_at(&init_mm, addr, ptes[i], __pte(0)); +		addr += PAGE_SIZE; +	} +} + +static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames) +{ +	area->ptes = kmalloc(sizeof(pte_t *) * nr_frames, GFP_KERNEL); +	if (area->ptes == NULL) +		return -ENOMEM; + +	area->area = alloc_vm_area(PAGE_SIZE * nr_frames, area->ptes); +	if (area->area == NULL) { +		kfree(area->ptes); +		return -ENOMEM; +	} + +	return 0; +} + +static void arch_gnttab_vfree(struct gnttab_vm_area *area) +{ +	free_vm_area(area->area); +	kfree(area->ptes); +} + +int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status) +{ +	int ret; + +	if (!xen_pv_domain()) +		return 0; + +	ret = arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared); +	if (ret < 0) +		return ret; + +	/* +	 * Always allocate the space for the status frames in case +	 * we're migrated to a host with V2 support. +	 */ +	ret = arch_gnttab_valloc(&gnttab_status_vm_area, nr_status); +	if (ret < 0) +		goto err; + +	return 0; +  err: +	arch_gnttab_vfree(&gnttab_shared_vm_area); +	return -ENOMEM; +} + +#ifdef CONFIG_XEN_PVH +#include <xen/balloon.h> +#include <xen/events.h> +#include <linux/slab.h> +static int __init xlated_setup_gnttab_pages(void)  { +	struct page **pages; +	xen_pfn_t *pfns;  	int rc; -	struct grant_entry *shared = *__shared; - -	if (shared == NULL) { -		struct vm_struct *area = -			xen_alloc_vm_area(PAGE_SIZE * max_nr_gframes); -		BUG_ON(area == NULL); -		shared = area->addr; -		*__shared = shared; +	unsigned int i; +	unsigned long nr_grant_frames = gnttab_max_grant_frames(); + +	BUG_ON(nr_grant_frames == 0); +	pages = kcalloc(nr_grant_frames, sizeof(pages[0]), GFP_KERNEL); +	if (!pages) +		return -ENOMEM; + +	pfns = kcalloc(nr_grant_frames, sizeof(pfns[0]), GFP_KERNEL); +	if (!pfns) { +		kfree(pages); +		return -ENOMEM; +	} +	rc = alloc_xenballooned_pages(nr_grant_frames, pages, 0 /* lowmem */); +	if (rc) { +		pr_warn("%s Couldn't balloon alloc %ld pfns rc:%d\n", __func__, +			nr_grant_frames, rc); +		kfree(pages); +		kfree(pfns); +		return rc;  	} +	for (i = 0; i < nr_grant_frames; i++) +		pfns[i] = page_to_pfn(pages[i]); -	rc = apply_to_page_range(&init_mm, (unsigned long)shared, -				 PAGE_SIZE * nr_gframes, -				 map_pte_fn, &frames); -	return rc; +	rc = arch_gnttab_map_shared(pfns, nr_grant_frames, nr_grant_frames, +				    &xen_auto_xlat_grant_frames.vaddr); + +	if (rc) { +		pr_warn("%s Couldn't map %ld pfns rc:%d\n", __func__, +			nr_grant_frames, rc); +		free_xenballooned_pages(nr_grant_frames, pages); +		kfree(pages); +		kfree(pfns); +		return rc; +	} +	kfree(pages); + +	xen_auto_xlat_grant_frames.pfn = pfns; +	xen_auto_xlat_grant_frames.count = nr_grant_frames; + +	return 0;  } -void arch_gnttab_unmap_shared(struct grant_entry *shared, -			      unsigned long nr_gframes) +static int __init xen_pvh_gnttab_setup(void)  { -	apply_to_page_range(&init_mm, (unsigned long)shared, -			    PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL); +	if (!xen_pvh_domain()) +		return -ENODEV; + +	return xlated_setup_gnttab_pages();  } +/* Call it _before_ __gnttab_init as we need to initialize the + * xen_auto_xlat_grant_frames first. */ +core_initcall(xen_pvh_gnttab_setup); +#endif diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 9d30105a0c4..a1207cb6472 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -5,6 +5,8 @@  #include <xen/interface/xen.h>  #include <xen/interface/sched.h>  #include <xen/interface/vcpu.h> +#include <xen/features.h> +#include <xen/events.h>  #include <asm/xen/hypercall.h>  #include <asm/xen/hypervisor.h> @@ -21,12 +23,12 @@ void xen_force_evtchn_callback(void)  	(void)HYPERVISOR_xen_version(0, NULL);  } -static unsigned long xen_save_fl(void) +asmlinkage __visible unsigned long xen_save_fl(void)  {  	struct vcpu_info *vcpu;  	unsigned long flags; -	vcpu = percpu_read(xen_vcpu); +	vcpu = this_cpu_read(xen_vcpu);  	/* flag has opposite sense of mask */  	flags = !vcpu->evtchn_upcall_mask; @@ -39,54 +41,51 @@ static unsigned long xen_save_fl(void)  }  PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl); -static void xen_restore_fl(unsigned long flags) +__visible void xen_restore_fl(unsigned long flags)  {  	struct vcpu_info *vcpu;  	/* convert from IF type flag */  	flags = !(flags & X86_EFLAGS_IF); -	/* There's a one instruction preempt window here.  We need to -	   make sure we're don't switch CPUs between getting the vcpu -	   pointer and updating the mask. */ +	/* See xen_irq_enable() for why preemption must be disabled. */  	preempt_disable(); -	vcpu = percpu_read(xen_vcpu); +	vcpu = this_cpu_read(xen_vcpu);  	vcpu->evtchn_upcall_mask = flags; -	preempt_enable_no_resched(); - -	/* Doesn't matter if we get preempted here, because any -	   pending event will get dealt with anyway. */  	if (flags == 0) { -		preempt_check_resched();  		barrier(); /* unmask then check (avoid races) */  		if (unlikely(vcpu->evtchn_upcall_pending))  			xen_force_evtchn_callback(); -	} +		preempt_enable(); +	} else +		preempt_enable_no_resched();  }  PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl); -static void xen_irq_disable(void) +asmlinkage __visible void xen_irq_disable(void)  {  	/* There's a one instruction preempt window here.  We need to  	   make sure we're don't switch CPUs between getting the vcpu  	   pointer and updating the mask. */  	preempt_disable(); -	percpu_read(xen_vcpu)->evtchn_upcall_mask = 1; +	this_cpu_read(xen_vcpu)->evtchn_upcall_mask = 1;  	preempt_enable_no_resched();  }  PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable); -static void xen_irq_enable(void) +asmlinkage __visible void xen_irq_enable(void)  {  	struct vcpu_info *vcpu; -	/* We don't need to worry about being preempted here, since -	   either a) interrupts are disabled, so no preemption, or b) -	   the caller is confused and is trying to re-enable interrupts -	   on an indeterminate processor. */ +	/* +	 * We may be preempted as soon as vcpu->evtchn_upcall_mask is +	 * cleared, so disable preemption to ensure we check for +	 * events on the VCPU we are still running on. +	 */ +	preempt_disable(); -	vcpu = percpu_read(xen_vcpu); +	vcpu = this_cpu_read(xen_vcpu);  	vcpu->evtchn_upcall_mask = 0;  	/* Doesn't matter if we get preempted here, because any @@ -95,6 +94,8 @@ static void xen_irq_enable(void)  	barrier(); /* unmask then check (avoid races) */  	if (unlikely(vcpu->evtchn_upcall_pending))  		xen_force_evtchn_callback(); + +	preempt_enable();  }  PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable); @@ -113,7 +114,7 @@ static void xen_halt(void)  		xen_safe_halt();  } -static const struct pv_irq_ops xen_irq_ops __initdata = { +static const struct pv_irq_ops xen_irq_ops __initconst = {  	.save_fl = PV_CALLEE_SAVE(xen_save_fl),  	.restore_fl = PV_CALLEE_SAVE(xen_restore_fl),  	.irq_disable = PV_CALLEE_SAVE(xen_irq_disable), @@ -126,8 +127,10 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {  #endif  }; -void __init xen_init_irq_ops() +void __init xen_init_irq_ops(void)  { -	pv_irq_ops = xen_irq_ops; +	/* For PVH we use default pv_irq_ops settings. */ +	if (!xen_feature(XENFEAT_hvm_callback_vector)) +		pv_irq_ops = xen_irq_ops;  	x86_init.irqs.intr_init = xen_init_IRQ;  } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 21ed8d7f75a..e8a1201c329 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -46,6 +46,10 @@  #include <linux/module.h>  #include <linux/gfp.h>  #include <linux/memblock.h> +#include <linux/seq_file.h> +#include <linux/crash_dump.h> + +#include <trace/events/xen.h>  #include <asm/pgtable.h>  #include <asm/tlbflush.h> @@ -58,6 +62,7 @@  #include <asm/page.h>  #include <asm/init.h>  #include <asm/pat.h> +#include <asm/smp.h>  #include <asm/xen/hypercall.h>  #include <asm/xen/hypervisor.h> @@ -74,68 +79,13 @@  #include "mmu.h"  #include "debugfs.h" -#define MMU_UPDATE_HISTO	30 -  /*   * Protects atomic reservation decrease/increase against concurrent increases. - * Also protects non-atomic updates of current_pages and driver_pages, and - * balloon lists. + * Also protects non-atomic updates of current_pages and balloon lists.   */  DEFINE_SPINLOCK(xen_reservation_lock); -#ifdef CONFIG_XEN_DEBUG_FS - -static struct { -	u32 pgd_update; -	u32 pgd_update_pinned; -	u32 pgd_update_batched; - -	u32 pud_update; -	u32 pud_update_pinned; -	u32 pud_update_batched; - -	u32 pmd_update; -	u32 pmd_update_pinned; -	u32 pmd_update_batched; - -	u32 pte_update; -	u32 pte_update_pinned; -	u32 pte_update_batched; - -	u32 mmu_update; -	u32 mmu_update_extended; -	u32 mmu_update_histo[MMU_UPDATE_HISTO]; - -	u32 prot_commit; -	u32 prot_commit_batched; - -	u32 set_pte_at; -	u32 set_pte_at_batched; -	u32 set_pte_at_pinned; -	u32 set_pte_at_current; -	u32 set_pte_at_kernel; -} mmu_stats; - -static u8 zero_stats; - -static inline void check_zero(void) -{ -	if (unlikely(zero_stats)) { -		memset(&mmu_stats, 0, sizeof(mmu_stats)); -		zero_stats = 0; -	} -} - -#define ADD_STATS(elem, val)			\ -	do { check_zero(); mmu_stats.elem += (val); } while(0) - -#else  /* !CONFIG_XEN_DEBUG_FS */ - -#define ADD_STATS(elem, val)	do { (void)(val); } while(0) - -#endif /* CONFIG_XEN_DEBUG_FS */ - - +#ifdef CONFIG_X86_32  /*   * Identity map, in addition to plain kernel map.  This needs to be   * large enough to allocate page table pages to allocate the rest. @@ -143,7 +93,7 @@ static inline void check_zero(void)   */  #define LEVEL1_IDENT_ENTRIES	(PTRS_PER_PTE * 4)  static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); - +#endif  #ifdef CONFIG_X86_64  /* l3 pud for userspace vsyscall mapping */  static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; @@ -173,371 +123,6 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */   */  #define USER_LIMIT	((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) -/* - * Xen leaves the responsibility for maintaining p2m mappings to the - * guests themselves, but it must also access and update the p2m array - * during suspend/resume when all the pages are reallocated. - * - * The p2m table is logically a flat array, but we implement it as a - * three-level tree to allow the address space to be sparse. - * - *                               Xen - *                                | - *     p2m_top              p2m_top_mfn - *       /  \                   /   \ - * p2m_mid p2m_mid	p2m_mid_mfn p2m_mid_mfn - *    / \      / \         /           / - *  p2m p2m p2m p2m p2m p2m p2m ... - * - * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. - * - * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the - * maximum representable pseudo-physical address space is: - *  P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages - * - * P2M_PER_PAGE depends on the architecture, as a mfn is always - * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to - * 512 and 1024 entries respectively.  - */ - -unsigned long xen_max_p2m_pfn __read_mostly; - -#define P2M_PER_PAGE		(PAGE_SIZE / sizeof(unsigned long)) -#define P2M_MID_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long *)) -#define P2M_TOP_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long **)) - -#define MAX_P2M_PFN		(P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) - -/* Placeholders for holes in the address space */ -static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); - -static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); - -RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); -RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); - -static inline unsigned p2m_top_index(unsigned long pfn) -{ -	BUG_ON(pfn >= MAX_P2M_PFN); -	return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); -} - -static inline unsigned p2m_mid_index(unsigned long pfn) -{ -	return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; -} - -static inline unsigned p2m_index(unsigned long pfn) -{ -	return pfn % P2M_PER_PAGE; -} - -static void p2m_top_init(unsigned long ***top) -{ -	unsigned i; - -	for (i = 0; i < P2M_TOP_PER_PAGE; i++) -		top[i] = p2m_mid_missing; -} - -static void p2m_top_mfn_init(unsigned long *top) -{ -	unsigned i; - -	for (i = 0; i < P2M_TOP_PER_PAGE; i++) -		top[i] = virt_to_mfn(p2m_mid_missing_mfn); -} - -static void p2m_top_mfn_p_init(unsigned long **top) -{ -	unsigned i; - -	for (i = 0; i < P2M_TOP_PER_PAGE; i++) -		top[i] = p2m_mid_missing_mfn; -} - -static void p2m_mid_init(unsigned long **mid) -{ -	unsigned i; - -	for (i = 0; i < P2M_MID_PER_PAGE; i++) -		mid[i] = p2m_missing; -} - -static void p2m_mid_mfn_init(unsigned long *mid) -{ -	unsigned i; - -	for (i = 0; i < P2M_MID_PER_PAGE; i++) -		mid[i] = virt_to_mfn(p2m_missing); -} - -static void p2m_init(unsigned long *p2m) -{ -	unsigned i; - -	for (i = 0; i < P2M_MID_PER_PAGE; i++) -		p2m[i] = INVALID_P2M_ENTRY; -} - -/* - * Build the parallel p2m_top_mfn and p2m_mid_mfn structures - * - * This is called both at boot time, and after resuming from suspend: - * - At boot time we're called very early, and must use extend_brk() - *   to allocate memory. - * - * - After resume we're called from within stop_machine, but the mfn - *   tree should alreay be completely allocated. - */ -void xen_build_mfn_list_list(void) -{ -	unsigned long pfn; - -	/* Pre-initialize p2m_top_mfn to be completely missing */ -	if (p2m_top_mfn == NULL) { -		p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); -		p2m_mid_mfn_init(p2m_mid_missing_mfn); - -		p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); -		p2m_top_mfn_p_init(p2m_top_mfn_p); - -		p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); -		p2m_top_mfn_init(p2m_top_mfn); -	} else { -		/* Reinitialise, mfn's all change after migration */ -		p2m_mid_mfn_init(p2m_mid_missing_mfn); -	} - -	for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { -		unsigned topidx = p2m_top_index(pfn); -		unsigned mididx = p2m_mid_index(pfn); -		unsigned long **mid; -		unsigned long *mid_mfn_p; - -		mid = p2m_top[topidx]; -		mid_mfn_p = p2m_top_mfn_p[topidx]; - -		/* Don't bother allocating any mfn mid levels if -		 * they're just missing, just update the stored mfn, -		 * since all could have changed over a migrate. -		 */ -		if (mid == p2m_mid_missing) { -			BUG_ON(mididx); -			BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); -			p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); -			pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; -			continue; -		} - -		if (mid_mfn_p == p2m_mid_missing_mfn) { -			/* -			 * XXX boot-time only!  We should never find -			 * missing parts of the mfn tree after -			 * runtime.  extend_brk() will BUG if we call -			 * it too late. -			 */ -			mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); -			p2m_mid_mfn_init(mid_mfn_p); - -			p2m_top_mfn_p[topidx] = mid_mfn_p; -		} - -		p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); -		mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); -	} -} - -void xen_setup_mfn_list_list(void) -{ -	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); - -	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = -		virt_to_mfn(p2m_top_mfn); -	HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; -} - -/* Set up p2m_top to point to the domain-builder provided p2m pages */ -void __init xen_build_dynamic_phys_to_machine(void) -{ -	unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; -	unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); -	unsigned long pfn; - -	xen_max_p2m_pfn = max_pfn; - -	p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); -	p2m_init(p2m_missing); - -	p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); -	p2m_mid_init(p2m_mid_missing); - -	p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); -	p2m_top_init(p2m_top); - -	/* -	 * The domain builder gives us a pre-constructed p2m array in -	 * mfn_list for all the pages initially given to us, so we just -	 * need to graft that into our tree structure. -	 */ -	for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { -		unsigned topidx = p2m_top_index(pfn); -		unsigned mididx = p2m_mid_index(pfn); - -		if (p2m_top[topidx] == p2m_mid_missing) { -			unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); -			p2m_mid_init(mid); - -			p2m_top[topidx] = mid; -		} - -		p2m_top[topidx][mididx] = &mfn_list[pfn]; -	} -} - -unsigned long get_phys_to_machine(unsigned long pfn) -{ -	unsigned topidx, mididx, idx; - -	if (unlikely(pfn >= MAX_P2M_PFN)) -		return INVALID_P2M_ENTRY; - -	topidx = p2m_top_index(pfn); -	mididx = p2m_mid_index(pfn); -	idx = p2m_index(pfn); - -	return p2m_top[topidx][mididx][idx]; -} -EXPORT_SYMBOL_GPL(get_phys_to_machine); - -static void *alloc_p2m_page(void) -{ -	return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); -} - -static void free_p2m_page(void *p) -{ -	free_page((unsigned long)p); -} - -/*  - * Fully allocate the p2m structure for a given pfn.  We need to check - * that both the top and mid levels are allocated, and make sure the - * parallel mfn tree is kept in sync.  We may race with other cpus, so - * the new pages are installed with cmpxchg; if we lose the race then - * simply free the page we allocated and use the one that's there. - */ -static bool alloc_p2m(unsigned long pfn) -{ -	unsigned topidx, mididx; -	unsigned long ***top_p, **mid; -	unsigned long *top_mfn_p, *mid_mfn; - -	topidx = p2m_top_index(pfn); -	mididx = p2m_mid_index(pfn); - -	top_p = &p2m_top[topidx]; -	mid = *top_p; - -	if (mid == p2m_mid_missing) { -		/* Mid level is missing, allocate a new one */ -		mid = alloc_p2m_page(); -		if (!mid) -			return false; - -		p2m_mid_init(mid); - -		if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) -			free_p2m_page(mid); -	} - -	top_mfn_p = &p2m_top_mfn[topidx]; -	mid_mfn = p2m_top_mfn_p[topidx]; - -	BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); - -	if (mid_mfn == p2m_mid_missing_mfn) { -		/* Separately check the mid mfn level */ -		unsigned long missing_mfn; -		unsigned long mid_mfn_mfn; - -		mid_mfn = alloc_p2m_page(); -		if (!mid_mfn) -			return false; - -		p2m_mid_mfn_init(mid_mfn); - -		missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); -		mid_mfn_mfn = virt_to_mfn(mid_mfn); -		if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) -			free_p2m_page(mid_mfn); -		else -			p2m_top_mfn_p[topidx] = mid_mfn; -	} - -	if (p2m_top[topidx][mididx] == p2m_missing) { -		/* p2m leaf page is missing */ -		unsigned long *p2m; - -		p2m = alloc_p2m_page(); -		if (!p2m) -			return false; - -		p2m_init(p2m); - -		if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) -			free_p2m_page(p2m); -		else -			mid_mfn[mididx] = virt_to_mfn(p2m); -	} - -	return true; -} - -/* Try to install p2m mapping; fail if intermediate bits missing */ -bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) -{ -	unsigned topidx, mididx, idx; - -	if (unlikely(pfn >= MAX_P2M_PFN)) { -		BUG_ON(mfn != INVALID_P2M_ENTRY); -		return true; -	} - -	topidx = p2m_top_index(pfn); -	mididx = p2m_mid_index(pfn); -	idx = p2m_index(pfn); - -	if (p2m_top[topidx][mididx] == p2m_missing) -		return mfn == INVALID_P2M_ENTRY; - -	p2m_top[topidx][mididx][idx] = mfn; - -	return true; -} - -bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) -{ -	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { -		BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); -		return true; -	} - -	if (unlikely(!__set_phys_to_machine(pfn, mfn)))  { -		if (!alloc_p2m(pfn)) -			return false; - -		if (!__set_phys_to_machine(pfn, mfn)) -			return false; -	} - -	return true; -} -  unsigned long arbitrary_virt_to_mfn(void *vaddr)  {  	xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); @@ -566,6 +151,7 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr)  	offset = address & ~PAGE_MASK;  	return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);  } +EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);  void make_lowmem_page_readonly(void *vaddr)  { @@ -607,21 +193,18 @@ static bool xen_page_pinned(void *ptr)  	return PagePinned(page);  } -static bool xen_iomap_pte(pte_t pte) -{ -	return pte_flags(pte) & _PAGE_IOMAP; -} -  void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)  {  	struct multicall_space mcs;  	struct mmu_update *u; +	trace_xen_mmu_set_domain_pte(ptep, pteval, domid); +  	mcs = xen_mc_entry(sizeof(*u));  	u = mcs.args;  	/* ptep might be kmapped when using 32-bit HIGHPTE */ -	u->ptr = arbitrary_virt_to_machine(ptep).maddr; +	u->ptr = virt_to_machine(ptep).maddr;  	u->val = pte_val_ma(pteval);  	MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); @@ -630,11 +213,6 @@ void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)  }  EXPORT_SYMBOL_GPL(xen_set_domain_pte); -static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) -{ -	xen_set_domain_pte(ptep, pteval, DOMID_IO); -} -  static void xen_extend_mmu_update(const struct mmu_update *update)  {  	struct multicall_space mcs; @@ -643,27 +221,35 @@ static void xen_extend_mmu_update(const struct mmu_update *update)  	mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));  	if (mcs.mc != NULL) { -		ADD_STATS(mmu_update_extended, 1); -		ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1); -  		mcs.mc->args[1]++; - -		if (mcs.mc->args[1] < MMU_UPDATE_HISTO) -			ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1); -		else -			ADD_STATS(mmu_update_histo[0], 1);  	} else { -		ADD_STATS(mmu_update, 1);  		mcs = __xen_mc_entry(sizeof(*u));  		MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); -		ADD_STATS(mmu_update_histo[1], 1);  	}  	u = mcs.args;  	*u = *update;  } -void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) +static void xen_extend_mmuext_op(const struct mmuext_op *op) +{ +	struct multicall_space mcs; +	struct mmuext_op *u; + +	mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u)); + +	if (mcs.mc != NULL) { +		mcs.mc->args[1]++; +	} else { +		mcs = __xen_mc_entry(sizeof(*u)); +		MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); +	} + +	u = mcs.args; +	*u = *op; +} + +static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)  {  	struct mmu_update u; @@ -676,16 +262,14 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)  	u.val = pmd_val_ma(val);  	xen_extend_mmu_update(&u); -	ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); -  	xen_mc_issue(PARAVIRT_LAZY_MMU);  	preempt_enable();  } -void xen_set_pmd(pmd_t *ptr, pmd_t val) +static void xen_set_pmd(pmd_t *ptr, pmd_t val)  { -	ADD_STATS(pmd_update, 1); +	trace_xen_mmu_set_pmd(ptr, val);  	/* If page is not pinned, we can just update the entry  	   directly */ @@ -694,8 +278,6 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)  		return;  	} -	ADD_STATS(pmd_update_pinned, 1); -  	xen_set_pmd_hyper(ptr, val);  } @@ -708,41 +290,60 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)  	set_pte_vaddr(vaddr, mfn_pte(mfn, flags));  } -void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, -		    pte_t *ptep, pte_t pteval) +static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)  { -	if (xen_iomap_pte(pteval)) { -		xen_set_iomap_pte(ptep, pteval); -		goto out; -	} +	struct mmu_update u; -	ADD_STATS(set_pte_at, 1); -//	ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); -	ADD_STATS(set_pte_at_current, mm == current->mm); -	ADD_STATS(set_pte_at_kernel, mm == &init_mm); +	if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) +		return false; -	if (mm == current->mm || mm == &init_mm) { -		if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { -			struct multicall_space mcs; -			mcs = xen_mc_entry(0); +	xen_mc_batch(); -			MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); -			ADD_STATS(set_pte_at_batched, 1); -			xen_mc_issue(PARAVIRT_LAZY_MMU); -			goto out; -		} else -			if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0) -				goto out; +	u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; +	u.val = pte_val_ma(pteval); +	xen_extend_mmu_update(&u); + +	xen_mc_issue(PARAVIRT_LAZY_MMU); + +	return true; +} + +static inline void __xen_set_pte(pte_t *ptep, pte_t pteval) +{ +	if (!xen_batched_set_pte(ptep, pteval)) { +		/* +		 * Could call native_set_pte() here and trap and +		 * emulate the PTE write but with 32-bit guests this +		 * needs two traps (one for each of the two 32-bit +		 * words in the PTE) so do one hypercall directly +		 * instead. +		 */ +		struct mmu_update u; + +		u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; +		u.val = pte_val_ma(pteval); +		HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);  	} -	xen_set_pte(ptep, pteval); +} + +static void xen_set_pte(pte_t *ptep, pte_t pteval) +{ +	trace_xen_mmu_set_pte(ptep, pteval); +	__xen_set_pte(ptep, pteval); +} -out:	return; +static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, +		    pte_t *ptep, pte_t pteval) +{ +	trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval); +	__xen_set_pte(ptep, pteval);  }  pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,  				 unsigned long addr, pte_t *ptep)  {  	/* Just return the pte as-is.  We preserve the bits on commit */ +	trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);  	return *ptep;  } @@ -751,15 +352,13 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,  {  	struct mmu_update u; +	trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);  	xen_mc_batch(); -	u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; +	u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;  	u.val = pte_val_ma(pte);  	xen_extend_mmu_update(&u); -	ADD_STATS(prot_commit, 1); -	ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); -  	xen_mc_issue(PARAVIRT_LAZY_MMU);  } @@ -768,8 +367,13 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)  {  	if (val & _PAGE_PRESENT) {  		unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; +		unsigned long pfn = mfn_to_pfn(mfn); +  		pteval_t flags = val & PTE_FLAGS_MASK; -		val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; +		if (unlikely(pfn == ~0)) +			val = flags & ~_PAGE_PRESENT; +		else +			val = ((pteval_t)pfn << PAGE_SHIFT) | flags;  	}  	return val; @@ -780,8 +384,12 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)  	if (val & _PAGE_PRESENT) {  		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;  		pteval_t flags = val & PTE_FLAGS_MASK; -		unsigned long mfn = pfn_to_mfn(pfn); +		unsigned long mfn; +		if (!xen_feature(XENFEAT_auto_translated_physmap)) +			mfn = get_phys_to_machine(pfn); +		else +			mfn = pfn;  		/*  		 * If there's no mfn for the pfn, then just create an  		 * empty non-present pte.  Unfortunately this loses @@ -791,8 +399,18 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)  		if (unlikely(mfn == INVALID_P2M_ENTRY)) {  			mfn = 0;  			flags = 0; +		} else { +			/* +			 * Paramount to do this test _after_ the +			 * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY & +			 * IDENTITY_FRAME_BIT resolves to true. +			 */ +			mfn &= ~FOREIGN_FRAME_BIT; +			if (mfn & IDENTITY_FRAME_BIT) { +				mfn &= ~IDENTITY_FRAME_BIT; +				flags |= _PAGE_IOMAP; +			}  		} -  		val = ((pteval_t)mfn << PAGE_SHIFT) | flags;  	} @@ -813,16 +431,16 @@ static pteval_t iomap_pte(pteval_t val)  	return val;  } -pteval_t xen_pte_val(pte_t pte) +__visible pteval_t xen_pte_val(pte_t pte)  {  	pteval_t pteval = pte.pte; - +#if 0  	/* If this is a WC pte, convert back from Xen WC to Linux WC */  	if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {  		WARN_ON(!pat_enabled);  		pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;  	} - +#endif  	if (xen_initial_domain() && (pteval & _PAGE_IOMAP))  		return pteval; @@ -830,7 +448,7 @@ pteval_t xen_pte_val(pte_t pte)  }  PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); -pgdval_t xen_pgd_val(pgd_t pgd) +__visible pgdval_t xen_pgd_val(pgd_t pgd)  {  	return pte_mfn_to_pfn(pgd.pgd);  } @@ -850,8 +468,8 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);   * 3        PCD PWT      UC       UC     UC   * 4    PAT              WB       WC     WB   * 5    PAT     PWT      WC       WP     WT - * 6    PAT PCD          UC-      UC     UC- - * 7    PAT PCD PWT      UC       UC     UC + * 6    PAT PCD          UC-      rsv    UC- + * 7    PAT PCD PWT      UC       rsv    UC   */  void xen_set_pat(u64 pat) @@ -861,10 +479,10 @@ void xen_set_pat(u64 pat)  	WARN_ON(pat != 0x0007010600070106ull);  } -pte_t xen_make_pte(pteval_t pte) +__visible pte_t xen_make_pte(pteval_t pte)  {  	phys_addr_t addr = (pte & PTE_PFN_MASK); - +#if 0  	/* If Linux is trying to set a WC pte, then map to the Xen WC.  	 * If _PAGE_PAT is set, then it probably means it is really  	 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope @@ -877,7 +495,7 @@ pte_t xen_make_pte(pteval_t pte)  		if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)  			pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;  	} - +#endif  	/*  	 * Unprivileged domains are allowed to do IOMAPpings for  	 * PCI passthrough, but not map ISA space.  The ISA @@ -896,20 +514,20 @@ pte_t xen_make_pte(pteval_t pte)  }  PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); -pgd_t xen_make_pgd(pgdval_t pgd) +__visible pgd_t xen_make_pgd(pgdval_t pgd)  {  	pgd = pte_pfn_to_mfn(pgd);  	return native_make_pgd(pgd);  }  PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); -pmdval_t xen_pmd_val(pmd_t pmd) +__visible pmdval_t xen_pmd_val(pmd_t pmd)  {  	return pte_mfn_to_pfn(pmd.pmd);  }  PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); -void xen_set_pud_hyper(pud_t *ptr, pud_t val) +static void xen_set_pud_hyper(pud_t *ptr, pud_t val)  {  	struct mmu_update u; @@ -922,16 +540,14 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)  	u.val = pud_val_ma(val);  	xen_extend_mmu_update(&u); -	ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); -  	xen_mc_issue(PARAVIRT_LAZY_MMU);  	preempt_enable();  } -void xen_set_pud(pud_t *ptr, pud_t val) +static void xen_set_pud(pud_t *ptr, pud_t val)  { -	ADD_STATS(pud_update, 1); +	trace_xen_mmu_set_pud(ptr, val);  	/* If page is not pinned, we can just update the entry  	   directly */ @@ -940,56 +556,31 @@ void xen_set_pud(pud_t *ptr, pud_t val)  		return;  	} -	ADD_STATS(pud_update_pinned, 1); -  	xen_set_pud_hyper(ptr, val);  } -void xen_set_pte(pte_t *ptep, pte_t pte) -{ -	if (xen_iomap_pte(pte)) { -		xen_set_iomap_pte(ptep, pte); -		return; -	} - -	ADD_STATS(pte_update, 1); -//	ADD_STATS(pte_update_pinned, xen_page_pinned(ptep)); -	ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); - -#ifdef CONFIG_X86_PAE -	ptep->pte_high = pte.pte_high; -	smp_wmb(); -	ptep->pte_low = pte.pte_low; -#else -	*ptep = pte; -#endif -} -  #ifdef CONFIG_X86_PAE -void xen_set_pte_atomic(pte_t *ptep, pte_t pte) +static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)  { -	if (xen_iomap_pte(pte)) { -		xen_set_iomap_pte(ptep, pte); -		return; -	} - +	trace_xen_mmu_set_pte_atomic(ptep, pte);  	set_64bit((u64 *)ptep, native_pte_val(pte));  } -void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)  { -	ptep->pte_low = 0; -	smp_wmb();		/* make sure low gets written first */ -	ptep->pte_high = 0; +	trace_xen_mmu_pte_clear(mm, addr, ptep); +	if (!xen_batched_set_pte(ptep, native_make_pte(0))) +		native_pte_clear(mm, addr, ptep);  } -void xen_pmd_clear(pmd_t *pmdp) +static void xen_pmd_clear(pmd_t *pmdp)  { +	trace_xen_mmu_pmd_clear(pmdp);  	set_pmd(pmdp, __pmd(0));  }  #endif	/* CONFIG_X86_PAE */ -pmd_t xen_make_pmd(pmdval_t pmd) +__visible pmd_t xen_make_pmd(pmdval_t pmd)  {  	pmd = pte_pfn_to_mfn(pmd);  	return native_make_pmd(pmd); @@ -997,13 +588,13 @@ pmd_t xen_make_pmd(pmdval_t pmd)  PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);  #if PAGETABLE_LEVELS == 4 -pudval_t xen_pud_val(pud_t pud) +__visible pudval_t xen_pud_val(pud_t pud)  {  	return pte_mfn_to_pfn(pud.pud);  }  PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); -pud_t xen_make_pud(pudval_t pud) +__visible pud_t xen_make_pud(pudval_t pud)  {  	pud = pte_pfn_to_mfn(pud); @@ -1011,7 +602,7 @@ pud_t xen_make_pud(pudval_t pud)  }  PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); -pgd_t *xen_get_user_pgd(pgd_t *pgd) +static pgd_t *xen_get_user_pgd(pgd_t *pgd)  {  	pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);  	unsigned offset = pgd - pgd_page; @@ -1043,7 +634,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)   *  2. It is always pinned   *  3. It has no user pagetable attached to it   */ -void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) +static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)  {  	preempt_disable(); @@ -1056,11 +647,11 @@ void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)  	preempt_enable();  } -void xen_set_pgd(pgd_t *ptr, pgd_t val) +static void xen_set_pgd(pgd_t *ptr, pgd_t val)  {  	pgd_t *user_ptr = xen_get_user_pgd(ptr); -	ADD_STATS(pgd_update, 1); +	trace_xen_mmu_set_pgd(ptr, user_ptr, val);  	/* If page is not pinned, we can just update the entry  	   directly */ @@ -1073,9 +664,6 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)  		return;  	} -	ADD_STATS(pgd_update_pinned, 1); -	ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); -  	/* If it's pinned, then we can at least batch the kernel and  	   user updates together. */  	xen_mc_batch(); @@ -1208,8 +796,8 @@ static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)  {  	spinlock_t *ptl = NULL; -#if USE_SPLIT_PTLOCKS -	ptl = __pte_lockptr(page); +#if USE_SPLIT_PTE_PTLOCKS +	ptl = ptlock_ptr(page);  	spin_lock_nest_lock(ptl, &mm->page_table_lock);  #endif @@ -1224,14 +812,12 @@ static void xen_pte_unlock(void *v)  static void xen_do_pin(unsigned level, unsigned long pfn)  { -	struct mmuext_op *op; -	struct multicall_space mcs; +	struct mmuext_op op; -	mcs = __xen_mc_entry(sizeof(*op)); -	op = mcs.args; -	op->cmd = level; -	op->arg1.mfn = pfn_to_mfn(pfn); -	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); +	op.cmd = level; +	op.arg1.mfn = pfn_to_mfn(pfn); + +	xen_extend_mmuext_op(&op);  }  static int xen_pin_page(struct mm_struct *mm, struct page *page, @@ -1299,6 +885,8 @@ static int xen_pin_page(struct mm_struct *mm, struct page *page,     read-only, and can be pinned. */  static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)  { +	trace_xen_mmu_pgd_pin(mm, pgd); +  	xen_mc_batch();  	if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) { @@ -1350,10 +938,9 @@ static void xen_pgd_pin(struct mm_struct *mm)   */  void xen_mm_pin_all(void)  { -	unsigned long flags;  	struct page *page; -	spin_lock_irqsave(&pgd_lock, flags); +	spin_lock(&pgd_lock);  	list_for_each_entry(page, &pgd_list, lru) {  		if (!PagePinned(page)) { @@ -1362,7 +949,7 @@ void xen_mm_pin_all(void)  		}  	} -	spin_unlock_irqrestore(&pgd_lock, flags); +	spin_unlock(&pgd_lock);  }  /* @@ -1370,7 +957,7 @@ void xen_mm_pin_all(void)   * that's before we have page structures to store the bits.  So do all   * the book-keeping now.   */ -static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page, +static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,  				  enum pt_level level)  {  	SetPagePinned(page); @@ -1425,6 +1012,8 @@ static int xen_unpin_page(struct mm_struct *mm, struct page *page,  /* Release a pagetables pages back as normal RW */  static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)  { +	trace_xen_mmu_pgd_unpin(mm, pgd); +  	xen_mc_batch();  	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); @@ -1463,10 +1052,9 @@ static void xen_pgd_unpin(struct mm_struct *mm)   */  void xen_mm_unpin_all(void)  { -	unsigned long flags;  	struct page *page; -	spin_lock_irqsave(&pgd_lock, flags); +	spin_lock(&pgd_lock);  	list_for_each_entry(page, &pgd_list, lru) {  		if (PageSavePinned(page)) { @@ -1476,17 +1064,17 @@ void xen_mm_unpin_all(void)  		}  	} -	spin_unlock_irqrestore(&pgd_lock, flags); +	spin_unlock(&pgd_lock);  } -void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) +static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)  {  	spin_lock(&next->page_table_lock);  	xen_pgd_pin(next);  	spin_unlock(&next->page_table_lock);  } -void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)  {  	spin_lock(&mm->page_table_lock);  	xen_pgd_pin(mm); @@ -1502,14 +1090,14 @@ static void drop_other_mm_ref(void *info)  	struct mm_struct *mm = info;  	struct mm_struct *active_mm; -	active_mm = percpu_read(cpu_tlbstate.active_mm); +	active_mm = this_cpu_read(cpu_tlbstate.active_mm); -	if (active_mm == mm) +	if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)  		leave_mm(smp_processor_id());  	/* If this cpu still has a stale cr3 reference, then make sure  	   it has been flushed. */ -	if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) +	if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))  		load_cr3(swapper_pg_dir);  } @@ -1573,7 +1161,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm)   * pagetable because of lazy tlb flushing.  This means we need need to   * switch all CPUs off this pagetable before we can unpin it.   */ -void xen_exit_mmap(struct mm_struct *mm) +static void xen_exit_mmap(struct mm_struct *mm)  {  	get_cpu();		/* make sure we don't move around */  	xen_drop_mm_ref(mm); @@ -1588,38 +1176,134 @@ void xen_exit_mmap(struct mm_struct *mm)  	spin_unlock(&mm->page_table_lock);  } -static __init void xen_pagetable_setup_start(pgd_t *base) +static void xen_post_allocator_init(void); + +#ifdef CONFIG_X86_64 +static void __init xen_cleanhighmap(unsigned long vaddr, +				    unsigned long vaddr_end)  { +	unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; +	pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr); + +	/* NOTE: The loop is more greedy than the cleanup_highmap variant. +	 * We include the PMD passed in on _both_ boundaries. */ +	for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PAGE_SIZE)); +			pmd++, vaddr += PMD_SIZE) { +		if (pmd_none(*pmd)) +			continue; +		if (vaddr < (unsigned long) _text || vaddr > kernel_end) +			set_pmd(pmd, __pmd(0)); +	} +	/* In case we did something silly, we should crash in this function +	 * instead of somewhere later and be confusing. */ +	xen_mc_flush();  } +static void __init xen_pagetable_p2m_copy(void) +{ +	unsigned long size; +	unsigned long addr; +	unsigned long new_mfn_list; -static void xen_post_allocator_init(void); +	if (xen_feature(XENFEAT_auto_translated_physmap)) +		return; + +	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + +	new_mfn_list = xen_revector_p2m_tree(); +	/* No memory or already called. */ +	if (!new_mfn_list || new_mfn_list == xen_start_info->mfn_list) +		return; + +	/* using __ka address and sticking INVALID_P2M_ENTRY! */ +	memset((void *)xen_start_info->mfn_list, 0xff, size); + +	/* We should be in __ka space. */ +	BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); +	addr = xen_start_info->mfn_list; +	/* We roundup to the PMD, which means that if anybody at this stage is +	 * using the __ka address of xen_start_info or xen_start_info->shared_info +	 * they are in going to crash. Fortunatly we have already revectored +	 * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ +	size = roundup(size, PMD_SIZE); +	xen_cleanhighmap(addr, addr + size); + +	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); +	memblock_free(__pa(xen_start_info->mfn_list), size); +	/* And revector! Bye bye old array */ +	xen_start_info->mfn_list = new_mfn_list; + +	/* At this stage, cleanup_highmap has already cleaned __ka space +	 * from _brk_limit way up to the max_pfn_mapped (which is the end of +	 * the ramdisk). We continue on, erasing PMD entries that point to page +	 * tables - do note that they are accessible at this stage via __va. +	 * For good measure we also round up to the PMD - which means that if +	 * anybody is using __ka address to the initial boot-stack - and try +	 * to use it - they are going to crash. The xen_start_info has been +	 * taken care of already in xen_setup_kernel_pagetable. */ +	addr = xen_start_info->pt_base; +	size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE); + +	xen_cleanhighmap(addr, addr + size); +	xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base)); +#ifdef DEBUG +	/* This is superflous and is not neccessary, but you know what +	 * lets do it. The MODULES_VADDR -> MODULES_END should be clear of +	 * anything at this stage. */ +	xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); +#endif +} +#endif -static __init void xen_pagetable_setup_done(pgd_t *base) +static void __init xen_pagetable_init(void)  { +	paging_init();  	xen_setup_shared_info(); +#ifdef CONFIG_X86_64 +	xen_pagetable_p2m_copy(); +#endif  	xen_post_allocator_init();  } -  static void xen_write_cr2(unsigned long cr2)  { -	percpu_read(xen_vcpu)->arch.cr2 = cr2; +	this_cpu_read(xen_vcpu)->arch.cr2 = cr2;  }  static unsigned long xen_read_cr2(void)  { -	return percpu_read(xen_vcpu)->arch.cr2; +	return this_cpu_read(xen_vcpu)->arch.cr2;  }  unsigned long xen_read_cr2_direct(void)  { -	return percpu_read(xen_vcpu_info.arch.cr2); +	return this_cpu_read(xen_vcpu_info.arch.cr2);  } +void xen_flush_tlb_all(void) +{ +	struct mmuext_op *op; +	struct multicall_space mcs; + +	trace_xen_mmu_flush_tlb_all(0); + +	preempt_disable(); + +	mcs = xen_mc_entry(sizeof(*op)); + +	op = mcs.args; +	op->cmd = MMUEXT_TLB_FLUSH_ALL; +	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + +	xen_mc_issue(PARAVIRT_LAZY_MMU); + +	preempt_enable(); +}  static void xen_flush_tlb(void)  {  	struct mmuext_op *op;  	struct multicall_space mcs; +	trace_xen_mmu_flush_tlb(0); +  	preempt_disable();  	mcs = xen_mc_entry(sizeof(*op)); @@ -1638,6 +1322,8 @@ static void xen_flush_tlb_single(unsigned long addr)  	struct mmuext_op *op;  	struct multicall_space mcs; +	trace_xen_mmu_flush_tlb_single(addr); +  	preempt_disable();  	mcs = xen_mc_entry(sizeof(*op)); @@ -1652,14 +1338,21 @@ static void xen_flush_tlb_single(unsigned long addr)  }  static void xen_flush_tlb_others(const struct cpumask *cpus, -				 struct mm_struct *mm, unsigned long va) +				 struct mm_struct *mm, unsigned long start, +				 unsigned long end)  {  	struct {  		struct mmuext_op op; +#ifdef CONFIG_SMP +		DECLARE_BITMAP(mask, num_processors); +#else  		DECLARE_BITMAP(mask, NR_CPUS); +#endif  	} *args;  	struct multicall_space mcs; +	trace_xen_mmu_flush_tlb_others(cpus, mm, start, end); +  	if (cpumask_empty(cpus))  		return;		/* nothing to do */ @@ -1671,11 +1364,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,  	cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);  	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); -	if (va == TLB_FLUSH_ALL) { -		args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; -	} else { +	args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; +	if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {  		args->op.cmd = MMUEXT_INVLPG_MULTI; -		args->op.arg1.linear_addr = va; +		args->op.arg1.linear_addr = start;  	}  	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); @@ -1685,20 +1377,21 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,  static unsigned long xen_read_cr3(void)  { -	return percpu_read(xen_cr3); +	return this_cpu_read(xen_cr3);  }  static void set_current_cr3(void *v)  { -	percpu_write(xen_current_cr3, (unsigned long)v); +	this_cpu_write(xen_current_cr3, (unsigned long)v);  }  static void __xen_write_cr3(bool kernel, unsigned long cr3)  { -	struct mmuext_op *op; -	struct multicall_space mcs; +	struct mmuext_op op;  	unsigned long mfn; +	trace_xen_mmu_write_cr3(kernel, cr3); +  	if (cr3)  		mfn = pfn_to_mfn(PFN_DOWN(cr3));  	else @@ -1706,23 +1399,19 @@ static void __xen_write_cr3(bool kernel, unsigned long cr3)  	WARN_ON(mfn == 0 && kernel); -	mcs = __xen_mc_entry(sizeof(*op)); - -	op = mcs.args; -	op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; -	op->arg1.mfn = mfn; +	op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; +	op.arg1.mfn = mfn; -	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); +	xen_extend_mmuext_op(&op);  	if (kernel) { -		percpu_write(xen_cr3, cr3); +		this_cpu_write(xen_cr3, cr3);  		/* Update xen_current_cr3 once the batch has actually  		   been submitted. */  		xen_mc_callback(set_current_cr3, (void *)cr3);  	}  } -  static void xen_write_cr3(unsigned long cr3)  {  	BUG_ON(preemptible()); @@ -1731,7 +1420,7 @@ static void xen_write_cr3(unsigned long cr3)  	/* Update while interrupts are disabled, so its atomic with  	   respect to ipis */ -	percpu_write(xen_cr3, cr3); +	this_cpu_write(xen_cr3, cr3);  	__xen_write_cr3(true, cr3); @@ -1748,6 +1437,43 @@ static void xen_write_cr3(unsigned long cr3)  	xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */  } +#ifdef CONFIG_X86_64 +/* + * At the start of the day - when Xen launches a guest, it has already + * built pagetables for the guest. We diligently look over them + * in xen_setup_kernel_pagetable and graft as appropiate them in the + * init_level4_pgt and its friends. Then when we are happy we load + * the new init_level4_pgt - and continue on. + * + * The generic code starts (start_kernel) and 'init_mem_mapping' sets + * up the rest of the pagetables. When it has completed it loads the cr3. + * N.B. that baremetal would start at 'start_kernel' (and the early + * #PF handler would create bootstrap pagetables) - so we are running + * with the same assumptions as what to do when write_cr3 is executed + * at this point. + * + * Since there are no user-page tables at all, we have two variants + * of xen_write_cr3 - the early bootup (this one), and the late one + * (xen_write_cr3). The reason we have to do that is that in 64-bit + * the Linux kernel and user-space are both in ring 3 while the + * hypervisor is in ring 0. + */ +static void __init xen_write_cr3_init(unsigned long cr3) +{ +	BUG_ON(preemptible()); + +	xen_mc_batch();  /* disables interrupts */ + +	/* Update while interrupts are disabled, so its atomic with +	   respect to ipis */ +	this_cpu_write(xen_cr3, cr3); + +	__xen_write_cr3(true, cr3); + +	xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */ +} +#endif +  static int xen_pgd_alloc(struct mm_struct *mm)  {  	pgd_t *pgd = mm->pgd; @@ -1768,7 +1494,7 @@ static int xen_pgd_alloc(struct mm_struct *mm)  		page->private = (unsigned long)user_pgd;  		if (user_pgd != NULL) { -			user_pgd[pgd_index(VSYSCALL_START)] = +			user_pgd[pgd_index(VSYSCALL_ADDR)] =  				__pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);  			ret = 0;  		} @@ -1790,36 +1516,45 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)  #endif  } -static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) -{ -	unsigned long pfn = pte_pfn(pte); -  #ifdef CONFIG_X86_32 +static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) +{  	/* If there's an existing pte, then don't allow _PAGE_RW to be set */  	if (pte_val_ma(*ptep) & _PAGE_PRESENT)  		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &  			       pte_val_ma(pte)); -#endif - -	/* -	 * If the new pfn is within the range of the newly allocated -	 * kernel pagetable, and it isn't being mapped into an -	 * early_ioremap fixmap slot, make sure it is RO. -	 */ -	if (!is_early_ioremap_ptep(ptep) && -	    pfn >= e820_table_start && pfn < e820_table_end) -		pte = pte_wrprotect(pte);  	return pte;  } +#else /* CONFIG_X86_64 */ +static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) +{ +	return pte; +} +#endif /* CONFIG_X86_64 */ -/* Init-time set_pte while constructing initial pagetables, which -   doesn't allow RO pagetable pages to be remapped RW */ -static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) +/* + * Init-time set_pte while constructing initial pagetables, which + * doesn't allow RO page table pages to be remapped RW. + * + * If there is no MFN for this PFN then this page is initially + * ballooned out so clear the PTE (as in decrease_reservation() in + * drivers/xen/balloon.c). + * + * Many of these PTE updates are done on unpinned and writable pages + * and doing a hypercall for these is unnecessary and expensive.  At + * this point it is not possible to tell if a page is pinned or not, + * so always write the PTE directly and rely on Xen trapping and + * emulating any updates as necessary. + */ +static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)  { -	pte = mask_rw_pte(ptep, pte); +	if (pte_mfn(pte) != INVALID_P2M_ENTRY) +		pte = mask_rw_pte(ptep, pte); +	else +		pte = __pte_ma(0); -	xen_set_pte(ptep, pte); +	native_set_pte(ptep, pte);  }  static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) @@ -1833,7 +1568,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)  /* Early in boot, while setting up the initial pagetable, assume     everything is pinned. */ -static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) +static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)  {  #ifdef CONFIG_FLATMEM  	BUG_ON(mem_map);	/* should only be used early */ @@ -1843,7 +1578,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)  }  /* Used for pmd and pud */ -static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) +static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)  {  #ifdef CONFIG_FLATMEM  	BUG_ON(mem_map);	/* should only be used early */ @@ -1853,30 +1588,63 @@ static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)  /* Early release_pte assumes that all pts are pinned, since there's     only init_mm and anything attached to that is pinned. */ -static __init void xen_release_pte_init(unsigned long pfn) +static void __init xen_release_pte_init(unsigned long pfn)  {  	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);  	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));  } -static __init void xen_release_pmd_init(unsigned long pfn) +static void __init xen_release_pmd_init(unsigned long pfn)  {  	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));  } +static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn) +{ +	struct multicall_space mcs; +	struct mmuext_op *op; + +	mcs = __xen_mc_entry(sizeof(*op)); +	op = mcs.args; +	op->cmd = cmd; +	op->arg1.mfn = pfn_to_mfn(pfn); + +	MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); +} + +static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot) +{ +	struct multicall_space mcs; +	unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT); + +	mcs = __xen_mc_entry(0); +	MULTI_update_va_mapping(mcs.mc, (unsigned long)addr, +				pfn_pte(pfn, prot), 0); +} +  /* This needs to make sure the new pte page is pinned iff its being     attached to a pinned pagetable. */ -static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level) +static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, +				    unsigned level)  { -	struct page *page = pfn_to_page(pfn); +	bool pinned = PagePinned(virt_to_page(mm->pgd)); + +	trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned); + +	if (pinned) { +		struct page *page = pfn_to_page(pfn); -	if (PagePinned(virt_to_page(mm->pgd))) {  		SetPagePinned(page);  		if (!PageHighMem(page)) { -			make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); -			if (level == PT_PTE && USE_SPLIT_PTLOCKS) -				pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); +			xen_mc_batch(); + +			__set_pfn_prot(pfn, PAGE_KERNEL_RO); + +			if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) +				__pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); + +			xen_mc_issue(PARAVIRT_LAZY_MMU);  		} else {  			/* make sure there are no stray mappings of  			   this page */ @@ -1896,15 +1664,23 @@ static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)  }  /* This should never happen until we're OK to use struct page */ -static void xen_release_ptpage(unsigned long pfn, unsigned level) +static inline void xen_release_ptpage(unsigned long pfn, unsigned level)  {  	struct page *page = pfn_to_page(pfn); +	bool pinned = PagePinned(page); -	if (PagePinned(page)) { +	trace_xen_mmu_release_ptpage(pfn, level, pinned); + +	if (pinned) {  		if (!PageHighMem(page)) { -			if (level == PT_PTE && USE_SPLIT_PTLOCKS) -				pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); -			make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); +			xen_mc_batch(); + +			if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) +				__pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); + +			__set_pfn_prot(pfn, PAGE_KERNEL); + +			xen_mc_issue(PARAVIRT_LAZY_MMU);  		}  		ClearPagePinned(page);  	} @@ -1976,16 +1752,24 @@ static void *m2v(phys_addr_t maddr)  }  /* Set the page permissions on an identity-mapped pages */ -static void set_page_prot(void *addr, pgprot_t prot) +static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags)  {  	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;  	pte_t pte = pfn_pte(pfn, prot); -	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) +	/* For PVH no need to set R/O or R/W to pin them or unpin them. */ +	if (xen_feature(XENFEAT_auto_translated_physmap)) +		return; + +	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))  		BUG();  } - -static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) +static void set_page_prot(void *addr, pgprot_t prot) +{ +	return set_page_prot_flags(addr, prot, UVMF_NONE); +} +#ifdef CONFIG_X86_32 +static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)  {  	unsigned pmdidx, pteidx;  	unsigned ident_pte; @@ -2017,8 +1801,10 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)  		for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {  			pte_t pte; +#ifdef CONFIG_X86_32  			if (pfn > max_pfn_mapped)  				max_pfn_mapped = pfn; +#endif  			if (!pte_none(pte_page[pteidx]))  				continue; @@ -2033,6 +1819,22 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)  	set_page_prot(pmd, PAGE_KERNEL_RO);  } +#endif +void __init xen_setup_machphys_mapping(void) +{ +	struct xen_machphys_mapping mapping; + +	if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { +		machine_to_phys_mapping = (unsigned long *)mapping.v_start; +		machine_to_phys_nr = mapping.max_mfn + 1; +	} else { +		machine_to_phys_nr = MACH2PHYS_NR_ENTRIES; +	} +#ifdef CONFIG_X86_32 +	WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1)) +		< machine_to_phys_mapping); +#endif +}  #ifdef CONFIG_X86_64  static void convert_pfn_mfn(void *v) @@ -2045,9 +1847,22 @@ static void convert_pfn_mfn(void *v)  	for (i = 0; i < PTRS_PER_PTE; i++)  		pte[i] = xen_make_pte(pte[i].pte);  } - +static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, +				 unsigned long addr) +{ +	if (*pt_base == PFN_DOWN(__pa(addr))) { +		set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); +		clear_page((void *)addr); +		(*pt_base)++; +	} +	if (*pt_end == PFN_DOWN(__pa(addr))) { +		set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); +		clear_page((void *)addr); +		(*pt_end)--; +	} +}  /* - * Set up the inital kernel pagetable. + * Set up the initial kernel pagetable.   *   * We can construct this by grafting the Xen provided pagetable into   * head_64.S's preconstructed pagetables.  We copy the Xen L2's into @@ -2056,107 +1871,177 @@ static void convert_pfn_mfn(void *v)   * but that's enough to get __va working.  We need to fill in the rest   * of the physical mapping once some sort of allocator has been set   * up. + * NOTE: for PVH, the page tables are native.   */ -__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, -					 unsigned long max_pfn) +void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)  {  	pud_t *l3;  	pmd_t *l2; +	unsigned long addr[3]; +	unsigned long pt_base, pt_end; +	unsigned i; + +	/* max_pfn_mapped is the last pfn mapped in the initial memory +	 * mappings. Considering that on Xen after the kernel mappings we +	 * have the mappings of some pages that don't exist in pfn space, we +	 * set max_pfn_mapped to the last real pfn mapped. */ +	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); + +	pt_base = PFN_DOWN(__pa(xen_start_info->pt_base)); +	pt_end = pt_base + xen_start_info->nr_pt_frames;  	/* Zap identity mapping */  	init_level4_pgt[0] = __pgd(0); -	/* Pre-constructed entries are in pfn, so convert to mfn */ -	convert_pfn_mfn(init_level4_pgt); -	convert_pfn_mfn(level3_ident_pgt); -	convert_pfn_mfn(level3_kernel_pgt); - +	if (!xen_feature(XENFEAT_auto_translated_physmap)) { +		/* Pre-constructed entries are in pfn, so convert to mfn */ +		/* L4[272] -> level3_ident_pgt +		 * L4[511] -> level3_kernel_pgt */ +		convert_pfn_mfn(init_level4_pgt); + +		/* L3_i[0] -> level2_ident_pgt */ +		convert_pfn_mfn(level3_ident_pgt); +		/* L3_k[510] -> level2_kernel_pgt +		 * L3_i[511] -> level2_fixmap_pgt */ +		convert_pfn_mfn(level3_kernel_pgt); +	} +	/* We get [511][511] and have Xen's version of level2_kernel_pgt */  	l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);  	l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); -	memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); -	memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); - +	addr[0] = (unsigned long)pgd; +	addr[1] = (unsigned long)l3; +	addr[2] = (unsigned long)l2; +	/* Graft it onto L4[272][0]. Note that we creating an aliasing problem: +	 * Both L4[272][0] and L4[511][511] have entries that point to the same +	 * L2 (PMD) tables. Meaning that if you modify it in __va space +	 * it will be also modified in the __ka space! (But if you just +	 * modify the PMD table to point to other PTE's or none, then you +	 * are OK - which is what cleanup_highmap does) */ +	copy_page(level2_ident_pgt, l2); +	/* Graft it onto L4[511][511] */ +	copy_page(level2_kernel_pgt, l2); + +	/* Get [511][510] and graft that in level2_fixmap_pgt */  	l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);  	l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); -	memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); - -	/* Set up identity map */ -	xen_map_identity_early(level2_ident_pgt, max_pfn); +	copy_page(level2_fixmap_pgt, l2); +	/* Note that we don't do anything with level1_fixmap_pgt which +	 * we don't need. */ +	if (!xen_feature(XENFEAT_auto_translated_physmap)) { +		/* Make pagetable pieces RO */ +		set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); +		set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); +		set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); +		set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); +		set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); +		set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); +		set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); + +		/* Pin down new L4 */ +		pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, +				  PFN_DOWN(__pa_symbol(init_level4_pgt))); + +		/* Unpin Xen-provided one */ +		pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); -	/* Make pagetable pieces RO */ -	set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); -	set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); -	set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); -	set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); -	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); -	set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); +		/* +		 * At this stage there can be no user pgd, and no page +		 * structure to attach it to, so make sure we just set kernel +		 * pgd. +		 */ +		xen_mc_batch(); +		__xen_write_cr3(true, __pa(init_level4_pgt)); +		xen_mc_issue(PARAVIRT_LAZY_CPU); +	} else +		native_write_cr3(__pa(init_level4_pgt)); + +	/* We can't that easily rip out L3 and L2, as the Xen pagetables are +	 * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ...  for +	 * the initial domain. For guests using the toolstack, they are in: +	 * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only +	 * rip out the [L4] (pgd), but for guests we shave off three pages. +	 */ +	for (i = 0; i < ARRAY_SIZE(addr); i++) +		check_pt_base(&pt_base, &pt_end, addr[i]); -	/* Pin down new L4 */ -	pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, -			  PFN_DOWN(__pa_symbol(init_level4_pgt))); +	/* Our (by three pages) smaller Xen pagetable that we are using */ +	memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE); +	/* Revector the xen_start_info */ +	xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); +} +#else	/* !CONFIG_X86_64 */ +static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); +static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); -	/* Unpin Xen-provided one */ -	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); +static void __init xen_write_cr3_init(unsigned long cr3) +{ +	unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); -	/* Switch over */ -	pgd = init_level4_pgt; +	BUG_ON(read_cr3() != __pa(initial_page_table)); +	BUG_ON(cr3 != __pa(swapper_pg_dir));  	/* -	 * At this stage there can be no user pgd, and no page -	 * structure to attach it to, so make sure we just set kernel -	 * pgd. +	 * We are switching to swapper_pg_dir for the first time (from +	 * initial_page_table) and therefore need to mark that page +	 * read-only and then pin it. +	 * +	 * Xen disallows sharing of kernel PMDs for PAE +	 * guests. Therefore we must copy the kernel PMD from +	 * initial_page_table into a new kernel PMD to be used in +	 * swapper_pg_dir.  	 */ -	xen_mc_batch(); -	__xen_write_cr3(true, __pa(pgd)); -	xen_mc_issue(PARAVIRT_LAZY_CPU); +	swapper_kernel_pmd = +		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); +	copy_page(swapper_kernel_pmd, initial_kernel_pmd); +	swapper_pg_dir[KERNEL_PGD_BOUNDARY] = +		__pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT); +	set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO); + +	set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); +	xen_write_cr3(cr3); +	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn); -	memblock_x86_reserve_range(__pa(xen_start_info->pt_base), -		      __pa(xen_start_info->pt_base + -			   xen_start_info->nr_pt_frames * PAGE_SIZE), -		      "XEN PAGETABLES"); +	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, +			  PFN_DOWN(__pa(initial_page_table))); +	set_page_prot(initial_page_table, PAGE_KERNEL); +	set_page_prot(initial_kernel_pmd, PAGE_KERNEL); -	return pgd; +	pv_mmu_ops.write_cr3 = &xen_write_cr3;  } -#else	/* !CONFIG_X86_64 */ -static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD); -__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, -					 unsigned long max_pfn) +void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)  {  	pmd_t *kernel_pmd; -	level2_kernel_pgt = extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); +	initial_kernel_pmd = +		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);  	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +  				  xen_start_info->nr_pt_frames * PAGE_SIZE +  				  512*1024);  	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); -	memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); +	copy_page(initial_kernel_pmd, kernel_pmd); -	xen_map_identity_early(level2_kernel_pgt, max_pfn); +	xen_map_identity_early(initial_kernel_pmd, max_pfn); -	memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); -	set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], -			__pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); +	copy_page(initial_page_table, pgd); +	initial_page_table[KERNEL_PGD_BOUNDARY] = +		__pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT); -	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); -	set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); +	set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO); +	set_page_prot(initial_page_table, PAGE_KERNEL_RO);  	set_page_prot(empty_zero_page, PAGE_KERNEL_RO);  	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); -	xen_write_cr3(__pa(swapper_pg_dir)); - -	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); - -	memblock_x86_reserve_range(__pa(xen_start_info->pt_base), -		      __pa(xen_start_info->pt_base + -			   xen_start_info->nr_pt_frames * PAGE_SIZE), -		      "XEN PAGETABLES"); +	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, +			  PFN_DOWN(__pa(initial_page_table))); +	xen_write_cr3(__pa(initial_page_table)); -	return swapper_pg_dir; +	memblock_reserve(__pa(xen_start_info->pt_base), +			 xen_start_info->nr_pt_frames * PAGE_SIZE);  }  #endif	/* CONFIG_X86_64 */ @@ -2170,17 +2055,14 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)  	switch (idx) {  	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: -#ifdef CONFIG_X86_F00F_BUG -	case FIX_F00F_IDT: -#endif +	case FIX_RO_IDT:  #ifdef CONFIG_X86_32  	case FIX_WP_TEST: -	case FIX_VDSO:  # ifdef CONFIG_HIGHMEM  	case FIX_KMAP_BEGIN ... FIX_KMAP_END:  # endif  #else -	case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: +	case VSYSCALL_PAGE:  #endif  	case FIX_TEXT_POKE0:  	case FIX_TEXT_POKE1: @@ -2221,38 +2103,18 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)  #ifdef CONFIG_X86_64  	/* Replicate changes to map the vsyscall page into the user  	   pagetable vsyscall mapping. */ -	if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) { +	if (idx == VSYSCALL_PAGE) {  		unsigned long vaddr = __fix_to_virt(idx);  		set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);  	}  #endif  } -__init void xen_ident_map_ISA(void) +static void __init xen_post_allocator_init(void)  { -	unsigned long pa; - -	/* -	 * If we're dom0, then linear map the ISA machine addresses into -	 * the kernel's address space. -	 */ -	if (!xen_initial_domain()) +	if (xen_feature(XENFEAT_auto_translated_physmap))  		return; -	xen_raw_printk("Xen: setup ISA identity maps\n"); - -	for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) { -		pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO); - -		if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0)) -			BUG(); -	} - -	xen_flush_tlb(); -} - -static __init void xen_post_allocator_init(void) -{  	pv_mmu_ops.set_pte = xen_set_pte;  	pv_mmu_ops.set_pmd = xen_set_pmd;  	pv_mmu_ops.set_pud = xen_set_pud; @@ -2272,6 +2134,7 @@ static __init void xen_post_allocator_init(void)  #endif  #ifdef CONFIG_X86_64 +	pv_mmu_ops.write_cr3 = &xen_write_cr3;  	SetPagePinned(virt_to_page(level3_user_vsyscall));  #endif  	xen_mark_init_mm_pinned(); @@ -2285,12 +2148,12 @@ static void xen_leave_lazy_mmu(void)  	preempt_enable();  } -static const struct pv_mmu_ops xen_mmu_ops __initdata = { +static const struct pv_mmu_ops xen_mmu_ops __initconst = {  	.read_cr2 = xen_read_cr2,  	.write_cr2 = xen_write_cr2,  	.read_cr3 = xen_read_cr3, -	.write_cr3 = xen_write_cr3, +	.write_cr3 = xen_write_cr3_init,  	.flush_tlb_user = xen_flush_tlb,  	.flush_tlb_kernel = xen_flush_tlb, @@ -2347,6 +2210,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {  	.lazy_mode = {  		.enter = paravirt_enter_lazy_mmu,  		.leave = xen_leave_lazy_mmu, +		.flush = paravirt_flush_lazy_mmu,  	},  	.set_fixmap = xen_set_fixmap, @@ -2354,11 +2218,17 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {  void __init xen_init_mmu_ops(void)  { -	x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; -	x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; -	pv_mmu_ops = xen_mmu_ops; +	x86_init.paging.pagetable_init = xen_pagetable_init; -	vmap_lazy_unmap = false; +	/* Optimization - we can use the HVM one but it has no idea which +	 * VCPUs are descheduled - which means that it will needlessly IPI +	 * them. Xen knows so let it do the job. +	 */ +	if (xen_feature(XENFEAT_auto_translated_physmap)) { +		pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others; +		return; +	} +	pv_mmu_ops = xen_mmu_ops;  	memset(dummy_mapping, 0xff, PAGE_SIZE);  } @@ -2383,7 +2253,7 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,  			in_frames[i] = virt_to_mfn(vaddr);  		MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); -		set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); +		__set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);  		if (out_frames)  			out_frames[i] = virt_to_pfn(vaddr); @@ -2479,12 +2349,14 @@ static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,  	return success;  } -int xen_create_contiguous_region(unsigned long vstart, unsigned int order, -				 unsigned int address_bits) +int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, +				 unsigned int address_bits, +				 dma_addr_t *dma_handle)  {  	unsigned long *in_frames = discontig_frames, out_frame;  	unsigned long  flags;  	int            success; +	unsigned long vstart = (unsigned long)phys_to_virt(pstart);  	/*  	 * Currently an auto-translated guest will not perform I/O, nor will @@ -2519,15 +2391,17 @@ int xen_create_contiguous_region(unsigned long vstart, unsigned int order,  	spin_unlock_irqrestore(&xen_reservation_lock, flags); +	*dma_handle = virt_to_machine(vstart).maddr;  	return success ? 0 : -ENOMEM;  }  EXPORT_SYMBOL_GPL(xen_create_contiguous_region); -void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order) +void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)  {  	unsigned long *out_frames = discontig_frames, in_frame;  	unsigned long  flags;  	int success; +	unsigned long vstart;  	if (xen_feature(XENFEAT_auto_translated_physmap))  		return; @@ -2535,6 +2409,7 @@ void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)  	if (unlikely(order > MAX_CONTIG_ORDER))  		return; +	vstart = (unsigned long)phys_to_virt(pstart);  	memset((void *) vstart, 0, PAGE_SIZE << order);  	spin_lock_irqsave(&xen_reservation_lock, flags); @@ -2560,6 +2435,43 @@ void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)  EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);  #ifdef CONFIG_XEN_PVHVM +#ifdef CONFIG_PROC_VMCORE +/* + * This function is used in two contexts: + * - the kdump kernel has to check whether a pfn of the crashed kernel + *   was a ballooned page. vmcore is using this function to decide + *   whether to access a pfn of the crashed kernel. + * - the kexec kernel has to check whether a pfn was ballooned by the + *   previous kernel. If the pfn is ballooned, handle it properly. + * Returns 0 if the pfn is not backed by a RAM page, the caller may + * handle the pfn special in this case. + */ +static int xen_oldmem_pfn_is_ram(unsigned long pfn) +{ +	struct xen_hvm_get_mem_type a = { +		.domid = DOMID_SELF, +		.pfn = pfn, +	}; +	int ram; + +	if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a)) +		return -ENXIO; + +	switch (a.mem_type) { +		case HVMMEM_mmio_dm: +			ram = 0; +			break; +		case HVMMEM_ram_rw: +		case HVMMEM_ram_ro: +		default: +			ram = 1; +			break; +	} + +	return ram; +} +#endif +  static void xen_hvm_exit_mmap(struct mm_struct *mm)  {  	struct xen_hvm_pagetable_dying a; @@ -2590,6 +2502,98 @@ void __init xen_hvm_init_mmu_ops(void)  {  	if (is_pagetable_dying_supported())  		pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap; +#ifdef CONFIG_PROC_VMCORE +	register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram); +#endif +} +#endif + +#ifdef CONFIG_XEN_PVH +/* + * Map foreign gfn (fgfn), to local pfn (lpfn). This for the user + * space creating new guest on pvh dom0 and needing to map domU pages. + */ +static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn, +			    unsigned int domid) +{ +	int rc, err = 0; +	xen_pfn_t gpfn = lpfn; +	xen_ulong_t idx = fgfn; + +	struct xen_add_to_physmap_range xatp = { +		.domid = DOMID_SELF, +		.foreign_domid = domid, +		.size = 1, +		.space = XENMAPSPACE_gmfn_foreign, +	}; +	set_xen_guest_handle(xatp.idxs, &idx); +	set_xen_guest_handle(xatp.gpfns, &gpfn); +	set_xen_guest_handle(xatp.errs, &err); + +	rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp); +	if (rc < 0) +		return rc; +	return err; +} + +static int xlate_remove_from_p2m(unsigned long spfn, int count) +{ +	struct xen_remove_from_physmap xrp; +	int i, rc; + +	for (i = 0; i < count; i++) { +		xrp.domid = DOMID_SELF; +		xrp.gpfn = spfn+i; +		rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); +		if (rc) +			break; +	} +	return rc; +} + +struct xlate_remap_data { +	unsigned long fgfn; /* foreign domain's gfn */ +	pgprot_t prot; +	domid_t  domid; +	int index; +	struct page **pages; +}; + +static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, +			    void *data) +{ +	int rc; +	struct xlate_remap_data *remap = data; +	unsigned long pfn = page_to_pfn(remap->pages[remap->index++]); +	pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot)); + +	rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid); +	if (rc) +		return rc; +	native_set_pte(ptep, pteval); + +	return 0; +} + +static int xlate_remap_gfn_range(struct vm_area_struct *vma, +				 unsigned long addr, unsigned long mfn, +				 int nr, pgprot_t prot, unsigned domid, +				 struct page **pages) +{ +	int err; +	struct xlate_remap_data pvhdata; + +	BUG_ON(!pages); + +	pvhdata.fgfn = mfn; +	pvhdata.prot = prot; +	pvhdata.domid = domid; +	pvhdata.index = 0; +	pvhdata.pages = pages; +	err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT, +				  xlate_map_pte_fn, &pvhdata); +	flush_tlb_all(); +	return err;  }  #endif @@ -2605,9 +2609,9 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,  				 unsigned long addr, void *data)  {  	struct remap_data *rmd = data; -	pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot)); +	pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot)); -	rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr; +	rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;  	rmd->mmu_update->val = pte_val_ma(pte);  	rmd->mmu_update++; @@ -2616,8 +2620,10 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,  int xen_remap_domain_mfn_range(struct vm_area_struct *vma,  			       unsigned long addr, -			       unsigned long mfn, int nr, -			       pgprot_t prot, unsigned domid) +			       xen_pfn_t mfn, int nr, +			       pgprot_t prot, unsigned domid, +			       struct page **pages) +  {  	struct remap_data rmd;  	struct mmu_update mmu_update[REMAP_BATCH_SIZE]; @@ -2625,9 +2631,17 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,  	unsigned long range;  	int err = 0; -	prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); +	BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); -	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; +	if (xen_feature(XENFEAT_auto_translated_physmap)) { +#ifdef CONFIG_XEN_PVH +		/* We need to update the local page tables and the xen HAP */ +		return xlate_remap_gfn_range(vma, addr, mfn, nr, prot, +					     domid, pages); +#else +		return -EINVAL; +#endif +        }  	rmd.mfn = mfn;  	rmd.prot = prot; @@ -2642,8 +2656,8 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,  		if (err)  			goto out; -		err = -EFAULT; -		if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0) +		err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid); +		if (err < 0)  			goto out;  		nr -= batch; @@ -2653,71 +2667,38 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,  	err = 0;  out: -	flush_tlb_all(); +	xen_flush_tlb_all();  	return err;  }  EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); -#ifdef CONFIG_XEN_DEBUG_FS - -static struct dentry *d_mmu_debug; - -static int __init xen_mmu_debugfs(void) +/* Returns: 0 success */ +int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, +			       int numpgs, struct page **pages)  { -	struct dentry *d_xen = xen_init_debugfs(); - -	if (d_xen == NULL) -		return -ENOMEM; +	if (!pages || !xen_feature(XENFEAT_auto_translated_physmap)) +		return 0; -	d_mmu_debug = debugfs_create_dir("mmu", d_xen); - -	debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats); - -	debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update); -	debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug, -			   &mmu_stats.pgd_update_pinned); -	debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug, -			   &mmu_stats.pgd_update_pinned); - -	debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update); -	debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug, -			   &mmu_stats.pud_update_pinned); -	debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug, -			   &mmu_stats.pud_update_pinned); - -	debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update); -	debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug, -			   &mmu_stats.pmd_update_pinned); -	debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug, -			   &mmu_stats.pmd_update_pinned); - -	debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update); -//	debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug, -//			   &mmu_stats.pte_update_pinned); -	debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug, -			   &mmu_stats.pte_update_pinned); - -	debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update); -	debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug, -			   &mmu_stats.mmu_update_extended); -	xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug, -				     mmu_stats.mmu_update_histo, 20); - -	debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at); -	debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug, -			   &mmu_stats.set_pte_at_batched); -	debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug, -			   &mmu_stats.set_pte_at_current); -	debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug, -			   &mmu_stats.set_pte_at_kernel); - -	debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit); -	debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug, -			   &mmu_stats.prot_commit_batched); +#ifdef CONFIG_XEN_PVH +	while (numpgs--) { +		/* +		 * The mmu has already cleaned up the process mmu +		 * resources at this point (lookup_address will return +		 * NULL). +		 */ +		unsigned long pfn = page_to_pfn(pages[numpgs]); +		xlate_remove_from_p2m(pfn, 1); +	} +	/* +	 * We don't need to flush tlbs because as part of +	 * xlate_remove_from_p2m, the hypervisor will do tlb flushes +	 * after removing the p2m entries from the EPT/NPT +	 */  	return 0; +#else +	return -EINVAL; +#endif  } -fs_initcall(xen_mmu_debugfs); - -#endif	/* CONFIG_XEN_DEBUG_FS */ +EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range); diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 537bb9aab77..73809bb951b 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -15,43 +15,6 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);  void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); - -void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); -void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); -void xen_exit_mmap(struct mm_struct *mm); - -pteval_t xen_pte_val(pte_t); -pmdval_t xen_pmd_val(pmd_t); -pgdval_t xen_pgd_val(pgd_t); - -pte_t xen_make_pte(pteval_t); -pmd_t xen_make_pmd(pmdval_t); -pgd_t xen_make_pgd(pgdval_t); - -void xen_set_pte(pte_t *ptep, pte_t pteval); -void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, -		    pte_t *ptep, pte_t pteval); - -#ifdef CONFIG_X86_PAE -void xen_set_pte_atomic(pte_t *ptep, pte_t pte); -void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -void xen_pmd_clear(pmd_t *pmdp); -#endif	/* CONFIG_X86_PAE */ - -void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); -void xen_set_pud(pud_t *ptr, pud_t val); -void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval); -void xen_set_pud_hyper(pud_t *ptr, pud_t val); - -#if PAGETABLE_LEVELS == 4 -pudval_t xen_pud_val(pud_t pud); -pud_t xen_make_pud(pudval_t pudval); -void xen_set_pgd(pgd_t *pgdp, pgd_t pgd); -void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd); -#endif - -pgd_t *xen_get_user_pgd(pgd_t *pgd); -  pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);  void  xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,  				  pte_t *ptep, pte_t pte); diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 8bff7e7c290..0d82003e76a 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -30,12 +30,13 @@  #define MC_BATCH	32 -#define MC_DEBUG	1 +#define MC_DEBUG	0  #define MC_ARGS		(MC_BATCH * 16)  struct mc_buffer { +	unsigned mcidx, argidx, cbidx;  	struct multicall_entry entries[MC_BATCH];  #if MC_DEBUG  	struct multicall_entry debug[MC_BATCH]; @@ -46,85 +47,15 @@ struct mc_buffer {  		void (*fn)(void *);  		void *data;  	} callbacks[MC_BATCH]; -	unsigned mcidx, argidx, cbidx;  };  static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);  DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); -/* flush reasons 0- slots, 1- args, 2- callbacks */ -enum flush_reasons -{ -	FL_SLOTS, -	FL_ARGS, -	FL_CALLBACKS, - -	FL_N_REASONS -}; - -#ifdef CONFIG_XEN_DEBUG_FS -#define NHYPERCALLS	40		/* not really */ - -static struct { -	unsigned histo[MC_BATCH+1]; - -	unsigned issued; -	unsigned arg_total; -	unsigned hypercalls; -	unsigned histo_hypercalls[NHYPERCALLS]; - -	unsigned flush[FL_N_REASONS]; -} mc_stats; - -static u8 zero_stats; - -static inline void check_zero(void) -{ -	if (unlikely(zero_stats)) { -		memset(&mc_stats, 0, sizeof(mc_stats)); -		zero_stats = 0; -	} -} - -static void mc_add_stats(const struct mc_buffer *mc) -{ -	int i; - -	check_zero(); - -	mc_stats.issued++; -	mc_stats.hypercalls += mc->mcidx; -	mc_stats.arg_total += mc->argidx; - -	mc_stats.histo[mc->mcidx]++; -	for(i = 0; i < mc->mcidx; i++) { -		unsigned op = mc->entries[i].op; -		if (op < NHYPERCALLS) -			mc_stats.histo_hypercalls[op]++; -	} -} - -static void mc_stats_flush(enum flush_reasons idx) -{ -	check_zero(); - -	mc_stats.flush[idx]++; -} - -#else  /* !CONFIG_XEN_DEBUG_FS */ - -static inline void mc_add_stats(const struct mc_buffer *mc) -{ -} - -static inline void mc_stats_flush(enum flush_reasons idx) -{ -} -#endif	/* CONFIG_XEN_DEBUG_FS */ -  void xen_mc_flush(void)  {  	struct mc_buffer *b = &__get_cpu_var(mc_buffer); +	struct multicall_entry *mc;  	int ret = 0;  	unsigned long flags;  	int i; @@ -135,9 +66,26 @@ void xen_mc_flush(void)  	   something in the middle */  	local_irq_save(flags); -	mc_add_stats(b); +	trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx); + +	switch (b->mcidx) { +	case 0: +		/* no-op */ +		BUG_ON(b->argidx != 0); +		break; + +	case 1: +		/* Singleton multicall - bypass multicall machinery +		   and just do the call directly. */ +		mc = &b->entries[0]; + +		mc->result = privcmd_call(mc->op, +					  mc->args[0], mc->args[1], mc->args[2],  +					  mc->args[3], mc->args[4]); +		ret = mc->result < 0; +		break; -	if (b->mcidx) { +	default:  #if MC_DEBUG  		memcpy(b->debug, b->entries,  		       b->mcidx * sizeof(struct multicall_entry)); @@ -164,11 +112,10 @@ void xen_mc_flush(void)  			}  		}  #endif +	} -		b->mcidx = 0; -		b->argidx = 0; -	} else -		BUG_ON(b->argidx != 0); +	b->mcidx = 0; +	b->argidx = 0;  	for (i = 0; i < b->cbidx; i++) {  		struct callback *cb = &b->callbacks[i]; @@ -188,25 +135,28 @@ struct multicall_space __xen_mc_entry(size_t args)  	struct multicall_space ret;  	unsigned argidx = roundup(b->argidx, sizeof(u64)); +	trace_xen_mc_entry_alloc(args); +  	BUG_ON(preemptible()); -	BUG_ON(b->argidx > MC_ARGS); +	BUG_ON(b->argidx >= MC_ARGS); -	if (b->mcidx == MC_BATCH || -	    (argidx + args) > MC_ARGS) { -		mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS); +	if (unlikely(b->mcidx == MC_BATCH || +		     (argidx + args) >= MC_ARGS)) { +		trace_xen_mc_flush_reason((b->mcidx == MC_BATCH) ? +					  XEN_MC_FL_BATCH : XEN_MC_FL_ARGS);  		xen_mc_flush();  		argidx = roundup(b->argidx, sizeof(u64));  	}  	ret.mc = &b->entries[b->mcidx]; -#ifdef MC_DEBUG +#if MC_DEBUG  	b->caller[b->mcidx] = __builtin_return_address(0);  #endif  	b->mcidx++;  	ret.args = &b->args[argidx];  	b->argidx = argidx + args; -	BUG_ON(b->argidx > MC_ARGS); +	BUG_ON(b->argidx >= MC_ARGS);  	return ret;  } @@ -216,22 +166,27 @@ struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)  	struct multicall_space ret = { NULL, NULL };  	BUG_ON(preemptible()); -	BUG_ON(b->argidx > MC_ARGS); +	BUG_ON(b->argidx >= MC_ARGS); -	if (b->mcidx == 0) -		return ret; - -	if (b->entries[b->mcidx - 1].op != op) -		return ret; +	if (unlikely(b->mcidx == 0 || +		     b->entries[b->mcidx - 1].op != op)) { +		trace_xen_mc_extend_args(op, size, XEN_MC_XE_BAD_OP); +		goto out; +	} -	if ((b->argidx + size) > MC_ARGS) -		return ret; +	if (unlikely((b->argidx + size) >= MC_ARGS)) { +		trace_xen_mc_extend_args(op, size, XEN_MC_XE_NO_SPACE); +		goto out; +	}  	ret.mc = &b->entries[b->mcidx - 1];  	ret.args = &b->args[b->argidx];  	b->argidx += size; -	BUG_ON(b->argidx > MC_ARGS); +	BUG_ON(b->argidx >= MC_ARGS); + +	trace_xen_mc_extend_args(op, size, XEN_MC_XE_OK); +out:  	return ret;  } @@ -241,43 +196,13 @@ void xen_mc_callback(void (*fn)(void *), void *data)  	struct callback *cb;  	if (b->cbidx == MC_BATCH) { -		mc_stats_flush(FL_CALLBACKS); +		trace_xen_mc_flush_reason(XEN_MC_FL_CALLBACK);  		xen_mc_flush();  	} +	trace_xen_mc_callback(fn, data); +  	cb = &b->callbacks[b->cbidx++];  	cb->fn = fn;  	cb->data = data;  } - -#ifdef CONFIG_XEN_DEBUG_FS - -static struct dentry *d_mc_debug; - -static int __init xen_mc_debugfs(void) -{ -	struct dentry *d_xen = xen_init_debugfs(); - -	if (d_xen == NULL) -		return -ENOMEM; - -	d_mc_debug = debugfs_create_dir("multicalls", d_xen); - -	debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats); - -	debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued); -	debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls); -	debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total); - -	xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug, -				     mc_stats.histo, MC_BATCH); -	xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug, -				     mc_stats.histo_hypercalls, NHYPERCALLS); -	xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug, -				     mc_stats.flush, FL_N_REASONS); - -	return 0; -} -fs_initcall(xen_mc_debugfs); - -#endif	/* CONFIG_XEN_DEBUG_FS */ diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h index 9e565da5d1f..9c2e74f9096 100644 --- a/arch/x86/xen/multicalls.h +++ b/arch/x86/xen/multicalls.h @@ -1,6 +1,8 @@  #ifndef _XEN_MULTICALLS_H  #define _XEN_MULTICALLS_H +#include <trace/events/xen.h> +  #include "xen-ops.h"  /* Multicalls */ @@ -20,9 +22,11 @@ DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);  static inline void xen_mc_batch(void)  {  	unsigned long flags; +  	/* need to disable interrupts until this entry is complete */  	local_irq_save(flags); -	__get_cpu_var(xen_mc_irq_flags) = flags; +	trace_xen_mc_batch(paravirt_get_lazy_mode()); +	__this_cpu_write(xen_mc_irq_flags, flags);  }  static inline struct multicall_space xen_mc_entry(size_t args) @@ -37,11 +41,13 @@ void xen_mc_flush(void);  /* Issue a multicall if we're not in a lazy mode */  static inline void xen_mc_issue(unsigned mode)  { +	trace_xen_mc_issue(mode); +  	if ((paravirt_get_lazy_mode() & mode) == 0)  		xen_mc_flush();  	/* restore flags saved in xen_mc_batch */ -	local_irq_restore(percpu_read(xen_mc_irq_flags)); +	local_irq_restore(this_cpu_read(xen_mc_irq_flags));  }  /* Set up a callback to be called when the current batch is flushed */ diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c new file mode 100644 index 00000000000..9bb3d82ffec --- /dev/null +++ b/arch/x86/xen/p2m.c @@ -0,0 +1,1340 @@ +/* + * Xen leaves the responsibility for maintaining p2m mappings to the + * guests themselves, but it must also access and update the p2m array + * during suspend/resume when all the pages are reallocated. + * + * The p2m table is logically a flat array, but we implement it as a + * three-level tree to allow the address space to be sparse. + * + *                               Xen + *                                | + *     p2m_top              p2m_top_mfn + *       /  \                   /   \ + * p2m_mid p2m_mid	p2m_mid_mfn p2m_mid_mfn + *    / \      / \         /           / + *  p2m p2m p2m p2m p2m p2m p2m ... + * + * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. + * + * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the + * maximum representable pseudo-physical address space is: + *  P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages + * + * P2M_PER_PAGE depends on the architecture, as a mfn is always + * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to + * 512 and 1024 entries respectively. + * + * In short, these structures contain the Machine Frame Number (MFN) of the PFN. + * + * However not all entries are filled with MFNs. Specifically for all other + * leaf entries, or for the top  root, or middle one, for which there is a void + * entry, we assume it is  "missing". So (for example) + *  pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY. + * + * We also have the possibility of setting 1-1 mappings on certain regions, so + * that: + *  pfn_to_mfn(0xc0000)=0xc0000 + * + * The benefit of this is, that we can assume for non-RAM regions (think + * PCI BARs, or ACPI spaces), we can create mappings easily because we + * get the PFN value to match the MFN. + * + * For this to work efficiently we have one new page p2m_identity and + * allocate (via reserved_brk) any other pages we need to cover the sides + * (1GB or 4MB boundary violations). All entries in p2m_identity are set to + * INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs, + * no other fancy value). + * + * On lookup we spot that the entry points to p2m_identity and return the + * identity value instead of dereferencing and returning INVALID_P2M_ENTRY. + * If the entry points to an allocated page, we just proceed as before and + * return the PFN.  If the PFN has IDENTITY_FRAME_BIT set we unmask that in + * appropriate functions (pfn_to_mfn). + * + * The reason for having the IDENTITY_FRAME_BIT instead of just returning the + * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a + * non-identity pfn. To protect ourselves against we elect to set (and get) the + * IDENTITY_FRAME_BIT on all identity mapped PFNs. + * + * This simplistic diagram is used to explain the more subtle piece of code. + * There is also a digram of the P2M at the end that can help. + * Imagine your E820 looking as so: + * + *                    1GB                                           2GB    4GB + * /-------------------+---------\/----\         /----------\    /---+-----\ + * | System RAM        | Sys RAM ||ACPI|         | reserved |    | Sys RAM | + * \-------------------+---------/\----/         \----------/    \---+-----/ + *                               ^- 1029MB                       ^- 2001MB + * + * [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100), + *  2048MB = 524288 (0x80000)] + * + * And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB + * is actually not present (would have to kick the balloon driver to put it in). + * + * When we are told to set the PFNs for identity mapping (see patch: "xen/setup: + * Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start + * of the PFN and the end PFN (263424 and 512256 respectively). The first step + * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page + * covers 512^2 of page estate (1GB) and in case the start or end PFN is not + * aligned on 512^2*PAGE_SIZE (1GB) we reserve_brk new middle and leaf pages as + * required to split any existing p2m_mid_missing middle pages. + * + * With the E820 example above, 263424 is not 1GB aligned so we allocate a + * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000. + * Each entry in the allocate page is "missing" (points to p2m_missing). + * + * Next stage is to determine if we need to do a more granular boundary check + * on the 4MB (or 2MB depending on architecture) off the start and end pfn's. + * We check if the start pfn and end pfn violate that boundary check, and if + * so reserve_brk a (p2m[x][y]) leaf page. This way we have a much finer + * granularity of setting which PFNs are missing and which ones are identity. + * In our example 263424 and 512256 both fail the check so we reserve_brk two + * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing" + * values) and assign them to p2m[1][2] and p2m[1][488] respectively. + * + * At this point we would at minimum reserve_brk one page, but could be up to + * three. Each call to set_phys_range_identity has at maximum a three page + * cost. If we were to query the P2M at this stage, all those entries from + * start PFN through end PFN (so 1029MB -> 2001MB) would return + * INVALID_P2M_ENTRY ("missing"). + * + * The next step is to walk from the start pfn to the end pfn setting + * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity. + * If we find that the middle entry is pointing to p2m_missing we can swap it + * over to p2m_identity - this way covering 4MB (or 2MB) PFN space (and + * similarly swapping p2m_mid_missing for p2m_mid_identity for larger regions). + * At this point we do not need to worry about boundary aligment (so no need to + * reserve_brk a middle page, figure out which PFNs are "missing" and which + * ones are identity), as that has been done earlier.  If we find that the + * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference + * that page (which covers 512 PFNs) and set the appropriate PFN with + * IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we + * set from p2m[1][2][256->511] and p2m[1][488][0->256] with + * IDENTITY_FRAME_BIT set. + * + * All other regions that are void (or not filled) either point to p2m_missing + * (considered missing) or have the default value of INVALID_P2M_ENTRY (also + * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511] + * contain the INVALID_P2M_ENTRY value and are considered "missing." + * + * Finally, the region beyond the end of of the E820 (4 GB in this example) + * is set to be identity (in case there are MMIO regions placed here). + * + * This is what the p2m ends up looking (for the E820 above) with this + * fabulous drawing: + * + *    p2m         /--------------\ + *  /-----\       | &mfn_list[0],|                           /-----------------\ + *  |  0  |------>| &mfn_list[1],|    /---------------\      | ~0, ~0, ..      | + *  |-----|       |  ..., ~0, ~0 |    | ~0, ~0, [x]---+----->| IDENTITY [@256] | + *  |  1  |---\   \--------------/    | [p2m_identity]+\     | IDENTITY [@257] | + *  |-----|    \                      | [p2m_identity]+\\    | ....            | + *  |  2  |--\  \-------------------->|  ...          | \\   \----------------/ + *  |-----|   \                       \---------------/  \\ + *  |  3  |-\  \                                          \\  p2m_identity [1] + *  |-----|  \  \-------------------->/---------------\   /-----------------\ + *  | ..  |\  |                       | [p2m_identity]+-->| ~0, ~0, ~0, ... | + *  \-----/ | |                       | [p2m_identity]+-->| ..., ~0         | + *          | |                       | ....          |   \-----------------/ + *          | |                       +-[x], ~0, ~0.. +\ + *          | |                       \---------------/ \ + *          | |                                          \-> /---------------\ + *          | V  p2m_mid_missing       p2m_missing           | IDENTITY[@0]  | + *          | /-----------------\     /------------\         | IDENTITY[@256]| + *          | | [p2m_missing]   +---->| ~0, ~0, ...|         | ~0, ~0, ....  | + *          | | [p2m_missing]   +---->| ..., ~0    |         \---------------/ + *          | | ...             |     \------------/ + *          | \-----------------/ + *          | + *          |     p2m_mid_identity + *          |   /-----------------\ + *          \-->| [p2m_identity]  +---->[1] + *              | [p2m_identity]  +---->[1] + *              | ...             | + *              \-----------------/ + * + * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT) + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/hash.h> +#include <linux/sched.h> +#include <linux/seq_file.h> + +#include <asm/cache.h> +#include <asm/setup.h> + +#include <asm/xen/page.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> +#include <xen/balloon.h> +#include <xen/grant_table.h> + +#include "multicalls.h" +#include "xen-ops.h" + +static void __init m2p_override_init(void); + +unsigned long xen_max_p2m_pfn __read_mostly; + +#define P2M_PER_PAGE		(PAGE_SIZE / sizeof(unsigned long)) +#define P2M_MID_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long *)) +#define P2M_TOP_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long **)) + +#define MAX_P2M_PFN		(P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) + +/* Placeholders for holes in the address space */ +static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); + +static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); + +static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_identity, P2M_MID_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_identity_mfn, P2M_MID_PER_PAGE); + +RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); +RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); + +/* We might hit two boundary violations at the start and end, at max each + * boundary violation will require three middle nodes. */ +RESERVE_BRK(p2m_mid_extra, PAGE_SIZE * 2 * 3); + +/* When we populate back during bootup, the amount of pages can vary. The + * max we have is seen is 395979, but that does not mean it can't be more. + * Some machines can have 3GB I/O holes even. With early_can_reuse_p2m_middle + * it can re-use Xen provided mfn_list array, so we only need to allocate at + * most three P2M top nodes. */ +RESERVE_BRK(p2m_populated, PAGE_SIZE * 3); + +static inline unsigned p2m_top_index(unsigned long pfn) +{ +	BUG_ON(pfn >= MAX_P2M_PFN); +	return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); +} + +static inline unsigned p2m_mid_index(unsigned long pfn) +{ +	return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; +} + +static inline unsigned p2m_index(unsigned long pfn) +{ +	return pfn % P2M_PER_PAGE; +} + +static void p2m_top_init(unsigned long ***top) +{ +	unsigned i; + +	for (i = 0; i < P2M_TOP_PER_PAGE; i++) +		top[i] = p2m_mid_missing; +} + +static void p2m_top_mfn_init(unsigned long *top) +{ +	unsigned i; + +	for (i = 0; i < P2M_TOP_PER_PAGE; i++) +		top[i] = virt_to_mfn(p2m_mid_missing_mfn); +} + +static void p2m_top_mfn_p_init(unsigned long **top) +{ +	unsigned i; + +	for (i = 0; i < P2M_TOP_PER_PAGE; i++) +		top[i] = p2m_mid_missing_mfn; +} + +static void p2m_mid_init(unsigned long **mid, unsigned long *leaf) +{ +	unsigned i; + +	for (i = 0; i < P2M_MID_PER_PAGE; i++) +		mid[i] = leaf; +} + +static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf) +{ +	unsigned i; + +	for (i = 0; i < P2M_MID_PER_PAGE; i++) +		mid[i] = virt_to_mfn(leaf); +} + +static void p2m_init(unsigned long *p2m) +{ +	unsigned i; + +	for (i = 0; i < P2M_MID_PER_PAGE; i++) +		p2m[i] = INVALID_P2M_ENTRY; +} + +/* + * Build the parallel p2m_top_mfn and p2m_mid_mfn structures + * + * This is called both at boot time, and after resuming from suspend: + * - At boot time we're called very early, and must use extend_brk() + *   to allocate memory. + * + * - After resume we're called from within stop_machine, but the mfn + *   tree should alreay be completely allocated. + */ +void __ref xen_build_mfn_list_list(void) +{ +	unsigned long pfn; + +	if (xen_feature(XENFEAT_auto_translated_physmap)) +		return; + +	/* Pre-initialize p2m_top_mfn to be completely missing */ +	if (p2m_top_mfn == NULL) { +		p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); +		p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); +		p2m_mid_identity_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); +		p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity); + +		p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); +		p2m_top_mfn_p_init(p2m_top_mfn_p); + +		p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); +		p2m_top_mfn_init(p2m_top_mfn); +	} else { +		/* Reinitialise, mfn's all change after migration */ +		p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); +		p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity); +	} + +	for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { +		unsigned topidx = p2m_top_index(pfn); +		unsigned mididx = p2m_mid_index(pfn); +		unsigned long **mid; +		unsigned long *mid_mfn_p; + +		mid = p2m_top[topidx]; +		mid_mfn_p = p2m_top_mfn_p[topidx]; + +		/* Don't bother allocating any mfn mid levels if +		 * they're just missing, just update the stored mfn, +		 * since all could have changed over a migrate. +		 */ +		if (mid == p2m_mid_missing) { +			BUG_ON(mididx); +			BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); +			p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); +			pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; +			continue; +		} + +		if (mid_mfn_p == p2m_mid_missing_mfn) { +			/* +			 * XXX boot-time only!  We should never find +			 * missing parts of the mfn tree after +			 * runtime.  extend_brk() will BUG if we call +			 * it too late. +			 */ +			mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); +			p2m_mid_mfn_init(mid_mfn_p, p2m_missing); + +			p2m_top_mfn_p[topidx] = mid_mfn_p; +		} + +		p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); +		mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); +	} +} + +void xen_setup_mfn_list_list(void) +{ +	if (xen_feature(XENFEAT_auto_translated_physmap)) +		return; + +	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); + +	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = +		virt_to_mfn(p2m_top_mfn); +	HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; +} + +/* Set up p2m_top to point to the domain-builder provided p2m pages */ +void __init xen_build_dynamic_phys_to_machine(void) +{ +	unsigned long *mfn_list; +	unsigned long max_pfn; +	unsigned long pfn; + +	 if (xen_feature(XENFEAT_auto_translated_physmap)) +		return; + +	mfn_list = (unsigned long *)xen_start_info->mfn_list; +	max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); +	xen_max_p2m_pfn = max_pfn; + +	p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); +	p2m_init(p2m_missing); +	p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); +	p2m_init(p2m_identity); + +	p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); +	p2m_mid_init(p2m_mid_missing, p2m_missing); +	p2m_mid_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); +	p2m_mid_init(p2m_mid_identity, p2m_identity); + +	p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); +	p2m_top_init(p2m_top); + +	/* +	 * The domain builder gives us a pre-constructed p2m array in +	 * mfn_list for all the pages initially given to us, so we just +	 * need to graft that into our tree structure. +	 */ +	for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { +		unsigned topidx = p2m_top_index(pfn); +		unsigned mididx = p2m_mid_index(pfn); + +		if (p2m_top[topidx] == p2m_mid_missing) { +			unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); +			p2m_mid_init(mid, p2m_missing); + +			p2m_top[topidx] = mid; +		} + +		/* +		 * As long as the mfn_list has enough entries to completely +		 * fill a p2m page, pointing into the array is ok. But if +		 * not the entries beyond the last pfn will be undefined. +		 */ +		if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) { +			unsigned long p2midx; + +			p2midx = max_pfn % P2M_PER_PAGE; +			for ( ; p2midx < P2M_PER_PAGE; p2midx++) +				mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY; +		} +		p2m_top[topidx][mididx] = &mfn_list[pfn]; +	} + +	m2p_override_init(); +} +#ifdef CONFIG_X86_64 +#include <linux/bootmem.h> +unsigned long __init xen_revector_p2m_tree(void) +{ +	unsigned long va_start; +	unsigned long va_end; +	unsigned long pfn; +	unsigned long pfn_free = 0; +	unsigned long *mfn_list = NULL; +	unsigned long size; + +	va_start = xen_start_info->mfn_list; +	/*We copy in increments of P2M_PER_PAGE * sizeof(unsigned long), +	 * so make sure it is rounded up to that */ +	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); +	va_end = va_start + size; + +	/* If we were revectored already, don't do it again. */ +	if (va_start <= __START_KERNEL_map && va_start >= __PAGE_OFFSET) +		return 0; + +	mfn_list = alloc_bootmem_align(size, PAGE_SIZE); +	if (!mfn_list) { +		pr_warn("Could not allocate space for a new P2M tree!\n"); +		return xen_start_info->mfn_list; +	} +	/* Fill it out with INVALID_P2M_ENTRY value */ +	memset(mfn_list, 0xFF, size); + +	for (pfn = 0; pfn < ALIGN(MAX_DOMAIN_PAGES, P2M_PER_PAGE); pfn += P2M_PER_PAGE) { +		unsigned topidx = p2m_top_index(pfn); +		unsigned mididx; +		unsigned long *mid_p; + +		if (!p2m_top[topidx]) +			continue; + +		if (p2m_top[topidx] == p2m_mid_missing) +			continue; + +		mididx = p2m_mid_index(pfn); +		mid_p = p2m_top[topidx][mididx]; +		if (!mid_p) +			continue; +		if ((mid_p == p2m_missing) || (mid_p == p2m_identity)) +			continue; + +		if ((unsigned long)mid_p == INVALID_P2M_ENTRY) +			continue; + +		/* The old va. Rebase it on mfn_list */ +		if (mid_p >= (unsigned long *)va_start && mid_p <= (unsigned long *)va_end) { +			unsigned long *new; + +			if (pfn_free  > (size / sizeof(unsigned long))) { +				WARN(1, "Only allocated for %ld pages, but we want %ld!\n", +				     size / sizeof(unsigned long), pfn_free); +				return 0; +			} +			new = &mfn_list[pfn_free]; + +			copy_page(new, mid_p); +			p2m_top[topidx][mididx] = &mfn_list[pfn_free]; +			p2m_top_mfn_p[topidx][mididx] = virt_to_mfn(&mfn_list[pfn_free]); + +			pfn_free += P2M_PER_PAGE; + +		} +		/* This should be the leafs allocated for identity from _brk. */ +	} +	return (unsigned long)mfn_list; + +} +#else +unsigned long __init xen_revector_p2m_tree(void) +{ +	return 0; +} +#endif +unsigned long get_phys_to_machine(unsigned long pfn) +{ +	unsigned topidx, mididx, idx; + +	if (unlikely(pfn >= MAX_P2M_PFN)) +		return IDENTITY_FRAME(pfn); + +	topidx = p2m_top_index(pfn); +	mididx = p2m_mid_index(pfn); +	idx = p2m_index(pfn); + +	/* +	 * The INVALID_P2M_ENTRY is filled in both p2m_*identity +	 * and in p2m_*missing, so returning the INVALID_P2M_ENTRY +	 * would be wrong. +	 */ +	if (p2m_top[topidx][mididx] == p2m_identity) +		return IDENTITY_FRAME(pfn); + +	return p2m_top[topidx][mididx][idx]; +} +EXPORT_SYMBOL_GPL(get_phys_to_machine); + +static void *alloc_p2m_page(void) +{ +	return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); +} + +static void free_p2m_page(void *p) +{ +	free_page((unsigned long)p); +} + +/* + * Fully allocate the p2m structure for a given pfn.  We need to check + * that both the top and mid levels are allocated, and make sure the + * parallel mfn tree is kept in sync.  We may race with other cpus, so + * the new pages are installed with cmpxchg; if we lose the race then + * simply free the page we allocated and use the one that's there. + */ +static bool alloc_p2m(unsigned long pfn) +{ +	unsigned topidx, mididx; +	unsigned long ***top_p, **mid; +	unsigned long *top_mfn_p, *mid_mfn; + +	topidx = p2m_top_index(pfn); +	mididx = p2m_mid_index(pfn); + +	top_p = &p2m_top[topidx]; +	mid = *top_p; + +	if (mid == p2m_mid_missing) { +		/* Mid level is missing, allocate a new one */ +		mid = alloc_p2m_page(); +		if (!mid) +			return false; + +		p2m_mid_init(mid, p2m_missing); + +		if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) +			free_p2m_page(mid); +	} + +	top_mfn_p = &p2m_top_mfn[topidx]; +	mid_mfn = p2m_top_mfn_p[topidx]; + +	BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); + +	if (mid_mfn == p2m_mid_missing_mfn) { +		/* Separately check the mid mfn level */ +		unsigned long missing_mfn; +		unsigned long mid_mfn_mfn; + +		mid_mfn = alloc_p2m_page(); +		if (!mid_mfn) +			return false; + +		p2m_mid_mfn_init(mid_mfn, p2m_missing); + +		missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); +		mid_mfn_mfn = virt_to_mfn(mid_mfn); +		if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) +			free_p2m_page(mid_mfn); +		else +			p2m_top_mfn_p[topidx] = mid_mfn; +	} + +	if (p2m_top[topidx][mididx] == p2m_identity || +	    p2m_top[topidx][mididx] == p2m_missing) { +		/* p2m leaf page is missing */ +		unsigned long *p2m; +		unsigned long *p2m_orig = p2m_top[topidx][mididx]; + +		p2m = alloc_p2m_page(); +		if (!p2m) +			return false; + +		p2m_init(p2m); + +		if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig) +			free_p2m_page(p2m); +		else +			mid_mfn[mididx] = virt_to_mfn(p2m); +	} + +	return true; +} + +static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary) +{ +	unsigned topidx, mididx, idx; +	unsigned long *p2m; +	unsigned long *mid_mfn_p; + +	topidx = p2m_top_index(pfn); +	mididx = p2m_mid_index(pfn); +	idx = p2m_index(pfn); + +	/* Pfff.. No boundary cross-over, lets get out. */ +	if (!idx && check_boundary) +		return false; + +	WARN(p2m_top[topidx][mididx] == p2m_identity, +		"P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n", +		topidx, mididx); + +	/* +	 * Could be done by xen_build_dynamic_phys_to_machine.. +	 */ +	if (p2m_top[topidx][mididx] != p2m_missing) +		return false; + +	/* Boundary cross-over for the edges: */ +	p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); + +	p2m_init(p2m); + +	p2m_top[topidx][mididx] = p2m; + +	/* For save/restore we need to MFN of the P2M saved */ + +	mid_mfn_p = p2m_top_mfn_p[topidx]; +	WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing), +		"P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n", +		topidx, mididx); +	mid_mfn_p[mididx] = virt_to_mfn(p2m); + +	return true; +} + +static bool __init early_alloc_p2m_middle(unsigned long pfn) +{ +	unsigned topidx = p2m_top_index(pfn); +	unsigned long *mid_mfn_p; +	unsigned long **mid; + +	mid = p2m_top[topidx]; +	mid_mfn_p = p2m_top_mfn_p[topidx]; +	if (mid == p2m_mid_missing) { +		mid = extend_brk(PAGE_SIZE, PAGE_SIZE); + +		p2m_mid_init(mid, p2m_missing); + +		p2m_top[topidx] = mid; + +		BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); +	} +	/* And the save/restore P2M tables.. */ +	if (mid_mfn_p == p2m_mid_missing_mfn) { +		mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); +		p2m_mid_mfn_init(mid_mfn_p, p2m_missing); + +		p2m_top_mfn_p[topidx] = mid_mfn_p; +		p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); +		/* Note: we don't set mid_mfn_p[midix] here, +		 * look in early_alloc_p2m() */ +	} +	return true; +} + +/* + * Skim over the P2M tree looking at pages that are either filled with + * INVALID_P2M_ENTRY or with 1:1 PFNs. If found, re-use that page and + * replace the P2M leaf with a p2m_missing or p2m_identity. + * Stick the old page in the new P2M tree location. + */ +bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_mfn) +{ +	unsigned topidx; +	unsigned mididx; +	unsigned ident_pfns; +	unsigned inv_pfns; +	unsigned long *p2m; +	unsigned long *mid_mfn_p; +	unsigned idx; +	unsigned long pfn; + +	/* We only look when this entails a P2M middle layer */ +	if (p2m_index(set_pfn)) +		return false; + +	for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) { +		topidx = p2m_top_index(pfn); + +		if (!p2m_top[topidx]) +			continue; + +		if (p2m_top[topidx] == p2m_mid_missing) +			continue; + +		mididx = p2m_mid_index(pfn); +		p2m = p2m_top[topidx][mididx]; +		if (!p2m) +			continue; + +		if ((p2m == p2m_missing) || (p2m == p2m_identity)) +			continue; + +		if ((unsigned long)p2m == INVALID_P2M_ENTRY) +			continue; + +		ident_pfns = 0; +		inv_pfns = 0; +		for (idx = 0; idx < P2M_PER_PAGE; idx++) { +			/* IDENTITY_PFNs are 1:1 */ +			if (p2m[idx] == IDENTITY_FRAME(pfn + idx)) +				ident_pfns++; +			else if (p2m[idx] == INVALID_P2M_ENTRY) +				inv_pfns++; +			else +				break; +		} +		if ((ident_pfns == P2M_PER_PAGE) || (inv_pfns == P2M_PER_PAGE)) +			goto found; +	} +	return false; +found: +	/* Found one, replace old with p2m_identity or p2m_missing */ +	p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing); +	/* And the other for save/restore.. */ +	mid_mfn_p = p2m_top_mfn_p[topidx]; +	/* NOTE: Even if it is a p2m_identity it should still be point to +	 * a page filled with INVALID_P2M_ENTRY entries. */ +	mid_mfn_p[mididx] = virt_to_mfn(p2m_missing); + +	/* Reset where we want to stick the old page in. */ +	topidx = p2m_top_index(set_pfn); +	mididx = p2m_mid_index(set_pfn); + +	/* This shouldn't happen */ +	if (WARN_ON(p2m_top[topidx] == p2m_mid_missing)) +		early_alloc_p2m_middle(set_pfn); + +	if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing)) +		return false; + +	p2m_init(p2m); +	p2m_top[topidx][mididx] = p2m; +	mid_mfn_p = p2m_top_mfn_p[topidx]; +	mid_mfn_p[mididx] = virt_to_mfn(p2m); + +	return true; +} +bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ +	if (unlikely(!__set_phys_to_machine(pfn, mfn)))  { +		if (!early_alloc_p2m_middle(pfn)) +			return false; + +		if (early_can_reuse_p2m_middle(pfn, mfn)) +			return __set_phys_to_machine(pfn, mfn); + +		if (!early_alloc_p2m(pfn, false /* boundary crossover OK!*/)) +			return false; + +		if (!__set_phys_to_machine(pfn, mfn)) +			return false; +	} + +	return true; +} + +static void __init early_split_p2m(unsigned long pfn) +{ +	unsigned long mididx, idx; + +	mididx = p2m_mid_index(pfn); +	idx = p2m_index(pfn); + +	/* +	 * Allocate new middle and leaf pages if this pfn lies in the +	 * middle of one. +	 */ +	if (mididx || idx) +		early_alloc_p2m_middle(pfn); +	if (idx) +		early_alloc_p2m(pfn, false); +} + +unsigned long __init set_phys_range_identity(unsigned long pfn_s, +				      unsigned long pfn_e) +{ +	unsigned long pfn; + +	if (unlikely(pfn_s >= MAX_P2M_PFN)) +		return 0; + +	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) +		return pfn_e - pfn_s; + +	if (pfn_s > pfn_e) +		return 0; + +	if (pfn_e > MAX_P2M_PFN) +		pfn_e = MAX_P2M_PFN; + +	early_split_p2m(pfn_s); +	early_split_p2m(pfn_e); + +	for (pfn = pfn_s; pfn < pfn_e;) { +		unsigned topidx = p2m_top_index(pfn); +		unsigned mididx = p2m_mid_index(pfn); + +		if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) +			break; +		pfn++; + +		/* +		 * If the PFN was set to a middle or leaf identity +		 * page the remainder must also be identity, so skip +		 * ahead to the next middle or leaf entry. +		 */ +		if (p2m_top[topidx] == p2m_mid_identity) +			pfn = ALIGN(pfn, P2M_MID_PER_PAGE * P2M_PER_PAGE); +		else if (p2m_top[topidx][mididx] == p2m_identity) +			pfn = ALIGN(pfn, P2M_PER_PAGE); +	} + +	if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s), +		"Identity mapping failed. We are %ld short of 1-1 mappings!\n", +		(pfn_e - pfn_s) - (pfn - pfn_s))) +		printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn); + +	return pfn - pfn_s; +} + +/* Try to install p2m mapping; fail if intermediate bits missing */ +bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ +	unsigned topidx, mididx, idx; + +	/* don't track P2M changes in autotranslate guests */ +	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) +		return true; + +	if (unlikely(pfn >= MAX_P2M_PFN)) { +		BUG_ON(mfn != INVALID_P2M_ENTRY); +		return true; +	} + +	topidx = p2m_top_index(pfn); +	mididx = p2m_mid_index(pfn); +	idx = p2m_index(pfn); + +	/* For sparse holes were the p2m leaf has real PFN along with +	 * PCI holes, stick in the PFN as the MFN value. +	 * +	 * set_phys_range_identity() will have allocated new middle +	 * and leaf pages as required so an existing p2m_mid_missing +	 * or p2m_missing mean that whole range will be identity so +	 * these can be switched to p2m_mid_identity or p2m_identity. +	 */ +	if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) { +		if (p2m_top[topidx] == p2m_mid_identity) +			return true; + +		if (p2m_top[topidx] == p2m_mid_missing) { +			WARN_ON(cmpxchg(&p2m_top[topidx], p2m_mid_missing, +					p2m_mid_identity) != p2m_mid_missing); +			return true; +		} + +		if (p2m_top[topidx][mididx] == p2m_identity) +			return true; + +		/* Swap over from MISSING to IDENTITY if needed. */ +		if (p2m_top[topidx][mididx] == p2m_missing) { +			WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing, +				p2m_identity) != p2m_missing); +			return true; +		} +	} + +	if (p2m_top[topidx][mididx] == p2m_missing) +		return mfn == INVALID_P2M_ENTRY; + +	p2m_top[topidx][mididx][idx] = mfn; + +	return true; +} + +bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ +	if (unlikely(!__set_phys_to_machine(pfn, mfn)))  { +		if (!alloc_p2m(pfn)) +			return false; + +		if (!__set_phys_to_machine(pfn, mfn)) +			return false; +	} + +	return true; +} + +#define M2P_OVERRIDE_HASH_SHIFT	10 +#define M2P_OVERRIDE_HASH	(1 << M2P_OVERRIDE_HASH_SHIFT) + +static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH); +static DEFINE_SPINLOCK(m2p_override_lock); + +static void __init m2p_override_init(void) +{ +	unsigned i; + +	m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH, +				   sizeof(unsigned long)); + +	for (i = 0; i < M2P_OVERRIDE_HASH; i++) +		INIT_LIST_HEAD(&m2p_overrides[i]); +} + +static unsigned long mfn_hash(unsigned long mfn) +{ +	return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT); +} + +int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, +			    struct gnttab_map_grant_ref *kmap_ops, +			    struct page **pages, unsigned int count) +{ +	int i, ret = 0; +	bool lazy = false; +	pte_t *pte; + +	if (xen_feature(XENFEAT_auto_translated_physmap)) +		return 0; + +	if (kmap_ops && +	    !in_interrupt() && +	    paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { +		arch_enter_lazy_mmu_mode(); +		lazy = true; +	} + +	for (i = 0; i < count; i++) { +		unsigned long mfn, pfn; + +		/* Do not add to override if the map failed. */ +		if (map_ops[i].status) +			continue; + +		if (map_ops[i].flags & GNTMAP_contains_pte) { +			pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + +				(map_ops[i].host_addr & ~PAGE_MASK)); +			mfn = pte_mfn(*pte); +		} else { +			mfn = PFN_DOWN(map_ops[i].dev_bus_addr); +		} +		pfn = page_to_pfn(pages[i]); + +		WARN_ON(PagePrivate(pages[i])); +		SetPagePrivate(pages[i]); +		set_page_private(pages[i], mfn); +		pages[i]->index = pfn_to_mfn(pfn); + +		if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) { +			ret = -ENOMEM; +			goto out; +		} + +		if (kmap_ops) { +			ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]); +			if (ret) +				goto out; +		} +	} + +out: +	if (lazy) +		arch_leave_lazy_mmu_mode(); + +	return ret; +} +EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping); + +/* Add an MFN override for a particular page */ +int m2p_add_override(unsigned long mfn, struct page *page, +		struct gnttab_map_grant_ref *kmap_op) +{ +	unsigned long flags; +	unsigned long pfn; +	unsigned long uninitialized_var(address); +	unsigned level; +	pte_t *ptep = NULL; + +	pfn = page_to_pfn(page); +	if (!PageHighMem(page)) { +		address = (unsigned long)__va(pfn << PAGE_SHIFT); +		ptep = lookup_address(address, &level); +		if (WARN(ptep == NULL || level != PG_LEVEL_4K, +					"m2p_add_override: pfn %lx not mapped", pfn)) +			return -EINVAL; +	} + +	if (kmap_op != NULL) { +		if (!PageHighMem(page)) { +			struct multicall_space mcs = +				xen_mc_entry(sizeof(*kmap_op)); + +			MULTI_grant_table_op(mcs.mc, +					GNTTABOP_map_grant_ref, kmap_op, 1); + +			xen_mc_issue(PARAVIRT_LAZY_MMU); +		} +	} +	spin_lock_irqsave(&m2p_override_lock, flags); +	list_add(&page->lru,  &m2p_overrides[mfn_hash(mfn)]); +	spin_unlock_irqrestore(&m2p_override_lock, flags); + +	/* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in +	 * this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other +	 * pfn so that the following mfn_to_pfn(mfn) calls will return the +	 * pfn from the m2p_override (the backend pfn) instead. +	 * We need to do this because the pages shared by the frontend +	 * (xen-blkfront) can be already locked (lock_page, called by +	 * do_read_cache_page); when the userspace backend tries to use them +	 * with direct_IO, mfn_to_pfn returns the pfn of the frontend, so +	 * do_blockdev_direct_IO is going to try to lock the same pages +	 * again resulting in a deadlock. +	 * As a side effect get_user_pages_fast might not be safe on the +	 * frontend pages while they are being shared with the backend, +	 * because mfn_to_pfn (that ends up being called by GUPF) will +	 * return the backend pfn rather than the frontend pfn. */ +	pfn = mfn_to_pfn_no_overrides(mfn); +	if (get_phys_to_machine(pfn) == mfn) +		set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); + +	return 0; +} +EXPORT_SYMBOL_GPL(m2p_add_override); + +int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, +			      struct gnttab_map_grant_ref *kmap_ops, +			      struct page **pages, unsigned int count) +{ +	int i, ret = 0; +	bool lazy = false; + +	if (xen_feature(XENFEAT_auto_translated_physmap)) +		return 0; + +	if (kmap_ops && +	    !in_interrupt() && +	    paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { +		arch_enter_lazy_mmu_mode(); +		lazy = true; +	} + +	for (i = 0; i < count; i++) { +		unsigned long mfn = get_phys_to_machine(page_to_pfn(pages[i])); +		unsigned long pfn = page_to_pfn(pages[i]); + +		if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) { +			ret = -EINVAL; +			goto out; +		} + +		set_page_private(pages[i], INVALID_P2M_ENTRY); +		WARN_ON(!PagePrivate(pages[i])); +		ClearPagePrivate(pages[i]); +		set_phys_to_machine(pfn, pages[i]->index); + +		if (kmap_ops) +			ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn); +		if (ret) +			goto out; +	} + +out: +	if (lazy) +		arch_leave_lazy_mmu_mode(); +	return ret; +} +EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping); + +int m2p_remove_override(struct page *page, +			struct gnttab_map_grant_ref *kmap_op, +			unsigned long mfn) +{ +	unsigned long flags; +	unsigned long pfn; +	unsigned long uninitialized_var(address); +	unsigned level; +	pte_t *ptep = NULL; + +	pfn = page_to_pfn(page); + +	if (!PageHighMem(page)) { +		address = (unsigned long)__va(pfn << PAGE_SHIFT); +		ptep = lookup_address(address, &level); + +		if (WARN(ptep == NULL || level != PG_LEVEL_4K, +					"m2p_remove_override: pfn %lx not mapped", pfn)) +			return -EINVAL; +	} + +	spin_lock_irqsave(&m2p_override_lock, flags); +	list_del(&page->lru); +	spin_unlock_irqrestore(&m2p_override_lock, flags); + +	if (kmap_op != NULL) { +		if (!PageHighMem(page)) { +			struct multicall_space mcs; +			struct gnttab_unmap_and_replace *unmap_op; +			struct page *scratch_page = get_balloon_scratch_page(); +			unsigned long scratch_page_address = (unsigned long) +				__va(page_to_pfn(scratch_page) << PAGE_SHIFT); + +			/* +			 * It might be that we queued all the m2p grant table +			 * hypercalls in a multicall, then m2p_remove_override +			 * get called before the multicall has actually been +			 * issued. In this case handle is going to -1 because +			 * it hasn't been modified yet. +			 */ +			if (kmap_op->handle == -1) +				xen_mc_flush(); +			/* +			 * Now if kmap_op->handle is negative it means that the +			 * hypercall actually returned an error. +			 */ +			if (kmap_op->handle == GNTST_general_error) { +				printk(KERN_WARNING "m2p_remove_override: " +						"pfn %lx mfn %lx, failed to modify kernel mappings", +						pfn, mfn); +				put_balloon_scratch_page(); +				return -1; +			} + +			xen_mc_batch(); + +			mcs = __xen_mc_entry( +					sizeof(struct gnttab_unmap_and_replace)); +			unmap_op = mcs.args; +			unmap_op->host_addr = kmap_op->host_addr; +			unmap_op->new_addr = scratch_page_address; +			unmap_op->handle = kmap_op->handle; + +			MULTI_grant_table_op(mcs.mc, +					GNTTABOP_unmap_and_replace, unmap_op, 1); + +			mcs = __xen_mc_entry(0); +			MULTI_update_va_mapping(mcs.mc, scratch_page_address, +					pfn_pte(page_to_pfn(scratch_page), +					PAGE_KERNEL_RO), 0); + +			xen_mc_issue(PARAVIRT_LAZY_MMU); + +			kmap_op->host_addr = 0; +			put_balloon_scratch_page(); +		} +	} + +	/* p2m(m2p(mfn)) == FOREIGN_FRAME(mfn): the mfn is already present +	 * somewhere in this domain, even before being added to the +	 * m2p_override (see comment above in m2p_add_override). +	 * If there are no other entries in the m2p_override corresponding +	 * to this mfn, then remove the FOREIGN_FRAME_BIT from the p2m for +	 * the original pfn (the one shared by the frontend): the backend +	 * cannot do any IO on this page anymore because it has been +	 * unshared. Removing the FOREIGN_FRAME_BIT from the p2m entry of +	 * the original pfn causes mfn_to_pfn(mfn) to return the frontend +	 * pfn again. */ +	mfn &= ~FOREIGN_FRAME_BIT; +	pfn = mfn_to_pfn_no_overrides(mfn); +	if (get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) && +			m2p_find_override(mfn) == NULL) +		set_phys_to_machine(pfn, mfn); + +	return 0; +} +EXPORT_SYMBOL_GPL(m2p_remove_override); + +struct page *m2p_find_override(unsigned long mfn) +{ +	unsigned long flags; +	struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)]; +	struct page *p, *ret; + +	ret = NULL; + +	spin_lock_irqsave(&m2p_override_lock, flags); + +	list_for_each_entry(p, bucket, lru) { +		if (page_private(p) == mfn) { +			ret = p; +			break; +		} +	} + +	spin_unlock_irqrestore(&m2p_override_lock, flags); + +	return ret; +} + +unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn) +{ +	struct page *p = m2p_find_override(mfn); +	unsigned long ret = pfn; + +	if (p) +		ret = page_to_pfn(p); + +	return ret; +} +EXPORT_SYMBOL_GPL(m2p_find_override_pfn); + +#ifdef CONFIG_XEN_DEBUG_FS +#include <linux/debugfs.h> +#include "debugfs.h" +static int p2m_dump_show(struct seq_file *m, void *v) +{ +	static const char * const level_name[] = { "top", "middle", +						"entry", "abnormal", "error"}; +#define TYPE_IDENTITY 0 +#define TYPE_MISSING 1 +#define TYPE_PFN 2 +#define TYPE_UNKNOWN 3 +	static const char * const type_name[] = { +				[TYPE_IDENTITY] = "identity", +				[TYPE_MISSING] = "missing", +				[TYPE_PFN] = "pfn", +				[TYPE_UNKNOWN] = "abnormal"}; +	unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0; +	unsigned int uninitialized_var(prev_level); +	unsigned int uninitialized_var(prev_type); + +	if (!p2m_top) +		return 0; + +	for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) { +		unsigned topidx = p2m_top_index(pfn); +		unsigned mididx = p2m_mid_index(pfn); +		unsigned idx = p2m_index(pfn); +		unsigned lvl, type; + +		lvl = 4; +		type = TYPE_UNKNOWN; +		if (p2m_top[topidx] == p2m_mid_missing) { +			lvl = 0; type = TYPE_MISSING; +		} else if (p2m_top[topidx] == NULL) { +			lvl = 0; type = TYPE_UNKNOWN; +		} else if (p2m_top[topidx][mididx] == NULL) { +			lvl = 1; type = TYPE_UNKNOWN; +		} else if (p2m_top[topidx][mididx] == p2m_identity) { +			lvl = 1; type = TYPE_IDENTITY; +		} else if (p2m_top[topidx][mididx] == p2m_missing) { +			lvl = 1; type = TYPE_MISSING; +		} else if (p2m_top[topidx][mididx][idx] == 0) { +			lvl = 2; type = TYPE_UNKNOWN; +		} else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) { +			lvl = 2; type = TYPE_IDENTITY; +		} else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) { +			lvl = 2; type = TYPE_MISSING; +		} else if (p2m_top[topidx][mididx][idx] == pfn) { +			lvl = 2; type = TYPE_PFN; +		} else if (p2m_top[topidx][mididx][idx] != pfn) { +			lvl = 2; type = TYPE_PFN; +		} +		if (pfn == 0) { +			prev_level = lvl; +			prev_type = type; +		} +		if (pfn == MAX_DOMAIN_PAGES-1) { +			lvl = 3; +			type = TYPE_UNKNOWN; +		} +		if (prev_type != type) { +			seq_printf(m, " [0x%lx->0x%lx] %s\n", +				prev_pfn_type, pfn, type_name[prev_type]); +			prev_pfn_type = pfn; +			prev_type = type; +		} +		if (prev_level != lvl) { +			seq_printf(m, " [0x%lx->0x%lx] level %s\n", +				prev_pfn_level, pfn, level_name[prev_level]); +			prev_pfn_level = pfn; +			prev_level = lvl; +		} +	} +	return 0; +#undef TYPE_IDENTITY +#undef TYPE_MISSING +#undef TYPE_PFN +#undef TYPE_UNKNOWN +} + +static int p2m_dump_open(struct inode *inode, struct file *filp) +{ +	return single_open(filp, p2m_dump_show, NULL); +} + +static const struct file_operations p2m_dump_fops = { +	.open		= p2m_dump_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= single_release, +}; + +static struct dentry *d_mmu_debug; + +static int __init xen_p2m_debugfs(void) +{ +	struct dentry *d_xen = xen_init_debugfs(); + +	if (d_xen == NULL) +		return -ENOMEM; + +	d_mmu_debug = debugfs_create_dir("mmu", d_xen); + +	debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops); +	return 0; +} +fs_initcall(xen_p2m_debugfs); +#endif /* CONFIG_XEN_DEBUG_FS */ diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index bfd0632fe65..0e98e5d241d 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -8,12 +8,20 @@  #include <xen/xen.h>  #include <asm/iommu_table.h> + +#include <asm/xen/swiotlb-xen.h> +#ifdef CONFIG_X86_64 +#include <asm/iommu.h> +#include <asm/dma.h> +#endif +#include <linux/export.h> +  int xen_swiotlb __read_mostly;  static struct dma_map_ops xen_swiotlb_dma_ops = {  	.mapping_error = xen_swiotlb_dma_mapping_error, -	.alloc_coherent = xen_swiotlb_alloc_coherent, -	.free_coherent = xen_swiotlb_free_coherent, +	.alloc = xen_swiotlb_alloc_coherent, +	.free = xen_swiotlb_free_coherent,  	.sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,  	.sync_single_for_device = xen_swiotlb_sync_single_for_device,  	.sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu, @@ -34,34 +42,68 @@ static struct dma_map_ops xen_swiotlb_dma_ops = {  int __init pci_xen_swiotlb_detect(void)  { +	if (!xen_pv_domain()) +		return 0; +  	/* If running as PV guest, either iommu=soft, or swiotlb=force will  	 * activate this IOMMU. If running as PV privileged, activate it -	 * irregardlesss. +	 * irregardless.  	 */ -	if ((xen_initial_domain() || swiotlb || swiotlb_force) && -	    (xen_pv_domain())) +	if ((xen_initial_domain() || swiotlb || swiotlb_force))  		xen_swiotlb = 1;  	/* If we are running under Xen, we MUST disable the native SWIOTLB.  	 * Don't worry about swiotlb_force flag activating the native, as  	 * the 'swiotlb' flag is the only one turning it on. */ -	if (xen_pv_domain()) -		swiotlb = 0; +	swiotlb = 0; +#ifdef CONFIG_X86_64 +	/* pci_swiotlb_detect_4gb turns on native SWIOTLB if no_iommu == 0 +	 * (so no iommu=X command line over-writes). +	 * Considering that PV guests do not want the *native SWIOTLB* but +	 * only Xen SWIOTLB it is not useful to us so set no_iommu=1 here. +	 */ +	if (max_pfn > MAX_DMA32_PFN) +		no_iommu = 1; +#endif  	return xen_swiotlb;  }  void __init pci_xen_swiotlb_init(void)  {  	if (xen_swiotlb) { -		xen_swiotlb_init(1); +		xen_swiotlb_init(1, true /* early */);  		dma_ops = &xen_swiotlb_dma_ops; +#ifdef CONFIG_PCI  		/* Make sure ACS will be enabled */  		pci_request_acs(); +#endif  	}  } + +int pci_xen_swiotlb_init_late(void) +{ +	int rc; + +	if (xen_swiotlb) +		return 0; + +	rc = xen_swiotlb_init(1, false /* late */); +	if (rc) +		return rc; + +	dma_ops = &xen_swiotlb_dma_ops; +#ifdef CONFIG_PCI +	/* Make sure ACS will be enabled */ +	pci_request_acs(); +#endif + +	return 0; +} +EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late); +  IOMMU_INIT_FINISH(pci_xen_swiotlb_detect, -		  0, +		  NULL,  		  pci_xen_swiotlb_init, -		  0); +		  NULL); diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index 0f456386cce..a8261716d58 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -24,18 +24,18 @@  #include <linux/module.h>  #include <xen/platform_pci.h> +#include "xen-ops.h"  #define XEN_PLATFORM_ERR_MAGIC -1  #define XEN_PLATFORM_ERR_PROTOCOL -2  #define XEN_PLATFORM_ERR_BLACKLIST -3 -/* store the value of xen_emul_unplug after the unplug is done */ -int xen_platform_pci_unplug; -EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);  #ifdef CONFIG_XEN_PVHVM +/* store the value of xen_emul_unplug after the unplug is done */ +static int xen_platform_pci_unplug;  static int xen_emul_unplug; -static int __init check_platform_magic(void) +static int check_platform_magic(void)  {  	short magic;  	char protocol; @@ -68,7 +68,81 @@ static int __init check_platform_magic(void)  	return 0;  } -void __init xen_unplug_emulated_devices(void) +bool xen_has_pv_devices() +{ +	if (!xen_domain()) +		return false; + +	/* PV domains always have them. */ +	if (xen_pv_domain()) +		return true; + +	/* And user has xen_platform_pci=0 set in guest config as +	 * driver did not modify the value. */ +	if (xen_platform_pci_unplug == 0) +		return false; + +	if (xen_platform_pci_unplug & XEN_UNPLUG_NEVER) +		return false; + +	if (xen_platform_pci_unplug & XEN_UNPLUG_ALL) +		return true; + +	/* This is an odd one - we are going to run legacy +	 * and PV drivers at the same time. */ +	if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) +		return true; + +	/* And the caller has to follow with xen_pv_{disk,nic}_devices +	 * to be certain which driver can load. */ +	return false; +} +EXPORT_SYMBOL_GPL(xen_has_pv_devices); + +static bool __xen_has_pv_device(int state) +{ +	/* HVM domains might or might not */ +	if (xen_hvm_domain() && (xen_platform_pci_unplug & state)) +		return true; + +	return xen_has_pv_devices(); +} + +bool xen_has_pv_nic_devices(void) +{ +	return __xen_has_pv_device(XEN_UNPLUG_ALL_NICS | XEN_UNPLUG_ALL); +} +EXPORT_SYMBOL_GPL(xen_has_pv_nic_devices); + +bool xen_has_pv_disk_devices(void) +{ +	return __xen_has_pv_device(XEN_UNPLUG_ALL_IDE_DISKS | +				   XEN_UNPLUG_AUX_IDE_DISKS | XEN_UNPLUG_ALL); +} +EXPORT_SYMBOL_GPL(xen_has_pv_disk_devices); + +/* + * This one is odd - it determines whether you want to run PV _and_ + * legacy (IDE) drivers together. This combination is only possible + * under HVM. + */ +bool xen_has_pv_and_legacy_disk_devices(void) +{ +	if (!xen_domain()) +		return false; + +	/* N.B. This is only ever used in HVM mode */ +	if (xen_pv_domain()) +		return false; + +	if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) +		return true; + +	return false; +} +EXPORT_SYMBOL_GPL(xen_has_pv_and_legacy_disk_devices); + +void xen_unplug_emulated_devices(void)  {  	int r; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 769c4b01fa3..2e555163c2f 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -9,12 +9,15 @@  #include <linux/mm.h>  #include <linux/pm.h>  #include <linux/memblock.h> +#include <linux/cpuidle.h> +#include <linux/cpufreq.h>  #include <asm/elf.h>  #include <asm/vdso.h>  #include <asm/e820.h>  #include <asm/setup.h>  #include <asm/acpi.h> +#include <asm/numa.h>  #include <asm/xen/hypervisor.h>  #include <asm/xen/hypercall.h> @@ -23,21 +26,25 @@  #include <xen/interface/callback.h>  #include <xen/interface/memory.h>  #include <xen/interface/physdev.h> -#include <xen/interface/memory.h>  #include <xen/features.h> -  #include "xen-ops.h"  #include "vdso.h"  /* These are code, but not functions.  Defined in entry.S */  extern const char xen_hypervisor_callback[];  extern const char xen_failsafe_callback[]; +#ifdef CONFIG_X86_64 +extern asmlinkage void nmi(void); +#endif  extern void xen_sysenter_target(void);  extern void xen_syscall_target(void);  extern void xen_syscall32_target(void);  /* Amount of extra memory space we add to the e820 ranges */ -phys_addr_t xen_extra_mem_start, xen_extra_mem_size; +struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; + +/* Number of pages released from the initial allocation. */ +unsigned long xen_released_pages;  /*    * The maximum amount of extra memory compared to the base size.  The @@ -51,94 +58,281 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size;   */  #define EXTRA_MEM_RATIO		(10) -static __init void xen_add_extra_mem(unsigned long pages) +static void __init xen_add_extra_mem(u64 start, u64 size)  { -	u64 size = (u64)pages * PAGE_SIZE; -	u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; +	unsigned long pfn; +	int i; -	if (!pages) -		return; +	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { +		/* Add new region. */ +		if (xen_extra_mem[i].size == 0) { +			xen_extra_mem[i].start = start; +			xen_extra_mem[i].size  = size; +			break; +		} +		/* Append to existing region. */ +		if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) { +			xen_extra_mem[i].size += size; +			break; +		} +	} +	if (i == XEN_EXTRA_MEM_MAX_REGIONS) +		printk(KERN_WARNING "Warning: not enough extra memory regions\n"); -	e820_add_region(extra_start, size, E820_RAM); -	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); +	memblock_reserve(start, size); -	memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA"); +	xen_max_p2m_pfn = PFN_DOWN(start + size); +	for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) { +		unsigned long mfn = pfn_to_mfn(pfn); -	xen_extra_mem_size += size; +		if (WARN_ONCE(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn)) +			continue; +		WARN_ONCE(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n", +			  pfn, mfn); -	xen_max_p2m_pfn = PFN_DOWN(extra_start + size); +		__set_phys_to_machine(pfn, INVALID_P2M_ENTRY); +	}  } -static unsigned long __init xen_release_chunk(phys_addr_t start_addr, -					      phys_addr_t end_addr) +static unsigned long __init xen_do_chunk(unsigned long start, +					 unsigned long end, bool release)  {  	struct xen_memory_reservation reservation = {  		.address_bits = 0,  		.extent_order = 0,  		.domid        = DOMID_SELF  	}; -	unsigned long start, end;  	unsigned long len = 0;  	unsigned long pfn;  	int ret; -	start = PFN_UP(start_addr); -	end = PFN_DOWN(end_addr); - -	if (end <= start) -		return 0; - -	printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ", -	       start, end); -	for(pfn = start; pfn < end; pfn++) { +	for (pfn = start; pfn < end; pfn++) { +		unsigned long frame;  		unsigned long mfn = pfn_to_mfn(pfn); -		/* Make sure pfn exists to start with */ -		if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) -			continue; - -		set_xen_guest_handle(reservation.extent_start, &mfn); +		if (release) { +			/* Make sure pfn exists to start with */ +			if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) +				continue; +			frame = mfn; +		} else { +			if (mfn != INVALID_P2M_ENTRY) +				continue; +			frame = pfn; +		} +		set_xen_guest_handle(reservation.extent_start, &frame);  		reservation.nr_extents = 1; -		ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, +		ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap,  					   &reservation); -		WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", -		     start, end, ret); +		WARN(ret != 1, "Failed to %s pfn %lx err=%d\n", +		     release ? "release" : "populate", pfn, ret); +  		if (ret == 1) { -			set_phys_to_machine(pfn, INVALID_P2M_ENTRY); +			if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) { +				if (release) +					break; +				set_xen_guest_handle(reservation.extent_start, &frame); +				reservation.nr_extents = 1; +				ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, +							   &reservation); +				break; +			}  			len++; -		} +		} else +			break;  	} -	printk(KERN_CONT "%ld pages freed\n", len); +	if (len) +		printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n", +		       release ? "Freeing" : "Populating", +		       start, end, len, +		       release ? "freed" : "added");  	return len;  } -static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, -						     const struct e820map *e820) +static unsigned long __init xen_release_chunk(unsigned long start, +					      unsigned long end) +{ +	return xen_do_chunk(start, end, true); +} + +static unsigned long __init xen_populate_chunk( +	const struct e820entry *list, size_t map_size, +	unsigned long max_pfn, unsigned long *last_pfn, +	unsigned long credits_left) +{ +	const struct e820entry *entry; +	unsigned int i; +	unsigned long done = 0; +	unsigned long dest_pfn; + +	for (i = 0, entry = list; i < map_size; i++, entry++) { +		unsigned long s_pfn; +		unsigned long e_pfn; +		unsigned long pfns; +		long capacity; + +		if (credits_left <= 0) +			break; + +		if (entry->type != E820_RAM) +			continue; + +		e_pfn = PFN_DOWN(entry->addr + entry->size); + +		/* We only care about E820 after the xen_start_info->nr_pages */ +		if (e_pfn <= max_pfn) +			continue; + +		s_pfn = PFN_UP(entry->addr); +		/* If the E820 falls within the nr_pages, we want to start +		 * at the nr_pages PFN. +		 * If that would mean going past the E820 entry, skip it +		 */ +		if (s_pfn <= max_pfn) { +			capacity = e_pfn - max_pfn; +			dest_pfn = max_pfn; +		} else { +			capacity = e_pfn - s_pfn; +			dest_pfn = s_pfn; +		} + +		if (credits_left < capacity) +			capacity = credits_left; + +		pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false); +		done += pfns; +		*last_pfn = (dest_pfn + pfns); +		if (pfns < capacity) +			break; +		credits_left -= pfns; +	} +	return done; +} + +static void __init xen_set_identity_and_release_chunk( +	unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, +	unsigned long *released, unsigned long *identity)  { -	phys_addr_t max_addr = PFN_PHYS(max_pfn); -	phys_addr_t last_end = ISA_END_ADDRESS; +	unsigned long pfn; + +	/* +	 * If the PFNs are currently mapped, clear the mappings +	 * (except for the ISA region which must be 1:1 mapped) to +	 * release the refcounts (in Xen) on the original frames. +	 */ +	for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) { +		pte_t pte = __pte_ma(0); + +		if (pfn < PFN_UP(ISA_END_ADDRESS)) +			pte = mfn_pte(pfn, PAGE_KERNEL_IO); + +		(void)HYPERVISOR_update_va_mapping( +			(unsigned long)__va(pfn << PAGE_SHIFT), pte, 0); +	} + +	if (start_pfn < nr_pages) +		*released += xen_release_chunk( +			start_pfn, min(end_pfn, nr_pages)); + +	*identity += set_phys_range_identity(start_pfn, end_pfn); +} + +static unsigned long __init xen_set_identity_and_release( +	const struct e820entry *list, size_t map_size, unsigned long nr_pages) +{ +	phys_addr_t start = 0;  	unsigned long released = 0; +	unsigned long identity = 0; +	const struct e820entry *entry;  	int i; -	/* Free any unused memory above the low 1Mbyte. */ -	for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { -		phys_addr_t end = e820->map[i].addr; -		end = min(max_addr, end); +	/* +	 * Combine non-RAM regions and gaps until a RAM region (or the +	 * end of the map) is reached, then set the 1:1 map and +	 * release the pages (if available) in those non-RAM regions. +	 * +	 * The combined non-RAM regions are rounded to a whole number +	 * of pages so any partial pages are accessible via the 1:1 +	 * mapping.  This is needed for some BIOSes that put (for +	 * example) the DMI tables in a reserved region that begins on +	 * a non-page boundary. +	 */ +	for (i = 0, entry = list; i < map_size; i++, entry++) { +		phys_addr_t end = entry->addr + entry->size; +		if (entry->type == E820_RAM || i == map_size - 1) { +			unsigned long start_pfn = PFN_DOWN(start); +			unsigned long end_pfn = PFN_UP(end); + +			if (entry->type == E820_RAM) +				end_pfn = PFN_UP(entry->addr); -		if (last_end < end) -			released += xen_release_chunk(last_end, end); -		last_end = max(last_end, e820->map[i].addr + e820->map[i].size); +			if (start_pfn < end_pfn) +				xen_set_identity_and_release_chunk( +					start_pfn, end_pfn, nr_pages, +					&released, &identity); + +			start = end; +		}  	} -	if (last_end < max_addr) -		released += xen_release_chunk(last_end, max_addr); +	if (released) +		printk(KERN_INFO "Released %lu pages of unused memory\n", released); +	if (identity) +		printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity); -	printk(KERN_INFO "released %ld pages of unused memory\n", released);  	return released;  } +static unsigned long __init xen_get_max_pages(void) +{ +	unsigned long max_pages = MAX_DOMAIN_PAGES; +	domid_t domid = DOMID_SELF; +	int ret; + +	/* +	 * For the initial domain we use the maximum reservation as +	 * the maximum page. +	 * +	 * For guest domains the current maximum reservation reflects +	 * the current maximum rather than the static maximum. In this +	 * case the e820 map provided to us will cover the static +	 * maximum region. +	 */ +	if (xen_initial_domain()) { +		ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid); +		if (ret > 0) +			max_pages = ret; +	} + +	return min(max_pages, MAX_DOMAIN_PAGES); +} + +static void xen_align_and_add_e820_region(u64 start, u64 size, int type) +{ +	u64 end = start + size; + +	/* Align RAM regions to page boundaries. */ +	if (type == E820_RAM) { +		start = PAGE_ALIGN(start); +		end &= ~((u64)PAGE_SIZE - 1); +	} + +	e820_add_region(start, end - start, type); +} + +void xen_ignore_unusable(struct e820entry *list, size_t map_size) +{ +	struct e820entry *entry; +	unsigned int i; + +	for (i = 0, entry = list; i < map_size; i++, entry++) { +		if (entry->type == E820_UNUSABLE) +			entry->type = E820_RAM; +	} +} +  /**   * machine_specific_memory_setup - Hook for machine specific memory setup.   **/ @@ -150,8 +344,10 @@ char * __init xen_memory_setup(void)  	unsigned long long mem_end;  	int rc;  	struct xen_memory_map memmap; +	unsigned long max_pages; +	unsigned long last_pfn = 0;  	unsigned long extra_pages = 0; -	unsigned long extra_limit; +	unsigned long populated;  	int i;  	int op; @@ -177,40 +373,96 @@ char * __init xen_memory_setup(void)  	}  	BUG_ON(rc); -	e820.nr_map = 0; -	xen_extra_mem_start = mem_end; -	for (i = 0; i < memmap.nr_entries; i++) { -		unsigned long long end = map[i].addr + map[i].size; +	/* +	 * Xen won't allow a 1:1 mapping to be created to UNUSABLE +	 * regions, so if we're using the machine memory map leave the +	 * region as RAM as it is in the pseudo-physical map. +	 * +	 * UNUSABLE regions in domUs are not handled and will need +	 * a patch in the future. +	 */ +	if (xen_initial_domain()) +		xen_ignore_unusable(map, memmap.nr_entries); -		if (map[i].type == E820_RAM) { -			if (map[i].addr < mem_end && end > mem_end) { -				/* Truncate region to max_mem. */ -				u64 delta = end - mem_end; +	/* Make sure the Xen-supplied memory map is well-ordered. */ +	sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); -				map[i].size -= delta; -				extra_pages += PFN_DOWN(delta); +	max_pages = xen_get_max_pages(); +	if (max_pages > max_pfn) +		extra_pages += max_pages - max_pfn; -				end = mem_end; -			} +	/* +	 * Set P2M for all non-RAM pages and E820 gaps to be identity +	 * type PFNs.  Any RAM pages that would be made inaccesible by +	 * this are first released. +	 */ +	xen_released_pages = xen_set_identity_and_release( +		map, memmap.nr_entries, max_pfn); + +	/* +	 * Populate back the non-RAM pages and E820 gaps that had been +	 * released. */ +	populated = xen_populate_chunk(map, memmap.nr_entries, +			max_pfn, &last_pfn, xen_released_pages); + +	xen_released_pages -= populated; +	extra_pages += xen_released_pages; + +	if (last_pfn > max_pfn) { +		max_pfn = min(MAX_DOMAIN_PAGES, last_pfn); +		mem_end = PFN_PHYS(max_pfn); +	} +	/* +	 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO +	 * factor the base size.  On non-highmem systems, the base +	 * size is the full initial memory allocation; on highmem it +	 * is limited to the max size of lowmem, so that it doesn't +	 * get completely filled. +	 * +	 * In principle there could be a problem in lowmem systems if +	 * the initial memory is also very large with respect to +	 * lowmem, but we won't try to deal with that here. +	 */ +	extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), +			  extra_pages); +	i = 0; +	while (i < memmap.nr_entries) { +		u64 addr = map[i].addr; +		u64 size = map[i].size; +		u32 type = map[i].type; + +		if (type == E820_RAM) { +			if (addr < mem_end) { +				size = min(size, mem_end - addr); +			} else if (extra_pages) { +				size = min(size, (u64)extra_pages * PAGE_SIZE); +				extra_pages -= size / PAGE_SIZE; +				xen_add_extra_mem(addr, size); +			} else +				type = E820_UNUSABLE;  		} -		if (end > xen_extra_mem_start) -			xen_extra_mem_start = end; +		xen_align_and_add_e820_region(addr, size, type); -		/* If region is non-RAM or below mem_end, add what remains */ -		if ((map[i].type != E820_RAM || map[i].addr < mem_end) && -		    map[i].size > 0) -			e820_add_region(map[i].addr, map[i].size, map[i].type); +		map[i].addr += size; +		map[i].size -= size; +		if (map[i].size == 0) +			i++;  	}  	/* +	 * Set the rest as identity mapped, in case PCI BARs are +	 * located here. +	 * +	 * PFNs above MAX_P2M_PFN are considered identity mapped as +	 * well. +	 */ +	set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul); + +	/*  	 * In domU, the ISA region is normal, usable memory, but we  	 * reserve ISA memory anyway because too many things poke  	 * about in there. -	 * -	 * In Dom0, the host E820 information can leave gaps in the -	 * ISA range, which would cause us to release those pages.  To -	 * avoid this, we unconditionally reserve them here.  	 */  	e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,  			E820_RESERVED); @@ -220,52 +472,60 @@ char * __init xen_memory_setup(void)  	 *  - mfn_list  	 *  - xen_start_info  	 * See comment above "struct start_info" in <xen/interface/xen.h> +	 * We tried to make the the memblock_reserve more selective so +	 * that it would be clear what region is reserved. Sadly we ran +	 * in the problem wherein on a 64-bit hypervisor with a 32-bit +	 * initial domain, the pt_base has the cr3 value which is not +	 * neccessarily where the pagetable starts! As Jan put it: " +	 * Actually, the adjustment turns out to be correct: The page +	 * tables for a 32-on-64 dom0 get allocated in the order "first L1", +	 * "first L2", "first L3", so the offset to the page table base is +	 * indeed 2. When reading xen/include/public/xen.h's comment +	 * very strictly, this is not a violation (since there nothing is said +	 * that the first thing in the page table space is pointed to by +	 * pt_base; I admit that this seems to be implied though, namely +	 * do I think that it is implied that the page table space is the +	 * range [pt_base, pt_base + nt_pt_frames), whereas that +	 * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames), +	 * which - without a priori knowledge - the kernel would have +	 * difficulty to figure out)." - so lets just fall back to the +	 * easy way and reserve the whole region.  	 */ -	memblock_x86_reserve_range(__pa(xen_start_info->mfn_list), -		      __pa(xen_start_info->pt_base), -			"XEN START INFO"); +	memblock_reserve(__pa(xen_start_info->mfn_list), +			 xen_start_info->pt_base - xen_start_info->mfn_list);  	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); -	extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); +	return "Xen"; +} -	/* -	 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO -	 * factor the base size.  On non-highmem systems, the base -	 * size is the full initial memory allocation; on highmem it -	 * is limited to the max size of lowmem, so that it doesn't -	 * get completely filled. -	 * -	 * In principle there could be a problem in lowmem systems if -	 * the initial memory is also very large with respect to -	 * lowmem, but we won't try to deal with that here. -	 */ -	extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), -			  max_pfn + extra_pages); +/* + * Machine specific memory setup for auto-translated guests. + */ +char * __init xen_auto_xlated_memory_setup(void) +{ +	static struct e820entry map[E820MAX] __initdata; -	if (extra_limit >= max_pfn) -		extra_pages = extra_limit - max_pfn; -	else -		extra_pages = 0; +	struct xen_memory_map memmap; +	int i; +	int rc; -	if (!xen_initial_domain()) -		xen_add_extra_mem(extra_pages); +	memmap.nr_entries = E820MAX; +	set_xen_guest_handle(memmap.buffer, map); -	return "Xen"; -} +	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); +	if (rc < 0) +		panic("No memory map (%d)\n", rc); -static void xen_idle(void) -{ -	local_irq_disable(); - -	if (need_resched()) -		local_irq_enable(); -	else { -		current_thread_info()->status &= ~TS_POLLING; -		smp_mb__after_clear_bit(); -		safe_halt(); -		current_thread_info()->status |= TS_POLLING; -	} +	sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries); + +	for (i = 0; i < memmap.nr_entries; i++) +		e820_add_region(map[i].addr, map[i].size, map[i].type); + +	memblock_reserve(__pa(xen_start_info->mfn_list), +			 xen_start_info->pt_base - xen_start_info->mfn_list); + +	return "Xen";  }  /* @@ -276,15 +536,22 @@ static void xen_idle(void)  static void __init fiddle_vdso(void)  {  #ifdef CONFIG_X86_32 +	/* +	 * This could be called before selected_vdso32 is initialized, so +	 * just fiddle with both possible images.  vdso_image_32_syscall +	 * can't be selected, since it only exists on 64-bit systems. +	 */  	u32 *mask; -	mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK); +	mask = vdso_image_32_int80.data + +		vdso_image_32_int80.sym_VDSO32_NOTE_MASK;  	*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; -	mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK); +	mask = vdso_image_32_sysenter.data + +		vdso_image_32_sysenter.sym_VDSO32_NOTE_MASK;  	*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;  #endif  } -static __cpuinit int register_callback(unsigned type, const void *func) +static int register_callback(unsigned type, const void *func)  {  	struct callback_register callback = {  		.type = type, @@ -295,7 +562,7 @@ static __cpuinit int register_callback(unsigned type, const void *func)  	return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);  } -void __cpuinit xen_enable_sysenter(void) +void xen_enable_sysenter(void)  {  	int ret;  	unsigned sysenter_feature; @@ -314,7 +581,7 @@ void __cpuinit xen_enable_sysenter(void)  		setup_clear_cpu_cap(sysenter_feature);  } -void __cpuinit xen_enable_syscall(void) +void xen_enable_syscall(void)  {  #ifdef CONFIG_X86_64  	int ret; @@ -335,19 +602,13 @@ void __cpuinit xen_enable_syscall(void)  #endif /* CONFIG_X86_64 */  } -void __init xen_arch_setup(void) +void __init xen_pvmmu_arch_setup(void)  { -	struct physdev_set_iopl set_iopl; -	int rc; - -	xen_panic_handler_init(); -  	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);  	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); -	if (!xen_feature(XENFEAT_auto_translated_physmap)) -		HYPERVISOR_vm_assist(VMASST_CMD_enable, -				     VMASST_TYPE_pae_extended_cr3); +	HYPERVISOR_vm_assist(VMASST_CMD_enable, +			     VMASST_TYPE_pae_extended_cr3);  	if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||  	    register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) @@ -355,11 +616,14 @@ void __init xen_arch_setup(void)  	xen_enable_sysenter();  	xen_enable_syscall(); +} -	set_iopl.iopl = 1; -	rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); -	if (rc != 0) -		printk(KERN_INFO "physdev_op failed %d\n", rc); +/* This function is not called for HVM domains */ +void __init xen_arch_setup(void) +{ +	xen_panic_handler_init(); +	if (!xen_feature(XENFEAT_auto_translated_physmap)) +		xen_pvmmu_arch_setup();  #ifdef CONFIG_ACPI  	if (!(xen_start_info->flags & SIF_INITDOMAIN)) { @@ -372,7 +636,12 @@ void __init xen_arch_setup(void)  	       MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?  	       COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); -	pm_idle = xen_idle; - +	/* Set up idle, making sure it calls safe_halt() pvop */ +	disable_cpuidle(); +	disable_cpufreq(); +	WARN_ON(xen_set_default_idle());  	fiddle_vdso(); +#ifdef CONFIG_NUMA +	numa_off = 1; +#endif  } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 72a4c795904..7005974c3ff 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -16,6 +16,8 @@  #include <linux/err.h>  #include <linux/slab.h>  #include <linux/smp.h> +#include <linux/irq_work.h> +#include <linux/tick.h>  #include <asm/paravirt.h>  #include <asm/desc.h> @@ -32,42 +34,50 @@  #include <xen/page.h>  #include <xen/events.h> +#include <xen/hvc-console.h>  #include "xen-ops.h"  #include "mmu.h"  cpumask_var_t xen_cpu_initialized_map; -static DEFINE_PER_CPU(int, xen_resched_irq); -static DEFINE_PER_CPU(int, xen_callfunc_irq); -static DEFINE_PER_CPU(int, xen_callfuncsingle_irq); -static DEFINE_PER_CPU(int, xen_debug_irq) = -1; +struct xen_common_irq { +	int irq; +	char *name; +}; +static DEFINE_PER_CPU(struct xen_common_irq, xen_resched_irq) = { .irq = -1 }; +static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 }; +static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 }; +static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 }; +static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 };  static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);  static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); +static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id);  /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back.   */  static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)  {  	inc_irq_stat(irq_resched_count); +	scheduler_ipi();  	return IRQ_HANDLED;  } -static __cpuinit void cpu_bringup(void) +static void cpu_bringup(void)  { -	int cpu = smp_processor_id(); +	int cpu;  	cpu_init();  	touch_softlockup_watchdog();  	preempt_disable(); -	xen_enable_sysenter(); -	xen_enable_syscall(); - +	/* PVH runs in ring 0 and allows us to do native syscalls. Yay! */ +	if (!xen_feature(XENFEAT_supervisor_mode_kernel)) { +		xen_enable_sysenter(); +		xen_enable_syscall(); +	}  	cpu = smp_processor_id();  	smp_store_cpu_info(cpu);  	cpu_data(cpu).x86_max_cores = 1; @@ -75,8 +85,12 @@ static __cpuinit void cpu_bringup(void)  	xen_setup_cpu_clockevents(); +	notify_cpu_starting(cpu); +  	set_cpu_online(cpu, true); -	percpu_write(cpu_state, CPU_ONLINE); + +	this_cpu_write(cpu_state, CPU_ONLINE); +  	wmb();  	/* We can take interrupts now: we're officially "up". */ @@ -85,71 +99,128 @@ static __cpuinit void cpu_bringup(void)  	wmb();			/* make sure everything is out */  } -static __cpuinit void cpu_bringup_and_idle(void) +/* Note: cpu parameter is only relevant for PVH */ +static void cpu_bringup_and_idle(int cpu)  { +#ifdef CONFIG_X86_64 +	if (xen_feature(XENFEAT_auto_translated_physmap) && +	    xen_feature(XENFEAT_supervisor_mode_kernel)) +		xen_pvh_secondary_vcpu_init(cpu); +#endif  	cpu_bringup(); -	cpu_idle(); +	cpu_startup_entry(CPUHP_ONLINE);  } +static void xen_smp_intr_free(unsigned int cpu) +{ +	if (per_cpu(xen_resched_irq, cpu).irq >= 0) { +		unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu).irq, NULL); +		per_cpu(xen_resched_irq, cpu).irq = -1; +		kfree(per_cpu(xen_resched_irq, cpu).name); +		per_cpu(xen_resched_irq, cpu).name = NULL; +	} +	if (per_cpu(xen_callfunc_irq, cpu).irq >= 0) { +		unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu).irq, NULL); +		per_cpu(xen_callfunc_irq, cpu).irq = -1; +		kfree(per_cpu(xen_callfunc_irq, cpu).name); +		per_cpu(xen_callfunc_irq, cpu).name = NULL; +	} +	if (per_cpu(xen_debug_irq, cpu).irq >= 0) { +		unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu).irq, NULL); +		per_cpu(xen_debug_irq, cpu).irq = -1; +		kfree(per_cpu(xen_debug_irq, cpu).name); +		per_cpu(xen_debug_irq, cpu).name = NULL; +	} +	if (per_cpu(xen_callfuncsingle_irq, cpu).irq >= 0) { +		unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu).irq, +				       NULL); +		per_cpu(xen_callfuncsingle_irq, cpu).irq = -1; +		kfree(per_cpu(xen_callfuncsingle_irq, cpu).name); +		per_cpu(xen_callfuncsingle_irq, cpu).name = NULL; +	} +	if (xen_hvm_domain()) +		return; + +	if (per_cpu(xen_irq_work, cpu).irq >= 0) { +		unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL); +		per_cpu(xen_irq_work, cpu).irq = -1; +		kfree(per_cpu(xen_irq_work, cpu).name); +		per_cpu(xen_irq_work, cpu).name = NULL; +	} +};  static int xen_smp_intr_init(unsigned int cpu)  {  	int rc; -	const char *resched_name, *callfunc_name, *debug_name; +	char *resched_name, *callfunc_name, *debug_name;  	resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);  	rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,  				    cpu,  				    xen_reschedule_interrupt, -				    IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, +				    IRQF_PERCPU|IRQF_NOBALANCING,  				    resched_name,  				    NULL);  	if (rc < 0)  		goto fail; -	per_cpu(xen_resched_irq, cpu) = rc; +	per_cpu(xen_resched_irq, cpu).irq = rc; +	per_cpu(xen_resched_irq, cpu).name = resched_name;  	callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);  	rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,  				    cpu,  				    xen_call_function_interrupt, -				    IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, +				    IRQF_PERCPU|IRQF_NOBALANCING,  				    callfunc_name,  				    NULL);  	if (rc < 0)  		goto fail; -	per_cpu(xen_callfunc_irq, cpu) = rc; +	per_cpu(xen_callfunc_irq, cpu).irq = rc; +	per_cpu(xen_callfunc_irq, cpu).name = callfunc_name;  	debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu);  	rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt, -				     IRQF_DISABLED | IRQF_PERCPU | IRQF_NOBALANCING, +				     IRQF_PERCPU | IRQF_NOBALANCING,  				     debug_name, NULL);  	if (rc < 0)  		goto fail; -	per_cpu(xen_debug_irq, cpu) = rc; +	per_cpu(xen_debug_irq, cpu).irq = rc; +	per_cpu(xen_debug_irq, cpu).name = debug_name;  	callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);  	rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,  				    cpu,  				    xen_call_function_single_interrupt, -				    IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, +				    IRQF_PERCPU|IRQF_NOBALANCING,  				    callfunc_name,  				    NULL);  	if (rc < 0)  		goto fail; -	per_cpu(xen_callfuncsingle_irq, cpu) = rc; +	per_cpu(xen_callfuncsingle_irq, cpu).irq = rc; +	per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name; + +	/* +	 * The IRQ worker on PVHVM goes through the native path and uses the +	 * IPI mechanism. +	 */ +	if (xen_hvm_domain()) +		return 0; + +	callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu); +	rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR, +				    cpu, +				    xen_irq_work_interrupt, +				    IRQF_PERCPU|IRQF_NOBALANCING, +				    callfunc_name, +				    NULL); +	if (rc < 0) +		goto fail; +	per_cpu(xen_irq_work, cpu).irq = rc; +	per_cpu(xen_irq_work, cpu).name = callfunc_name;  	return 0;   fail: -	if (per_cpu(xen_resched_irq, cpu) >= 0) -		unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL); -	if (per_cpu(xen_callfunc_irq, cpu) >= 0) -		unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); -	if (per_cpu(xen_debug_irq, cpu) >= 0) -		unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); -	if (per_cpu(xen_callfuncsingle_irq, cpu) >= 0) -		unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), -				       NULL); - +	xen_smp_intr_free(cpu);  	return rc;  } @@ -172,6 +243,7 @@ static void __init xen_fill_possible_map(void)  static void __init xen_filter_cpu_maps(void)  {  	int i, rc; +	unsigned int subtract = 0;  	if (!xen_initial_domain())  		return; @@ -186,8 +258,22 @@ static void __init xen_filter_cpu_maps(void)  		} else {  			set_cpu_possible(i, false);  			set_cpu_present(i, false); +			subtract++;  		}  	} +#ifdef CONFIG_HOTPLUG_CPU +	/* This is akin to using 'nr_cpus' on the Linux command line. +	 * Which is OK as when we use 'dom0_max_vcpus=X' we can only +	 * have up to X, while nr_cpu_ids is greater than X. This +	 * normally is not a problem, except when CPU hotplugging +	 * is involved and then there might be more than X CPUs +	 * in the guest - which will not work as there is no +	 * hypercall to expand the max number of VCPUs an already +	 * running guest has. So cap it up to X. */ +	if (subtract) +		nr_cpu_ids = nr_cpu_ids - subtract; +#endif +  }  static void __init xen_smp_prepare_boot_cpu(void) @@ -195,22 +281,57 @@ static void __init xen_smp_prepare_boot_cpu(void)  	BUG_ON(smp_processor_id() != 0);  	native_smp_prepare_boot_cpu(); -	/* We've switched to the "real" per-cpu gdt, so make sure the -	   old memory can be recycled */ -	make_lowmem_page_readwrite(xen_initial_gdt); +	if (xen_pv_domain()) { +		if (!xen_feature(XENFEAT_writable_page_tables)) +			/* We've switched to the "real" per-cpu gdt, so make +			 * sure the old memory can be recycled. */ +			make_lowmem_page_readwrite(xen_initial_gdt); + +#ifdef CONFIG_X86_32 +		/* +		 * Xen starts us with XEN_FLAT_RING1_DS, but linux code +		 * expects __USER_DS +		 */ +		loadsegment(ds, __USER_DS); +		loadsegment(es, __USER_DS); +#endif -	xen_filter_cpu_maps(); -	xen_setup_vcpu_info_placement(); +		xen_filter_cpu_maps(); +		xen_setup_vcpu_info_placement(); +	} +	/* +	 * The alternative logic (which patches the unlock/lock) runs before +	 * the smp bootup up code is activated. Hence we need to set this up +	 * the core kernel is being patched. Otherwise we will have only +	 * modules patched but not core code. +	 */ +	xen_init_spinlocks();  }  static void __init xen_smp_prepare_cpus(unsigned int max_cpus)  {  	unsigned cpu; +	unsigned int i; +	if (skip_ioapic_setup) { +		char *m = (max_cpus == 0) ? +			"The nosmp parameter is incompatible with Xen; " \ +			"use Xen dom0_max_vcpus=1 parameter" : +			"The noapic parameter is incompatible with Xen"; + +		xen_raw_printk(m); +		panic(m); +	}  	xen_init_lock_cpu(0); -	smp_store_cpu_info(0); +	smp_store_boot_cpu_info();  	cpu_data(0).x86_max_cores = 1; + +	for_each_possible_cpu(i) { +		zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); +		zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); +		zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); +	}  	set_cpu_sibling_map(0);  	if (xen_smp_intr_init(0)) @@ -228,21 +349,11 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)  		set_cpu_possible(cpu, false);  	} -	for_each_possible_cpu (cpu) { -		struct task_struct *idle; - -		if (cpu == 0) -			continue; - -		idle = fork_idle(cpu); -		if (IS_ERR(idle)) -			panic("failed fork for CPU %d", cpu); - +	for_each_possible_cpu(cpu)  		set_cpu_present(cpu, true); -	}  } -static __cpuinit int +static int  cpu_initialize_context(unsigned int cpu, struct task_struct *idle)  {  	struct vcpu_guest_context *ctxt; @@ -258,50 +369,62 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)  	gdt = get_cpu_gdt_table(cpu); -	ctxt->flags = VGCF_IN_KERNEL; -	ctxt->user_regs.ds = __USER_DS; -	ctxt->user_regs.es = __USER_DS; -	ctxt->user_regs.ss = __KERNEL_DS;  #ifdef CONFIG_X86_32 +	/* Note: PVH is not yet supported on x86_32. */  	ctxt->user_regs.fs = __KERNEL_PERCPU;  	ctxt->user_regs.gs = __KERNEL_STACK_CANARY; -#else -	ctxt->gs_base_kernel = per_cpu_offset(cpu);  #endif  	ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; -	ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */  	memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); -	xen_copy_trap_info(ctxt->trap_ctxt); +	if (!xen_feature(XENFEAT_auto_translated_physmap)) { +		ctxt->flags = VGCF_IN_KERNEL; +		ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ +		ctxt->user_regs.ds = __USER_DS; +		ctxt->user_regs.es = __USER_DS; +		ctxt->user_regs.ss = __KERNEL_DS; -	ctxt->ldt_ents = 0; +		xen_copy_trap_info(ctxt->trap_ctxt); -	BUG_ON((unsigned long)gdt & ~PAGE_MASK); +		ctxt->ldt_ents = 0; -	gdt_mfn = arbitrary_virt_to_mfn(gdt); -	make_lowmem_page_readonly(gdt); -	make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); +		BUG_ON((unsigned long)gdt & ~PAGE_MASK); -	ctxt->gdt_frames[0] = gdt_mfn; -	ctxt->gdt_ents      = GDT_ENTRIES; +		gdt_mfn = arbitrary_virt_to_mfn(gdt); +		make_lowmem_page_readonly(gdt); +		make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); -	ctxt->user_regs.cs = __KERNEL_CS; -	ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); +		ctxt->gdt_frames[0] = gdt_mfn; +		ctxt->gdt_ents      = GDT_ENTRIES; -	ctxt->kernel_ss = __KERNEL_DS; -	ctxt->kernel_sp = idle->thread.sp0; +		ctxt->kernel_ss = __KERNEL_DS; +		ctxt->kernel_sp = idle->thread.sp0;  #ifdef CONFIG_X86_32 -	ctxt->event_callback_cs     = __KERNEL_CS; -	ctxt->failsafe_callback_cs  = __KERNEL_CS; +		ctxt->event_callback_cs     = __KERNEL_CS; +		ctxt->failsafe_callback_cs  = __KERNEL_CS; +#else +		ctxt->gs_base_kernel = per_cpu_offset(cpu);  #endif -	ctxt->event_callback_eip    = (unsigned long)xen_hypervisor_callback; -	ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; - -	per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); +		ctxt->event_callback_eip    = +					(unsigned long)xen_hypervisor_callback; +		ctxt->failsafe_callback_eip = +					(unsigned long)xen_failsafe_callback; +		ctxt->user_regs.cs = __KERNEL_CS; +		per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); +#ifdef CONFIG_X86_32 +	} +#else +	} else +		/* N.B. The user_regs.eip (cpu_bringup_and_idle) is called with +		 * %rdi having the cpu number - which means are passing in +		 * as the first parameter the cpu. Subtle! +		 */ +		ctxt->user_regs.rdi = cpu; +#endif +	ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);  	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); -  	if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))  		BUG(); @@ -309,9 +432,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)  	return 0;  } -static int __cpuinit xen_cpu_up(unsigned int cpu) +static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)  { -	struct task_struct *idle = idle_task(cpu);  	int rc;  	per_cpu(current_task, cpu) = idle; @@ -319,10 +441,11 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)  	irq_ctx_init(cpu);  #else  	clear_tsk_thread_flag(idle, TIF_FORK); +#endif  	per_cpu(kernel_stack, cpu) =  		(unsigned long)task_stack_page(idle) -  		KERNEL_STACK_OFFSET + THREAD_SIZE; -#endif +  	xen_setup_runstate_info(cpu);  	xen_setup_timer(cpu);  	xen_init_lock_cpu(cpu); @@ -337,7 +460,8 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)  		return rc;  	if (num_online_cpus() == 1) -		alternatives_smp_switch(1); +		/* Just in case we booted with a single CPU. */ +		alternatives_enable_smp();  	rc = xen_smp_intr_init(cpu);  	if (rc) @@ -373,26 +497,27 @@ static int xen_cpu_disable(void)  static void xen_cpu_die(unsigned int cpu)  { -	while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) { +	while (xen_pv_domain() && HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {  		current->state = TASK_UNINTERRUPTIBLE;  		schedule_timeout(HZ/10);  	} -	unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL); -	unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); -	unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); -	unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); +	xen_smp_intr_free(cpu);  	xen_uninit_lock_cpu(cpu);  	xen_teardown_timer(cpu); - -	if (num_online_cpus() == 1) -		alternatives_smp_switch(0);  } -static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */ +static void xen_play_dead(void) /* used only with HOTPLUG_CPU */  {  	play_dead_common();  	HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);  	cpu_bringup(); +	/* +	 * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down) +	 * clears certain data that the cpu_idle loop (which called us +	 * and that we return from) expects. The only way to get that +	 * data back is to call: +	 */ +	tick_nohz_idle_enter();  }  #else /* !CONFIG_HOTPLUG_CPU */ @@ -436,8 +561,8 @@ static void xen_smp_send_reschedule(int cpu)  	xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);  } -static void xen_send_IPI_mask(const struct cpumask *mask, -			      enum ipi_vector vector) +static void __xen_send_IPI_mask(const struct cpumask *mask, +			      int vector)  {  	unsigned cpu; @@ -449,7 +574,7 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask)  {  	int cpu; -	xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); +	__xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);  	/* Make sure other vcpus get a chance to run if they need to. */  	for_each_cpu(cpu, mask) { @@ -462,10 +587,90 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask)  static void xen_smp_send_call_function_single_ipi(int cpu)  { -	xen_send_IPI_mask(cpumask_of(cpu), +	__xen_send_IPI_mask(cpumask_of(cpu),  			  XEN_CALL_FUNCTION_SINGLE_VECTOR);  } +static inline int xen_map_vector(int vector) +{ +	int xen_vector; + +	switch (vector) { +	case RESCHEDULE_VECTOR: +		xen_vector = XEN_RESCHEDULE_VECTOR; +		break; +	case CALL_FUNCTION_VECTOR: +		xen_vector = XEN_CALL_FUNCTION_VECTOR; +		break; +	case CALL_FUNCTION_SINGLE_VECTOR: +		xen_vector = XEN_CALL_FUNCTION_SINGLE_VECTOR; +		break; +	case IRQ_WORK_VECTOR: +		xen_vector = XEN_IRQ_WORK_VECTOR; +		break; +#ifdef CONFIG_X86_64 +	case NMI_VECTOR: +	case APIC_DM_NMI: /* Some use that instead of NMI_VECTOR */ +		xen_vector = XEN_NMI_VECTOR; +		break; +#endif +	default: +		xen_vector = -1; +		printk(KERN_ERR "xen: vector 0x%x is not implemented\n", +			vector); +	} + +	return xen_vector; +} + +void xen_send_IPI_mask(const struct cpumask *mask, +			      int vector) +{ +	int xen_vector = xen_map_vector(vector); + +	if (xen_vector >= 0) +		__xen_send_IPI_mask(mask, xen_vector); +} + +void xen_send_IPI_all(int vector) +{ +	int xen_vector = xen_map_vector(vector); + +	if (xen_vector >= 0) +		__xen_send_IPI_mask(cpu_online_mask, xen_vector); +} + +void xen_send_IPI_self(int vector) +{ +	int xen_vector = xen_map_vector(vector); + +	if (xen_vector >= 0) +		xen_send_IPI_one(smp_processor_id(), xen_vector); +} + +void xen_send_IPI_mask_allbutself(const struct cpumask *mask, +				int vector) +{ +	unsigned cpu; +	unsigned int this_cpu = smp_processor_id(); +	int xen_vector = xen_map_vector(vector); + +	if (!(num_online_cpus() > 1) || (xen_vector < 0)) +		return; + +	for_each_cpu_and(cpu, mask, cpu_online_mask) { +		if (this_cpu == cpu) +			continue; + +		xen_send_IPI_one(cpu, xen_vector); +	} +} + +void xen_send_IPI_allbutself(int vector) +{ +	xen_send_IPI_mask_allbutself(cpu_online_mask, vector); +} +  static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)  {  	irq_enter(); @@ -486,7 +691,17 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)  	return IRQ_HANDLED;  } -static const struct smp_ops xen_smp_ops __initdata = { +static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id) +{ +	irq_enter(); +	irq_work_run(); +	inc_irq_stat(apic_irq_work_irqs); +	irq_exit(); + +	return IRQ_HANDLED; +} + +static const struct smp_ops xen_smp_ops __initconst = {  	.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,  	.smp_prepare_cpus = xen_smp_prepare_cpus,  	.smp_cpus_done = xen_smp_cpus_done, @@ -507,5 +722,55 @@ void __init xen_smp_init(void)  {  	smp_ops = xen_smp_ops;  	xen_fill_possible_map(); -	xen_init_spinlocks(); +} + +static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) +{ +	native_smp_prepare_cpus(max_cpus); +	WARN_ON(xen_smp_intr_init(0)); + +	xen_init_lock_cpu(0); +} + +static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) +{ +	int rc; +	/* +	 * xen_smp_intr_init() needs to run before native_cpu_up() +	 * so that IPI vectors are set up on the booting CPU before +	 * it is marked online in native_cpu_up(). +	*/ +	rc = xen_smp_intr_init(cpu); +	WARN_ON(rc); +	if (!rc) +		rc =  native_cpu_up(cpu, tidle); + +	/* +	 * We must initialize the slowpath CPU kicker _after_ the native +	 * path has executed. If we initialized it before none of the +	 * unlocker IPI kicks would reach the booting CPU as the booting +	 * CPU had not set itself 'online' in cpu_online_mask. That mask +	 * is checked when IPIs are sent (on HVM at least). +	 */ +	xen_init_lock_cpu(cpu); +	return rc; +} + +static void xen_hvm_cpu_die(unsigned int cpu) +{ +	xen_cpu_die(cpu); +	native_cpu_die(cpu); +} + +void __init xen_hvm_smp_init(void) +{ +	if (!xen_have_vector_callback) +		return; +	smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; +	smp_ops.smp_send_reschedule = xen_smp_send_reschedule; +	smp_ops.cpu_up = xen_hvm_cpu_up; +	smp_ops.cpu_die = xen_hvm_cpu_die; +	smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; +	smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; +	smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu;  } diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h new file mode 100644 index 00000000000..c7c2d89efd7 --- /dev/null +++ b/arch/x86/xen/smp.h @@ -0,0 +1,11 @@ +#ifndef _XEN_SMP_H + +extern void xen_send_IPI_mask(const struct cpumask *mask, +			      int vector); +extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask, +				int vector); +extern void xen_send_IPI_allbutself(int vector); +extern void xen_send_IPI_all(int vector); +extern void xen_send_IPI_self(int vector); + +#endif diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 23e061b9327..0ba5f3b967f 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -7,6 +7,7 @@  #include <linux/debugfs.h>  #include <linux/log2.h>  #include <linux/gfp.h> +#include <linux/slab.h>  #include <asm/paravirt.h> @@ -16,45 +17,44 @@  #include "xen-ops.h"  #include "debugfs.h" -#ifdef CONFIG_XEN_DEBUG_FS -static struct xen_spinlock_stats -{ -	u64 taken; -	u32 taken_slow; -	u32 taken_slow_nested; -	u32 taken_slow_pickup; -	u32 taken_slow_spurious; -	u32 taken_slow_irqenable; +enum xen_contention_stat { +	TAKEN_SLOW, +	TAKEN_SLOW_PICKUP, +	TAKEN_SLOW_SPURIOUS, +	RELEASED_SLOW, +	RELEASED_SLOW_KICKED, +	NR_CONTENTION_STATS +}; -	u64 released; -	u32 released_slow; -	u32 released_slow_kicked; +#ifdef CONFIG_XEN_DEBUG_FS  #define HISTO_BUCKETS	30 -	u32 histo_spin_total[HISTO_BUCKETS+1]; -	u32 histo_spin_spinning[HISTO_BUCKETS+1]; +static struct xen_spinlock_stats +{ +	u32 contention_stats[NR_CONTENTION_STATS];  	u32 histo_spin_blocked[HISTO_BUCKETS+1]; - -	u64 time_total; -	u64 time_spinning;  	u64 time_blocked;  } spinlock_stats;  static u8 zero_stats; -static unsigned lock_timeout = 1 << 10; -#define TIMEOUT lock_timeout -  static inline void check_zero(void)  { -	if (unlikely(zero_stats)) { -		memset(&spinlock_stats, 0, sizeof(spinlock_stats)); -		zero_stats = 0; +	u8 ret; +	u8 old = ACCESS_ONCE(zero_stats); +	if (unlikely(old)) { +		ret = cmpxchg(&zero_stats, old, 0); +		/* This ensures only one fellow resets the stat */ +		if (ret == old) +			memset(&spinlock_stats, 0, sizeof(spinlock_stats));  	}  } -#define ADD_STATS(elem, val)			\ -	do { check_zero(); spinlock_stats.elem += (val); } while(0) +static inline void add_stats(enum xen_contention_stat var, u32 val) +{ +	check_zero(); +	spinlock_stats.contention_stats[var] += val; +}  static inline u64 spin_time_start(void)  { @@ -73,22 +73,6 @@ static void __spin_time_accum(u64 delta, u32 *array)  		array[HISTO_BUCKETS]++;  } -static inline void spin_time_accum_spinning(u64 start) -{ -	u32 delta = xen_clocksource_read() - start; - -	__spin_time_accum(delta, spinlock_stats.histo_spin_spinning); -	spinlock_stats.time_spinning += delta; -} - -static inline void spin_time_accum_total(u64 start) -{ -	u32 delta = xen_clocksource_read() - start; - -	__spin_time_accum(delta, spinlock_stats.histo_spin_total); -	spinlock_stats.time_total += delta; -} -  static inline void spin_time_accum_blocked(u64 start)  {  	u32 delta = xen_clocksource_read() - start; @@ -97,270 +81,167 @@ static inline void spin_time_accum_blocked(u64 start)  	spinlock_stats.time_blocked += delta;  }  #else  /* !CONFIG_XEN_DEBUG_FS */ -#define TIMEOUT			(1 << 10) -#define ADD_STATS(elem, val)	do { (void)(val); } while(0) +static inline void add_stats(enum xen_contention_stat var, u32 val) +{ +}  static inline u64 spin_time_start(void)  {  	return 0;  } -static inline void spin_time_accum_total(u64 start) -{ -} -static inline void spin_time_accum_spinning(u64 start) -{ -}  static inline void spin_time_accum_blocked(u64 start)  {  }  #endif  /* CONFIG_XEN_DEBUG_FS */ -struct xen_spinlock { -	unsigned char lock;		/* 0 -> free; 1 -> locked */ -	unsigned short spinners;	/* count of waiting cpus */ +struct xen_lock_waiting { +	struct arch_spinlock *lock; +	__ticket_t want;  }; -static int xen_spin_is_locked(struct arch_spinlock *lock) -{ -	struct xen_spinlock *xl = (struct xen_spinlock *)lock; - -	return xl->lock != 0; -} - -static int xen_spin_is_contended(struct arch_spinlock *lock) -{ -	struct xen_spinlock *xl = (struct xen_spinlock *)lock; - -	/* Not strictly true; this is only the count of contended -	   lock-takers entering the slow path. */ -	return xl->spinners != 0; -} - -static int xen_spin_trylock(struct arch_spinlock *lock) -{ -	struct xen_spinlock *xl = (struct xen_spinlock *)lock; -	u8 old = 1; - -	asm("xchgb %b0,%1" -	    : "+q" (old), "+m" (xl->lock) : : "memory"); - -	return old == 0; -} -  static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; -static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); +static DEFINE_PER_CPU(char *, irq_name); +static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting); +static cpumask_t waiting_cpus; -/* - * Mark a cpu as interested in a lock.  Returns the CPU's previous - * lock of interest, in case we got preempted by an interrupt. - */ -static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl) +static bool xen_pvspin = true; +__visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)  { -	struct xen_spinlock *prev; - -	prev = __get_cpu_var(lock_spinners); -	__get_cpu_var(lock_spinners) = xl; - -	wmb();			/* set lock of interest before count */ - -	asm(LOCK_PREFIX " incw %0" -	    : "+m" (xl->spinners) : : "memory"); - -	return prev; -} - -/* - * Mark a cpu as no longer interested in a lock.  Restores previous - * lock of interest (NULL for none). - */ -static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev) -{ -	asm(LOCK_PREFIX " decw %0" -	    : "+m" (xl->spinners) : : "memory"); -	wmb();			/* decrement count before restoring lock */ -	__get_cpu_var(lock_spinners) = prev; -} - -static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable) -{ -	struct xen_spinlock *xl = (struct xen_spinlock *)lock; -	struct xen_spinlock *prev; -	int irq = __get_cpu_var(lock_kicker_irq); -	int ret; +	int irq = __this_cpu_read(lock_kicker_irq); +	struct xen_lock_waiting *w = &__get_cpu_var(lock_waiting); +	int cpu = smp_processor_id();  	u64 start; +	unsigned long flags;  	/* If kicker interrupts not initialized yet, just spin */  	if (irq == -1) -		return 0; +		return;  	start = spin_time_start(); -	/* announce we're spinning */ -	prev = spinning_lock(xl); - -	ADD_STATS(taken_slow, 1); -	ADD_STATS(taken_slow_nested, prev != NULL); +	/* +	 * Make sure an interrupt handler can't upset things in a +	 * partially setup state. +	 */ +	local_irq_save(flags); +	/* +	 * We don't really care if we're overwriting some other +	 * (lock,want) pair, as that would mean that we're currently +	 * in an interrupt context, and the outer context had +	 * interrupts enabled.  That has already kicked the VCPU out +	 * of xen_poll_irq(), so it will just return spuriously and +	 * retry with newly setup (lock,want). +	 * +	 * The ordering protocol on this is that the "lock" pointer +	 * may only be set non-NULL if the "want" ticket is correct. +	 * If we're updating "want", we must first clear "lock". +	 */ +	w->lock = NULL; +	smp_wmb(); +	w->want = want; +	smp_wmb(); +	w->lock = lock; -	do { -		unsigned long flags; +	/* This uses set_bit, which atomic and therefore a barrier */ +	cpumask_set_cpu(cpu, &waiting_cpus); +	add_stats(TAKEN_SLOW, 1); -		/* clear pending */ -		xen_clear_irq_pending(irq); +	/* clear pending */ +	xen_clear_irq_pending(irq); -		/* check again make sure it didn't become free while -		   we weren't looking  */ -		ret = xen_spin_trylock(lock); -		if (ret) { -			ADD_STATS(taken_slow_pickup, 1); +	/* Only check lock once pending cleared */ +	barrier(); -			/* -			 * If we interrupted another spinlock while it -			 * was blocking, make sure it doesn't block -			 * without rechecking the lock. -			 */ -			if (prev != NULL) -				xen_set_irq_pending(irq); -			goto out; -		} +	/* +	 * Mark entry to slowpath before doing the pickup test to make +	 * sure we don't deadlock with an unlocker. +	 */ +	__ticket_enter_slowpath(lock); -		flags = arch_local_save_flags(); -		if (irq_enable) { -			ADD_STATS(taken_slow_irqenable, 1); -			raw_local_irq_enable(); -		} +	/* +	 * check again make sure it didn't become free while +	 * we weren't looking +	 */ +	if (ACCESS_ONCE(lock->tickets.head) == want) { +		add_stats(TAKEN_SLOW_PICKUP, 1); +		goto out; +	} -		/* -		 * Block until irq becomes pending.  If we're -		 * interrupted at this point (after the trylock but -		 * before entering the block), then the nested lock -		 * handler guarantees that the irq will be left -		 * pending if there's any chance the lock became free; -		 * xen_poll_irq() returns immediately if the irq is -		 * pending. -		 */ -		xen_poll_irq(irq); +	/* Allow interrupts while blocked */ +	local_irq_restore(flags); -		raw_local_irq_restore(flags); +	/* +	 * If an interrupt happens here, it will leave the wakeup irq +	 * pending, which will cause xen_poll_irq() to return +	 * immediately. +	 */ -		ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); -	} while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ +	/* Block until irq becomes pending (or perhaps a spurious wakeup) */ +	xen_poll_irq(irq); +	add_stats(TAKEN_SLOW_SPURIOUS, !xen_test_irq_pending(irq)); -	kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); +	local_irq_save(flags); +	kstat_incr_irq_this_cpu(irq);  out: -	unspinning_lock(xl, prev); -	spin_time_accum_blocked(start); - -	return ret; -} - -static inline void __xen_spin_lock(struct arch_spinlock *lock, bool irq_enable) -{ -	struct xen_spinlock *xl = (struct xen_spinlock *)lock; -	unsigned timeout; -	u8 oldval; -	u64 start_spin; - -	ADD_STATS(taken, 1); - -	start_spin = spin_time_start(); - -	do { -		u64 start_spin_fast = spin_time_start(); +	cpumask_clear_cpu(cpu, &waiting_cpus); +	w->lock = NULL; -		timeout = TIMEOUT; +	local_irq_restore(flags); -		asm("1: xchgb %1,%0\n" -		    "   testb %1,%1\n" -		    "   jz 3f\n" -		    "2: rep;nop\n" -		    "   cmpb $0,%0\n" -		    "   je 1b\n" -		    "   dec %2\n" -		    "   jnz 2b\n" -		    "3:\n" -		    : "+m" (xl->lock), "=q" (oldval), "+r" (timeout) -		    : "1" (1) -		    : "memory"); - -		spin_time_accum_spinning(start_spin_fast); - -	} while (unlikely(oldval != 0 && -			  (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable)))); - -	spin_time_accum_total(start_spin); -} - -static void xen_spin_lock(struct arch_spinlock *lock) -{ -	__xen_spin_lock(lock, false); -} - -static void xen_spin_lock_flags(struct arch_spinlock *lock, unsigned long flags) -{ -	__xen_spin_lock(lock, !raw_irqs_disabled_flags(flags)); +	spin_time_accum_blocked(start);  } +PV_CALLEE_SAVE_REGS_THUNK(xen_lock_spinning); -static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) +static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)  {  	int cpu; -	ADD_STATS(released_slow, 1); +	add_stats(RELEASED_SLOW, 1); -	for_each_online_cpu(cpu) { -		/* XXX should mix up next cpu selection */ -		if (per_cpu(lock_spinners, cpu) == xl) { -			ADD_STATS(released_slow_kicked, 1); +	for_each_cpu(cpu, &waiting_cpus) { +		const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu); + +		/* Make sure we read lock before want */ +		if (ACCESS_ONCE(w->lock) == lock && +		    ACCESS_ONCE(w->want) == next) { +			add_stats(RELEASED_SLOW_KICKED, 1);  			xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);  			break;  		}  	}  } -static void xen_spin_unlock(struct arch_spinlock *lock) -{ -	struct xen_spinlock *xl = (struct xen_spinlock *)lock; - -	ADD_STATS(released, 1); - -	smp_wmb();		/* make sure no writes get moved after unlock */ -	xl->lock = 0;		/* release lock */ - -	/* -	 * Make sure unlock happens before checking for waiting -	 * spinners.  We need a strong barrier to enforce the -	 * write-read ordering to different memory locations, as the -	 * CPU makes no implied guarantees about their ordering. -	 */ -	mb(); - -	if (unlikely(xl->spinners)) -		xen_spin_unlock_slow(xl); -} -  static irqreturn_t dummy_handler(int irq, void *dev_id)  {  	BUG();  	return IRQ_HANDLED;  } -void __cpuinit xen_init_lock_cpu(int cpu) +void xen_init_lock_cpu(int cpu)  {  	int irq; -	const char *name; +	char *name; + +	if (!xen_pvspin) +		return; + +	WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n", +	     cpu, per_cpu(lock_kicker_irq, cpu));  	name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);  	irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,  				     cpu,  				     dummy_handler, -				     IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, +				     IRQF_PERCPU|IRQF_NOBALANCING,  				     name,  				     NULL);  	if (irq >= 0) {  		disable_irq(irq); /* make sure it's never delivered */  		per_cpu(lock_kicker_irq, cpu) = irq; +		per_cpu(irq_name, cpu) = name;  	}  	printk("cpu %d spinlock event irq %d\n", cpu, irq); @@ -368,19 +249,62 @@ void __cpuinit xen_init_lock_cpu(int cpu)  void xen_uninit_lock_cpu(int cpu)  { +	if (!xen_pvspin) +		return; +  	unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL); +	per_cpu(lock_kicker_irq, cpu) = -1; +	kfree(per_cpu(irq_name, cpu)); +	per_cpu(irq_name, cpu) = NULL;  } + +/* + * Our init of PV spinlocks is split in two init functions due to us + * using paravirt patching and jump labels patching and having to do + * all of this before SMP code is invoked. + * + * The paravirt patching needs to be done _before_ the alternative asm code + * is started, otherwise we would not patch the core kernel code. + */  void __init xen_init_spinlocks(void)  { -	pv_lock_ops.spin_is_locked = xen_spin_is_locked; -	pv_lock_ops.spin_is_contended = xen_spin_is_contended; -	pv_lock_ops.spin_lock = xen_spin_lock; -	pv_lock_ops.spin_lock_flags = xen_spin_lock_flags; -	pv_lock_ops.spin_trylock = xen_spin_trylock; -	pv_lock_ops.spin_unlock = xen_spin_unlock; + +	if (!xen_pvspin) { +		printk(KERN_DEBUG "xen: PV spinlocks disabled\n"); +		return; +	} +	printk(KERN_DEBUG "xen: PV spinlocks enabled\n"); +	pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning); +	pv_lock_ops.unlock_kick = xen_unlock_kick;  } +/* + * While the jump_label init code needs to happend _after_ the jump labels are + * enabled and before SMP is started. Hence we use pre-SMP initcall level + * init. We cannot do it in xen_init_spinlocks as that is done before + * jump labels are activated. + */ +static __init int xen_init_spinlocks_jump(void) +{ +	if (!xen_pvspin) +		return 0; + +	if (!xen_domain()) +		return 0; + +	static_key_slow_inc(¶virt_ticketlocks_enabled); +	return 0; +} +early_initcall(xen_init_spinlocks_jump); + +static __init int xen_parse_nopvspin(char *arg) +{ +	xen_pvspin = false; +	return 0; +} +early_param("xen_nopvspin", xen_parse_nopvspin); +  #ifdef CONFIG_XEN_DEBUG_FS  static struct dentry *d_spin_debug; @@ -392,43 +316,30 @@ static int __init xen_spinlock_debugfs(void)  	if (d_xen == NULL)  		return -ENOMEM; +	if (!xen_pvspin) +		return 0; +  	d_spin_debug = debugfs_create_dir("spinlocks", d_xen);  	debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); -	debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout); - -	debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken);  	debugfs_create_u32("taken_slow", 0444, d_spin_debug, -			   &spinlock_stats.taken_slow); -	debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug, -			   &spinlock_stats.taken_slow_nested); +			   &spinlock_stats.contention_stats[TAKEN_SLOW]);  	debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, -			   &spinlock_stats.taken_slow_pickup); +			   &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);  	debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug, -			   &spinlock_stats.taken_slow_spurious); -	debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug, -			   &spinlock_stats.taken_slow_irqenable); +			   &spinlock_stats.contention_stats[TAKEN_SLOW_SPURIOUS]); -	debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released);  	debugfs_create_u32("released_slow", 0444, d_spin_debug, -			   &spinlock_stats.released_slow); +			   &spinlock_stats.contention_stats[RELEASED_SLOW]);  	debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, -			   &spinlock_stats.released_slow_kicked); +			   &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]); -	debugfs_create_u64("time_spinning", 0444, d_spin_debug, -			   &spinlock_stats.time_spinning);  	debugfs_create_u64("time_blocked", 0444, d_spin_debug,  			   &spinlock_stats.time_blocked); -	debugfs_create_u64("time_total", 0444, d_spin_debug, -			   &spinlock_stats.time_total); - -	xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug, -				     spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1); -	xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, -				     spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1); -	xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, -				     spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); + +	debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, +				spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);  	return 0;  } diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 1d789d56877..c4df9dbd63b 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -12,8 +12,10 @@  #include "xen-ops.h"  #include "mmu.h" -void xen_pre_suspend(void) +static void xen_pv_pre_suspend(void)  { +	xen_mm_pin_all(); +  	xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);  	xen_start_info->console.domU.mfn =  		mfn_to_pfn(xen_start_info->console.domU.mfn); @@ -26,19 +28,22 @@ void xen_pre_suspend(void)  		BUG();  } -void xen_hvm_post_suspend(int suspend_cancelled) +static void xen_hvm_post_suspend(int suspend_cancelled)  { +#ifdef CONFIG_XEN_PVHVM  	int cpu;  	xen_hvm_init_shared_info();  	xen_callback_vector(); +	xen_unplug_emulated_devices();  	if (xen_feature(XENFEAT_hvm_safe_pvclock)) {  		for_each_online_cpu(cpu) {  			xen_setup_runstate_info(cpu);  		}  	} +#endif  } -void xen_post_suspend(int suspend_cancelled) +static void xen_pv_post_suspend(int suspend_cancelled)  {  	xen_build_mfn_list_list(); @@ -57,6 +62,21 @@ void xen_post_suspend(int suspend_cancelled)  		xen_vcpu_restore();  	} +	xen_mm_unpin_all(); +} + +void xen_arch_pre_suspend(void) +{ +    if (xen_pv_domain()) +        xen_pv_pre_suspend(); +} + +void xen_arch_post_suspend(int cancelled) +{ +    if (xen_pv_domain()) +        xen_pv_post_suspend(cancelled); +    else +        xen_hvm_post_suspend(cancelled);  }  static void xen_vcpu_notify_restore(void *data) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index b2bb5aa3b05..7b78f88c170 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -14,6 +14,8 @@  #include <linux/kernel_stat.h>  #include <linux/math64.h>  #include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/pvclock_gtod.h>  #include <asm/pvclock.h>  #include <asm/xen/hypervisor.h> @@ -26,8 +28,6 @@  #include "xen-ops.h" -#define XEN_SHIFT 22 -  /* Xen may fire a timer up to this many ns early */  #define TIMER_SLOP	100000  #define NS_PER_TICK	(1000000000LL / HZ) @@ -38,9 +38,8 @@ static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);  /* snapshots of runstate info */  static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot); -/* unused ns of stolen and blocked time */ +/* unused ns of stolen time */  static DEFINE_PER_CPU(u64, xen_residual_stolen); -static DEFINE_PER_CPU(u64, xen_residual_blocked);  /* return an consistent snapshot of 64-bit time/counter value */  static u64 get64(const u64 *p) @@ -117,7 +116,7 @@ static void do_stolen_accounting(void)  {  	struct vcpu_runstate_info state;  	struct vcpu_runstate_info *snap; -	s64 blocked, runnable, offline, stolen; +	s64 runnable, offline, stolen;  	cputime_t ticks;  	get_runstate_snapshot(&state); @@ -127,7 +126,6 @@ static void do_stolen_accounting(void)  	snap = &__get_cpu_var(xen_runstate_snapshot);  	/* work out how much time the VCPU has not been runn*ing*  */ -	blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];  	runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];  	offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]; @@ -135,25 +133,14 @@ static void do_stolen_accounting(void)  	/* Add the appropriate number of ticks of stolen time,  	   including any left-overs from last time. */ -	stolen = runnable + offline + __get_cpu_var(xen_residual_stolen); +	stolen = runnable + offline + __this_cpu_read(xen_residual_stolen);  	if (stolen < 0)  		stolen = 0;  	ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); -	__get_cpu_var(xen_residual_stolen) = stolen; +	__this_cpu_write(xen_residual_stolen, stolen);  	account_steal_ticks(ticks); - -	/* Add the appropriate number of ticks of blocked time, -	   including any left-overs from last time. */ -	blocked += __get_cpu_var(xen_residual_blocked); - -	if (blocked < 0) -		blocked = 0; - -	ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); -	__get_cpu_var(xen_residual_blocked) = blocked; -	account_idle_ticks(ticks);  }  /* Get the TSC speed from Xen */ @@ -170,9 +157,10 @@ cycle_t xen_clocksource_read(void)          struct pvclock_vcpu_time_info *src;  	cycle_t ret; -	src = &get_cpu_var(xen_vcpu)->time; +	preempt_disable_notrace(); +	src = &__get_cpu_var(xen_vcpu)->time;  	ret = pvclock_clocksource_read(src); -	put_cpu_var(xen_vcpu); +	preempt_enable_notrace();  	return ret;  } @@ -192,27 +180,61 @@ static void xen_read_wallclock(struct timespec *ts)  	put_cpu_var(xen_vcpu);  } -static unsigned long xen_get_wallclock(void) +static void xen_get_wallclock(struct timespec *now)  { -	struct timespec ts; - -	xen_read_wallclock(&ts); -	return ts.tv_sec; +	xen_read_wallclock(now);  } -static int xen_set_wallclock(unsigned long now) +static int xen_set_wallclock(const struct timespec *now)  { -	/* do nothing for domU */  	return -1;  } +static int xen_pvclock_gtod_notify(struct notifier_block *nb, +				   unsigned long was_set, void *priv) +{ +	/* Protected by the calling core code serialization */ +	static struct timespec next_sync; + +	struct xen_platform_op op; +	struct timespec now; + +	now = __current_kernel_time(); + +	/* +	 * We only take the expensive HV call when the clock was set +	 * or when the 11 minutes RTC synchronization time elapsed. +	 */ +	if (!was_set && timespec_compare(&now, &next_sync) < 0) +		return NOTIFY_OK; + +	op.cmd = XENPF_settime; +	op.u.settime.secs = now.tv_sec; +	op.u.settime.nsecs = now.tv_nsec; +	op.u.settime.system_time = xen_clocksource_read(); + +	(void)HYPERVISOR_dom0_op(&op); + +	/* +	 * Move the next drift compensation time 11 minutes +	 * ahead. That's emulating the sync_cmos_clock() update for +	 * the hardware RTC. +	 */ +	next_sync = now; +	next_sync.tv_sec += 11 * 60; + +	return NOTIFY_OK; +} + +static struct notifier_block xen_pvclock_gtod_notifier = { +	.notifier_call = xen_pvclock_gtod_notify, +}; +  static struct clocksource xen_clocksource __read_mostly = {  	.name = "xen",  	.rating = 400,  	.read = xen_clocksource_get_cycles,  	.mask = ~0, -	.mult = 1<<XEN_SHIFT,		/* time directly in nanoseconds */ -	.shift = XEN_SHIFT,  	.flags = CLOCK_SOURCE_IS_CONTINUOUS,  }; @@ -366,11 +388,16 @@ static const struct clock_event_device xen_vcpuop_clockevent = {  static const struct clock_event_device *xen_clockevent =  	&xen_timerop_clockevent; -static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events); + +struct xen_clock_event_device { +	struct clock_event_device evt; +	char *name; +}; +static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt.irq = -1 };  static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)  { -	struct clock_event_device *evt = &__get_cpu_var(xen_clock_events); +	struct clock_event_device *evt = &__get_cpu_var(xen_clock_events).evt;  	irqreturn_t ret;  	ret = IRQ_NONE; @@ -384,12 +411,31 @@ static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)  	return ret;  } +void xen_teardown_timer(int cpu) +{ +	struct clock_event_device *evt; +	BUG_ON(cpu == 0); +	evt = &per_cpu(xen_clock_events, cpu).evt; + +	if (evt->irq >= 0) { +		unbind_from_irqhandler(evt->irq, NULL); +		evt->irq = -1; +		kfree(per_cpu(xen_clock_events, cpu).name); +		per_cpu(xen_clock_events, cpu).name = NULL; +	} +} +  void xen_setup_timer(int cpu)  { -	const char *name; +	char *name;  	struct clock_event_device *evt;  	int irq; +	evt = &per_cpu(xen_clock_events, cpu).evt; +	WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu); +	if (evt->irq >= 0) +		xen_teardown_timer(cpu); +  	printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);  	name = kasprintf(GFP_KERNEL, "timer%d", cpu); @@ -397,35 +443,32 @@ void xen_setup_timer(int cpu)  		name = "<timer kasprintf failed>";  	irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, -				      IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER, +				      IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER| +				      IRQF_FORCE_RESUME,  				      name, NULL); +	(void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX); -	evt = &per_cpu(xen_clock_events, cpu);  	memcpy(evt, xen_clockevent, sizeof(*evt));  	evt->cpumask = cpumask_of(cpu);  	evt->irq = irq; +	per_cpu(xen_clock_events, cpu).name = name;  } -void xen_teardown_timer(int cpu) -{ -	struct clock_event_device *evt; -	BUG_ON(cpu == 0); -	evt = &per_cpu(xen_clock_events, cpu); -	unbind_from_irqhandler(evt->irq, NULL); -}  void xen_setup_cpu_clockevents(void)  {  	BUG_ON(preemptible()); -	clockevents_register_device(&__get_cpu_var(xen_clock_events)); +	clockevents_register_device(&__get_cpu_var(xen_clock_events).evt);  }  void xen_timer_resume(void)  {  	int cpu; +	pvclock_resume(); +  	if (xen_clockevent != &xen_vcpuop_clockevent)  		return; @@ -435,16 +478,16 @@ void xen_timer_resume(void)  	}  } -static const struct pv_time_ops xen_time_ops __initdata = { +static const struct pv_time_ops xen_time_ops __initconst = {  	.sched_clock = xen_clocksource_read,  }; -static __init void xen_time_init(void) +static void __init xen_time_init(void)  {  	int cpu = smp_processor_id();  	struct timespec tp; -	clocksource_register(&xen_clocksource); +	clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);  	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {  		/* Successfully turned off 100Hz tick, so we have the @@ -462,9 +505,12 @@ static __init void xen_time_init(void)  	xen_setup_runstate_info(cpu);  	xen_setup_timer(cpu);  	xen_setup_cpu_clockevents(); + +	if (xen_initial_domain()) +		pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);  } -__init void xen_init_time_ops(void) +void __init xen_init_time_ops(void)  {  	pv_time_ops = xen_time_ops; @@ -474,7 +520,9 @@ __init void xen_init_time_ops(void)  	x86_platform.calibrate_tsc = xen_tsc_khz;  	x86_platform.get_wallclock = xen_get_wallclock; -	x86_platform.set_wallclock = xen_set_wallclock; +	/* Dom0 uses the native method to set the hardware RTC. */ +	if (!xen_initial_domain()) +		x86_platform.set_wallclock = xen_set_wallclock;  }  #ifdef CONFIG_XEN_PVHVM @@ -482,11 +530,15 @@ static void xen_hvm_setup_cpu_clockevents(void)  {  	int cpu = smp_processor_id();  	xen_setup_runstate_info(cpu); -	xen_setup_timer(cpu); +	/* +	 * xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence +	 * doing it xen_hvm_cpu_notify (which gets called by smp_init during +	 * early bootup and also during CPU hotplug events). +	 */  	xen_setup_cpu_clockevents();  } -__init void xen_hvm_init_time_ops(void) +void __init xen_hvm_init_time_ops(void)  {  	/* vector callback is needed otherwise we cannot receive interrupts  	 * on cpu > 0 and at this point we don't know how many cpus are diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c new file mode 100644 index 00000000000..520022d1a18 --- /dev/null +++ b/arch/x86/xen/trace.c @@ -0,0 +1,62 @@ +#include <linux/ftrace.h> +#include <xen/interface/xen.h> + +#define N(x)	[__HYPERVISOR_##x] = "("#x")" +static const char *xen_hypercall_names[] = { +	N(set_trap_table), +	N(mmu_update), +	N(set_gdt), +	N(stack_switch), +	N(set_callbacks), +	N(fpu_taskswitch), +	N(sched_op_compat), +	N(dom0_op), +	N(set_debugreg), +	N(get_debugreg), +	N(update_descriptor), +	N(memory_op), +	N(multicall), +	N(update_va_mapping), +	N(set_timer_op), +	N(event_channel_op_compat), +	N(xen_version), +	N(console_io), +	N(physdev_op_compat), +	N(grant_table_op), +	N(vm_assist), +	N(update_va_mapping_otherdomain), +	N(iret), +	N(vcpu_op), +	N(set_segment_base), +	N(mmuext_op), +	N(acm_op), +	N(nmi_op), +	N(sched_op), +	N(callback_op), +	N(xenoprof_op), +	N(event_channel_op), +	N(physdev_op), +	N(hvm_op), + +/* Architecture-specific hypercall definitions. */ +	N(arch_0), +	N(arch_1), +	N(arch_2), +	N(arch_3), +	N(arch_4), +	N(arch_5), +	N(arch_6), +	N(arch_7), +}; +#undef N + +static const char *xen_hypercall_name(unsigned op) +{ +	if (op < ARRAY_SIZE(xen_hypercall_names) && xen_hypercall_names[op] != NULL) +		return xen_hypercall_names[op]; + +	return ""; +} + +#define CREATE_TRACE_POINTS +#include <trace/events/xen.h> diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c new file mode 100644 index 00000000000..6722e3733f0 --- /dev/null +++ b/arch/x86/xen/vga.c @@ -0,0 +1,74 @@ +#include <linux/screen_info.h> +#include <linux/init.h> + +#include <asm/bootparam.h> +#include <asm/setup.h> + +#include <xen/interface/xen.h> + +#include "xen-ops.h" + +void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size) +{ +	struct screen_info *screen_info = &boot_params.screen_info; + +	/* This is drawn from a dump from vgacon:startup in +	 * standard Linux. */ +	screen_info->orig_video_mode = 3; +	screen_info->orig_video_isVGA = 1; +	screen_info->orig_video_lines = 25; +	screen_info->orig_video_cols = 80; +	screen_info->orig_video_ega_bx = 3; +	screen_info->orig_video_points = 16; +	screen_info->orig_y = screen_info->orig_video_lines - 1; + +	switch (info->video_type) { +	case XEN_VGATYPE_TEXT_MODE_3: +		if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3) +		    + sizeof(info->u.text_mode_3)) +			break; +		screen_info->orig_video_lines = info->u.text_mode_3.rows; +		screen_info->orig_video_cols = info->u.text_mode_3.columns; +		screen_info->orig_x = info->u.text_mode_3.cursor_x; +		screen_info->orig_y = info->u.text_mode_3.cursor_y; +		screen_info->orig_video_points = +			info->u.text_mode_3.font_height; +		break; + +	case XEN_VGATYPE_EFI_LFB: +	case XEN_VGATYPE_VESA_LFB: +		if (size < offsetof(struct dom0_vga_console_info, +				    u.vesa_lfb.gbl_caps)) +			break; +		screen_info->orig_video_isVGA = VIDEO_TYPE_VLFB; +		screen_info->lfb_width = info->u.vesa_lfb.width; +		screen_info->lfb_height = info->u.vesa_lfb.height; +		screen_info->lfb_depth = info->u.vesa_lfb.bits_per_pixel; +		screen_info->lfb_base = info->u.vesa_lfb.lfb_base; +		screen_info->lfb_size = info->u.vesa_lfb.lfb_size; +		screen_info->lfb_linelength = info->u.vesa_lfb.bytes_per_line; +		screen_info->red_size = info->u.vesa_lfb.red_size; +		screen_info->red_pos = info->u.vesa_lfb.red_pos; +		screen_info->green_size = info->u.vesa_lfb.green_size; +		screen_info->green_pos = info->u.vesa_lfb.green_pos; +		screen_info->blue_size = info->u.vesa_lfb.blue_size; +		screen_info->blue_pos = info->u.vesa_lfb.blue_pos; +		screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size; +		screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos; + +		if (info->video_type == XEN_VGATYPE_EFI_LFB) { +			screen_info->orig_video_isVGA = VIDEO_TYPE_EFI; +			break; +		} + +		if (size >= offsetof(struct dom0_vga_console_info, +				     u.vesa_lfb.gbl_caps) +		    + sizeof(info->u.vesa_lfb.gbl_caps)) +			screen_info->capabilities = info->u.vesa_lfb.gbl_caps; +		if (size >= offsetof(struct dom0_vga_console_info, +				     u.vesa_lfb.mode_attrs) +		    + sizeof(info->u.vesa_lfb.mode_attrs)) +			screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs; +		break; +	} +} diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 79d7362ad6d..3e45aa00071 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -96,7 +96,7 @@ ENTRY(xen_restore_fl_direct)  	/* check for unmasked and pending */  	cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending -	jz 1f +	jnz 1f  2:	call check_events  1:  ENDPATCH(xen_restore_fl_direct) diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 22a2093b586..fd92a64d748 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -14,6 +14,7 @@  #include <asm/thread_info.h>  #include <asm/processor-flags.h>  #include <asm/segment.h> +#include <asm/asm.h>  #include <xen/interface/xen.h> @@ -74,6 +75,17 @@ ENDPROC(xen_sysexit)   * stack state in whatever form its in, we keep things simple by only   * using a single register which is pushed/popped on the stack.   */ + +.macro POP_FS +1: +	popw %fs +.pushsection .fixup, "ax" +2:	movw $0, (%esp) +	jmp 1b +.popsection +	_ASM_EXTABLE(1b,2b) +.endm +  ENTRY(xen_iret)  	/* test eflags for special cases */  	testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) @@ -82,17 +94,15 @@ ENTRY(xen_iret)  	push %eax  	ESP_OFFSET=4	# bytes pushed onto stack -	/* -	 * Store vcpu_info pointer for easy access.  Do it this way to -	 * avoid having to reload %fs -	 */ +	/* Store vcpu_info pointer for easy access */  #ifdef CONFIG_SMP -	GET_THREAD_INFO(%eax) -	movl TI_cpu(%eax), %eax -	movl __per_cpu_offset(,%eax,4), %eax -	mov xen_vcpu(%eax), %eax +	pushw %fs +	movl $(__KERNEL_PERCPU), %eax +	movl %eax, %fs +	movl %fs:xen_vcpu, %eax +	POP_FS  #else -	movl xen_vcpu, %eax +	movl %ss:xen_vcpu, %eax  #endif  	/* check IF state we're restoring */ @@ -105,19 +115,21 @@ ENTRY(xen_iret)  	 * resuming the code, so we don't have to be worried about  	 * being preempted to another CPU.  	 */ -	setz XEN_vcpu_info_mask(%eax) +	setz %ss:XEN_vcpu_info_mask(%eax)  xen_iret_start_crit:  	/* check for unmasked and pending */ -	cmpw $0x0001, XEN_vcpu_info_pending(%eax) +	cmpw $0x0001, %ss:XEN_vcpu_info_pending(%eax)  	/*  	 * If there's something pending, mask events again so we can -	 * jump back into xen_hypervisor_callback +	 * jump back into xen_hypervisor_callback. Otherwise do not +	 * touch XEN_vcpu_info_mask.  	 */ -	sete XEN_vcpu_info_mask(%eax) +	jne 1f +	movb $1, %ss:XEN_vcpu_info_mask(%eax) -	popl %eax +1:	popl %eax  	/*  	 * From this point on the registers are restored and the stack @@ -135,10 +147,7 @@ iret_restore_end:  1:	iret  xen_iret_end_crit: -.section __ex_table, "a" -	.align 4 -	.long 1b, iret_exc -.previous +	_ASM_EXTABLE(1b, iret_exc)  hyper_iret:  	/* put this out of line since its very rarely used */ diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 1a5ff24e29c..485b6958554 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -11,8 +11,28 @@  #include <asm/page_types.h>  #include <xen/interface/elfnote.h> +#include <xen/interface/features.h>  #include <asm/xen/interface.h> +#ifdef CONFIG_XEN_PVH +#define PVH_FEATURES_STR  "|writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel" +/* Note the lack of 'hvm_callback_vector'. Older hypervisor will + * balk at this being part of XEN_ELFNOTE_FEATURES, so we put it in + * XEN_ELFNOTE_SUPPORTED_FEATURES which older hypervisors will ignore. + */ +#define PVH_FEATURES ((1 << XENFEAT_writable_page_tables) | \ +		      (1 << XENFEAT_auto_translated_physmap) | \ +		      (1 << XENFEAT_supervisor_mode_kernel) | \ +		      (1 << XENFEAT_hvm_callback_vector)) +/* The XENFEAT_writable_page_tables is not stricly neccessary as we set that + * up regardless whether this CONFIG option is enabled or not, but it + * clarifies what the right flags need to be. + */ +#else +#define PVH_FEATURES_STR  "" +#define PVH_FEATURES (0) +#endif +  	__INIT  ENTRY(startup_xen)  	cld @@ -28,9 +48,61 @@ ENTRY(startup_xen)  	__FINIT  .pushsection .text -	.align PAGE_SIZE_asm +	.balign PAGE_SIZE  ENTRY(hypercall_page) -	.skip PAGE_SIZE_asm +#define NEXT_HYPERCALL(x) \ +	ENTRY(xen_hypercall_##x) \ +	.skip 32 + +NEXT_HYPERCALL(set_trap_table) +NEXT_HYPERCALL(mmu_update) +NEXT_HYPERCALL(set_gdt) +NEXT_HYPERCALL(stack_switch) +NEXT_HYPERCALL(set_callbacks) +NEXT_HYPERCALL(fpu_taskswitch) +NEXT_HYPERCALL(sched_op_compat) +NEXT_HYPERCALL(platform_op) +NEXT_HYPERCALL(set_debugreg) +NEXT_HYPERCALL(get_debugreg) +NEXT_HYPERCALL(update_descriptor) +NEXT_HYPERCALL(ni) +NEXT_HYPERCALL(memory_op) +NEXT_HYPERCALL(multicall) +NEXT_HYPERCALL(update_va_mapping) +NEXT_HYPERCALL(set_timer_op) +NEXT_HYPERCALL(event_channel_op_compat) +NEXT_HYPERCALL(xen_version) +NEXT_HYPERCALL(console_io) +NEXT_HYPERCALL(physdev_op_compat) +NEXT_HYPERCALL(grant_table_op) +NEXT_HYPERCALL(vm_assist) +NEXT_HYPERCALL(update_va_mapping_otherdomain) +NEXT_HYPERCALL(iret) +NEXT_HYPERCALL(vcpu_op) +NEXT_HYPERCALL(set_segment_base) +NEXT_HYPERCALL(mmuext_op) +NEXT_HYPERCALL(xsm_op) +NEXT_HYPERCALL(nmi_op) +NEXT_HYPERCALL(sched_op) +NEXT_HYPERCALL(callback_op) +NEXT_HYPERCALL(xenoprof_op) +NEXT_HYPERCALL(event_channel_op) +NEXT_HYPERCALL(physdev_op) +NEXT_HYPERCALL(hvm_op) +NEXT_HYPERCALL(sysctl) +NEXT_HYPERCALL(domctl) +NEXT_HYPERCALL(kexec_op) +NEXT_HYPERCALL(tmem_op) /* 38 */ +ENTRY(xen_hypercall_rsvr) +	.skip 320 +NEXT_HYPERCALL(mca) /* 48 */ +NEXT_HYPERCALL(arch_1) +NEXT_HYPERCALL(arch_2) +NEXT_HYPERCALL(arch_3) +NEXT_HYPERCALL(arch_4) +NEXT_HYPERCALL(arch_5) +NEXT_HYPERCALL(arch_6) +	.balign PAGE_SIZE  .popsection  	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux") @@ -43,7 +115,10 @@ ENTRY(hypercall_page)  #endif  	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          _ASM_PTR startup_xen)  	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) -	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "!writable_page_tables|pae_pgdir_above_4gb") +	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .ascii "!writable_page_tables|pae_pgdir_above_4gb"; .asciz PVH_FEATURES_STR) +	ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long (PVH_FEATURES) | +						(1 << XENFEAT_writable_page_tables) | +						(1 << XENFEAT_dom0))  	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")  	ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")  	ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 64044747348..97d87659f77 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -27,25 +27,27 @@ void xen_setup_mfn_list_list(void);  void xen_setup_shared_info(void);  void xen_build_mfn_list_list(void);  void xen_setup_machphys_mapping(void); -pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); -void xen_ident_map_ISA(void); +void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);  void xen_reserve_top(void);  extern unsigned long xen_max_p2m_pfn; +void xen_mm_pin_all(void); +void xen_mm_unpin_all(void);  void xen_set_pat(u64);  char * __init xen_memory_setup(void); +char * xen_auto_xlated_memory_setup(void);  void __init xen_arch_setup(void); -void __init xen_init_IRQ(void);  void xen_enable_sysenter(void);  void xen_enable_syscall(void);  void xen_vcpu_restore(void);  void xen_callback_vector(void);  void xen_hvm_init_shared_info(void); -void __init xen_unplug_emulated_devices(void); +void xen_unplug_emulated_devices(void);  void __init xen_build_dynamic_phys_to_machine(void); +unsigned long __init xen_revector_p2m_tree(void);  void xen_init_irq_ops(void);  void xen_setup_timer(int cpu); @@ -64,15 +66,17 @@ void xen_setup_vcpu_info_placement(void);  #ifdef CONFIG_SMP  void xen_smp_init(void); +void __init xen_hvm_smp_init(void);  extern cpumask_var_t xen_cpu_initialized_map;  #else  static inline void xen_smp_init(void) {} +static inline void xen_hvm_smp_init(void) {}  #endif  #ifdef CONFIG_PARAVIRT_SPINLOCKS  void __init xen_init_spinlocks(void); -__cpuinit void xen_init_lock_cpu(int cpu); +void xen_init_lock_cpu(int cpu);  void xen_uninit_lock_cpu(int cpu);  #else  static inline void xen_init_spinlocks(void) @@ -86,12 +90,27 @@ static inline void xen_uninit_lock_cpu(int cpu)  }  #endif +struct dom0_vga_console_info; + +#ifdef CONFIG_XEN_DOM0 +void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size); +void __init xen_init_apic(void); +#else +static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, +				       size_t size) +{ +} +static inline void __init xen_init_apic(void) +{ +} +#endif +  /* Declare an asm function, along with symbols needed to make it     inlineable */  #define DECL_ASM(ret, name, ...)		\ -	ret name(__VA_ARGS__);			\ -	extern char name##_end[];		\ -	extern char name##_reloc[]		\ +	__visible ret name(__VA_ARGS__);	\ +	extern char name##_end[] __visible;	\ +	extern char name##_reloc[] __visible  DECL_ASM(void, xen_irq_enable_direct, void);  DECL_ASM(void, xen_irq_disable_direct, void); @@ -99,12 +118,13 @@ DECL_ASM(unsigned long, xen_save_fl_direct, void);  DECL_ASM(void, xen_restore_fl_direct, unsigned long);  /* These are not functions, and cannot be called normally */ -void xen_iret(void); -void xen_sysexit(void); -void xen_sysret32(void); -void xen_sysret64(void); -void xen_adjust_exception_frame(void); +__visible void xen_iret(void); +__visible void xen_sysexit(void); +__visible void xen_sysret32(void); +__visible void xen_sysret64(void); +__visible void xen_adjust_exception_frame(void);  extern int xen_panic_handler_init(void); +void xen_pvh_secondary_vcpu_init(int cpu);  #endif /* XEN_OPS_H */  | 
