diff options
Diffstat (limited to 'arch/x86/xen')
32 files changed, 8368 insertions, 2395 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 4d5f2649bee..e88fda867a3 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -4,10 +4,49 @@ config XEN bool "Xen guest support" - select PARAVIRT - depends on X86_32 - depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER) + depends on PARAVIRT + select PARAVIRT_CLOCK + select XEN_HAVE_PVMMU + depends on X86_64 || (X86_32 && X86_PAE) + depends on X86_TSC help This is the Linux Xen port. Enabling this will allow the kernel to boot in a paravirtualized environment under the Xen hypervisor. + +config XEN_DOM0 + def_bool y + depends on XEN && PCI_XEN && SWIOTLB_XEN + depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI + +config XEN_PVHVM + def_bool y + depends on XEN && PCI && X86_LOCAL_APIC + +config XEN_MAX_DOMAIN_MEMORY + int + default 500 if X86_64 + default 64 if X86_32 + depends on XEN + help + This only affects the sizing of some bss arrays, the unused + portions of which are freed. + +config XEN_SAVE_RESTORE + bool + depends on XEN + select HIBERNATE_CALLBACKS + default y + +config XEN_DEBUG_FS + bool "Enable Xen debug and tuning parameters in debugfs" + depends on XEN && DEBUG_FS + default n + help + Enable statistics output and various tuning options in debugfs. + Enabling this option may incur a significant performance overhead. + +config XEN_PVH + bool "Support for running as a PVH guest" + depends on X86_64 && XEN && XEN_PVHVM + def_bool n diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 343df246bd3..96ab2c09cb6 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,4 +1,24 @@ -obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \ - events.o time.o manage.o xen-asm.o +ifdef CONFIG_FUNCTION_TRACER +# Do not profile debug and lowlevel utilities +CFLAGS_REMOVE_spinlock.o = -pg +CFLAGS_REMOVE_time.o = -pg +CFLAGS_REMOVE_irq.o = -pg +endif -obj-$(CONFIG_SMP) += smp.o +# Make sure early boot has no stackprotector +nostackp := $(call cc-option, -fno-stack-protector) +CFLAGS_enlighten.o := $(nostackp) +CFLAGS_mmu.o := $(nostackp) + +obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ + time.o xen-asm.o xen-asm_$(BITS).o \ + grant-table.o suspend.o platform-pci-unplug.o \ + p2m.o + +obj-$(CONFIG_EVENT_TRACING) += trace.o + +obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o +obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o +obj-$(CONFIG_XEN_DOM0) += apic.o vga.o +obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c new file mode 100644 index 00000000000..7005ced5d1a --- /dev/null +++ b/arch/x86/xen/apic.c @@ -0,0 +1,34 @@ +#include <linux/init.h> + +#include <asm/x86_init.h> +#include <asm/apic.h> +#include <asm/xen/hypercall.h> + +#include <xen/xen.h> +#include <xen/interface/physdev.h> +#include "xen-ops.h" + +static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) +{ + struct physdev_apic apic_op; + int ret; + + apic_op.apic_physbase = mpc_ioapic_addr(apic); + apic_op.reg = reg; + ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op); + if (!ret) + return apic_op.value; + + /* fallback to return an emulated IO_APIC values */ + if (reg == 0x1) + return 0x00170020; + else if (reg == 0x0) + return apic << 24; + + return 0xfd; +} + +void __init xen_init_apic(void) +{ + x86_io_apic_ops.read = xen_io_apic_read; +} diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c new file mode 100644 index 00000000000..c8377fb26cd --- /dev/null +++ b/arch/x86/xen/debugfs.c @@ -0,0 +1,21 @@ +#include <linux/init.h> +#include <linux/debugfs.h> +#include <linux/slab.h> +#include <linux/module.h> + +#include "debugfs.h" + +static struct dentry *d_xen_debug; + +struct dentry * __init xen_init_debugfs(void) +{ + if (!d_xen_debug) { + d_xen_debug = debugfs_create_dir("xen", NULL); + + if (!d_xen_debug) + pr_warning("Could not create 'xen' debugfs directory\n"); + } + + return d_xen_debug; +} + diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h new file mode 100644 index 00000000000..12ebf3325c7 --- /dev/null +++ b/arch/x86/xen/debugfs.h @@ -0,0 +1,6 @@ +#ifndef _XEN_DEBUGFS_H +#define _XEN_DEBUGFS_H + +struct dentry * __init xen_init_debugfs(void); + +#endif /* _XEN_DEBUGFS_H */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 27ee26aedf9..ffb101e4573 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -11,6 +11,7 @@ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 */ +#include <linux/cpu.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/smp.h> @@ -20,67 +21,119 @@ #include <linux/delay.h> #include <linux/start_kernel.h> #include <linux/sched.h> +#include <linux/kprobes.h> #include <linux/bootmem.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/page-flags.h> #include <linux/highmem.h> - +#include <linux/console.h> +#include <linux/pci.h> +#include <linux/gfp.h> +#include <linux/memblock.h> +#include <linux/edd.h> + +#include <xen/xen.h> +#include <xen/events.h> #include <xen/interface/xen.h> +#include <xen/interface/version.h> #include <xen/interface/physdev.h> #include <xen/interface/vcpu.h> -#include <xen/interface/sched.h> +#include <xen/interface/memory.h> +#include <xen/interface/xen-mca.h> #include <xen/features.h> #include <xen/page.h> +#include <xen/hvm.h> +#include <xen/hvc-console.h> +#include <xen/acpi.h> #include <asm/paravirt.h> +#include <asm/apic.h> #include <asm/page.h> +#include <asm/xen/pci.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> #include <asm/fixmap.h> #include <asm/processor.h> +#include <asm/proto.h> +#include <asm/msr-index.h> +#include <asm/traps.h> #include <asm/setup.h> #include <asm/desc.h> +#include <asm/pgalloc.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/reboot.h> +#include <asm/stackprotector.h> +#include <asm/hypervisor.h> +#include <asm/mwait.h> +#include <asm/pci_x86.h> +#include <asm/pat.h> + +#ifdef CONFIG_ACPI +#include <linux/acpi.h> +#include <asm/acpi.h> +#include <acpi/pdc_intel.h> +#include <acpi/processor.h> +#include <xen/interface/platform.h> +#endif #include "xen-ops.h" #include "mmu.h" +#include "smp.h" #include "multicalls.h" EXPORT_SYMBOL_GPL(hypercall_page); -DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); -DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); - /* - * Note about cr3 (pagetable base) values: + * Pointer to the xen_vcpu_info structure or + * &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info + * and xen_vcpu_setup for details. By default it points to share_info->vcpu_info + * but if the hypervisor supports VCPUOP_register_vcpu_info then it can point + * to xen_vcpu_info. The pointer is used in __xen_evtchn_do_upcall to + * acknowledge pending events. + * Also more subtly it is used by the patched version of irq enable/disable + * e.g. xen_irq_enable_direct and xen_iret in PV mode. * - * xen_cr3 contains the current logical cr3 value; it contains the - * last set cr3. This may not be the current effective cr3, because - * its update may be being lazily deferred. However, a vcpu looking - * at its own cr3 can use this value knowing that it everything will - * be self-consistent. + * The desire to be able to do those mask/unmask operations as a single + * instruction by using the per-cpu offset held in %gs is the real reason + * vcpu info is in a per-cpu pointer and the original reason for this + * hypercall. * - * xen_current_cr3 contains the actual vcpu cr3; it is set once the - * hypercall to set the vcpu cr3 is complete (so it may be a little - * out of date, but it will never be set early). If one vcpu is - * looking at another vcpu's cr3 value, it should use this variable. */ -DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ -DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ +DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); + +/* + * Per CPU pages used if hypervisor supports VCPUOP_register_vcpu_info + * hypercall. This can be used both in PV and PVHVM mode. The structure + * overrides the default per_cpu(xen_vcpu, cpu) value. + */ +DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); + +enum xen_domain_type xen_domain_type = XEN_NATIVE; +EXPORT_SYMBOL_GPL(xen_domain_type); + +unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; +EXPORT_SYMBOL(machine_to_phys_mapping); +unsigned long machine_to_phys_nr; +EXPORT_SYMBOL(machine_to_phys_nr); struct start_info *xen_start_info; EXPORT_SYMBOL_GPL(xen_start_info); -static /* __initdata */ struct shared_info dummy_shared_info; +struct shared_info xen_dummy_shared_info; + +void *xen_initial_gdt; + +RESERVE_BRK(shared_info_page_brk, PAGE_SIZE); +__read_mostly int xen_have_vector_callback; +EXPORT_SYMBOL_GPL(xen_have_vector_callback); /* * Point at some empty memory to start with. We map the real shared_info * page as soon as fixmap is up and running. */ -struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info; +struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info; /* * Flag to determine whether vcpu info placement is available on all @@ -97,65 +150,187 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info; */ static int have_vcpu_info_placement = 1; -static void __init xen_vcpu_setup(int cpu) +struct tls_descs { + struct desc_struct desc[3]; +}; + +/* + * Updating the 3 TLS descriptors in the GDT on every task switch is + * surprisingly expensive so we avoid updating them if they haven't + * changed. Since Xen writes different descriptors than the one + * passed in the update_descriptor hypercall we keep shadow copies to + * compare against. + */ +static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); + +static void clamp_max_cpus(void) +{ +#ifdef CONFIG_SMP + if (setup_max_cpus > MAX_VIRT_CPUS) + setup_max_cpus = MAX_VIRT_CPUS; +#endif +} + +static void xen_vcpu_setup(int cpu) { struct vcpu_register_vcpu_info info; int err; struct vcpu_info *vcpup; - BUG_ON(HYPERVISOR_shared_info == &dummy_shared_info); - per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; + BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); - if (!have_vcpu_info_placement) - return; /* already tested, not available */ + /* + * This path is called twice on PVHVM - first during bootup via + * smp_init -> xen_hvm_cpu_notify, and then if the VCPU is being + * hotplugged: cpu_up -> xen_hvm_cpu_notify. + * As we can only do the VCPUOP_register_vcpu_info once lets + * not over-write its result. + * + * For PV it is called during restore (xen_vcpu_restore) and bootup + * (xen_setup_vcpu_info_placement). The hotplug mechanism does not + * use this function. + */ + if (xen_hvm_domain()) { + if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu)) + return; + } + if (cpu < MAX_VIRT_CPUS) + per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; - vcpup = &per_cpu(xen_vcpu_info, cpu); + if (!have_vcpu_info_placement) { + if (cpu >= MAX_VIRT_CPUS) + clamp_max_cpus(); + return; + } - info.mfn = virt_to_mfn(vcpup); + vcpup = &per_cpu(xen_vcpu_info, cpu); + info.mfn = arbitrary_virt_to_mfn(vcpup); info.offset = offset_in_page(vcpup); - printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", - cpu, vcpup, info.mfn, info.offset); - /* Check to see if the hypervisor will put the vcpu_info structure where we want it, which allows direct access via - a percpu-variable. */ + a percpu-variable. + N.B. This hypercall can _only_ be called once per CPU. Subsequent + calls will error out with -EINVAL. This is due to the fact that + hypervisor has no unregister variant and this hypercall does not + allow to over-write info.mfn and info.offset. + */ err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info); if (err) { printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); have_vcpu_info_placement = 0; + clamp_max_cpus(); } else { /* This cpu is using the registered vcpu info, even if later ones fail to. */ per_cpu(xen_vcpu, cpu) = vcpup; + } +} - printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n", - cpu, vcpup); +/* + * On restore, set the vcpu placement up again. + * If it fails, then we're in a bad state, since + * we can't back out from using it... + */ +void xen_vcpu_restore(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + bool other_cpu = (cpu != smp_processor_id()); + bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL); + + if (other_cpu && is_up && + HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL)) + BUG(); + + xen_setup_runstate_info(cpu); + + if (have_vcpu_info_placement) + xen_vcpu_setup(cpu); + + if (other_cpu && is_up && + HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) + BUG(); } } static void __init xen_banner(void) { - printk(KERN_INFO "Booting paravirtualized kernel on %s\n", - pv_info.name); - printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); + unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); + struct xen_extraversion extra; + HYPERVISOR_xen_version(XENVER_extraversion, &extra); + + pr_info("Booting paravirtualized kernel %son %s\n", + xen_feature(XENFEAT_auto_translated_physmap) ? + "with PVH extensions " : "", pv_info.name); + printk(KERN_INFO "Xen version: %d.%d%s%s\n", + version >> 16, version & 0xffff, extra.extraversion, + xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); +} +/* Check if running on Xen version (major, minor) or later */ +bool +xen_running_on_version_or_later(unsigned int major, unsigned int minor) +{ + unsigned int version; + + if (!xen_domain()) + return false; + + version = HYPERVISOR_xen_version(XENVER_version, NULL); + if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) || + ((version >> 16) > major)) + return true; + return false; } +#define CPUID_THERM_POWER_LEAF 6 +#define APERFMPERF_PRESENT 0 + +static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0; +static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0; + +static __read_mostly unsigned int cpuid_leaf1_ecx_set_mask; +static __read_mostly unsigned int cpuid_leaf5_ecx_val; +static __read_mostly unsigned int cpuid_leaf5_edx_val; + static void xen_cpuid(unsigned int *ax, unsigned int *bx, unsigned int *cx, unsigned int *dx) { + unsigned maskebx = ~0; + unsigned maskecx = ~0; unsigned maskedx = ~0; - + unsigned setecx = 0; /* * Mask out inconvenient features, to try and disable as many * unsupported kernel subsystems as possible. */ - if (*ax == 1) - maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ - (1 << X86_FEATURE_ACPI) | /* disable ACPI */ - (1 << X86_FEATURE_SEP) | /* disable SEP */ - (1 << X86_FEATURE_ACC)); /* thermal monitoring */ + switch (*ax) { + case 1: + maskecx = cpuid_leaf1_ecx_mask; + setecx = cpuid_leaf1_ecx_set_mask; + maskedx = cpuid_leaf1_edx_mask; + break; + + case CPUID_MWAIT_LEAF: + /* Synthesize the values.. */ + *ax = 0; + *bx = 0; + *cx = cpuid_leaf5_ecx_val; + *dx = cpuid_leaf5_edx_val; + return; + + case CPUID_THERM_POWER_LEAF: + /* Disabling APERFMPERF for kernel usage */ + maskecx = ~(1 << APERFMPERF_PRESENT); + break; + + case 0xb: + /* Suppress extended topology stuff */ + maskebx = 0; + break; + } asm(XEN_EMULATE_PREFIX "cpuid" : "=a" (*ax), @@ -163,111 +338,129 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, "=c" (*cx), "=d" (*dx) : "0" (*ax), "2" (*cx)); + + *bx &= maskebx; + *cx &= maskecx; + *cx |= setecx; *dx &= maskedx; -} -static void xen_set_debugreg(int reg, unsigned long val) -{ - HYPERVISOR_set_debugreg(reg, val); } -static unsigned long xen_get_debugreg(int reg) +static bool __init xen_check_mwait(void) { - return HYPERVISOR_get_debugreg(reg); -} +#ifdef CONFIG_ACPI + struct xen_platform_op op = { + .cmd = XENPF_set_processor_pminfo, + .u.set_pminfo.id = -1, + .u.set_pminfo.type = XEN_PM_PDC, + }; + uint32_t buf[3]; + unsigned int ax, bx, cx, dx; + unsigned int mwait_mask; + + /* We need to determine whether it is OK to expose the MWAIT + * capability to the kernel to harvest deeper than C3 states from ACPI + * _CST using the processor_harvest_xen.c module. For this to work, we + * need to gather the MWAIT_LEAF values (which the cstate.c code + * checks against). The hypervisor won't expose the MWAIT flag because + * it would break backwards compatibility; so we will find out directly + * from the hardware and hypercall. + */ + if (!xen_initial_domain()) + return false; -static unsigned long xen_save_fl(void) -{ - struct vcpu_info *vcpu; - unsigned long flags; + /* + * When running under platform earlier than Xen4.2, do not expose + * mwait, to avoid the risk of loading native acpi pad driver + */ + if (!xen_running_on_version_or_later(4, 2)) + return false; - vcpu = x86_read_percpu(xen_vcpu); + ax = 1; + cx = 0; - /* flag has opposite sense of mask */ - flags = !vcpu->evtchn_upcall_mask; + native_cpuid(&ax, &bx, &cx, &dx); - /* convert to IF type flag - -0 -> 0x00000000 - -1 -> 0xffffffff - */ - return (-flags) & X86_EFLAGS_IF; -} + mwait_mask = (1 << (X86_FEATURE_EST % 32)) | + (1 << (X86_FEATURE_MWAIT % 32)); -static void xen_restore_fl(unsigned long flags) -{ - struct vcpu_info *vcpu; + if ((cx & mwait_mask) != mwait_mask) + return false; - /* convert from IF type flag */ - flags = !(flags & X86_EFLAGS_IF); + /* We need to emulate the MWAIT_LEAF and for that we need both + * ecx and edx. The hypercall provides only partial information. + */ - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ - preempt_disable(); - vcpu = x86_read_percpu(xen_vcpu); - vcpu->evtchn_upcall_mask = flags; - preempt_enable_no_resched(); - - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ - - if (flags == 0) { - preempt_check_resched(); - barrier(); /* unmask then check (avoid races) */ - if (unlikely(vcpu->evtchn_upcall_pending)) - force_evtchn_callback(); + ax = CPUID_MWAIT_LEAF; + bx = 0; + cx = 0; + dx = 0; + + native_cpuid(&ax, &bx, &cx, &dx); + + /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so, + * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3. + */ + buf[0] = ACPI_PDC_REVISION_ID; + buf[1] = 1; + buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP); + + set_xen_guest_handle(op.u.set_pminfo.pdc, buf); + + if ((HYPERVISOR_dom0_op(&op) == 0) && + (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) { + cpuid_leaf5_ecx_val = cx; + cpuid_leaf5_edx_val = dx; } + return true; +#else + return false; +#endif } - -static void xen_irq_disable(void) +static void __init xen_init_cpuid_mask(void) { - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ - preempt_disable(); - x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; - preempt_enable_no_resched(); -} + unsigned int ax, bx, cx, dx; + unsigned int xsave_mask; -static void xen_irq_enable(void) -{ - struct vcpu_info *vcpu; + cpuid_leaf1_edx_mask = + ~((1 << X86_FEATURE_MTRR) | /* disable MTRR */ + (1 << X86_FEATURE_ACC)); /* thermal monitoring */ - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ - preempt_disable(); - vcpu = x86_read_percpu(xen_vcpu); - vcpu->evtchn_upcall_mask = 0; - preempt_enable_no_resched(); + if (!xen_initial_domain()) + cpuid_leaf1_edx_mask &= + ~((1 << X86_FEATURE_ACPI)); /* disable ACPI */ + + cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32)); - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ + ax = 1; + cx = 0; + cpuid(1, &ax, &bx, &cx, &dx); - barrier(); /* unmask then check (avoid races) */ - if (unlikely(vcpu->evtchn_upcall_pending)) - force_evtchn_callback(); + xsave_mask = + (1 << (X86_FEATURE_XSAVE % 32)) | + (1 << (X86_FEATURE_OSXSAVE % 32)); + + /* Xen will set CR4.OSXSAVE if supported and not disabled by force */ + if ((cx & xsave_mask) != xsave_mask) + cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */ + if (xen_check_mwait()) + cpuid_leaf1_ecx_set_mask = (1 << (X86_FEATURE_MWAIT % 32)); } -static void xen_safe_halt(void) +static void xen_set_debugreg(int reg, unsigned long val) { - /* Blocking includes an implicit local_irq_enable(). */ - if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0) - BUG(); + HYPERVISOR_set_debugreg(reg, val); } -static void xen_halt(void) +static unsigned long xen_get_debugreg(int reg) { - if (irqs_disabled()) - HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); - else - xen_safe_halt(); + return HYPERVISOR_get_debugreg(reg); } -static void xen_leave_lazy(void) +static void xen_end_context_switch(struct task_struct *next) { - paravirt_leave_lazy(paravirt_get_lazy_mode()); xen_mc_flush(); + paravirt_end_context_switch(next); } static unsigned long xen_store_tr(void) @@ -275,11 +468,66 @@ static unsigned long xen_store_tr(void) return 0; } +/* + * Set the page permissions for a particular virtual address. If the + * address is a vmalloc mapping (or other non-linear mapping), then + * find the linear mapping of the page and also set its protections to + * match. + */ +static void set_aliased_prot(void *v, pgprot_t prot) +{ + int level; + pte_t *ptep; + pte_t pte; + unsigned long pfn; + struct page *page; + + ptep = lookup_address((unsigned long)v, &level); + BUG_ON(ptep == NULL); + + pfn = pte_pfn(*ptep); + page = pfn_to_page(pfn); + + pte = pfn_pte(pfn, prot); + + if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0)) + BUG(); + + if (!PageHighMem(page)) { + void *av = __va(PFN_PHYS(pfn)); + + if (av != v) + if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0)) + BUG(); + } else + kmap_flush_unused(); +} + +static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries) +{ + const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; + int i; + + for(i = 0; i < entries; i += entries_per_page) + set_aliased_prot(ldt + i, PAGE_KERNEL_RO); +} + +static void xen_free_ldt(struct desc_struct *ldt, unsigned entries) +{ + const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; + int i; + + for(i = 0; i < entries; i += entries_per_page) + set_aliased_prot(ldt + i, PAGE_KERNEL); +} + static void xen_set_ldt(const void *addr, unsigned entries) { struct mmuext_op *op; struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + trace_xen_cpu_set_ldt(addr, entries); + op = mcs.args; op->cmd = MMUEXT_SET_LDT; op->arg1.linear_addr = (unsigned long)addr; @@ -292,44 +540,142 @@ static void xen_set_ldt(const void *addr, unsigned entries) static void xen_load_gdt(const struct desc_ptr *dtr) { - unsigned long *frames; unsigned long va = dtr->address; unsigned int size = dtr->size + 1; unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; + unsigned long frames[pages]; int f; - struct multicall_space mcs; - /* A GDT can be up to 64k in size, which corresponds to 8192 - 8-byte entries, or 16 4k pages.. */ + /* + * A GDT can be up to 64k in size, which corresponds to 8192 + * 8-byte entries, or 16 4k pages.. + */ BUG_ON(size > 65536); BUG_ON(va & ~PAGE_MASK); - mcs = xen_mc_entry(sizeof(*frames) * pages); - frames = mcs.args; - for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { - frames[f] = virt_to_mfn(va); + int level; + pte_t *ptep; + unsigned long pfn, mfn; + void *virt; + + /* + * The GDT is per-cpu and is in the percpu data area. + * That can be virtually mapped, so we need to do a + * page-walk to get the underlying MFN for the + * hypercall. The page can also be in the kernel's + * linear range, so we need to RO that mapping too. + */ + ptep = lookup_address(va, &level); + BUG_ON(ptep == NULL); + + pfn = pte_pfn(*ptep); + mfn = pfn_to_mfn(pfn); + virt = __va(PFN_PHYS(pfn)); + + frames[f] = mfn; + make_lowmem_page_readonly((void *)va); + make_lowmem_page_readonly(virt); } - MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); + if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) + BUG(); +} - xen_mc_issue(PARAVIRT_LAZY_CPU); +/* + * load_gdt for early boot, when the gdt is only mapped once + */ +static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) +{ + unsigned long va = dtr->address; + unsigned int size = dtr->size + 1; + unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; + unsigned long frames[pages]; + int f; + + /* + * A GDT can be up to 64k in size, which corresponds to 8192 + * 8-byte entries, or 16 4k pages.. + */ + + BUG_ON(size > 65536); + BUG_ON(va & ~PAGE_MASK); + + for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { + pte_t pte; + unsigned long pfn, mfn; + + pfn = virt_to_pfn(va); + mfn = pfn_to_mfn(pfn); + + pte = pfn_pte(pfn, PAGE_KERNEL_RO); + + if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0)) + BUG(); + + frames[f] = mfn; + } + + if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) + BUG(); +} + +static inline bool desc_equal(const struct desc_struct *d1, + const struct desc_struct *d2) +{ + return d1->a == d2->a && d1->b == d2->b; } static void load_TLS_descriptor(struct thread_struct *t, unsigned int cpu, unsigned int i) { - struct desc_struct *gdt = get_cpu_gdt_table(cpu); - xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); - struct multicall_space mc = __xen_mc_entry(0); + struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i]; + struct desc_struct *gdt; + xmaddr_t maddr; + struct multicall_space mc; + + if (desc_equal(shadow, &t->tls_array[i])) + return; + + *shadow = t->tls_array[i]; + + gdt = get_cpu_gdt_table(cpu); + maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); + mc = __xen_mc_entry(0); MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); } static void xen_load_tls(struct thread_struct *t, unsigned int cpu) { + /* + * XXX sleazy hack: If we're being called in a lazy-cpu zone + * and lazy gs handling is enabled, it means we're in a + * context switch, and %gs has just been saved. This means we + * can zero it out to prevent faults on exit from the + * hypervisor if the next process has no %gs. Either way, it + * has been saved, and the new value will get loaded properly. + * This will go away as soon as Xen has been modified to not + * save/restore %gs for normal hypercalls. + * + * On x86_64, this hack is not used for %gs, because gs points + * to KERNEL_GS_BASE (and uses it for PDA references), so we + * must not zero %gs on x86_64 + * + * For x86_64, we need to zero %fs, otherwise we may get an + * exception between the new %fs descriptor being loaded and + * %fs being effectively cleared at __switch_to(). + */ + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { +#ifdef CONFIG_X86_32 + lazy_load_gs(0); +#else + loadsegment(fs, 0); +#endif + } + xen_mc_batch(); load_TLS_descriptor(t, cpu, 0); @@ -337,27 +683,24 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) load_TLS_descriptor(t, cpu, 2); xen_mc_issue(PARAVIRT_LAZY_CPU); +} - /* - * XXX sleazy hack: If we're being called in a lazy-cpu zone, - * it means we're in a context switch, and %gs has just been - * saved. This means we can zero it out to prevent faults on - * exit from the hypervisor if the next process has no %gs. - * Either way, it has been saved, and the new value will get - * loaded properly. This will go away as soon as Xen has been - * modified to not save/restore %gs for normal hypercalls. - */ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) - loadsegment(gs, 0); +#ifdef CONFIG_X86_64 +static void xen_load_gs_index(unsigned int idx) +{ + if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx)) + BUG(); } +#endif static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, const void *ptr) { - unsigned long lp = (unsigned long)&dt[entrynum]; - xmaddr_t mach_lp = virt_to_machine(lp); + xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]); u64 entry = *(u64 *)ptr; + trace_xen_cpu_write_ldt_entry(dt, entrynum, entry); + preempt_disable(); xen_mc_flush(); @@ -367,24 +710,60 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, preempt_enable(); } -static int cvt_gate_to_trap(int vector, u32 low, u32 high, +static int cvt_gate_to_trap(int vector, const gate_desc *val, struct trap_info *info) { - u8 type, dpl; + unsigned long addr; - type = (high >> 8) & 0x1f; - dpl = (high >> 13) & 3; - - if (type != 0xf && type != 0xe) + if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) return 0; info->vector = vector; - info->address = (high & 0xffff0000) | (low & 0x0000ffff); - info->cs = low >> 16; - info->flags = dpl; + + addr = gate_offset(*val); +#ifdef CONFIG_X86_64 + /* + * Look for known traps using IST, and substitute them + * appropriately. The debugger ones are the only ones we care + * about. Xen will handle faults like double_fault, + * so we should never see them. Warn if + * there's an unexpected IST-using fault handler. + */ + if (addr == (unsigned long)debug) + addr = (unsigned long)xen_debug; + else if (addr == (unsigned long)int3) + addr = (unsigned long)xen_int3; + else if (addr == (unsigned long)stack_segment) + addr = (unsigned long)xen_stack_segment; + else if (addr == (unsigned long)double_fault) { + /* Don't need to handle these */ + return 0; +#ifdef CONFIG_X86_MCE + } else if (addr == (unsigned long)machine_check) { + /* + * when xen hypervisor inject vMCE to guest, + * use native mce handler to handle it + */ + ; +#endif + } else if (addr == (unsigned long)nmi) + /* + * Use the native version as well. + */ + ; + else { + /* Some other trap using IST? */ + if (WARN_ON(val->ist != 0)) + return 0; + } +#endif /* CONFIG_X86_64 */ + info->address = addr; + + info->cs = gate_segment(*val); + info->flags = val->dpl; /* interrupt gates clear IF */ - if (type == 0xe) - info->flags |= 4; + if (val->type == GATE_INTERRUPT) + info->flags |= 1 << 2; return 1; } @@ -399,10 +778,12 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) unsigned long p = (unsigned long)&dt[entrynum]; unsigned long start, end; + trace_xen_cpu_write_idt_entry(dt, entrynum, g); + preempt_disable(); - start = __get_cpu_var(idt_desc).address; - end = start + __get_cpu_var(idt_desc).size + 1; + start = __this_cpu_read(idt_desc.address); + end = start + __this_cpu_read(idt_desc.size) + 1; xen_mc_flush(); @@ -410,11 +791,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) if (p >= start && (p + 8) <= end) { struct trap_info info[2]; - u32 *desc = (u32 *)g; info[1].address = 0; - if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0])) + if (cvt_gate_to_trap(entrynum, g, &info[0])) if (HYPERVISOR_set_trap_table(info)) BUG(); } @@ -427,13 +807,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc, { unsigned in, out, count; - count = (desc->size+1) / 8; + count = (desc->size+1) / sizeof(gate_desc); BUG_ON(count > 256); for (in = out = 0; in < count; in++) { - const u32 *entry = (u32 *)(desc->address + in * 8); + gate_desc *entry = (gate_desc*)(desc->address) + in; - if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) + if (cvt_gate_to_trap(in, entry, &traps[out])) out++; } traps[out].address = 0; @@ -454,6 +834,8 @@ static void xen_load_idt(const struct desc_ptr *desc) static DEFINE_SPINLOCK(lock); static struct trap_info traps[257]; + trace_xen_cpu_load_idt(desc); + spin_lock(&lock); __get_cpu_var(idt_desc) = *desc; @@ -472,6 +854,8 @@ static void xen_load_idt(const struct desc_ptr *desc) static void xen_write_gdt_entry(struct desc_struct *dt, int entry, const void *desc, int type) { + trace_xen_cpu_write_gdt_entry(dt, entry, desc, type); + preempt_disable(); switch (type) { @@ -481,7 +865,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, break; default: { - xmaddr_t maddr = virt_to_machine(&dt[entry]); + xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]); xen_mc_flush(); if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) @@ -493,10 +877,37 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, preempt_enable(); } +/* + * Version of write_gdt_entry for use at early boot-time needed to + * update an entry as simply as possible. + */ +static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, + const void *desc, int type) +{ + trace_xen_cpu_write_gdt_entry(dt, entry, desc, type); + + switch (type) { + case DESC_LDT: + case DESC_TSS: + /* ignore */ + break; + + default: { + xmaddr_t maddr = virt_to_machine(&dt[entry]); + + if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) + dt[entry] = *(struct desc_struct *)desc; + } + + } +} + static void xen_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) + struct thread_struct *thread) { - struct multicall_space mcs = xen_mc_entry(0); + struct multicall_space mcs; + + mcs = xen_mc_entry(0); MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); xen_mc_issue(PARAVIRT_LAZY_CPU); } @@ -515,325 +926,202 @@ static void xen_io_delay(void) } #ifdef CONFIG_X86_LOCAL_APIC -static u32 xen_apic_read(unsigned long reg) +static unsigned long xen_set_apic_id(unsigned int x) { - return 0; -} - -static void xen_apic_write(unsigned long reg, u32 val) -{ - /* Warn to see if there's any stray references */ WARN_ON(1); + return x; } -#endif - -static void xen_flush_tlb(void) -{ - struct mmuext_op *op; - struct multicall_space mcs = xen_mc_entry(sizeof(*op)); - - op = mcs.args; - op->cmd = MMUEXT_TLB_FLUSH_LOCAL; - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_MMU); -} - -static void xen_flush_tlb_single(unsigned long addr) +static unsigned int xen_get_apic_id(unsigned long x) { - struct mmuext_op *op; - struct multicall_space mcs = xen_mc_entry(sizeof(*op)); - - op = mcs.args; - op->cmd = MMUEXT_INVLPG_LOCAL; - op->arg1.linear_addr = addr & PAGE_MASK; - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_MMU); + return ((x)>>24) & 0xFFu; } - -static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, - unsigned long va) +static u32 xen_apic_read(u32 reg) { - struct { - struct mmuext_op op; - cpumask_t mask; - } *args; - cpumask_t cpumask = *cpus; - struct multicall_space mcs; - - /* - * A couple of (to be removed) sanity checks: - * - * - current CPU must not be in mask - * - mask must exist :) - */ - BUG_ON(cpus_empty(cpumask)); - BUG_ON(cpu_isset(smp_processor_id(), cpumask)); - BUG_ON(!mm); - - /* If a CPU which we ran on has gone down, OK. */ - cpus_and(cpumask, cpumask, cpu_online_map); - if (cpus_empty(cpumask)) - return; + struct xen_platform_op op = { + .cmd = XENPF_get_cpuinfo, + .interface_version = XENPF_INTERFACE_VERSION, + .u.pcpu_info.xen_cpuid = 0, + }; + int ret = 0; + + /* Shouldn't need this as APIC is turned off for PV, and we only + * get called on the bootup processor. But just in case. */ + if (!xen_initial_domain() || smp_processor_id()) + return 0; - mcs = xen_mc_entry(sizeof(*args)); - args = mcs.args; - args->mask = cpumask; - args->op.arg2.vcpumask = &args->mask; + if (reg == APIC_LVR) + return 0x10; - if (va == TLB_FLUSH_ALL) { - args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; - } else { - args->op.cmd = MMUEXT_INVLPG_MULTI; - args->op.arg1.linear_addr = va; - } + if (reg != APIC_ID) + return 0; - MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); + ret = HYPERVISOR_dom0_op(&op); + if (ret) + return 0; - xen_mc_issue(PARAVIRT_LAZY_MMU); + return op.u.pcpu_info.apic_id << 24; } -static void xen_write_cr2(unsigned long cr2) +static void xen_apic_write(u32 reg, u32 val) { - x86_read_percpu(xen_vcpu)->arch.cr2 = cr2; + /* Warn to see if there's any stray references */ + WARN_ON(1); } -static unsigned long xen_read_cr2(void) +static u64 xen_apic_icr_read(void) { - return x86_read_percpu(xen_vcpu)->arch.cr2; + return 0; } -static unsigned long xen_read_cr2_direct(void) +static void xen_apic_icr_write(u32 low, u32 id) { - return x86_read_percpu(xen_vcpu_info.arch.cr2); + /* Warn to see if there's any stray references */ + WARN_ON(1); } -static void xen_write_cr4(unsigned long cr4) +static void xen_apic_wait_icr_idle(void) { - /* Just ignore cr4 changes; Xen doesn't allow us to do - anything anyway. */ + return; } -static unsigned long xen_read_cr3(void) +static u32 xen_safe_apic_wait_icr_idle(void) { - return x86_read_percpu(xen_cr3); + return 0; } -static void set_current_cr3(void *v) +static void set_xen_basic_apic_ops(void) { - x86_write_percpu(xen_current_cr3, (unsigned long)v); + apic->read = xen_apic_read; + apic->write = xen_apic_write; + apic->icr_read = xen_apic_icr_read; + apic->icr_write = xen_apic_icr_write; + apic->wait_icr_idle = xen_apic_wait_icr_idle; + apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle; + apic->set_apic_id = xen_set_apic_id; + apic->get_apic_id = xen_get_apic_id; + +#ifdef CONFIG_SMP + apic->send_IPI_allbutself = xen_send_IPI_allbutself; + apic->send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself; + apic->send_IPI_mask = xen_send_IPI_mask; + apic->send_IPI_all = xen_send_IPI_all; + apic->send_IPI_self = xen_send_IPI_self; +#endif } -static void xen_write_cr3(unsigned long cr3) +#endif + +static void xen_clts(void) { - struct mmuext_op *op; struct multicall_space mcs; - unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); - - BUG_ON(preemptible()); - - mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ - /* Update while interrupts are disabled, so its atomic with - respect to ipis */ - x86_write_percpu(xen_cr3, cr3); + mcs = xen_mc_entry(0); - op = mcs.args; - op->cmd = MMUEXT_NEW_BASEPTR; - op->arg1.mfn = mfn; - - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - /* Update xen_update_cr3 once the batch has actually - been submitted. */ - xen_mc_callback(set_current_cr3, (void *)cr3); - - xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ -} - -/* Early in boot, while setting up the initial pagetable, assume - everything is pinned. */ -static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) -{ - BUG_ON(mem_map); /* should only be used early */ - make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); -} + MULTI_fpu_taskswitch(mcs.mc, 0); -/* Early release_pt assumes that all pts are pinned, since there's - only init_mm and anything attached to that is pinned. */ -static void xen_release_pt_init(u32 pfn) -{ - make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); + xen_mc_issue(PARAVIRT_LAZY_CPU); } -static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) -{ - struct mmuext_op op; - op.cmd = cmd; - op.arg1.mfn = pfn_to_mfn(pfn); - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) - BUG(); -} +static DEFINE_PER_CPU(unsigned long, xen_cr0_value); -/* This needs to make sure the new pte page is pinned iff its being - attached to a pinned pagetable. */ -static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) +static unsigned long xen_read_cr0(void) { - struct page *page = pfn_to_page(pfn); - - if (PagePinned(virt_to_page(mm->pgd))) { - SetPagePinned(page); + unsigned long cr0 = this_cpu_read(xen_cr0_value); - if (!PageHighMem(page)) { - make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); - if (level == PT_PTE) - pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); - } else - /* make sure there are no stray mappings of - this page */ - kmap_flush_unused(); + if (unlikely(cr0 == 0)) { + cr0 = native_read_cr0(); + this_cpu_write(xen_cr0_value, cr0); } -} -static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) -{ - xen_alloc_ptpage(mm, pfn, PT_PTE); + return cr0; } -static void xen_alloc_pd(struct mm_struct *mm, u32 pfn) +static void xen_write_cr0(unsigned long cr0) { - xen_alloc_ptpage(mm, pfn, PT_PMD); -} + struct multicall_space mcs; -/* This should never happen until we're OK to use struct page */ -static void xen_release_ptpage(u32 pfn, unsigned level) -{ - struct page *page = pfn_to_page(pfn); + this_cpu_write(xen_cr0_value, cr0); - if (PagePinned(page)) { - if (!PageHighMem(page)) { - if (level == PT_PTE) - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); - make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); - } - ClearPagePinned(page); - } -} + /* Only pay attention to cr0.TS; everything else is + ignored. */ + mcs = xen_mc_entry(0); -static void xen_release_pt(u32 pfn) -{ - xen_release_ptpage(pfn, PT_PTE); -} + MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0); -static void xen_release_pd(u32 pfn) -{ - xen_release_ptpage(pfn, PT_PMD); + xen_mc_issue(PARAVIRT_LAZY_CPU); } -#ifdef CONFIG_HIGHPTE -static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) +static void xen_write_cr4(unsigned long cr4) { - pgprot_t prot = PAGE_KERNEL; - - if (PagePinned(page)) - prot = PAGE_KERNEL_RO; - - if (0 && PageHighMem(page)) - printk("mapping highpte %lx type %d prot %s\n", - page_to_pfn(page), type, - (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); + cr4 &= ~X86_CR4_PGE; + cr4 &= ~X86_CR4_PSE; - return kmap_atomic_prot(page, type, prot); + native_write_cr4(cr4); } -#endif - -static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) +#ifdef CONFIG_X86_64 +static inline unsigned long xen_read_cr8(void) { - /* If there's an existing pte, then don't allow _PAGE_RW to be set */ - if (pte_val_ma(*ptep) & _PAGE_PRESENT) - pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & - pte_val_ma(pte)); - - return pte; + return 0; } - -/* Init-time set_pte while constructing initial pagetables, which - doesn't allow RO pagetable pages to be remapped RW */ -static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) +static inline void xen_write_cr8(unsigned long val) { - pte = mask_rw_pte(ptep, pte); - - xen_set_pte(ptep, pte); + BUG_ON(val); } - -static __init void xen_pagetable_setup_start(pgd_t *base) +#endif +static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) { - pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; + int ret; - /* special set_pte for pagetable initialization */ - pv_mmu_ops.set_pte = xen_set_pte_init; + ret = 0; - init_mm.pgd = base; - /* - * copy top-level of Xen-supplied pagetable into place. For - * !PAE we can use this as-is, but for PAE it is a stand-in - * while we copy the pmd pages. - */ - memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); + switch (msr) { +#ifdef CONFIG_X86_64 + unsigned which; + u64 base; - if (PTRS_PER_PMD > 1) { - int i; - /* - * For PAE, need to allocate new pmds, rather than - * share Xen's, since Xen doesn't like pmd's being - * shared between address spaces. - */ - for (i = 0; i < PTRS_PER_PGD; i++) { - if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { - pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); + case MSR_FS_BASE: which = SEGBASE_FS; goto set; + case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set; + case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set; - memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), - PAGE_SIZE); + set: + base = ((u64)high << 32) | low; + if (HYPERVISOR_set_segment_base(which, base) != 0) + ret = -EIO; + break; +#endif - make_lowmem_page_readonly(pmd); + case MSR_STAR: + case MSR_CSTAR: + case MSR_LSTAR: + case MSR_SYSCALL_MASK: + case MSR_IA32_SYSENTER_CS: + case MSR_IA32_SYSENTER_ESP: + case MSR_IA32_SYSENTER_EIP: + /* Fast syscall setup is all done in hypercalls, so + these are all ignored. Stub them out here to stop + Xen console noise. */ + break; - set_pgd(&base[i], __pgd(1 + __pa(pmd))); - } else - pgd_clear(&base[i]); - } - } + case MSR_IA32_CR_PAT: + if (smp_processor_id() == 0) + xen_set_pat(((u64)high << 32) | low); + break; - /* make sure zero_page is mapped RO so we can use it in pagetables */ - make_lowmem_page_readonly(empty_zero_page); - make_lowmem_page_readonly(base); - /* - * Switch to new pagetable. This is done before - * pagetable_init has done anything so that the new pages - * added to the table can be prepared properly for Xen. - */ - xen_write_cr3(__pa(base)); + default: + ret = native_write_msr_safe(msr, low, high); + } - /* Unpin initial Xen pagetable */ - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, - PFN_DOWN(__pa(xen_start_info->pt_base))); + return ret; } -static __init void setup_shared_info(void) +void xen_setup_shared_info(void) { if (!xen_feature(XENFEAT_auto_translated_physmap)) { - unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); - - /* - * Create a mapping for the shared info page. - * Should be set_fixmap(), but shared_info is a machine - * address with no corresponding pseudo-phys address. - */ - set_pte_mfn(addr, - PFN_DOWN(xen_start_info->shared_info), - PAGE_KERNEL); + set_fixmap(FIX_PARAVIRT_BOOTMAP, + xen_start_info->shared_info); - HYPERVISOR_shared_info = (struct shared_info *)addr; + HYPERVISOR_shared_info = + (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); } else HYPERVISOR_shared_info = (struct shared_info *)__va(xen_start_info->shared_info); @@ -842,37 +1130,12 @@ static __init void setup_shared_info(void) /* In UP this is as good a place as any to set up shared info */ xen_setup_vcpu_info_placement(); #endif -} - -static __init void xen_pagetable_setup_done(pgd_t *base) -{ - /* This will work as long as patching hasn't happened yet - (which it hasn't) */ - pv_mmu_ops.alloc_pt = xen_alloc_pt; - pv_mmu_ops.alloc_pd = xen_alloc_pd; - pv_mmu_ops.release_pt = xen_release_pt; - pv_mmu_ops.release_pd = xen_release_pd; - pv_mmu_ops.set_pte = xen_set_pte; - - setup_shared_info(); - - /* Actually pin the pagetable down, but we can't set PG_pinned - yet because the page structures don't exist yet. */ - { - unsigned level; - -#ifdef CONFIG_X86_PAE - level = MMUEXT_PIN_L3_TABLE; -#else - level = MMUEXT_PIN_L2_TABLE; -#endif - pin_pagetable_pfn(level, PFN_DOWN(__pa(base))); - } + xen_setup_mfn_list_list(); } -/* This is called once we have the cpu_possible_map */ -void __init xen_setup_vcpu_info_placement(void) +/* This is called once we have the cpu_possible_mask */ +void xen_setup_vcpu_info_placement(void) { int cpu; @@ -880,16 +1143,14 @@ void __init xen_setup_vcpu_info_placement(void) xen_vcpu_setup(cpu); /* xen_vcpu_setup managed to place the vcpu_info within the - percpu area for all cpus, so make use of it */ - if (have_vcpu_info_placement) { - printk(KERN_INFO "Xen: using vcpu_info placement\n"); - - pv_irq_ops.save_fl = xen_save_fl_direct; - pv_irq_ops.restore_fl = xen_restore_fl_direct; - pv_irq_ops.irq_disable = xen_irq_disable_direct; - pv_irq_ops.irq_enable = xen_irq_enable_direct; + * percpu area for all cpus, so make use of it. Note that for + * PVH we want to use native IRQ mechanism. */ + if (have_vcpu_info_placement && !xen_pvh_domain()) { + pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); + pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); + pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); + pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct); pv_mmu_ops.read_cr2 = xen_read_cr2_direct; - pv_cpu_ops.iret = xen_iret_direct; } } @@ -946,63 +1207,70 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, return ret; } -static const struct pv_info xen_info __initdata = { +static const struct pv_info xen_info __initconst = { .paravirt_enabled = 1, .shared_kernel_pmd = 0, +#ifdef CONFIG_X86_64 + .extra_user_64bit_cs = FLAT_USER_CS64, +#endif + .name = "Xen", }; -static const struct pv_init_ops xen_init_ops __initdata = { +static const struct pv_init_ops xen_init_ops __initconst = { .patch = xen_patch, - - .banner = xen_banner, - .memory_setup = xen_memory_setup, - .arch_setup = xen_arch_setup, - .post_allocator_init = xen_mark_init_mm_pinned, -}; - -static const struct pv_time_ops xen_time_ops __initdata = { - .time_init = xen_time_init, - - .set_wallclock = xen_set_wallclock, - .get_wallclock = xen_get_wallclock, - .get_cpu_khz = xen_cpu_khz, - .sched_clock = xen_sched_clock, }; -static const struct pv_cpu_ops xen_cpu_ops __initdata = { +static const struct pv_cpu_ops xen_cpu_ops __initconst = { .cpuid = xen_cpuid, .set_debugreg = xen_set_debugreg, .get_debugreg = xen_get_debugreg, - .clts = native_clts, + .clts = xen_clts, - .read_cr0 = native_read_cr0, - .write_cr0 = native_write_cr0, + .read_cr0 = xen_read_cr0, + .write_cr0 = xen_write_cr0, .read_cr4 = native_read_cr4, .read_cr4_safe = native_read_cr4_safe, .write_cr4 = xen_write_cr4, +#ifdef CONFIG_X86_64 + .read_cr8 = xen_read_cr8, + .write_cr8 = xen_write_cr8, +#endif + .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, - .write_msr = native_write_msr_safe, + .write_msr = xen_write_msr_safe, + .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, - .iret = (void *)&hypercall_page[__HYPERVISOR_iret], - .irq_enable_syscall_ret = NULL, /* never called */ + .read_tscp = native_read_tscp, + + .iret = xen_iret, + .irq_enable_sysexit = xen_sysexit, +#ifdef CONFIG_X86_64 + .usergs_sysret32 = xen_sysret32, + .usergs_sysret64 = xen_sysret64, +#endif .load_tr_desc = paravirt_nop, .set_ldt = xen_set_ldt, .load_gdt = xen_load_gdt, .load_idt = xen_load_idt, .load_tls = xen_load_tls, +#ifdef CONFIG_X86_64 + .load_gs_index = xen_load_gs_index, +#endif + + .alloc_ldt = xen_alloc_ldt, + .free_ldt = xen_free_ldt, - .store_gdt = native_store_gdt, .store_idt = native_store_idt, .store_tr = xen_store_tr, @@ -1014,112 +1282,24 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .set_iopl_mask = xen_set_iopl_mask, .io_delay = xen_io_delay, - .lazy_mode = { - .enter = paravirt_enter_lazy_cpu, - .leave = xen_leave_lazy, - }, -}; + /* Xen takes care of %gs when switching to usermode for us */ + .swapgs = paravirt_nop, -static const struct pv_irq_ops xen_irq_ops __initdata = { - .init_IRQ = xen_init_IRQ, - .save_fl = xen_save_fl, - .restore_fl = xen_restore_fl, - .irq_disable = xen_irq_disable, - .irq_enable = xen_irq_enable, - .safe_halt = xen_safe_halt, - .halt = xen_halt, + .start_context_switch = paravirt_start_context_switch, + .end_context_switch = xen_end_context_switch, }; -static const struct pv_apic_ops xen_apic_ops __initdata = { +static const struct pv_apic_ops xen_apic_ops __initconst = { #ifdef CONFIG_X86_LOCAL_APIC - .apic_write = xen_apic_write, - .apic_write_atomic = xen_apic_write, - .apic_read = xen_apic_read, - .setup_boot_clock = paravirt_nop, - .setup_secondary_clock = paravirt_nop, .startup_ipi_hook = paravirt_nop, #endif }; -static const struct pv_mmu_ops xen_mmu_ops __initdata = { - .pagetable_setup_start = xen_pagetable_setup_start, - .pagetable_setup_done = xen_pagetable_setup_done, - - .read_cr2 = xen_read_cr2, - .write_cr2 = xen_write_cr2, - - .read_cr3 = xen_read_cr3, - .write_cr3 = xen_write_cr3, - - .flush_tlb_user = xen_flush_tlb, - .flush_tlb_kernel = xen_flush_tlb, - .flush_tlb_single = xen_flush_tlb_single, - .flush_tlb_others = xen_flush_tlb_others, - - .pte_update = paravirt_nop, - .pte_update_defer = paravirt_nop, - - .alloc_pt = xen_alloc_pt_init, - .release_pt = xen_release_pt_init, - .alloc_pd = xen_alloc_pt_init, - .alloc_pd_clone = paravirt_nop, - .release_pd = xen_release_pt_init, - -#ifdef CONFIG_HIGHPTE - .kmap_atomic_pte = xen_kmap_atomic_pte, -#endif - - .set_pte = NULL, /* see xen_pagetable_setup_* */ - .set_pte_at = xen_set_pte_at, - .set_pmd = xen_set_pmd, - - .pte_val = xen_pte_val, - .pgd_val = xen_pgd_val, - - .make_pte = xen_make_pte, - .make_pgd = xen_make_pgd, - -#ifdef CONFIG_X86_PAE - .set_pte_atomic = xen_set_pte_atomic, - .set_pte_present = xen_set_pte_at, - .set_pud = xen_set_pud, - .pte_clear = xen_pte_clear, - .pmd_clear = xen_pmd_clear, - - .make_pmd = xen_make_pmd, - .pmd_val = xen_pmd_val, -#endif /* PAE */ - - .activate_mm = xen_activate_mm, - .dup_mmap = xen_dup_mmap, - .exit_mmap = xen_exit_mmap, - - .lazy_mode = { - .enter = paravirt_enter_lazy_mmu, - .leave = xen_leave_lazy, - }, -}; - -#ifdef CONFIG_SMP -static const struct smp_ops xen_smp_ops __initdata = { - .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, - .smp_prepare_cpus = xen_smp_prepare_cpus, - .cpu_up = xen_cpu_up, - .smp_cpus_done = xen_smp_cpus_done, - - .smp_send_stop = xen_smp_send_stop, - .smp_send_reschedule = xen_smp_send_reschedule, - .smp_call_function_mask = xen_smp_call_function_mask, -}; -#endif /* CONFIG_SMP */ - static void xen_reboot(int reason) { -#ifdef CONFIG_SMP - smp_send_stop(); -#endif + struct sched_shutdown r = { .reason = reason }; - if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason)) + if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) BUG(); } @@ -1138,96 +1318,539 @@ static void xen_machine_halt(void) xen_reboot(SHUTDOWN_poweroff); } +static void xen_machine_power_off(void) +{ + if (pm_power_off) + pm_power_off(); + xen_reboot(SHUTDOWN_poweroff); +} + static void xen_crash_shutdown(struct pt_regs *regs) { xen_reboot(SHUTDOWN_crash); } -static const struct machine_ops __initdata xen_machine_ops = { +static int +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + xen_reboot(SHUTDOWN_crash); + return NOTIFY_DONE; +} + +static struct notifier_block xen_panic_block = { + .notifier_call= xen_panic_event, + .priority = INT_MIN +}; + +int xen_panic_handler_init(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block); + return 0; +} + +static const struct machine_ops xen_machine_ops __initconst = { .restart = xen_restart, .halt = xen_machine_halt, - .power_off = xen_machine_halt, + .power_off = xen_machine_power_off, .shutdown = xen_machine_halt, .crash_shutdown = xen_crash_shutdown, .emergency_restart = xen_emergency_restart, }; +static void __init xen_boot_params_init_edd(void) +{ +#if IS_ENABLED(CONFIG_EDD) + struct xen_platform_op op; + struct edd_info *edd_info; + u32 *mbr_signature; + unsigned nr; + int ret; + + edd_info = boot_params.eddbuf; + mbr_signature = boot_params.edd_mbr_sig_buffer; + + op.cmd = XENPF_firmware_info; + + op.u.firmware_info.type = XEN_FW_DISK_INFO; + for (nr = 0; nr < EDDMAXNR; nr++) { + struct edd_info *info = edd_info + nr; + + op.u.firmware_info.index = nr; + info->params.length = sizeof(info->params); + set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params, + &info->params); + ret = HYPERVISOR_dom0_op(&op); + if (ret) + break; + +#define C(x) info->x = op.u.firmware_info.u.disk_info.x + C(device); + C(version); + C(interface_support); + C(legacy_max_cylinder); + C(legacy_max_head); + C(legacy_sectors_per_track); +#undef C + } + boot_params.eddbuf_entries = nr; + + op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE; + for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) { + op.u.firmware_info.index = nr; + ret = HYPERVISOR_dom0_op(&op); + if (ret) + break; + mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature; + } + boot_params.edd_mbr_sig_buf_entries = nr; +#endif +} -static void __init xen_reserve_top(void) +/* + * Set up the GDT and segment registers for -fstack-protector. Until + * we do this, we have to be careful not to call any stack-protected + * function, which is most of the kernel. + * + * Note, that it is __ref because the only caller of this after init + * is PVH which is not going to use xen_load_gdt_boot or other + * __init functions. + */ +static void __ref xen_setup_gdt(int cpu) { - unsigned long top = HYPERVISOR_VIRT_START; - struct xen_platform_parameters pp; + if (xen_feature(XENFEAT_auto_translated_physmap)) { +#ifdef CONFIG_X86_64 + unsigned long dummy; + + load_percpu_segment(cpu); /* We need to access per-cpu area */ + switch_to_new_gdt(cpu); /* GDT and GS set */ + + /* We are switching of the Xen provided GDT to our HVM mode + * GDT. The new GDT has __KERNEL_CS with CS.L = 1 + * and we are jumping to reload it. + */ + asm volatile ("pushq %0\n" + "leaq 1f(%%rip),%0\n" + "pushq %0\n" + "lretq\n" + "1:\n" + : "=&r" (dummy) : "0" (__KERNEL_CS)); - if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) - top = pp.virt_start; + /* + * While not needed, we also set the %es, %ds, and %fs + * to zero. We don't care about %ss as it is NULL. + * Strictly speaking this is not needed as Xen zeros those + * out (and also MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE) + * + * Linux zeros them in cpu_init() and in secondary_startup_64 + * (for BSP). + */ + loadsegment(es, 0); + loadsegment(ds, 0); + loadsegment(fs, 0); +#else + /* PVH: TODO Implement. */ + BUG(); +#endif + return; /* PVH does not need any PV GDT ops. */ + } + pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; + pv_cpu_ops.load_gdt = xen_load_gdt_boot; - reserve_top_address(-top + 2 * PAGE_SIZE); + setup_stack_canary_segment(0); + switch_to_new_gdt(0); + + pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry; + pv_cpu_ops.load_gdt = xen_load_gdt; +} + +/* + * A PV guest starts with default flags that are not set for PVH, set them + * here asap. + */ +static void xen_pvh_set_cr_flags(int cpu) +{ + + /* Some of these are setup in 'secondary_startup_64'. The others: + * X86_CR0_TS, X86_CR0_PE, X86_CR0_ET are set by Xen for HVM guests + * (which PVH shared codepaths), while X86_CR0_PG is for PVH. */ + write_cr0(read_cr0() | X86_CR0_MP | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM); + + if (!cpu) + return; + /* + * For BSP, PSE PGE are set in probe_page_size_mask(), for APs + * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu_init. + */ + if (cpu_has_pse) + set_in_cr4(X86_CR4_PSE); + + if (cpu_has_pge) + set_in_cr4(X86_CR4_PGE); +} + +/* + * Note, that it is ref - because the only caller of this after init + * is PVH which is not going to use xen_load_gdt_boot or other + * __init functions. + */ +void __ref xen_pvh_secondary_vcpu_init(int cpu) +{ + xen_setup_gdt(cpu); + xen_pvh_set_cr_flags(cpu); +} + +static void __init xen_pvh_early_guest_init(void) +{ + if (!xen_feature(XENFEAT_auto_translated_physmap)) + return; + + if (!xen_feature(XENFEAT_hvm_callback_vector)) + return; + + xen_have_vector_callback = 1; + xen_pvh_set_cr_flags(0); + +#ifdef CONFIG_X86_32 + BUG(); /* PVH: Implement proper support. */ +#endif } /* First C function to be called on Xen boot */ -asmlinkage void __init xen_start_kernel(void) +asmlinkage __visible void __init xen_start_kernel(void) { - pgd_t *pgd; + struct physdev_set_iopl set_iopl; + int rc; if (!xen_start_info) return; - BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); + xen_domain_type = XEN_PV_DOMAIN; + + xen_setup_features(); + xen_pvh_early_guest_init(); + xen_setup_machphys_mapping(); /* Install Xen paravirt ops */ pv_info = xen_info; pv_init_ops = xen_init_ops; - pv_time_ops = xen_time_ops; - pv_cpu_ops = xen_cpu_ops; - pv_irq_ops = xen_irq_ops; pv_apic_ops = xen_apic_ops; - pv_mmu_ops = xen_mmu_ops; + if (!xen_pvh_domain()) + pv_cpu_ops = xen_cpu_ops; - machine_ops = xen_machine_ops; + if (xen_feature(XENFEAT_auto_translated_physmap)) + x86_init.resources.memory_setup = xen_auto_xlated_memory_setup; + else + x86_init.resources.memory_setup = xen_memory_setup; + x86_init.oem.arch_setup = xen_arch_setup; + x86_init.oem.banner = xen_banner; -#ifdef CONFIG_SMP - smp_ops = xen_smp_ops; + xen_init_time_ops(); + + /* + * Set up some pagetable state before starting to set any ptes. + */ + + xen_init_mmu_ops(); + + /* Prevent unwanted bits from being set in PTEs. */ + __supported_pte_mask &= ~_PAGE_GLOBAL; +#if 0 + if (!xen_initial_domain()) #endif + __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); - xen_setup_features(); + __supported_pte_mask |= _PAGE_IOMAP; + + /* + * Prevent page tables from being allocated in highmem, even + * if CONFIG_HIGHPTE is enabled. + */ + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + + /* Work out if we support NX */ + x86_configure_nx(); /* Get mfn list */ - if (!xen_feature(XENFEAT_auto_translated_physmap)) - phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list; + xen_build_dynamic_phys_to_machine(); - pgd = (pgd_t *)xen_start_info->pt_base; + /* + * Set up kernel GDT and segment registers, mainly so that + * -fstack-protector code can be executed. + */ + xen_setup_gdt(0); - init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; + xen_init_irq_ops(); + xen_init_cpuid_mask(); - init_mm.pgd = pgd; /* use the Xen pagetables to start */ +#ifdef CONFIG_X86_LOCAL_APIC + /* + * set up the basic apic ops. + */ + set_xen_basic_apic_ops(); +#endif - /* keep using Xen gdt for now; no urgent need to change it */ + if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { + pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; + pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; + } - x86_write_percpu(xen_cr3, __pa(pgd)); - x86_write_percpu(xen_current_cr3, __pa(pgd)); + machine_ops = xen_machine_ops; + /* + * The only reliable way to retain the initial address of the + * percpu gdt_page is to remember it here, so we can go and + * mark it RW later, when the initial percpu area is freed. + */ + xen_initial_gdt = &per_cpu(gdt_page, 0); + + xen_smp_init(); + +#ifdef CONFIG_ACPI_NUMA + /* + * The pages we from Xen are not related to machine pages, so + * any NUMA information the kernel tries to get from ACPI will + * be meaningless. Prevent it from trying. + */ + acpi_numa = -1; +#endif +#ifdef CONFIG_X86_PAT + /* + * For right now disable the PAT. We should remove this once + * git commit 8eaffa67b43e99ae581622c5133e20b0f48bcef1 + * (xen/pat: Disable PAT support for now) is reverted. + */ + pat_enabled = 0; +#endif /* Don't do the full vcpu_info placement stuff until we have a possible map and a non-dummy shared_info. */ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; + local_irq_disable(); + early_boot_irqs_disabled = true; + + xen_raw_console_write("mapping kernel into physical memory\n"); + xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages); + + /* Allocate and initialize top and mid mfn levels for p2m structure */ + xen_build_mfn_list_list(); + + /* keep using Xen gdt for now; no urgent need to change it */ + +#ifdef CONFIG_X86_32 pv_info.kernel_rpl = 1; if (xen_feature(XENFEAT_supervisor_mode_kernel)) pv_info.kernel_rpl = 0; - +#else + pv_info.kernel_rpl = 0; +#endif /* set the limit of our address space */ xen_reserve_top(); + /* PVH: runs at default kernel iopl of 0 */ + if (!xen_pvh_domain()) { + /* + * We used to do this in xen_arch_setup, but that is too late + * on AMD were early_cpu_init (run before ->arch_setup()) calls + * early_amd_init which pokes 0xcf8 port. + */ + set_iopl.iopl = 1; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); + if (rc != 0) + xen_raw_printk("physdev_op failed %d\n", rc); + } + +#ifdef CONFIG_X86_32 /* set up basic CPUID stuff */ cpu_detect(&new_cpu_data); - new_cpu_data.hard_math = 1; + set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU); + new_cpu_data.wp_works_ok = 1; new_cpu_data.x86_capability[0] = cpuid_edx(1); +#endif /* Poke various useful things into boot_params */ boot_params.hdr.type_of_loader = (9 << 4) | 0; boot_params.hdr.ramdisk_image = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0; boot_params.hdr.ramdisk_size = xen_start_info->mod_len; + boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); + + if (!xen_initial_domain()) { + add_preferred_console("xenboot", 0, NULL); + add_preferred_console("tty", 0, NULL); + add_preferred_console("hvc", 0, NULL); + if (pci_xen) + x86_init.pci.arch_init = pci_xen_init; + } else { + const struct dom0_vga_console_info *info = + (void *)((char *)xen_start_info + + xen_start_info->console.dom0.info_off); + struct xen_platform_op op = { + .cmd = XENPF_firmware_info, + .interface_version = XENPF_INTERFACE_VERSION, + .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS, + }; + + xen_init_vga(info, xen_start_info->console.dom0.info_size); + xen_start_info->console.domU.mfn = 0; + xen_start_info->console.domU.evtchn = 0; + + if (HYPERVISOR_dom0_op(&op) == 0) + boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags; + + xen_init_apic(); + + /* Make sure ACS will be enabled */ + pci_request_acs(); + + xen_acpi_sleep_register(); + + /* Avoid searching for BIOS MP tables */ + x86_init.mpparse.find_smp_config = x86_init_noop; + x86_init.mpparse.get_smp_config = x86_init_uint_noop; + + xen_boot_params_init_edd(); + } +#ifdef CONFIG_PCI + /* PCI BIOS service won't work from a PV guest. */ + pci_probe &= ~PCI_PROBE_BIOS; +#endif + xen_raw_console_write("about to get started...\n"); + + xen_setup_runstate_info(0); /* Start the world */ - start_kernel(); +#ifdef CONFIG_X86_32 + i386_start_kernel(); +#else + x86_64_start_reservations((char *)__pa_symbol(&boot_params)); +#endif +} + +void __ref xen_hvm_init_shared_info(void) +{ + int cpu; + struct xen_add_to_physmap xatp; + static struct shared_info *shared_info_page = 0; + + if (!shared_info_page) + shared_info_page = (struct shared_info *) + extend_brk(PAGE_SIZE, PAGE_SIZE); + xatp.domid = DOMID_SELF; + xatp.idx = 0; + xatp.space = XENMAPSPACE_shared_info; + xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; + if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) + BUG(); + + HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; + + /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info + * page, we use it in the event channel upcall and in some pvclock + * related functions. We don't need the vcpu_info placement + * optimizations because we don't use any pv_mmu or pv_irq op on + * HVM. + * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is + * online but xen_hvm_init_shared_info is run at resume time too and + * in that case multiple vcpus might be online. */ + for_each_online_cpu(cpu) { + /* Leave it to be NULL. */ + if (cpu >= MAX_VIRT_CPUS) + continue; + per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; + } +} + +#ifdef CONFIG_XEN_PVHVM +static void __init init_hvm_pv_info(void) +{ + int major, minor; + uint32_t eax, ebx, ecx, edx, pages, msr, base; + u64 pfn; + + base = xen_cpuid_base(); + cpuid(base + 1, &eax, &ebx, &ecx, &edx); + + major = eax >> 16; + minor = eax & 0xffff; + printk(KERN_INFO "Xen version %d.%d.\n", major, minor); + + cpuid(base + 2, &pages, &msr, &ecx, &edx); + + pfn = __pa(hypercall_page); + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + + xen_setup_features(); + + pv_info.name = "Xen HVM"; + + xen_domain_type = XEN_HVM_DOMAIN; } + +static int xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + switch (action) { + case CPU_UP_PREPARE: + xen_vcpu_setup(cpu); + if (xen_have_vector_callback) { + if (xen_feature(XENFEAT_hvm_safe_pvclock)) + xen_setup_timer(cpu); + } + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block xen_hvm_cpu_notifier = { + .notifier_call = xen_hvm_cpu_notify, +}; + +static void __init xen_hvm_guest_init(void) +{ + init_hvm_pv_info(); + + xen_hvm_init_shared_info(); + + xen_panic_handler_init(); + + if (xen_feature(XENFEAT_hvm_callback_vector)) + xen_have_vector_callback = 1; + xen_hvm_smp_init(); + register_cpu_notifier(&xen_hvm_cpu_notifier); + xen_unplug_emulated_devices(); + x86_init.irqs.intr_init = xen_init_IRQ; + xen_hvm_init_time_ops(); + xen_hvm_init_mmu_ops(); +} + +static uint32_t __init xen_hvm_platform(void) +{ + if (xen_pv_domain()) + return 0; + + return xen_cpuid_base(); +} + +bool xen_hvm_need_lapic(void) +{ + if (xen_pv_domain()) + return false; + if (!xen_hvm_domain()) + return false; + if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback) + return false; + return true; +} +EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); + +const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = { + .name = "Xen HVM", + .detect = xen_hvm_platform, + .init_platform = xen_hvm_guest_init, + .x2apic_available = xen_x2apic_para_available, +}; +EXPORT_SYMBOL(x86_hyper_xen_hvm); +#endif diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c deleted file mode 100644 index dcf613e1758..00000000000 --- a/arch/x86/xen/events.c +++ /dev/null @@ -1,591 +0,0 @@ -/* - * Xen event channels - * - * Xen models interrupts with abstract event channels. Because each - * domain gets 1024 event channels, but NR_IRQ is not that large, we - * must dynamically map irqs<->event channels. The event channels - * interface with the rest of the kernel by defining a xen interrupt - * chip. When an event is recieved, it is mapped to an irq and sent - * through the normal interrupt processing path. - * - * There are four kinds of events which can be mapped to an event - * channel: - * - * 1. Inter-domain notifications. This includes all the virtual - * device events, since they're driven by front-ends in another domain - * (typically dom0). - * 2. VIRQs, typically used for timers. These are per-cpu events. - * 3. IPIs. - * 4. Hardware interrupts. Not supported at present. - * - * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 - */ - -#include <linux/linkage.h> -#include <linux/interrupt.h> -#include <linux/irq.h> -#include <linux/module.h> -#include <linux/string.h> - -#include <asm/ptrace.h> -#include <asm/irq.h> -#include <asm/sync_bitops.h> -#include <asm/xen/hypercall.h> -#include <asm/xen/hypervisor.h> - -#include <xen/events.h> -#include <xen/interface/xen.h> -#include <xen/interface/event_channel.h> - -#include "xen-ops.h" - -/* - * This lock protects updates to the following mapping and reference-count - * arrays. The lock does not need to be acquired to read the mapping tables. - */ -static DEFINE_SPINLOCK(irq_mapping_update_lock); - -/* IRQ <-> VIRQ mapping. */ -static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; - -/* IRQ <-> IPI mapping */ -static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1}; - -/* Packed IRQ information: binding type, sub-type index, and event channel. */ -struct packed_irq -{ - unsigned short evtchn; - unsigned char index; - unsigned char type; -}; - -static struct packed_irq irq_info[NR_IRQS]; - -/* Binding types. */ -enum { - IRQT_UNBOUND, - IRQT_PIRQ, - IRQT_VIRQ, - IRQT_IPI, - IRQT_EVTCHN -}; - -/* Convenient shorthand for packed representation of an unbound IRQ. */ -#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0) - -static int evtchn_to_irq[NR_EVENT_CHANNELS] = { - [0 ... NR_EVENT_CHANNELS-1] = -1 -}; -static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG]; -static u8 cpu_evtchn[NR_EVENT_CHANNELS]; - -/* Reference counts for bindings to IRQs. */ -static int irq_bindcount[NR_IRQS]; - -/* Xen will never allocate port zero for any purpose. */ -#define VALID_EVTCHN(chn) ((chn) != 0) - -/* - * Force a proper event-channel callback from Xen after clearing the - * callback mask. We do this in a very simple manner, by making a call - * down into Xen. The pending flag will be checked by Xen on return. - */ -void force_evtchn_callback(void) -{ - (void)HYPERVISOR_xen_version(0, NULL); -} -EXPORT_SYMBOL_GPL(force_evtchn_callback); - -static struct irq_chip xen_dynamic_chip; - -/* Constructor for packed IRQ information. */ -static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn) -{ - return (struct packed_irq) { evtchn, index, type }; -} - -/* - * Accessors for packed IRQ information. - */ -static inline unsigned int evtchn_from_irq(int irq) -{ - return irq_info[irq].evtchn; -} - -static inline unsigned int index_from_irq(int irq) -{ - return irq_info[irq].index; -} - -static inline unsigned int type_from_irq(int irq) -{ - return irq_info[irq].type; -} - -static inline unsigned long active_evtchns(unsigned int cpu, - struct shared_info *sh, - unsigned int idx) -{ - return (sh->evtchn_pending[idx] & - cpu_evtchn_mask[cpu][idx] & - ~sh->evtchn_mask[idx]); -} - -static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) -{ - int irq = evtchn_to_irq[chn]; - - BUG_ON(irq == -1); -#ifdef CONFIG_SMP - irq_desc[irq].affinity = cpumask_of_cpu(cpu); -#endif - - __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]); - __set_bit(chn, cpu_evtchn_mask[cpu]); - - cpu_evtchn[chn] = cpu; -} - -static void init_evtchn_cpu_bindings(void) -{ -#ifdef CONFIG_SMP - int i; - /* By default all event channels notify CPU#0. */ - for (i = 0; i < NR_IRQS; i++) - irq_desc[i].affinity = cpumask_of_cpu(0); -#endif - - memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); - memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0])); -} - -static inline unsigned int cpu_from_evtchn(unsigned int evtchn) -{ - return cpu_evtchn[evtchn]; -} - -static inline void clear_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_clear_bit(port, &s->evtchn_pending[0]); -} - -static inline void set_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, &s->evtchn_pending[0]); -} - - -/** - * notify_remote_via_irq - send event to remote end of event channel via irq - * @irq: irq of event channel to send event to - * - * Unlike notify_remote_via_evtchn(), this is safe to use across - * save/restore. Notifications on a broken connection are silently - * dropped. - */ -void notify_remote_via_irq(int irq) -{ - int evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) - notify_remote_via_evtchn(evtchn); -} -EXPORT_SYMBOL_GPL(notify_remote_via_irq); - -static void mask_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, &s->evtchn_mask[0]); -} - -static void unmask_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - unsigned int cpu = get_cpu(); - - BUG_ON(!irqs_disabled()); - - /* Slow path (hypercall) if this is a non-local port. */ - if (unlikely(cpu != cpu_from_evtchn(port))) { - struct evtchn_unmask unmask = { .port = port }; - (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); - } else { - struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); - - sync_clear_bit(port, &s->evtchn_mask[0]); - - /* - * The following is basically the equivalent of - * 'hw_resend_irq'. Just like a real IO-APIC we 'lose - * the interrupt edge' if the channel is masked. - */ - if (sync_test_bit(port, &s->evtchn_pending[0]) && - !sync_test_and_set_bit(port / BITS_PER_LONG, - &vcpu_info->evtchn_pending_sel)) - vcpu_info->evtchn_upcall_pending = 1; - } - - put_cpu(); -} - -static int find_unbound_irq(void) -{ - int irq; - - /* Only allocate from dynirq range */ - for (irq = 0; irq < NR_IRQS; irq++) - if (irq_bindcount[irq] == 0) - break; - - if (irq == NR_IRQS) - panic("No available IRQ to bind to: increase NR_IRQS!\n"); - - return irq; -} - -int bind_evtchn_to_irq(unsigned int evtchn) -{ - int irq; - - spin_lock(&irq_mapping_update_lock); - - irq = evtchn_to_irq[evtchn]; - - if (irq == -1) { - irq = find_unbound_irq(); - - dynamic_irq_init(irq); - set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_level_irq, "event"); - - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn); - } - - irq_bindcount[irq]++; - - spin_unlock(&irq_mapping_update_lock); - - return irq; -} -EXPORT_SYMBOL_GPL(bind_evtchn_to_irq); - -static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) -{ - struct evtchn_bind_ipi bind_ipi; - int evtchn, irq; - - spin_lock(&irq_mapping_update_lock); - - irq = per_cpu(ipi_to_irq, cpu)[ipi]; - if (irq == -1) { - irq = find_unbound_irq(); - if (irq < 0) - goto out; - - dynamic_irq_init(irq); - set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_level_irq, "ipi"); - - bind_ipi.vcpu = cpu; - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, - &bind_ipi) != 0) - BUG(); - evtchn = bind_ipi.port; - - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); - - per_cpu(ipi_to_irq, cpu)[ipi] = irq; - - bind_evtchn_to_cpu(evtchn, cpu); - } - - irq_bindcount[irq]++; - - out: - spin_unlock(&irq_mapping_update_lock); - return irq; -} - - -static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) -{ - struct evtchn_bind_virq bind_virq; - int evtchn, irq; - - spin_lock(&irq_mapping_update_lock); - - irq = per_cpu(virq_to_irq, cpu)[virq]; - - if (irq == -1) { - bind_virq.virq = virq; - bind_virq.vcpu = cpu; - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, - &bind_virq) != 0) - BUG(); - evtchn = bind_virq.port; - - irq = find_unbound_irq(); - - dynamic_irq_init(irq); - set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_level_irq, "virq"); - - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); - - per_cpu(virq_to_irq, cpu)[virq] = irq; - - bind_evtchn_to_cpu(evtchn, cpu); - } - - irq_bindcount[irq]++; - - spin_unlock(&irq_mapping_update_lock); - - return irq; -} - -static void unbind_from_irq(unsigned int irq) -{ - struct evtchn_close close; - int evtchn = evtchn_from_irq(irq); - - spin_lock(&irq_mapping_update_lock); - - if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) { - close.port = evtchn; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) - BUG(); - - switch (type_from_irq(irq)) { - case IRQT_VIRQ: - per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) - [index_from_irq(irq)] = -1; - break; - default: - break; - } - - /* Closed ports are implicitly re-bound to VCPU0. */ - bind_evtchn_to_cpu(evtchn, 0); - - evtchn_to_irq[evtchn] = -1; - irq_info[irq] = IRQ_UNBOUND; - - dynamic_irq_init(irq); - } - - spin_unlock(&irq_mapping_update_lock); -} - -int bind_evtchn_to_irqhandler(unsigned int evtchn, - irq_handler_t handler, - unsigned long irqflags, - const char *devname, void *dev_id) -{ - unsigned int irq; - int retval; - - irq = bind_evtchn_to_irq(evtchn); - retval = request_irq(irq, handler, irqflags, devname, dev_id); - if (retval != 0) { - unbind_from_irq(irq); - return retval; - } - - return irq; -} -EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); - -int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, - irq_handler_t handler, - unsigned long irqflags, const char *devname, void *dev_id) -{ - unsigned int irq; - int retval; - - irq = bind_virq_to_irq(virq, cpu); - retval = request_irq(irq, handler, irqflags, devname, dev_id); - if (retval != 0) { - unbind_from_irq(irq); - return retval; - } - - return irq; -} -EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); - -int bind_ipi_to_irqhandler(enum ipi_vector ipi, - unsigned int cpu, - irq_handler_t handler, - unsigned long irqflags, - const char *devname, - void *dev_id) -{ - int irq, retval; - - irq = bind_ipi_to_irq(ipi, cpu); - if (irq < 0) - return irq; - - retval = request_irq(irq, handler, irqflags, devname, dev_id); - if (retval != 0) { - unbind_from_irq(irq); - return retval; - } - - return irq; -} - -void unbind_from_irqhandler(unsigned int irq, void *dev_id) -{ - free_irq(irq, dev_id); - unbind_from_irq(irq); -} -EXPORT_SYMBOL_GPL(unbind_from_irqhandler); - -void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) -{ - int irq = per_cpu(ipi_to_irq, cpu)[vector]; - BUG_ON(irq < 0); - notify_remote_via_irq(irq); -} - - -/* - * Search the CPUs pending events bitmasks. For each one found, map - * the event number to an irq, and feed it into do_IRQ() for - * handling. - * - * Xen uses a two-level bitmap to speed searching. The first level is - * a bitset of words which contain pending event bits. The second - * level is a bitset of pending events themselves. - */ -void xen_evtchn_do_upcall(struct pt_regs *regs) -{ - int cpu = get_cpu(); - struct shared_info *s = HYPERVISOR_shared_info; - struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); - unsigned long pending_words; - - vcpu_info->evtchn_upcall_pending = 0; - - /* NB. No need for a barrier here -- XCHG is a barrier on x86. */ - pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); - while (pending_words != 0) { - unsigned long pending_bits; - int word_idx = __ffs(pending_words); - pending_words &= ~(1UL << word_idx); - - while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) { - int bit_idx = __ffs(pending_bits); - int port = (word_idx * BITS_PER_LONG) + bit_idx; - int irq = evtchn_to_irq[port]; - - if (irq != -1) { - regs->orig_ax = ~irq; - do_IRQ(regs); - } - } - } - - put_cpu(); -} - -/* Rebind an evtchn so that it gets delivered to a specific cpu */ -static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) -{ - struct evtchn_bind_vcpu bind_vcpu; - int evtchn = evtchn_from_irq(irq); - - if (!VALID_EVTCHN(evtchn)) - return; - - /* Send future instances of this interrupt to other vcpu. */ - bind_vcpu.port = evtchn; - bind_vcpu.vcpu = tcpu; - - /* - * If this fails, it usually just indicates that we're dealing with a - * virq or IPI channel, which don't actually need to be rebound. Ignore - * it, but don't do the xenlinux-level rebind in that case. - */ - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) - bind_evtchn_to_cpu(evtchn, tcpu); -} - - -static void set_affinity_irq(unsigned irq, cpumask_t dest) -{ - unsigned tcpu = first_cpu(dest); - rebind_irq_to_cpu(irq, tcpu); -} - -static void enable_dynirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) - unmask_evtchn(evtchn); -} - -static void disable_dynirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) - mask_evtchn(evtchn); -} - -static void ack_dynirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - - move_native_irq(irq); - - if (VALID_EVTCHN(evtchn)) - clear_evtchn(evtchn); -} - -static int retrigger_dynirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - int ret = 0; - - if (VALID_EVTCHN(evtchn)) { - set_evtchn(evtchn); - ret = 1; - } - - return ret; -} - -static struct irq_chip xen_dynamic_chip __read_mostly = { - .name = "xen-dyn", - .mask = disable_dynirq, - .unmask = enable_dynirq, - .ack = ack_dynirq, - .set_affinity = set_affinity_irq, - .retrigger = retrigger_dynirq, -}; - -void __init xen_init_IRQ(void) -{ - int i; - - init_evtchn_cpu_bindings(); - - /* No event channels are 'live' right now. */ - for (i = 0; i < NR_EVENT_CHANNELS; i++) - mask_evtchn(i); - - /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ - for (i = 0; i < NR_IRQS; i++) - irq_bindcount[i] = 0; - - irq_ctx_init(smp_processor_id()); -} diff --git a/arch/x86/xen/features.c b/arch/x86/xen/features.c deleted file mode 100644 index 0707714e40d..00000000000 --- a/arch/x86/xen/features.c +++ /dev/null @@ -1,29 +0,0 @@ -/****************************************************************************** - * features.c - * - * Xen feature flags. - * - * Copyright (c) 2006, Ian Campbell, XenSource Inc. - */ -#include <linux/types.h> -#include <linux/cache.h> -#include <linux/module.h> -#include <asm/xen/hypervisor.h> -#include <xen/features.h> - -u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; -EXPORT_SYMBOL_GPL(xen_features); - -void xen_setup_features(void) -{ - struct xen_feature_info fi; - int i, j; - - for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) { - fi.submap_idx = i; - if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0) - break; - for (j = 0; j < 32; j++) - xen_features[i * 32 + j] = !!(fi.submap & 1<<j); - } -} diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c new file mode 100644 index 00000000000..ebfa9b2c871 --- /dev/null +++ b/arch/x86/xen/grant-table.c @@ -0,0 +1,225 @@ +/****************************************************************************** + * grant_table.c + * x86 specific part + * + * Granting foreign access to our memory reservation. + * + * Copyright (c) 2005-2006, Christopher Clark + * Copyright (c) 2004-2005, K A Fraser + * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan. Split out x86 specific part. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +#include <xen/interface/xen.h> +#include <xen/page.h> +#include <xen/grant_table.h> +#include <xen/xen.h> + +#include <asm/pgtable.h> + +static struct gnttab_vm_area { + struct vm_struct *area; + pte_t **ptes; +} gnttab_shared_vm_area, gnttab_status_vm_area; + +int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, + unsigned long max_nr_gframes, + void **__shared) +{ + void *shared = *__shared; + unsigned long addr; + unsigned long i; + + if (shared == NULL) + *__shared = shared = gnttab_shared_vm_area.area->addr; + + addr = (unsigned long)shared; + + for (i = 0; i < nr_gframes; i++) { + set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i], + mfn_pte(frames[i], PAGE_KERNEL)); + addr += PAGE_SIZE; + } + + return 0; +} + +int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes, + unsigned long max_nr_gframes, + grant_status_t **__shared) +{ + grant_status_t *shared = *__shared; + unsigned long addr; + unsigned long i; + + if (shared == NULL) + *__shared = shared = gnttab_status_vm_area.area->addr; + + addr = (unsigned long)shared; + + for (i = 0; i < nr_gframes; i++) { + set_pte_at(&init_mm, addr, gnttab_status_vm_area.ptes[i], + mfn_pte(frames[i], PAGE_KERNEL)); + addr += PAGE_SIZE; + } + + return 0; +} + +void arch_gnttab_unmap(void *shared, unsigned long nr_gframes) +{ + pte_t **ptes; + unsigned long addr; + unsigned long i; + + if (shared == gnttab_status_vm_area.area->addr) + ptes = gnttab_status_vm_area.ptes; + else + ptes = gnttab_shared_vm_area.ptes; + + addr = (unsigned long)shared; + + for (i = 0; i < nr_gframes; i++) { + set_pte_at(&init_mm, addr, ptes[i], __pte(0)); + addr += PAGE_SIZE; + } +} + +static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames) +{ + area->ptes = kmalloc(sizeof(pte_t *) * nr_frames, GFP_KERNEL); + if (area->ptes == NULL) + return -ENOMEM; + + area->area = alloc_vm_area(PAGE_SIZE * nr_frames, area->ptes); + if (area->area == NULL) { + kfree(area->ptes); + return -ENOMEM; + } + + return 0; +} + +static void arch_gnttab_vfree(struct gnttab_vm_area *area) +{ + free_vm_area(area->area); + kfree(area->ptes); +} + +int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status) +{ + int ret; + + if (!xen_pv_domain()) + return 0; + + ret = arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared); + if (ret < 0) + return ret; + + /* + * Always allocate the space for the status frames in case + * we're migrated to a host with V2 support. + */ + ret = arch_gnttab_valloc(&gnttab_status_vm_area, nr_status); + if (ret < 0) + goto err; + + return 0; + err: + arch_gnttab_vfree(&gnttab_shared_vm_area); + return -ENOMEM; +} + +#ifdef CONFIG_XEN_PVH +#include <xen/balloon.h> +#include <xen/events.h> +#include <linux/slab.h> +static int __init xlated_setup_gnttab_pages(void) +{ + struct page **pages; + xen_pfn_t *pfns; + int rc; + unsigned int i; + unsigned long nr_grant_frames = gnttab_max_grant_frames(); + + BUG_ON(nr_grant_frames == 0); + pages = kcalloc(nr_grant_frames, sizeof(pages[0]), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + pfns = kcalloc(nr_grant_frames, sizeof(pfns[0]), GFP_KERNEL); + if (!pfns) { + kfree(pages); + return -ENOMEM; + } + rc = alloc_xenballooned_pages(nr_grant_frames, pages, 0 /* lowmem */); + if (rc) { + pr_warn("%s Couldn't balloon alloc %ld pfns rc:%d\n", __func__, + nr_grant_frames, rc); + kfree(pages); + kfree(pfns); + return rc; + } + for (i = 0; i < nr_grant_frames; i++) + pfns[i] = page_to_pfn(pages[i]); + + rc = arch_gnttab_map_shared(pfns, nr_grant_frames, nr_grant_frames, + &xen_auto_xlat_grant_frames.vaddr); + + if (rc) { + pr_warn("%s Couldn't map %ld pfns rc:%d\n", __func__, + nr_grant_frames, rc); + free_xenballooned_pages(nr_grant_frames, pages); + kfree(pages); + kfree(pfns); + return rc; + } + kfree(pages); + + xen_auto_xlat_grant_frames.pfn = pfns; + xen_auto_xlat_grant_frames.count = nr_grant_frames; + + return 0; +} + +static int __init xen_pvh_gnttab_setup(void) +{ + if (!xen_pvh_domain()) + return -ENODEV; + + return xlated_setup_gnttab_pages(); +} +/* Call it _before_ __gnttab_init as we need to initialize the + * xen_auto_xlat_grant_frames first. */ +core_initcall(xen_pvh_gnttab_setup); +#endif diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c new file mode 100644 index 00000000000..a1207cb6472 --- /dev/null +++ b/arch/x86/xen/irq.c @@ -0,0 +1,136 @@ +#include <linux/hardirq.h> + +#include <asm/x86_init.h> + +#include <xen/interface/xen.h> +#include <xen/interface/sched.h> +#include <xen/interface/vcpu.h> +#include <xen/features.h> +#include <xen/events.h> + +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include "xen-ops.h" + +/* + * Force a proper event-channel callback from Xen after clearing the + * callback mask. We do this in a very simple manner, by making a call + * down into Xen. The pending flag will be checked by Xen on return. + */ +void xen_force_evtchn_callback(void) +{ + (void)HYPERVISOR_xen_version(0, NULL); +} + +asmlinkage __visible unsigned long xen_save_fl(void) +{ + struct vcpu_info *vcpu; + unsigned long flags; + + vcpu = this_cpu_read(xen_vcpu); + + /* flag has opposite sense of mask */ + flags = !vcpu->evtchn_upcall_mask; + + /* convert to IF type flag + -0 -> 0x00000000 + -1 -> 0xffffffff + */ + return (-flags) & X86_EFLAGS_IF; +} +PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl); + +__visible void xen_restore_fl(unsigned long flags) +{ + struct vcpu_info *vcpu; + + /* convert from IF type flag */ + flags = !(flags & X86_EFLAGS_IF); + + /* See xen_irq_enable() for why preemption must be disabled. */ + preempt_disable(); + vcpu = this_cpu_read(xen_vcpu); + vcpu->evtchn_upcall_mask = flags; + + if (flags == 0) { + barrier(); /* unmask then check (avoid races) */ + if (unlikely(vcpu->evtchn_upcall_pending)) + xen_force_evtchn_callback(); + preempt_enable(); + } else + preempt_enable_no_resched(); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl); + +asmlinkage __visible void xen_irq_disable(void) +{ + /* There's a one instruction preempt window here. We need to + make sure we're don't switch CPUs between getting the vcpu + pointer and updating the mask. */ + preempt_disable(); + this_cpu_read(xen_vcpu)->evtchn_upcall_mask = 1; + preempt_enable_no_resched(); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable); + +asmlinkage __visible void xen_irq_enable(void) +{ + struct vcpu_info *vcpu; + + /* + * We may be preempted as soon as vcpu->evtchn_upcall_mask is + * cleared, so disable preemption to ensure we check for + * events on the VCPU we are still running on. + */ + preempt_disable(); + + vcpu = this_cpu_read(xen_vcpu); + vcpu->evtchn_upcall_mask = 0; + + /* Doesn't matter if we get preempted here, because any + pending event will get dealt with anyway. */ + + barrier(); /* unmask then check (avoid races) */ + if (unlikely(vcpu->evtchn_upcall_pending)) + xen_force_evtchn_callback(); + + preempt_enable(); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable); + +static void xen_safe_halt(void) +{ + /* Blocking includes an implicit local_irq_enable(). */ + if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) + BUG(); +} + +static void xen_halt(void) +{ + if (irqs_disabled()) + HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); + else + xen_safe_halt(); +} + +static const struct pv_irq_ops xen_irq_ops __initconst = { + .save_fl = PV_CALLEE_SAVE(xen_save_fl), + .restore_fl = PV_CALLEE_SAVE(xen_restore_fl), + .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), + .irq_enable = PV_CALLEE_SAVE(xen_irq_enable), + + .safe_halt = xen_safe_halt, + .halt = xen_halt, +#ifdef CONFIG_X86_64 + .adjust_exception_frame = xen_adjust_exception_frame, +#endif +}; + +void __init xen_init_irq_ops(void) +{ + /* For PVH we use default pv_irq_ops settings. */ + if (!xen_feature(XENFEAT_hvm_callback_vector)) + pv_irq_ops = xen_irq_ops; + x86_init.irqs.intr_init = xen_init_IRQ; +} diff --git a/arch/x86/xen/manage.c b/arch/x86/xen/manage.c deleted file mode 100644 index aa7af9e6abc..00000000000 --- a/arch/x86/xen/manage.c +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Handle extern requests for shutdown, reboot and sysrq - */ -#include <linux/kernel.h> -#include <linux/err.h> -#include <linux/reboot.h> -#include <linux/sysrq.h> - -#include <xen/xenbus.h> - -#define SHUTDOWN_INVALID -1 -#define SHUTDOWN_POWEROFF 0 -#define SHUTDOWN_SUSPEND 2 -/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only - * report a crash, not be instructed to crash! - * HALT is the same as POWEROFF, as far as we're concerned. The tools use - * the distinction when we return the reason code to them. - */ -#define SHUTDOWN_HALT 4 - -/* Ignore multiple shutdown requests. */ -static int shutting_down = SHUTDOWN_INVALID; - -static void shutdown_handler(struct xenbus_watch *watch, - const char **vec, unsigned int len) -{ - char *str; - struct xenbus_transaction xbt; - int err; - - if (shutting_down != SHUTDOWN_INVALID) - return; - - again: - err = xenbus_transaction_start(&xbt); - if (err) - return; - - str = (char *)xenbus_read(xbt, "control", "shutdown", NULL); - /* Ignore read errors and empty reads. */ - if (XENBUS_IS_ERR_READ(str)) { - xenbus_transaction_end(xbt, 1); - return; - } - - xenbus_write(xbt, "control", "shutdown", ""); - - err = xenbus_transaction_end(xbt, 0); - if (err == -EAGAIN) { - kfree(str); - goto again; - } - - if (strcmp(str, "poweroff") == 0 || - strcmp(str, "halt") == 0) - orderly_poweroff(false); - else if (strcmp(str, "reboot") == 0) - ctrl_alt_del(); - else { - printk(KERN_INFO "Ignoring shutdown request: %s\n", str); - shutting_down = SHUTDOWN_INVALID; - } - - kfree(str); -} - -static void sysrq_handler(struct xenbus_watch *watch, const char **vec, - unsigned int len) -{ - char sysrq_key = '\0'; - struct xenbus_transaction xbt; - int err; - - again: - err = xenbus_transaction_start(&xbt); - if (err) - return; - if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) { - printk(KERN_ERR "Unable to read sysrq code in " - "control/sysrq\n"); - xenbus_transaction_end(xbt, 1); - return; - } - - if (sysrq_key != '\0') - xenbus_printf(xbt, "control", "sysrq", "%c", '\0'); - - err = xenbus_transaction_end(xbt, 0); - if (err == -EAGAIN) - goto again; - - if (sysrq_key != '\0') - handle_sysrq(sysrq_key, NULL); -} - -static struct xenbus_watch shutdown_watch = { - .node = "control/shutdown", - .callback = shutdown_handler -}; - -static struct xenbus_watch sysrq_watch = { - .node = "control/sysrq", - .callback = sysrq_handler -}; - -static int setup_shutdown_watcher(void) -{ - int err; - - err = register_xenbus_watch(&shutdown_watch); - if (err) { - printk(KERN_ERR "Failed to set shutdown watcher\n"); - return err; - } - - err = register_xenbus_watch(&sysrq_watch); - if (err) { - printk(KERN_ERR "Failed to set sysrq watcher\n"); - return err; - } - - return 0; -} - -static int shutdown_event(struct notifier_block *notifier, - unsigned long event, - void *data) -{ - setup_shutdown_watcher(); - return NOTIFY_DONE; -} - -static int __init setup_shutdown_event(void) -{ - static struct notifier_block xenstore_notifier = { - .notifier_call = shutdown_event - }; - register_xenstore_notifier(&xenstore_notifier); - - return 0; -} - -subsys_initcall(setup_shutdown_event); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 2a054ef2a3d..e8a1201c329 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -40,32 +40,118 @@ */ #include <linux/sched.h> #include <linux/highmem.h> +#include <linux/debugfs.h> #include <linux/bug.h> +#include <linux/vmalloc.h> +#include <linux/module.h> +#include <linux/gfp.h> +#include <linux/memblock.h> +#include <linux/seq_file.h> +#include <linux/crash_dump.h> + +#include <trace/events/xen.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> +#include <asm/fixmap.h> #include <asm/mmu_context.h> +#include <asm/setup.h> #include <asm/paravirt.h> +#include <asm/e820.h> +#include <asm/linkage.h> +#include <asm/page.h> +#include <asm/init.h> +#include <asm/pat.h> +#include <asm/smp.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> +#include <xen/xen.h> #include <xen/page.h> #include <xen/interface/xen.h> +#include <xen/interface/hvm/hvm_op.h> +#include <xen/interface/version.h> +#include <xen/interface/memory.h> +#include <xen/hvc-console.h> #include "multicalls.h" #include "mmu.h" +#include "debugfs.h" + +/* + * Protects atomic reservation decrease/increase against concurrent increases. + * Also protects non-atomic updates of current_pages and balloon lists. + */ +DEFINE_SPINLOCK(xen_reservation_lock); + +#ifdef CONFIG_X86_32 +/* + * Identity map, in addition to plain kernel map. This needs to be + * large enough to allocate page table pages to allocate the rest. + * Each page can map 2MB. + */ +#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4) +static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); +#endif +#ifdef CONFIG_X86_64 +/* l3 pud for userspace vsyscall mapping */ +static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; +#endif /* CONFIG_X86_64 */ + +/* + * Note about cr3 (pagetable base) values: + * + * xen_cr3 contains the current logical cr3 value; it contains the + * last set cr3. This may not be the current effective cr3, because + * its update may be being lazily deferred. However, a vcpu looking + * at its own cr3 can use this value knowing that it everything will + * be self-consistent. + * + * xen_current_cr3 contains the actual vcpu cr3; it is set once the + * hypercall to set the vcpu cr3 is complete (so it may be a little + * out of date, but it will never be set early). If one vcpu is + * looking at another vcpu's cr3 value, it should use this variable. + */ +DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ +DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ -xmaddr_t arbitrary_virt_to_machine(unsigned long address) + +/* + * Just beyond the highest usermode address. STACK_TOP_MAX has a + * redzone above it, so round it up to a PGD boundary. + */ +#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) + +unsigned long arbitrary_virt_to_mfn(void *vaddr) { + xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); + + return PFN_DOWN(maddr.maddr); +} + +xmaddr_t arbitrary_virt_to_machine(void *vaddr) +{ + unsigned long address = (unsigned long)vaddr; unsigned int level; - pte_t *pte = lookup_address(address, &level); - unsigned offset = address & PAGE_MASK; + pte_t *pte; + unsigned offset; - BUG_ON(pte == NULL); + /* + * if the PFN is in the linear mapped vaddr range, we can just use + * the (quick) virt_to_machine() p2m lookup + */ + if (virt_addr_valid(vaddr)) + return virt_to_machine(vaddr); - return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); + /* otherwise we have to do a (slower) full page-table walk */ + + pte = lookup_address(address, &level); + BUG_ON(pte == NULL); + offset = address & ~PAGE_MASK; + return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset); } +EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine); void make_lowmem_page_readonly(void *vaddr) { @@ -74,7 +160,8 @@ void make_lowmem_page_readonly(void *vaddr) unsigned int level; pte = lookup_address(address, &level); - BUG_ON(pte == NULL); + if (pte == NULL) + return; /* vaddr missing */ ptev = pte_wrprotect(*pte); @@ -89,7 +176,8 @@ void make_lowmem_page_readwrite(void *vaddr) unsigned int level; pte = lookup_address(address, &level); - BUG_ON(pte == NULL); + if (pte == NULL) + return; /* vaddr missing */ ptev = pte_mkwrite(*pte); @@ -98,307 +186,625 @@ void make_lowmem_page_readwrite(void *vaddr) } -void xen_set_pmd(pmd_t *ptr, pmd_t val) +static bool xen_page_pinned(void *ptr) +{ + struct page *page = virt_to_page(ptr); + + return PagePinned(page); +} + +void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) { struct multicall_space mcs; struct mmu_update *u; - preempt_disable(); + trace_xen_mmu_set_domain_pte(ptep, pteval, domid); mcs = xen_mc_entry(sizeof(*u)); u = mcs.args; - u->ptr = virt_to_machine(ptr).maddr; - u->val = pmd_val_ma(val); - MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); + + /* ptep might be kmapped when using 32-bit HIGHPTE */ + u->ptr = virt_to_machine(ptep).maddr; + u->val = pte_val_ma(pteval); + + MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); + + xen_mc_issue(PARAVIRT_LAZY_MMU); +} +EXPORT_SYMBOL_GPL(xen_set_domain_pte); + +static void xen_extend_mmu_update(const struct mmu_update *update) +{ + struct multicall_space mcs; + struct mmu_update *u; + + mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); + + if (mcs.mc != NULL) { + mcs.mc->args[1]++; + } else { + mcs = __xen_mc_entry(sizeof(*u)); + MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); + } + + u = mcs.args; + *u = *update; +} + +static void xen_extend_mmuext_op(const struct mmuext_op *op) +{ + struct multicall_space mcs; + struct mmuext_op *u; + + mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u)); + + if (mcs.mc != NULL) { + mcs.mc->args[1]++; + } else { + mcs = __xen_mc_entry(sizeof(*u)); + MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); + } + + u = mcs.args; + *u = *op; +} + +static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) +{ + struct mmu_update u; + + preempt_disable(); + + xen_mc_batch(); + + /* ptr may be ioremapped for 64-bit pagetable setup */ + u.ptr = arbitrary_virt_to_machine(ptr).maddr; + u.val = pmd_val_ma(val); + xen_extend_mmu_update(&u); xen_mc_issue(PARAVIRT_LAZY_MMU); preempt_enable(); } +static void xen_set_pmd(pmd_t *ptr, pmd_t val) +{ + trace_xen_mmu_set_pmd(ptr, val); + + /* If page is not pinned, we can just update the entry + directly */ + if (!xen_page_pinned(ptr)) { + *ptr = val; + return; + } + + xen_set_pmd_hyper(ptr, val); +} + /* * Associate a virtual page frame with a given physical page frame * and protection flags for that frame. */ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); +} - pgd = swapper_pg_dir + pgd_index(vaddr); - if (pgd_none(*pgd)) { - BUG(); - return; - } - pud = pud_offset(pgd, vaddr); - if (pud_none(*pud)) { - BUG(); - return; - } - pmd = pmd_offset(pud, vaddr); - if (pmd_none(*pmd)) { - BUG(); - return; +static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) +{ + struct mmu_update u; + + if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) + return false; + + xen_mc_batch(); + + u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; + u.val = pte_val_ma(pteval); + xen_extend_mmu_update(&u); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + return true; +} + +static inline void __xen_set_pte(pte_t *ptep, pte_t pteval) +{ + if (!xen_batched_set_pte(ptep, pteval)) { + /* + * Could call native_set_pte() here and trap and + * emulate the PTE write but with 32-bit guests this + * needs two traps (one for each of the two 32-bit + * words in the PTE) so do one hypercall directly + * instead. + */ + struct mmu_update u; + + u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; + u.val = pte_val_ma(pteval); + HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF); } - pte = pte_offset_kernel(pmd, vaddr); - /* <mfn,flags> stored as-is, to permit clearing entries */ - xen_set_pte(pte, mfn_pte(mfn, flags)); +} - /* - * It's enough to flush this one mapping. - * (PGE mappings get flushed as well) - */ - __flush_tlb_one(vaddr); +static void xen_set_pte(pte_t *ptep, pte_t pteval) +{ + trace_xen_mmu_set_pte(ptep, pteval); + __xen_set_pte(ptep, pteval); } -void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, +static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - if (mm == current->mm || mm == &init_mm) { - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { - struct multicall_space mcs; - mcs = xen_mc_entry(0); + trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval); + __xen_set_pte(ptep, pteval); +} - MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); - xen_mc_issue(PARAVIRT_LAZY_MMU); - return; - } else - if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0) - return; - } - xen_set_pte(ptep, pteval); +pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + /* Just return the pte as-is. We preserve the bits on commit */ + trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep); + return *ptep; } -#ifdef CONFIG_X86_PAE -void xen_set_pud(pud_t *ptr, pud_t val) +void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) { - struct multicall_space mcs; - struct mmu_update *u; + struct mmu_update u; - preempt_disable(); + trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte); + xen_mc_batch(); - mcs = xen_mc_entry(sizeof(*u)); - u = mcs.args; - u->ptr = virt_to_machine(ptr).maddr; - u->val = pud_val_ma(val); - MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); + u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; + u.val = pte_val_ma(pte); + xen_extend_mmu_update(&u); xen_mc_issue(PARAVIRT_LAZY_MMU); +} - preempt_enable(); +/* Assume pteval_t is equivalent to all the other *val_t types. */ +static pteval_t pte_mfn_to_pfn(pteval_t val) +{ + if (val & _PAGE_PRESENT) { + unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; + unsigned long pfn = mfn_to_pfn(mfn); + + pteval_t flags = val & PTE_FLAGS_MASK; + if (unlikely(pfn == ~0)) + val = flags & ~_PAGE_PRESENT; + else + val = ((pteval_t)pfn << PAGE_SHIFT) | flags; + } + + return val; } -void xen_set_pte(pte_t *ptep, pte_t pte) +static pteval_t pte_pfn_to_mfn(pteval_t val) { - ptep->pte_high = pte.pte_high; - smp_wmb(); - ptep->pte_low = pte.pte_low; + if (val & _PAGE_PRESENT) { + unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; + pteval_t flags = val & PTE_FLAGS_MASK; + unsigned long mfn; + + if (!xen_feature(XENFEAT_auto_translated_physmap)) + mfn = get_phys_to_machine(pfn); + else + mfn = pfn; + /* + * If there's no mfn for the pfn, then just create an + * empty non-present pte. Unfortunately this loses + * information about the original pfn, so + * pte_mfn_to_pfn is asymmetric. + */ + if (unlikely(mfn == INVALID_P2M_ENTRY)) { + mfn = 0; + flags = 0; + } else { + /* + * Paramount to do this test _after_ the + * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY & + * IDENTITY_FRAME_BIT resolves to true. + */ + mfn &= ~FOREIGN_FRAME_BIT; + if (mfn & IDENTITY_FRAME_BIT) { + mfn &= ~IDENTITY_FRAME_BIT; + flags |= _PAGE_IOMAP; + } + } + val = ((pteval_t)mfn << PAGE_SHIFT) | flags; + } + + return val; } -void xen_set_pte_atomic(pte_t *ptep, pte_t pte) +static pteval_t iomap_pte(pteval_t val) { - set_64bit((u64 *)ptep, pte_val_ma(pte)); + if (val & _PAGE_PRESENT) { + unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; + pteval_t flags = val & PTE_FLAGS_MASK; + + /* We assume the pte frame number is a MFN, so + just use it as-is. */ + val = ((pteval_t)pfn << PAGE_SHIFT) | flags; + } + + return val; } -void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +__visible pteval_t xen_pte_val(pte_t pte) { - ptep->pte_low = 0; - smp_wmb(); /* make sure low gets written first */ - ptep->pte_high = 0; + pteval_t pteval = pte.pte; +#if 0 + /* If this is a WC pte, convert back from Xen WC to Linux WC */ + if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) { + WARN_ON(!pat_enabled); + pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT; + } +#endif + if (xen_initial_domain() && (pteval & _PAGE_IOMAP)) + return pteval; + + return pte_mfn_to_pfn(pteval); } +PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); -void xen_pmd_clear(pmd_t *pmdp) +__visible pgdval_t xen_pgd_val(pgd_t pgd) { - xen_set_pmd(pmdp, __pmd(0)); + return pte_mfn_to_pfn(pgd.pgd); } +PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); -unsigned long long xen_pte_val(pte_t pte) +/* + * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7 + * are reserved for now, to correspond to the Intel-reserved PAT + * types. + * + * We expect Linux's PAT set as follows: + * + * Idx PTE flags Linux Xen Default + * 0 WB WB WB + * 1 PWT WC WT WT + * 2 PCD UC- UC- UC- + * 3 PCD PWT UC UC UC + * 4 PAT WB WC WB + * 5 PAT PWT WC WP WT + * 6 PAT PCD UC- rsv UC- + * 7 PAT PCD PWT UC rsv UC + */ + +void xen_set_pat(u64 pat) { - unsigned long long ret = 0; + /* We expect Linux to use a PAT setting of + * UC UC- WC WB (ignoring the PAT flag) */ + WARN_ON(pat != 0x0007010600070106ull); +} - if (pte.pte_low) { - ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low; - ret = machine_to_phys(XMADDR(ret)).paddr | 1; +__visible pte_t xen_make_pte(pteval_t pte) +{ + phys_addr_t addr = (pte & PTE_PFN_MASK); +#if 0 + /* If Linux is trying to set a WC pte, then map to the Xen WC. + * If _PAGE_PAT is set, then it probably means it is really + * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope + * things work out OK... + * + * (We should never see kernel mappings with _PAGE_PSE set, + * but we could see hugetlbfs mappings, I think.). + */ + if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) { + if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT) + pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT; + } +#endif + /* + * Unprivileged domains are allowed to do IOMAPpings for + * PCI passthrough, but not map ISA space. The ISA + * mappings are just dummy local mappings to keep other + * parts of the kernel happy. + */ + if (unlikely(pte & _PAGE_IOMAP) && + (xen_initial_domain() || addr >= ISA_END_ADDRESS)) { + pte = iomap_pte(pte); + } else { + pte &= ~_PAGE_IOMAP; + pte = pte_pfn_to_mfn(pte); } - return ret; + return native_make_pte(pte); } +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); -unsigned long long xen_pmd_val(pmd_t pmd) +__visible pgd_t xen_make_pgd(pgdval_t pgd) { - unsigned long long ret = pmd.pmd; - if (ret) - ret = machine_to_phys(XMADDR(ret)).paddr | 1; - return ret; + pgd = pte_pfn_to_mfn(pgd); + return native_make_pgd(pgd); } +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); -unsigned long long xen_pgd_val(pgd_t pgd) +__visible pmdval_t xen_pmd_val(pmd_t pmd) { - unsigned long long ret = pgd.pgd; - if (ret) - ret = machine_to_phys(XMADDR(ret)).paddr | 1; - return ret; + return pte_mfn_to_pfn(pmd.pmd); } +PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); -pte_t xen_make_pte(unsigned long long pte) +static void xen_set_pud_hyper(pud_t *ptr, pud_t val) { - if (pte & _PAGE_PRESENT) { - pte = phys_to_machine(XPADDR(pte)).maddr; - pte &= ~(_PAGE_PCD | _PAGE_PWT); + struct mmu_update u; + + preempt_disable(); + + xen_mc_batch(); + + /* ptr may be ioremapped for 64-bit pagetable setup */ + u.ptr = arbitrary_virt_to_machine(ptr).maddr; + u.val = pud_val_ma(val); + xen_extend_mmu_update(&u); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +static void xen_set_pud(pud_t *ptr, pud_t val) +{ + trace_xen_mmu_set_pud(ptr, val); + + /* If page is not pinned, we can just update the entry + directly */ + if (!xen_page_pinned(ptr)) { + *ptr = val; + return; } - return (pte_t){ .pte = pte }; + xen_set_pud_hyper(ptr, val); } -pmd_t xen_make_pmd(unsigned long long pmd) +#ifdef CONFIG_X86_PAE +static void xen_set_pte_atomic(pte_t *ptep, pte_t pte) { - if (pmd & 1) - pmd = phys_to_machine(XPADDR(pmd)).maddr; + trace_xen_mmu_set_pte_atomic(ptep, pte); + set_64bit((u64 *)ptep, native_pte_val(pte)); +} - return (pmd_t){ pmd }; +static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + trace_xen_mmu_pte_clear(mm, addr, ptep); + if (!xen_batched_set_pte(ptep, native_make_pte(0))) + native_pte_clear(mm, addr, ptep); } -pgd_t xen_make_pgd(unsigned long long pgd) +static void xen_pmd_clear(pmd_t *pmdp) { - if (pgd & _PAGE_PRESENT) - pgd = phys_to_machine(XPADDR(pgd)).maddr; + trace_xen_mmu_pmd_clear(pmdp); + set_pmd(pmdp, __pmd(0)); +} +#endif /* CONFIG_X86_PAE */ - return (pgd_t){ pgd }; +__visible pmd_t xen_make_pmd(pmdval_t pmd) +{ + pmd = pte_pfn_to_mfn(pmd); + return native_make_pmd(pmd); } -#else /* !PAE */ -void xen_set_pte(pte_t *ptep, pte_t pte) +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); + +#if PAGETABLE_LEVELS == 4 +__visible pudval_t xen_pud_val(pud_t pud) { - *ptep = pte; + return pte_mfn_to_pfn(pud.pud); } +PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); -unsigned long xen_pte_val(pte_t pte) +__visible pud_t xen_make_pud(pudval_t pud) { - unsigned long ret = pte.pte_low; + pud = pte_pfn_to_mfn(pud); - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr; + return native_make_pud(pud); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); - return ret; +static pgd_t *xen_get_user_pgd(pgd_t *pgd) +{ + pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); + unsigned offset = pgd - pgd_page; + pgd_t *user_ptr = NULL; + + if (offset < pgd_index(USER_LIMIT)) { + struct page *page = virt_to_page(pgd_page); + user_ptr = (pgd_t *)page->private; + if (user_ptr) + user_ptr += offset; + } + + return user_ptr; } -unsigned long xen_pgd_val(pgd_t pgd) +static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) { - unsigned long ret = pgd.pgd; - if (ret) - ret = machine_to_phys(XMADDR(ret)).paddr | 1; - return ret; + struct mmu_update u; + + u.ptr = virt_to_machine(ptr).maddr; + u.val = pgd_val_ma(val); + xen_extend_mmu_update(&u); } -pte_t xen_make_pte(unsigned long pte) +/* + * Raw hypercall-based set_pgd, intended for in early boot before + * there's a page structure. This implies: + * 1. The only existing pagetable is the kernel's + * 2. It is always pinned + * 3. It has no user pagetable attached to it + */ +static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) { - if (pte & _PAGE_PRESENT) { - pte = phys_to_machine(XPADDR(pte)).maddr; - pte &= ~(_PAGE_PCD | _PAGE_PWT); - } + preempt_disable(); + + xen_mc_batch(); - return (pte_t){ pte }; + __xen_set_pgd_hyper(ptr, val); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); } -pgd_t xen_make_pgd(unsigned long pgd) +static void xen_set_pgd(pgd_t *ptr, pgd_t val) { - if (pgd & _PAGE_PRESENT) - pgd = phys_to_machine(XPADDR(pgd)).maddr; + pgd_t *user_ptr = xen_get_user_pgd(ptr); - return (pgd_t){ pgd }; + trace_xen_mmu_set_pgd(ptr, user_ptr, val); + + /* If page is not pinned, we can just update the entry + directly */ + if (!xen_page_pinned(ptr)) { + *ptr = val; + if (user_ptr) { + WARN_ON(xen_page_pinned(user_ptr)); + *user_ptr = val; + } + return; + } + + /* If it's pinned, then we can at least batch the kernel and + user updates together. */ + xen_mc_batch(); + + __xen_set_pgd_hyper(ptr, val); + if (user_ptr) + __xen_set_pgd_hyper(user_ptr, val); + + xen_mc_issue(PARAVIRT_LAZY_MMU); } -#endif /* CONFIG_X86_PAE */ +#endif /* PAGETABLE_LEVELS == 4 */ /* - (Yet another) pagetable walker. This one is intended for pinning a - pagetable. This means that it walks a pagetable and calls the - callback function on each page it finds making up the page table, - at every level. It walks the entire pagetable, but it only bothers - pinning pte pages which are below pte_limit. In the normal case - this will be TASK_SIZE, but at boot we need to pin up to - FIXADDR_TOP. But the important bit is that we don't pin beyond - there, because then we start getting into Xen's ptes. -*/ -static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), - unsigned long limit) -{ - pgd_t *pgd = pgd_base; + * (Yet another) pagetable walker. This one is intended for pinning a + * pagetable. This means that it walks a pagetable and calls the + * callback function on each page it finds making up the page table, + * at every level. It walks the entire pagetable, but it only bothers + * pinning pte pages which are below limit. In the normal case this + * will be STACK_TOP_MAX, but at boot we need to pin up to + * FIXADDR_TOP. + * + * For 32-bit the important bit is that we don't pin beyond there, + * because then we start getting into Xen's ptes. + * + * For 64-bit, we must skip the Xen hole in the middle of the address + * space, just after the big x86-64 virtual hole. + */ +static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, + int (*func)(struct mm_struct *mm, struct page *, + enum pt_level), + unsigned long limit) +{ int flush = 0; - unsigned long addr = 0; - unsigned long pgd_next; + unsigned hole_low, hole_high; + unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; + unsigned pgdidx, pudidx, pmdidx; - BUG_ON(limit > FIXADDR_TOP); + /* The limit is the last byte to be touched */ + limit--; + BUG_ON(limit >= FIXADDR_TOP); if (xen_feature(XENFEAT_auto_translated_physmap)) return 0; - for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { + /* + * 64-bit has a great big hole in the middle of the address + * space, which contains the Xen mappings. On 32-bit these + * will end up making a zero-sized hole and so is a no-op. + */ + hole_low = pgd_index(USER_LIMIT); + hole_high = pgd_index(PAGE_OFFSET); + + pgdidx_limit = pgd_index(limit); +#if PTRS_PER_PUD > 1 + pudidx_limit = pud_index(limit); +#else + pudidx_limit = 0; +#endif +#if PTRS_PER_PMD > 1 + pmdidx_limit = pmd_index(limit); +#else + pmdidx_limit = 0; +#endif + + for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { pud_t *pud; - unsigned long pud_limit, pud_next; - pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); + if (pgdidx >= hole_low && pgdidx < hole_high) + continue; - if (!pgd_val(*pgd)) + if (!pgd_val(pgd[pgdidx])) continue; - pud = pud_offset(pgd, 0); + pud = pud_offset(&pgd[pgdidx], 0); if (PTRS_PER_PUD > 1) /* not folded */ - flush |= (*func)(virt_to_page(pud), PT_PUD); + flush |= (*func)(mm, virt_to_page(pud), PT_PUD); - for (; addr != pud_limit; pud++, addr = pud_next) { + for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { pmd_t *pmd; - unsigned long pmd_limit; - pud_next = pud_addr_end(addr, pud_limit); - - if (pud_next < limit) - pmd_limit = pud_next; - else - pmd_limit = limit; + if (pgdidx == pgdidx_limit && + pudidx > pudidx_limit) + goto out; - if (pud_none(*pud)) + if (pud_none(pud[pudidx])) continue; - pmd = pmd_offset(pud, 0); + pmd = pmd_offset(&pud[pudidx], 0); if (PTRS_PER_PMD > 1) /* not folded */ - flush |= (*func)(virt_to_page(pmd), PT_PMD); + flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); - for (; addr != pmd_limit; pmd++) { - addr += (PAGE_SIZE * PTRS_PER_PTE); - if ((pmd_limit-1) < (addr-1)) { - addr = pmd_limit; - break; - } + for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { + struct page *pte; - if (pmd_none(*pmd)) + if (pgdidx == pgdidx_limit && + pudidx == pudidx_limit && + pmdidx > pmdidx_limit) + goto out; + + if (pmd_none(pmd[pmdidx])) continue; - flush |= (*func)(pmd_page(*pmd), PT_PTE); + pte = pmd_page(pmd[pmdidx]); + flush |= (*func)(mm, pte, PT_PTE); } } } - flush |= (*func)(virt_to_page(pgd_base), PT_PGD); +out: + /* Do the top level last, so that the callbacks can use it as + a cue to do final things like tlb flushes. */ + flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); return flush; } -static spinlock_t *lock_pte(struct page *page) +static int xen_pgd_walk(struct mm_struct *mm, + int (*func)(struct mm_struct *mm, struct page *, + enum pt_level), + unsigned long limit) +{ + return __xen_pgd_walk(mm, mm->pgd, func, limit); +} + +/* If we're using split pte locks, then take the page's lock and + return a pointer to it. Otherwise return NULL. */ +static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) { spinlock_t *ptl = NULL; -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS - ptl = __pte_lockptr(page); - spin_lock(ptl); +#if USE_SPLIT_PTE_PTLOCKS + ptl = ptlock_ptr(page); + spin_lock_nest_lock(ptl, &mm->page_table_lock); #endif return ptl; } -static void do_unlock(void *v) +static void xen_pte_unlock(void *v) { spinlock_t *ptl = v; spin_unlock(ptl); @@ -406,19 +812,18 @@ static void do_unlock(void *v) static void xen_do_pin(unsigned level, unsigned long pfn) { - struct mmuext_op *op; - struct multicall_space mcs; + struct mmuext_op op; - mcs = __xen_mc_entry(sizeof(*op)); - op = mcs.args; - op->cmd = level; - op->arg1.mfn = pfn_to_mfn(pfn); - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + op.cmd = level; + op.arg1.mfn = pfn_to_mfn(pfn); + + xen_extend_mmuext_op(&op); } -static int pin_page(struct page *page, enum pt_level level) +static int xen_pin_page(struct mm_struct *mm, struct page *page, + enum pt_level level) { - unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); + unsigned pgfl = TestSetPagePinned(page); int flush; if (pgfl) @@ -435,21 +840,40 @@ static int pin_page(struct page *page, enum pt_level level) flush = 0; + /* + * We need to hold the pagetable lock between the time + * we make the pagetable RO and when we actually pin + * it. If we don't, then other users may come in and + * attempt to update the pagetable by writing it, + * which will fail because the memory is RO but not + * pinned, so Xen won't do the trap'n'emulate. + * + * If we're using split pte locks, we can't hold the + * entire pagetable's worth of locks during the + * traverse, because we may wrap the preempt count (8 + * bits). The solution is to mark RO and pin each PTE + * page while holding the lock. This means the number + * of locks we end up holding is never more than a + * batch size (~32 entries, at present). + * + * If we're not using split pte locks, we needn't pin + * the PTE pages independently, because we're + * protected by the overall pagetable lock. + */ ptl = NULL; if (level == PT_PTE) - ptl = lock_pte(page); + ptl = xen_pte_lock(page, mm); MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, pfn_pte(pfn, PAGE_KERNEL_RO), level == PT_PGD ? UVMF_TLB_FLUSH : 0); - if (level == PT_PTE) + if (ptl) { xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); - if (ptl) { /* Queue a deferred unlock for when this batch is completed. */ - xen_mc_callback(do_unlock, ptl); + xen_mc_callback(xen_pte_unlock, ptl); } } @@ -459,47 +883,96 @@ static int pin_page(struct page *page, enum pt_level level) /* This is called just after a mm has been created, but it has not been used yet. We need to make sure that its pagetable is all read-only, and can be pinned. */ -void xen_pgd_pin(pgd_t *pgd) +static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) { - unsigned level; + trace_xen_mmu_pgd_pin(mm, pgd); xen_mc_batch(); - if (pgd_walk(pgd, pin_page, TASK_SIZE)) { - /* re-enable interrupts for kmap_flush_unused */ + if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) { + /* re-enable interrupts for flushing */ xen_mc_issue(0); + kmap_flush_unused(); + xen_mc_batch(); } +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); + + if (user_pgd) { + xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); + xen_do_pin(MMUEXT_PIN_L4_TABLE, + PFN_DOWN(__pa(user_pgd))); + } + } +#else /* CONFIG_X86_32 */ #ifdef CONFIG_X86_PAE - level = MMUEXT_PIN_L3_TABLE; -#else - level = MMUEXT_PIN_L2_TABLE; + /* Need to make sure unshared kernel PMD is pinnable */ + xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]), + PT_PMD); #endif + xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); +#endif /* CONFIG_X86_64 */ + xen_mc_issue(0); +} - xen_do_pin(level, PFN_DOWN(__pa(pgd))); +static void xen_pgd_pin(struct mm_struct *mm) +{ + __xen_pgd_pin(mm, mm->pgd); +} - xen_mc_issue(0); +/* + * On save, we need to pin all pagetables to make sure they get their + * mfns turned into pfns. Search the list for any unpinned pgds and pin + * them (unpinned pgds are not currently in use, probably because the + * process is under construction or destruction). + * + * Expected to be called in stop_machine() ("equivalent to taking + * every spinlock in the system"), so the locking doesn't really + * matter all that much. + */ +void xen_mm_pin_all(void) +{ + struct page *page; + + spin_lock(&pgd_lock); + + list_for_each_entry(page, &pgd_list, lru) { + if (!PagePinned(page)) { + __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page)); + SetPageSavePinned(page); + } + } + + spin_unlock(&pgd_lock); } -/* The init_mm pagetable is really pinned as soon as its created, but - that's before we have page structures to store the bits. So do all - the book-keeping now. */ -static __init int mark_pinned(struct page *page, enum pt_level level) +/* + * The init_mm pagetable is really pinned as soon as its created, but + * that's before we have page structures to store the bits. So do all + * the book-keeping now. + */ +static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page, + enum pt_level level) { SetPagePinned(page); return 0; } -void __init xen_mark_init_mm_pinned(void) +static void __init xen_mark_init_mm_pinned(void) { - pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); + xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); } -static int unpin_page(struct page *page, enum pt_level level) +static int xen_unpin_page(struct mm_struct *mm, struct page *page, + enum pt_level level) { - unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); + unsigned pgfl = TestClearPagePinned(page); if (pgfl && !PageHighMem(page)) { void *pt = lowmem_page_address(page); @@ -507,10 +980,18 @@ static int unpin_page(struct page *page, enum pt_level level) spinlock_t *ptl = NULL; struct multicall_space mcs; + /* + * Do the converse to pin_page. If we're using split + * pte locks, we must be holding the lock for while + * the pte page is unpinned but still RO to prevent + * concurrent updates from seeing it in this + * partially-pinned state. + */ if (level == PT_PTE) { - ptl = lock_pte(page); + ptl = xen_pte_lock(page, mm); - xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); + if (ptl) + xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); } mcs = __xen_mc_entry(0); @@ -521,7 +1002,7 @@ static int unpin_page(struct page *page, enum pt_level level) if (ptl) { /* unlock when batch completed */ - xen_mc_callback(do_unlock, ptl); + xen_mc_callback(xen_pte_unlock, ptl); } } @@ -529,28 +1010,74 @@ static int unpin_page(struct page *page, enum pt_level level) } /* Release a pagetables pages back as normal RW */ -static void xen_pgd_unpin(pgd_t *pgd) +static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) { + trace_xen_mmu_pgd_unpin(mm, pgd); + xen_mc_batch(); xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - pgd_walk(pgd, unpin_page, TASK_SIZE); +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + if (user_pgd) { + xen_do_pin(MMUEXT_UNPIN_TABLE, + PFN_DOWN(__pa(user_pgd))); + xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); + } + } +#endif + +#ifdef CONFIG_X86_PAE + /* Need to make sure unshared kernel PMD is unpinned */ + xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]), + PT_PMD); +#endif + + __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT); xen_mc_issue(0); } -void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) +static void xen_pgd_unpin(struct mm_struct *mm) +{ + __xen_pgd_unpin(mm, mm->pgd); +} + +/* + * On resume, undo any pinning done at save, so that the rest of the + * kernel doesn't see any unexpected pinned pagetables. + */ +void xen_mm_unpin_all(void) +{ + struct page *page; + + spin_lock(&pgd_lock); + + list_for_each_entry(page, &pgd_list, lru) { + if (PageSavePinned(page)) { + BUG_ON(!PagePinned(page)); + __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page)); + ClearPageSavePinned(page); + } + } + + spin_unlock(&pgd_lock); +} + +static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) { spin_lock(&next->page_table_lock); - xen_pgd_pin(next->pgd); + xen_pgd_pin(next); spin_unlock(&next->page_table_lock); } -void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { spin_lock(&mm->page_table_lock); - xen_pgd_pin(mm->pgd); + xen_pgd_pin(mm); spin_unlock(&mm->page_table_lock); } @@ -561,21 +1088,22 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) static void drop_other_mm_ref(void *info) { struct mm_struct *mm = info; + struct mm_struct *active_mm; + + active_mm = this_cpu_read(cpu_tlbstate.active_mm); - if (__get_cpu_var(cpu_tlbstate).active_mm == mm) + if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) leave_mm(smp_processor_id()); /* If this cpu still has a stale cr3 reference, then make sure it has been flushed. */ - if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) { + if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd)) load_cr3(swapper_pg_dir); - arch_flush_lazy_cpu_mode(); - } } -static void drop_mm_ref(struct mm_struct *mm) +static void xen_drop_mm_ref(struct mm_struct *mm) { - cpumask_t mask; + cpumask_var_t mask; unsigned cpu; if (current->active_mm == mm) { @@ -583,11 +1111,19 @@ static void drop_mm_ref(struct mm_struct *mm) load_cr3(swapper_pg_dir); else leave_mm(smp_processor_id()); - arch_flush_lazy_cpu_mode(); } /* Get the "official" set of cpus referring to our pagetable. */ - mask = mm->cpu_vm_mask; + if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { + for_each_online_cpu(cpu) { + if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) + && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) + continue; + smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); + } + return; + } + cpumask_copy(mask, mm_cpumask(mm)); /* It's possible that a vcpu may have a stale reference to our cr3, because its in lazy mode, and it hasn't yet flushed @@ -596,14 +1132,15 @@ static void drop_mm_ref(struct mm_struct *mm) if needed. */ for_each_online_cpu(cpu) { if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) - cpu_set(cpu, mask); + cpumask_set_cpu(cpu, mask); } - if (!cpus_empty(mask)) - xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); + if (!cpumask_empty(mask)) + smp_call_function_many(mask, drop_other_mm_ref, mm, 1); + free_cpumask_var(mask); } #else -static void drop_mm_ref(struct mm_struct *mm) +static void xen_drop_mm_ref(struct mm_struct *mm) { if (current->active_mm == mm) load_cr3(swapper_pg_dir); @@ -624,17 +1161,1544 @@ static void drop_mm_ref(struct mm_struct *mm) * pagetable because of lazy tlb flushing. This means we need need to * switch all CPUs off this pagetable before we can unpin it. */ -void xen_exit_mmap(struct mm_struct *mm) +static void xen_exit_mmap(struct mm_struct *mm) { get_cpu(); /* make sure we don't move around */ - drop_mm_ref(mm); + xen_drop_mm_ref(mm); put_cpu(); spin_lock(&mm->page_table_lock); /* pgd may not be pinned in the error exit path of execve */ - if (PagePinned(virt_to_page(mm->pgd))) - xen_pgd_unpin(mm->pgd); + if (xen_page_pinned(mm->pgd)) + xen_pgd_unpin(mm); spin_unlock(&mm->page_table_lock); } + +static void xen_post_allocator_init(void); + +#ifdef CONFIG_X86_64 +static void __init xen_cleanhighmap(unsigned long vaddr, + unsigned long vaddr_end) +{ + unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; + pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr); + + /* NOTE: The loop is more greedy than the cleanup_highmap variant. + * We include the PMD passed in on _both_ boundaries. */ + for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PAGE_SIZE)); + pmd++, vaddr += PMD_SIZE) { + if (pmd_none(*pmd)) + continue; + if (vaddr < (unsigned long) _text || vaddr > kernel_end) + set_pmd(pmd, __pmd(0)); + } + /* In case we did something silly, we should crash in this function + * instead of somewhere later and be confusing. */ + xen_mc_flush(); +} +static void __init xen_pagetable_p2m_copy(void) +{ + unsigned long size; + unsigned long addr; + unsigned long new_mfn_list; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + + new_mfn_list = xen_revector_p2m_tree(); + /* No memory or already called. */ + if (!new_mfn_list || new_mfn_list == xen_start_info->mfn_list) + return; + + /* using __ka address and sticking INVALID_P2M_ENTRY! */ + memset((void *)xen_start_info->mfn_list, 0xff, size); + + /* We should be in __ka space. */ + BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); + addr = xen_start_info->mfn_list; + /* We roundup to the PMD, which means that if anybody at this stage is + * using the __ka address of xen_start_info or xen_start_info->shared_info + * they are in going to crash. Fortunatly we have already revectored + * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ + size = roundup(size, PMD_SIZE); + xen_cleanhighmap(addr, addr + size); + + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + memblock_free(__pa(xen_start_info->mfn_list), size); + /* And revector! Bye bye old array */ + xen_start_info->mfn_list = new_mfn_list; + + /* At this stage, cleanup_highmap has already cleaned __ka space + * from _brk_limit way up to the max_pfn_mapped (which is the end of + * the ramdisk). We continue on, erasing PMD entries that point to page + * tables - do note that they are accessible at this stage via __va. + * For good measure we also round up to the PMD - which means that if + * anybody is using __ka address to the initial boot-stack - and try + * to use it - they are going to crash. The xen_start_info has been + * taken care of already in xen_setup_kernel_pagetable. */ + addr = xen_start_info->pt_base; + size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE); + + xen_cleanhighmap(addr, addr + size); + xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base)); +#ifdef DEBUG + /* This is superflous and is not neccessary, but you know what + * lets do it. The MODULES_VADDR -> MODULES_END should be clear of + * anything at this stage. */ + xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); +#endif +} +#endif + +static void __init xen_pagetable_init(void) +{ + paging_init(); + xen_setup_shared_info(); +#ifdef CONFIG_X86_64 + xen_pagetable_p2m_copy(); +#endif + xen_post_allocator_init(); +} +static void xen_write_cr2(unsigned long cr2) +{ + this_cpu_read(xen_vcpu)->arch.cr2 = cr2; +} + +static unsigned long xen_read_cr2(void) +{ + return this_cpu_read(xen_vcpu)->arch.cr2; +} + +unsigned long xen_read_cr2_direct(void) +{ + return this_cpu_read(xen_vcpu_info.arch.cr2); +} + +void xen_flush_tlb_all(void) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + trace_xen_mmu_flush_tlb_all(0); + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_TLB_FLUSH_ALL; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} +static void xen_flush_tlb(void) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + trace_xen_mmu_flush_tlb(0); + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_TLB_FLUSH_LOCAL; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +static void xen_flush_tlb_single(unsigned long addr) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + trace_xen_mmu_flush_tlb_single(addr); + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*op)); + op = mcs.args; + op->cmd = MMUEXT_INVLPG_LOCAL; + op->arg1.linear_addr = addr & PAGE_MASK; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +static void xen_flush_tlb_others(const struct cpumask *cpus, + struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + struct { + struct mmuext_op op; +#ifdef CONFIG_SMP + DECLARE_BITMAP(mask, num_processors); +#else + DECLARE_BITMAP(mask, NR_CPUS); +#endif + } *args; + struct multicall_space mcs; + + trace_xen_mmu_flush_tlb_others(cpus, mm, start, end); + + if (cpumask_empty(cpus)) + return; /* nothing to do */ + + mcs = xen_mc_entry(sizeof(*args)); + args = mcs.args; + args->op.arg2.vcpumask = to_cpumask(args->mask); + + /* Remove us, and any offline CPUS. */ + cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); + + args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; + if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { + args->op.cmd = MMUEXT_INVLPG_MULTI; + args->op.arg1.linear_addr = start; + } + + MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); +} + +static unsigned long xen_read_cr3(void) +{ + return this_cpu_read(xen_cr3); +} + +static void set_current_cr3(void *v) +{ + this_cpu_write(xen_current_cr3, (unsigned long)v); +} + +static void __xen_write_cr3(bool kernel, unsigned long cr3) +{ + struct mmuext_op op; + unsigned long mfn; + + trace_xen_mmu_write_cr3(kernel, cr3); + + if (cr3) + mfn = pfn_to_mfn(PFN_DOWN(cr3)); + else + mfn = 0; + + WARN_ON(mfn == 0 && kernel); + + op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; + op.arg1.mfn = mfn; + + xen_extend_mmuext_op(&op); + + if (kernel) { + this_cpu_write(xen_cr3, cr3); + + /* Update xen_current_cr3 once the batch has actually + been submitted. */ + xen_mc_callback(set_current_cr3, (void *)cr3); + } +} +static void xen_write_cr3(unsigned long cr3) +{ + BUG_ON(preemptible()); + + xen_mc_batch(); /* disables interrupts */ + + /* Update while interrupts are disabled, so its atomic with + respect to ipis */ + this_cpu_write(xen_cr3, cr3); + + __xen_write_cr3(true, cr3); + +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); + if (user_pgd) + __xen_write_cr3(false, __pa(user_pgd)); + else + __xen_write_cr3(false, 0); + } +#endif + + xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ +} + +#ifdef CONFIG_X86_64 +/* + * At the start of the day - when Xen launches a guest, it has already + * built pagetables for the guest. We diligently look over them + * in xen_setup_kernel_pagetable and graft as appropiate them in the + * init_level4_pgt and its friends. Then when we are happy we load + * the new init_level4_pgt - and continue on. + * + * The generic code starts (start_kernel) and 'init_mem_mapping' sets + * up the rest of the pagetables. When it has completed it loads the cr3. + * N.B. that baremetal would start at 'start_kernel' (and the early + * #PF handler would create bootstrap pagetables) - so we are running + * with the same assumptions as what to do when write_cr3 is executed + * at this point. + * + * Since there are no user-page tables at all, we have two variants + * of xen_write_cr3 - the early bootup (this one), and the late one + * (xen_write_cr3). The reason we have to do that is that in 64-bit + * the Linux kernel and user-space are both in ring 3 while the + * hypervisor is in ring 0. + */ +static void __init xen_write_cr3_init(unsigned long cr3) +{ + BUG_ON(preemptible()); + + xen_mc_batch(); /* disables interrupts */ + + /* Update while interrupts are disabled, so its atomic with + respect to ipis */ + this_cpu_write(xen_cr3, cr3); + + __xen_write_cr3(true, cr3); + + xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ +} +#endif + +static int xen_pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd = mm->pgd; + int ret = 0; + + BUG_ON(PagePinned(virt_to_page(pgd))); + +#ifdef CONFIG_X86_64 + { + struct page *page = virt_to_page(pgd); + pgd_t *user_pgd; + + BUG_ON(page->private != 0); + + ret = -ENOMEM; + + user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + page->private = (unsigned long)user_pgd; + + if (user_pgd != NULL) { + user_pgd[pgd_index(VSYSCALL_ADDR)] = + __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); + ret = 0; + } + + BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); + } +#endif + + return ret; +} + +static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ +#ifdef CONFIG_X86_64 + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + if (user_pgd) + free_page((unsigned long)user_pgd); +#endif +} + +#ifdef CONFIG_X86_32 +static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) +{ + /* If there's an existing pte, then don't allow _PAGE_RW to be set */ + if (pte_val_ma(*ptep) & _PAGE_PRESENT) + pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & + pte_val_ma(pte)); + + return pte; +} +#else /* CONFIG_X86_64 */ +static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) +{ + return pte; +} +#endif /* CONFIG_X86_64 */ + +/* + * Init-time set_pte while constructing initial pagetables, which + * doesn't allow RO page table pages to be remapped RW. + * + * If there is no MFN for this PFN then this page is initially + * ballooned out so clear the PTE (as in decrease_reservation() in + * drivers/xen/balloon.c). + * + * Many of these PTE updates are done on unpinned and writable pages + * and doing a hypercall for these is unnecessary and expensive. At + * this point it is not possible to tell if a page is pinned or not, + * so always write the PTE directly and rely on Xen trapping and + * emulating any updates as necessary. + */ +static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) +{ + if (pte_mfn(pte) != INVALID_P2M_ENTRY) + pte = mask_rw_pte(ptep, pte); + else + pte = __pte_ma(0); + + native_set_pte(ptep, pte); +} + +static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) +{ + struct mmuext_op op; + op.cmd = cmd; + op.arg1.mfn = pfn_to_mfn(pfn); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); +} + +/* Early in boot, while setting up the initial pagetable, assume + everything is pinned. */ +static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) +{ +#ifdef CONFIG_FLATMEM + BUG_ON(mem_map); /* should only be used early */ +#endif + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); +} + +/* Used for pmd and pud */ +static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) +{ +#ifdef CONFIG_FLATMEM + BUG_ON(mem_map); /* should only be used early */ +#endif + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); +} + +/* Early release_pte assumes that all pts are pinned, since there's + only init_mm and anything attached to that is pinned. */ +static void __init xen_release_pte_init(unsigned long pfn) +{ + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); +} + +static void __init xen_release_pmd_init(unsigned long pfn) +{ + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); +} + +static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn) +{ + struct multicall_space mcs; + struct mmuext_op *op; + + mcs = __xen_mc_entry(sizeof(*op)); + op = mcs.args; + op->cmd = cmd; + op->arg1.mfn = pfn_to_mfn(pfn); + + MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); +} + +static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot) +{ + struct multicall_space mcs; + unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT); + + mcs = __xen_mc_entry(0); + MULTI_update_va_mapping(mcs.mc, (unsigned long)addr, + pfn_pte(pfn, prot), 0); +} + +/* This needs to make sure the new pte page is pinned iff its being + attached to a pinned pagetable. */ +static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, + unsigned level) +{ + bool pinned = PagePinned(virt_to_page(mm->pgd)); + + trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned); + + if (pinned) { + struct page *page = pfn_to_page(pfn); + + SetPagePinned(page); + + if (!PageHighMem(page)) { + xen_mc_batch(); + + __set_pfn_prot(pfn, PAGE_KERNEL_RO); + + if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) + __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + } else { + /* make sure there are no stray mappings of + this page */ + kmap_flush_unused(); + } + } +} + +static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) +{ + xen_alloc_ptpage(mm, pfn, PT_PTE); +} + +static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) +{ + xen_alloc_ptpage(mm, pfn, PT_PMD); +} + +/* This should never happen until we're OK to use struct page */ +static inline void xen_release_ptpage(unsigned long pfn, unsigned level) +{ + struct page *page = pfn_to_page(pfn); + bool pinned = PagePinned(page); + + trace_xen_mmu_release_ptpage(pfn, level, pinned); + + if (pinned) { + if (!PageHighMem(page)) { + xen_mc_batch(); + + if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) + __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); + + __set_pfn_prot(pfn, PAGE_KERNEL); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + } + ClearPagePinned(page); + } +} + +static void xen_release_pte(unsigned long pfn) +{ + xen_release_ptpage(pfn, PT_PTE); +} + +static void xen_release_pmd(unsigned long pfn) +{ + xen_release_ptpage(pfn, PT_PMD); +} + +#if PAGETABLE_LEVELS == 4 +static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) +{ + xen_alloc_ptpage(mm, pfn, PT_PUD); +} + +static void xen_release_pud(unsigned long pfn) +{ + xen_release_ptpage(pfn, PT_PUD); +} +#endif + +void __init xen_reserve_top(void) +{ +#ifdef CONFIG_X86_32 + unsigned long top = HYPERVISOR_VIRT_START; + struct xen_platform_parameters pp; + + if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) + top = pp.virt_start; + + reserve_top_address(-top); +#endif /* CONFIG_X86_32 */ +} + +/* + * Like __va(), but returns address in the kernel mapping (which is + * all we have until the physical memory mapping has been set up. + */ +static void *__ka(phys_addr_t paddr) +{ +#ifdef CONFIG_X86_64 + return (void *)(paddr + __START_KERNEL_map); +#else + return __va(paddr); +#endif +} + +/* Convert a machine address to physical address */ +static unsigned long m2p(phys_addr_t maddr) +{ + phys_addr_t paddr; + + maddr &= PTE_PFN_MASK; + paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; + + return paddr; +} + +/* Convert a machine address to kernel virtual */ +static void *m2v(phys_addr_t maddr) +{ + return __ka(m2p(maddr)); +} + +/* Set the page permissions on an identity-mapped pages */ +static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags) +{ + unsigned long pfn = __pa(addr) >> PAGE_SHIFT; + pte_t pte = pfn_pte(pfn, prot); + + /* For PVH no need to set R/O or R/W to pin them or unpin them. */ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags)) + BUG(); +} +static void set_page_prot(void *addr, pgprot_t prot) +{ + return set_page_prot_flags(addr, prot, UVMF_NONE); +} +#ifdef CONFIG_X86_32 +static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) +{ + unsigned pmdidx, pteidx; + unsigned ident_pte; + unsigned long pfn; + + level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES, + PAGE_SIZE); + + ident_pte = 0; + pfn = 0; + for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { + pte_t *pte_page; + + /* Reuse or allocate a page of ptes */ + if (pmd_present(pmd[pmdidx])) + pte_page = m2v(pmd[pmdidx].pmd); + else { + /* Check for free pte pages */ + if (ident_pte == LEVEL1_IDENT_ENTRIES) + break; + + pte_page = &level1_ident_pgt[ident_pte]; + ident_pte += PTRS_PER_PTE; + + pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); + } + + /* Install mappings */ + for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { + pte_t pte; + +#ifdef CONFIG_X86_32 + if (pfn > max_pfn_mapped) + max_pfn_mapped = pfn; +#endif + + if (!pte_none(pte_page[pteidx])) + continue; + + pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); + pte_page[pteidx] = pte; + } + } + + for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) + set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); + + set_page_prot(pmd, PAGE_KERNEL_RO); +} +#endif +void __init xen_setup_machphys_mapping(void) +{ + struct xen_machphys_mapping mapping; + + if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { + machine_to_phys_mapping = (unsigned long *)mapping.v_start; + machine_to_phys_nr = mapping.max_mfn + 1; + } else { + machine_to_phys_nr = MACH2PHYS_NR_ENTRIES; + } +#ifdef CONFIG_X86_32 + WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1)) + < machine_to_phys_mapping); +#endif +} + +#ifdef CONFIG_X86_64 +static void convert_pfn_mfn(void *v) +{ + pte_t *pte = v; + int i; + + /* All levels are converted the same way, so just treat them + as ptes. */ + for (i = 0; i < PTRS_PER_PTE; i++) + pte[i] = xen_make_pte(pte[i].pte); +} +static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, + unsigned long addr) +{ + if (*pt_base == PFN_DOWN(__pa(addr))) { + set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); + clear_page((void *)addr); + (*pt_base)++; + } + if (*pt_end == PFN_DOWN(__pa(addr))) { + set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); + clear_page((void *)addr); + (*pt_end)--; + } +} +/* + * Set up the initial kernel pagetable. + * + * We can construct this by grafting the Xen provided pagetable into + * head_64.S's preconstructed pagetables. We copy the Xen L2's into + * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This + * means that only the kernel has a physical mapping to start with - + * but that's enough to get __va working. We need to fill in the rest + * of the physical mapping once some sort of allocator has been set + * up. + * NOTE: for PVH, the page tables are native. + */ +void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) +{ + pud_t *l3; + pmd_t *l2; + unsigned long addr[3]; + unsigned long pt_base, pt_end; + unsigned i; + + /* max_pfn_mapped is the last pfn mapped in the initial memory + * mappings. Considering that on Xen after the kernel mappings we + * have the mappings of some pages that don't exist in pfn space, we + * set max_pfn_mapped to the last real pfn mapped. */ + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); + + pt_base = PFN_DOWN(__pa(xen_start_info->pt_base)); + pt_end = pt_base + xen_start_info->nr_pt_frames; + + /* Zap identity mapping */ + init_level4_pgt[0] = __pgd(0); + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* Pre-constructed entries are in pfn, so convert to mfn */ + /* L4[272] -> level3_ident_pgt + * L4[511] -> level3_kernel_pgt */ + convert_pfn_mfn(init_level4_pgt); + + /* L3_i[0] -> level2_ident_pgt */ + convert_pfn_mfn(level3_ident_pgt); + /* L3_k[510] -> level2_kernel_pgt + * L3_i[511] -> level2_fixmap_pgt */ + convert_pfn_mfn(level3_kernel_pgt); + } + /* We get [511][511] and have Xen's version of level2_kernel_pgt */ + l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); + l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); + + addr[0] = (unsigned long)pgd; + addr[1] = (unsigned long)l3; + addr[2] = (unsigned long)l2; + /* Graft it onto L4[272][0]. Note that we creating an aliasing problem: + * Both L4[272][0] and L4[511][511] have entries that point to the same + * L2 (PMD) tables. Meaning that if you modify it in __va space + * it will be also modified in the __ka space! (But if you just + * modify the PMD table to point to other PTE's or none, then you + * are OK - which is what cleanup_highmap does) */ + copy_page(level2_ident_pgt, l2); + /* Graft it onto L4[511][511] */ + copy_page(level2_kernel_pgt, l2); + + /* Get [511][510] and graft that in level2_fixmap_pgt */ + l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd); + l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); + copy_page(level2_fixmap_pgt, l2); + /* Note that we don't do anything with level1_fixmap_pgt which + * we don't need. */ + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* Make pagetable pieces RO */ + set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); + set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); + + /* Pin down new L4 */ + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, + PFN_DOWN(__pa_symbol(init_level4_pgt))); + + /* Unpin Xen-provided one */ + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + + /* + * At this stage there can be no user pgd, and no page + * structure to attach it to, so make sure we just set kernel + * pgd. + */ + xen_mc_batch(); + __xen_write_cr3(true, __pa(init_level4_pgt)); + xen_mc_issue(PARAVIRT_LAZY_CPU); + } else + native_write_cr3(__pa(init_level4_pgt)); + + /* We can't that easily rip out L3 and L2, as the Xen pagetables are + * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for + * the initial domain. For guests using the toolstack, they are in: + * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only + * rip out the [L4] (pgd), but for guests we shave off three pages. + */ + for (i = 0; i < ARRAY_SIZE(addr); i++) + check_pt_base(&pt_base, &pt_end, addr[i]); + + /* Our (by three pages) smaller Xen pagetable that we are using */ + memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE); + /* Revector the xen_start_info */ + xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); +} +#else /* !CONFIG_X86_64 */ +static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); +static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); + +static void __init xen_write_cr3_init(unsigned long cr3) +{ + unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); + + BUG_ON(read_cr3() != __pa(initial_page_table)); + BUG_ON(cr3 != __pa(swapper_pg_dir)); + + /* + * We are switching to swapper_pg_dir for the first time (from + * initial_page_table) and therefore need to mark that page + * read-only and then pin it. + * + * Xen disallows sharing of kernel PMDs for PAE + * guests. Therefore we must copy the kernel PMD from + * initial_page_table into a new kernel PMD to be used in + * swapper_pg_dir. + */ + swapper_kernel_pmd = + extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); + copy_page(swapper_kernel_pmd, initial_kernel_pmd); + swapper_pg_dir[KERNEL_PGD_BOUNDARY] = + __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT); + set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO); + + set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); + xen_write_cr3(cr3); + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn); + + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, + PFN_DOWN(__pa(initial_page_table))); + set_page_prot(initial_page_table, PAGE_KERNEL); + set_page_prot(initial_kernel_pmd, PAGE_KERNEL); + + pv_mmu_ops.write_cr3 = &xen_write_cr3; +} + +void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) +{ + pmd_t *kernel_pmd; + + initial_kernel_pmd = + extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); + + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + + xen_start_info->nr_pt_frames * PAGE_SIZE + + 512*1024); + + kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); + copy_page(initial_kernel_pmd, kernel_pmd); + + xen_map_identity_early(initial_kernel_pmd, max_pfn); + + copy_page(initial_page_table, pgd); + initial_page_table[KERNEL_PGD_BOUNDARY] = + __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT); + + set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO); + set_page_prot(initial_page_table, PAGE_KERNEL_RO); + set_page_prot(empty_zero_page, PAGE_KERNEL_RO); + + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, + PFN_DOWN(__pa(initial_page_table))); + xen_write_cr3(__pa(initial_page_table)); + + memblock_reserve(__pa(xen_start_info->pt_base), + xen_start_info->nr_pt_frames * PAGE_SIZE); +} +#endif /* CONFIG_X86_64 */ + +static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; + +static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) +{ + pte_t pte; + + phys >>= PAGE_SHIFT; + + switch (idx) { + case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: + case FIX_RO_IDT: +#ifdef CONFIG_X86_32 + case FIX_WP_TEST: +# ifdef CONFIG_HIGHMEM + case FIX_KMAP_BEGIN ... FIX_KMAP_END: +# endif +#else + case VSYSCALL_PAGE: +#endif + case FIX_TEXT_POKE0: + case FIX_TEXT_POKE1: + /* All local page mappings */ + pte = pfn_pte(phys, prot); + break; + +#ifdef CONFIG_X86_LOCAL_APIC + case FIX_APIC_BASE: /* maps dummy local APIC */ + pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); + break; +#endif + +#ifdef CONFIG_X86_IO_APIC + case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END: + /* + * We just don't map the IO APIC - all access is via + * hypercalls. Keep the address in the pte for reference. + */ + pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); + break; +#endif + + case FIX_PARAVIRT_BOOTMAP: + /* This is an MFN, but it isn't an IO mapping from the + IO domain */ + pte = mfn_pte(phys, prot); + break; + + default: + /* By default, set_fixmap is used for hardware mappings */ + pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP)); + break; + } + + __native_set_fixmap(idx, pte); + +#ifdef CONFIG_X86_64 + /* Replicate changes to map the vsyscall page into the user + pagetable vsyscall mapping. */ + if (idx == VSYSCALL_PAGE) { + unsigned long vaddr = __fix_to_virt(idx); + set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); + } +#endif +} + +static void __init xen_post_allocator_init(void) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + pv_mmu_ops.set_pte = xen_set_pte; + pv_mmu_ops.set_pmd = xen_set_pmd; + pv_mmu_ops.set_pud = xen_set_pud; +#if PAGETABLE_LEVELS == 4 + pv_mmu_ops.set_pgd = xen_set_pgd; +#endif + + /* This will work as long as patching hasn't happened yet + (which it hasn't) */ + pv_mmu_ops.alloc_pte = xen_alloc_pte; + pv_mmu_ops.alloc_pmd = xen_alloc_pmd; + pv_mmu_ops.release_pte = xen_release_pte; + pv_mmu_ops.release_pmd = xen_release_pmd; +#if PAGETABLE_LEVELS == 4 + pv_mmu_ops.alloc_pud = xen_alloc_pud; + pv_mmu_ops.release_pud = xen_release_pud; +#endif + +#ifdef CONFIG_X86_64 + pv_mmu_ops.write_cr3 = &xen_write_cr3; + SetPagePinned(virt_to_page(level3_user_vsyscall)); +#endif + xen_mark_init_mm_pinned(); +} + +static void xen_leave_lazy_mmu(void) +{ + preempt_disable(); + xen_mc_flush(); + paravirt_leave_lazy_mmu(); + preempt_enable(); +} + +static const struct pv_mmu_ops xen_mmu_ops __initconst = { + .read_cr2 = xen_read_cr2, + .write_cr2 = xen_write_cr2, + + .read_cr3 = xen_read_cr3, + .write_cr3 = xen_write_cr3_init, + + .flush_tlb_user = xen_flush_tlb, + .flush_tlb_kernel = xen_flush_tlb, + .flush_tlb_single = xen_flush_tlb_single, + .flush_tlb_others = xen_flush_tlb_others, + + .pte_update = paravirt_nop, + .pte_update_defer = paravirt_nop, + + .pgd_alloc = xen_pgd_alloc, + .pgd_free = xen_pgd_free, + + .alloc_pte = xen_alloc_pte_init, + .release_pte = xen_release_pte_init, + .alloc_pmd = xen_alloc_pmd_init, + .release_pmd = xen_release_pmd_init, + + .set_pte = xen_set_pte_init, + .set_pte_at = xen_set_pte_at, + .set_pmd = xen_set_pmd_hyper, + + .ptep_modify_prot_start = __ptep_modify_prot_start, + .ptep_modify_prot_commit = __ptep_modify_prot_commit, + + .pte_val = PV_CALLEE_SAVE(xen_pte_val), + .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), + + .make_pte = PV_CALLEE_SAVE(xen_make_pte), + .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), + +#ifdef CONFIG_X86_PAE + .set_pte_atomic = xen_set_pte_atomic, + .pte_clear = xen_pte_clear, + .pmd_clear = xen_pmd_clear, +#endif /* CONFIG_X86_PAE */ + .set_pud = xen_set_pud_hyper, + + .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), + .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), + +#if PAGETABLE_LEVELS == 4 + .pud_val = PV_CALLEE_SAVE(xen_pud_val), + .make_pud = PV_CALLEE_SAVE(xen_make_pud), + .set_pgd = xen_set_pgd_hyper, + + .alloc_pud = xen_alloc_pmd_init, + .release_pud = xen_release_pmd_init, +#endif /* PAGETABLE_LEVELS == 4 */ + + .activate_mm = xen_activate_mm, + .dup_mmap = xen_dup_mmap, + .exit_mmap = xen_exit_mmap, + + .lazy_mode = { + .enter = paravirt_enter_lazy_mmu, + .leave = xen_leave_lazy_mmu, + .flush = paravirt_flush_lazy_mmu, + }, + + .set_fixmap = xen_set_fixmap, +}; + +void __init xen_init_mmu_ops(void) +{ + x86_init.paging.pagetable_init = xen_pagetable_init; + + /* Optimization - we can use the HVM one but it has no idea which + * VCPUs are descheduled - which means that it will needlessly IPI + * them. Xen knows so let it do the job. + */ + if (xen_feature(XENFEAT_auto_translated_physmap)) { + pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others; + return; + } + pv_mmu_ops = xen_mmu_ops; + + memset(dummy_mapping, 0xff, PAGE_SIZE); +} + +/* Protected by xen_reservation_lock. */ +#define MAX_CONTIG_ORDER 9 /* 2MB */ +static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER]; + +#define VOID_PTE (mfn_pte(0, __pgprot(0))) +static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, + unsigned long *in_frames, + unsigned long *out_frames) +{ + int i; + struct multicall_space mcs; + + xen_mc_batch(); + for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) { + mcs = __xen_mc_entry(0); + + if (in_frames) + in_frames[i] = virt_to_mfn(vaddr); + + MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); + __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); + + if (out_frames) + out_frames[i] = virt_to_pfn(vaddr); + } + xen_mc_issue(0); +} + +/* + * Update the pfn-to-mfn mappings for a virtual address range, either to + * point to an array of mfns, or contiguously from a single starting + * mfn. + */ +static void xen_remap_exchanged_ptes(unsigned long vaddr, int order, + unsigned long *mfns, + unsigned long first_mfn) +{ + unsigned i, limit; + unsigned long mfn; + + xen_mc_batch(); + + limit = 1u << order; + for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) { + struct multicall_space mcs; + unsigned flags; + + mcs = __xen_mc_entry(0); + if (mfns) + mfn = mfns[i]; + else + mfn = first_mfn + i; + + if (i < (limit - 1)) + flags = 0; + else { + if (order == 0) + flags = UVMF_INVLPG | UVMF_ALL; + else + flags = UVMF_TLB_FLUSH | UVMF_ALL; + } + + MULTI_update_va_mapping(mcs.mc, vaddr, + mfn_pte(mfn, PAGE_KERNEL), flags); + + set_phys_to_machine(virt_to_pfn(vaddr), mfn); + } + + xen_mc_issue(0); +} + +/* + * Perform the hypercall to exchange a region of our pfns to point to + * memory with the required contiguous alignment. Takes the pfns as + * input, and populates mfns as output. + * + * Returns a success code indicating whether the hypervisor was able to + * satisfy the request or not. + */ +static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in, + unsigned long *pfns_in, + unsigned long extents_out, + unsigned int order_out, + unsigned long *mfns_out, + unsigned int address_bits) +{ + long rc; + int success; + + struct xen_memory_exchange exchange = { + .in = { + .nr_extents = extents_in, + .extent_order = order_in, + .extent_start = pfns_in, + .domid = DOMID_SELF + }, + .out = { + .nr_extents = extents_out, + .extent_order = order_out, + .extent_start = mfns_out, + .address_bits = address_bits, + .domid = DOMID_SELF + } + }; + + BUG_ON(extents_in << order_in != extents_out << order_out); + + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); + success = (exchange.nr_exchanged == extents_in); + + BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); + BUG_ON(success && (rc != 0)); + + return success; +} + +int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, + unsigned int address_bits, + dma_addr_t *dma_handle) +{ + unsigned long *in_frames = discontig_frames, out_frame; + unsigned long flags; + int success; + unsigned long vstart = (unsigned long)phys_to_virt(pstart); + + /* + * Currently an auto-translated guest will not perform I/O, nor will + * it require PAE page directories below 4GB. Therefore any calls to + * this function are redundant and can be ignored. + */ + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + if (unlikely(order > MAX_CONTIG_ORDER)) + return -ENOMEM; + + memset((void *) vstart, 0, PAGE_SIZE << order); + + spin_lock_irqsave(&xen_reservation_lock, flags); + + /* 1. Zap current PTEs, remembering MFNs. */ + xen_zap_pfn_range(vstart, order, in_frames, NULL); + + /* 2. Get a new contiguous memory extent. */ + out_frame = virt_to_pfn(vstart); + success = xen_exchange_memory(1UL << order, 0, in_frames, + 1, order, &out_frame, + address_bits); + + /* 3. Map the new extent in place of old pages. */ + if (success) + xen_remap_exchanged_ptes(vstart, order, NULL, out_frame); + else + xen_remap_exchanged_ptes(vstart, order, in_frames, 0); + + spin_unlock_irqrestore(&xen_reservation_lock, flags); + + *dma_handle = virt_to_machine(vstart).maddr; + return success ? 0 : -ENOMEM; +} +EXPORT_SYMBOL_GPL(xen_create_contiguous_region); + +void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) +{ + unsigned long *out_frames = discontig_frames, in_frame; + unsigned long flags; + int success; + unsigned long vstart; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + if (unlikely(order > MAX_CONTIG_ORDER)) + return; + + vstart = (unsigned long)phys_to_virt(pstart); + memset((void *) vstart, 0, PAGE_SIZE << order); + + spin_lock_irqsave(&xen_reservation_lock, flags); + + /* 1. Find start MFN of contiguous extent. */ + in_frame = virt_to_mfn(vstart); + + /* 2. Zap current PTEs. */ + xen_zap_pfn_range(vstart, order, NULL, out_frames); + + /* 3. Do the exchange for non-contiguous MFNs. */ + success = xen_exchange_memory(1, order, &in_frame, 1UL << order, + 0, out_frames, 0); + + /* 4. Map new pages in place of old pages. */ + if (success) + xen_remap_exchanged_ptes(vstart, order, out_frames, 0); + else + xen_remap_exchanged_ptes(vstart, order, NULL, in_frame); + + spin_unlock_irqrestore(&xen_reservation_lock, flags); +} +EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); + +#ifdef CONFIG_XEN_PVHVM +#ifdef CONFIG_PROC_VMCORE +/* + * This function is used in two contexts: + * - the kdump kernel has to check whether a pfn of the crashed kernel + * was a ballooned page. vmcore is using this function to decide + * whether to access a pfn of the crashed kernel. + * - the kexec kernel has to check whether a pfn was ballooned by the + * previous kernel. If the pfn is ballooned, handle it properly. + * Returns 0 if the pfn is not backed by a RAM page, the caller may + * handle the pfn special in this case. + */ +static int xen_oldmem_pfn_is_ram(unsigned long pfn) +{ + struct xen_hvm_get_mem_type a = { + .domid = DOMID_SELF, + .pfn = pfn, + }; + int ram; + + if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a)) + return -ENXIO; + + switch (a.mem_type) { + case HVMMEM_mmio_dm: + ram = 0; + break; + case HVMMEM_ram_rw: + case HVMMEM_ram_ro: + default: + ram = 1; + break; + } + + return ram; +} +#endif + +static void xen_hvm_exit_mmap(struct mm_struct *mm) +{ + struct xen_hvm_pagetable_dying a; + int rc; + + a.domid = DOMID_SELF; + a.gpa = __pa(mm->pgd); + rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); + WARN_ON_ONCE(rc < 0); +} + +static int is_pagetable_dying_supported(void) +{ + struct xen_hvm_pagetable_dying a; + int rc = 0; + + a.domid = DOMID_SELF; + a.gpa = 0x00; + rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); + if (rc < 0) { + printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n"); + return 0; + } + return 1; +} + +void __init xen_hvm_init_mmu_ops(void) +{ + if (is_pagetable_dying_supported()) + pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap; +#ifdef CONFIG_PROC_VMCORE + register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram); +#endif +} +#endif + +#ifdef CONFIG_XEN_PVH +/* + * Map foreign gfn (fgfn), to local pfn (lpfn). This for the user + * space creating new guest on pvh dom0 and needing to map domU pages. + */ +static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn, + unsigned int domid) +{ + int rc, err = 0; + xen_pfn_t gpfn = lpfn; + xen_ulong_t idx = fgfn; + + struct xen_add_to_physmap_range xatp = { + .domid = DOMID_SELF, + .foreign_domid = domid, + .size = 1, + .space = XENMAPSPACE_gmfn_foreign, + }; + set_xen_guest_handle(xatp.idxs, &idx); + set_xen_guest_handle(xatp.gpfns, &gpfn); + set_xen_guest_handle(xatp.errs, &err); + + rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp); + if (rc < 0) + return rc; + return err; +} + +static int xlate_remove_from_p2m(unsigned long spfn, int count) +{ + struct xen_remove_from_physmap xrp; + int i, rc; + + for (i = 0; i < count; i++) { + xrp.domid = DOMID_SELF; + xrp.gpfn = spfn+i; + rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); + if (rc) + break; + } + return rc; +} + +struct xlate_remap_data { + unsigned long fgfn; /* foreign domain's gfn */ + pgprot_t prot; + domid_t domid; + int index; + struct page **pages; +}; + +static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, + void *data) +{ + int rc; + struct xlate_remap_data *remap = data; + unsigned long pfn = page_to_pfn(remap->pages[remap->index++]); + pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot)); + + rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid); + if (rc) + return rc; + native_set_pte(ptep, pteval); + + return 0; +} + +static int xlate_remap_gfn_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long mfn, + int nr, pgprot_t prot, unsigned domid, + struct page **pages) +{ + int err; + struct xlate_remap_data pvhdata; + + BUG_ON(!pages); + + pvhdata.fgfn = mfn; + pvhdata.prot = prot; + pvhdata.domid = domid; + pvhdata.index = 0; + pvhdata.pages = pages; + err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT, + xlate_map_pte_fn, &pvhdata); + flush_tlb_all(); + return err; +} +#endif + +#define REMAP_BATCH_SIZE 16 + +struct remap_data { + unsigned long mfn; + pgprot_t prot; + struct mmu_update *mmu_update; +}; + +static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, + unsigned long addr, void *data) +{ + struct remap_data *rmd = data; + pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot)); + + rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; + rmd->mmu_update->val = pte_val_ma(pte); + rmd->mmu_update++; + + return 0; +} + +int xen_remap_domain_mfn_range(struct vm_area_struct *vma, + unsigned long addr, + xen_pfn_t mfn, int nr, + pgprot_t prot, unsigned domid, + struct page **pages) + +{ + struct remap_data rmd; + struct mmu_update mmu_update[REMAP_BATCH_SIZE]; + int batch; + unsigned long range; + int err = 0; + + BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); + + if (xen_feature(XENFEAT_auto_translated_physmap)) { +#ifdef CONFIG_XEN_PVH + /* We need to update the local page tables and the xen HAP */ + return xlate_remap_gfn_range(vma, addr, mfn, nr, prot, + domid, pages); +#else + return -EINVAL; +#endif + } + + rmd.mfn = mfn; + rmd.prot = prot; + + while (nr) { + batch = min(REMAP_BATCH_SIZE, nr); + range = (unsigned long)batch << PAGE_SHIFT; + + rmd.mmu_update = mmu_update; + err = apply_to_page_range(vma->vm_mm, addr, range, + remap_area_mfn_pte_fn, &rmd); + if (err) + goto out; + + err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid); + if (err < 0) + goto out; + + nr -= batch; + addr += range; + } + + err = 0; +out: + + xen_flush_tlb_all(); + + return err; +} +EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); + +/* Returns: 0 success */ +int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, + int numpgs, struct page **pages) +{ + if (!pages || !xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + +#ifdef CONFIG_XEN_PVH + while (numpgs--) { + /* + * The mmu has already cleaned up the process mmu + * resources at this point (lookup_address will return + * NULL). + */ + unsigned long pfn = page_to_pfn(pages[numpgs]); + + xlate_remove_from_p2m(pfn, 1); + } + /* + * We don't need to flush tlbs because as part of + * xlate_remove_from_p2m, the hypervisor will do tlb flushes + * after removing the p2m entries from the EPT/NPT + */ + return 0; +#else + return -EINVAL; +#endif +} +EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range); diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index b5e189b1519..73809bb951b 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -10,58 +10,17 @@ enum pt_level { PT_PTE }; -/* - * Page-directory addresses above 4GB do not fit into architectural %cr3. - * When accessing %cr3, or equivalent field in vcpu_guest_context, guests - * must use the following accessor macros to pack/unpack valid MFNs. - * - * Note that Xen is using the fact that the pagetable base is always - * page-aligned, and putting the 12 MSB of the address into the 12 LSB - * of cr3. - */ -#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20)) -#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20)) +bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); -void xen_set_pte(pte_t *ptep, pte_t pteval); -void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval); -void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); +pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); -void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); -void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); -void xen_exit_mmap(struct mm_struct *mm); - -void xen_pgd_pin(pgd_t *pgd); -//void xen_pgd_unpin(pgd_t *pgd); - -#ifdef CONFIG_X86_PAE -unsigned long long xen_pte_val(pte_t); -unsigned long long xen_pmd_val(pmd_t); -unsigned long long xen_pgd_val(pgd_t); - -pte_t xen_make_pte(unsigned long long); -pmd_t xen_make_pmd(unsigned long long); -pgd_t xen_make_pgd(unsigned long long); - -void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval); -void xen_set_pte_atomic(pte_t *ptep, pte_t pte); -void xen_set_pud(pud_t *ptr, pud_t val); -void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -void xen_pmd_clear(pmd_t *pmdp); - - -#else -unsigned long xen_pte_val(pte_t); -unsigned long xen_pmd_val(pmd_t); -unsigned long xen_pgd_val(pgd_t); - -pte_t xen_make_pte(unsigned long); -pmd_t xen_make_pmd(unsigned long); -pgd_t xen_make_pgd(unsigned long); -#endif +unsigned long xen_read_cr2_direct(void); +extern void xen_init_mmu_ops(void); +extern void xen_hvm_init_mmu_ops(void); #endif /* _XEN_MMU_H */ diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 5e6f36f6d87..0d82003e76a 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -21,27 +21,32 @@ */ #include <linux/percpu.h> #include <linux/hardirq.h> +#include <linux/debugfs.h> #include <asm/xen/hypercall.h> #include "multicalls.h" - -#define MC_DEBUG 1 +#include "debugfs.h" #define MC_BATCH 32 -#define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) + +#define MC_DEBUG 0 + +#define MC_ARGS (MC_BATCH * 16) + struct mc_buffer { + unsigned mcidx, argidx, cbidx; struct multicall_entry entries[MC_BATCH]; #if MC_DEBUG struct multicall_entry debug[MC_BATCH]; + void *caller[MC_BATCH]; #endif - u64 args[MC_ARGS]; + unsigned char args[MC_ARGS]; struct callback { void (*fn)(void *); void *data; } callbacks[MC_BATCH]; - unsigned mcidx, argidx, cbidx; }; static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); @@ -50,6 +55,7 @@ DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); void xen_mc_flush(void) { struct mc_buffer *b = &__get_cpu_var(mc_buffer); + struct multicall_entry *mc; int ret = 0; unsigned long flags; int i; @@ -60,7 +66,26 @@ void xen_mc_flush(void) something in the middle */ local_irq_save(flags); - if (b->mcidx) { + trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx); + + switch (b->mcidx) { + case 0: + /* no-op */ + BUG_ON(b->argidx != 0); + break; + + case 1: + /* Singleton multicall - bypass multicall machinery + and just do the call directly. */ + mc = &b->entries[0]; + + mc->result = privcmd_call(mc->op, + mc->args[0], mc->args[1], mc->args[2], + mc->args[3], mc->args[4]); + ret = mc->result < 0; + break; + + default: #if MC_DEBUG memcpy(b->debug, b->entries, b->mcidx * sizeof(struct multicall_entry)); @@ -76,51 +101,92 @@ void xen_mc_flush(void) if (ret) { printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", ret, smp_processor_id()); - for(i = 0; i < b->mcidx; i++) { - printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", + dump_stack(); + for (i = 0; i < b->mcidx; i++) { + printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\t%pF\n", i+1, b->mcidx, b->debug[i].op, b->debug[i].args[0], - b->entries[i].result); + b->entries[i].result, + b->caller[i]); } } #endif + } - b->mcidx = 0; - b->argidx = 0; - } else - BUG_ON(b->argidx != 0); - - local_irq_restore(flags); + b->mcidx = 0; + b->argidx = 0; - for(i = 0; i < b->cbidx; i++) { + for (i = 0; i < b->cbidx; i++) { struct callback *cb = &b->callbacks[i]; (*cb->fn)(cb->data); } b->cbidx = 0; - BUG_ON(ret); + local_irq_restore(flags); + + WARN_ON(ret); } struct multicall_space __xen_mc_entry(size_t args) { struct mc_buffer *b = &__get_cpu_var(mc_buffer); struct multicall_space ret; - unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64); + unsigned argidx = roundup(b->argidx, sizeof(u64)); + + trace_xen_mc_entry_alloc(args); BUG_ON(preemptible()); - BUG_ON(argspace > MC_ARGS); + BUG_ON(b->argidx >= MC_ARGS); - if (b->mcidx == MC_BATCH || - (b->argidx + argspace) > MC_ARGS) + if (unlikely(b->mcidx == MC_BATCH || + (argidx + args) >= MC_ARGS)) { + trace_xen_mc_flush_reason((b->mcidx == MC_BATCH) ? + XEN_MC_FL_BATCH : XEN_MC_FL_ARGS); xen_mc_flush(); + argidx = roundup(b->argidx, sizeof(u64)); + } ret.mc = &b->entries[b->mcidx]; +#if MC_DEBUG + b->caller[b->mcidx] = __builtin_return_address(0); +#endif b->mcidx++; + ret.args = &b->args[argidx]; + b->argidx = argidx + args; + + BUG_ON(b->argidx >= MC_ARGS); + return ret; +} + +struct multicall_space xen_mc_extend_args(unsigned long op, size_t size) +{ + struct mc_buffer *b = &__get_cpu_var(mc_buffer); + struct multicall_space ret = { NULL, NULL }; + + BUG_ON(preemptible()); + BUG_ON(b->argidx >= MC_ARGS); + + if (unlikely(b->mcidx == 0 || + b->entries[b->mcidx - 1].op != op)) { + trace_xen_mc_extend_args(op, size, XEN_MC_XE_BAD_OP); + goto out; + } + + if (unlikely((b->argidx + size) >= MC_ARGS)) { + trace_xen_mc_extend_args(op, size, XEN_MC_XE_NO_SPACE); + goto out; + } + + ret.mc = &b->entries[b->mcidx - 1]; ret.args = &b->args[b->argidx]; - b->argidx += argspace; + b->argidx += size; + + BUG_ON(b->argidx >= MC_ARGS); + trace_xen_mc_extend_args(op, size, XEN_MC_XE_OK); +out: return ret; } @@ -129,8 +195,12 @@ void xen_mc_callback(void (*fn)(void *), void *data) struct mc_buffer *b = &__get_cpu_var(mc_buffer); struct callback *cb; - if (b->cbidx == MC_BATCH) + if (b->cbidx == MC_BATCH) { + trace_xen_mc_flush_reason(XEN_MC_FL_CALLBACK); xen_mc_flush(); + } + + trace_xen_mc_callback(fn, data); cb = &b->callbacks[b->cbidx++]; cb->fn = fn; diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h index 8bae996d99a..9c2e74f9096 100644 --- a/arch/x86/xen/multicalls.h +++ b/arch/x86/xen/multicalls.h @@ -1,6 +1,8 @@ #ifndef _XEN_MULTICALLS_H #define _XEN_MULTICALLS_H +#include <trace/events/xen.h> + #include "xen-ops.h" /* Multicalls */ @@ -19,8 +21,12 @@ DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags); paired with xen_mc_issue() */ static inline void xen_mc_batch(void) { + unsigned long flags; + /* need to disable interrupts until this entry is complete */ - local_irq_save(__get_cpu_var(xen_mc_irq_flags)); + local_irq_save(flags); + trace_xen_mc_batch(paravirt_get_lazy_mode()); + __this_cpu_write(xen_mc_irq_flags, flags); } static inline struct multicall_space xen_mc_entry(size_t args) @@ -35,14 +41,28 @@ void xen_mc_flush(void); /* Issue a multicall if we're not in a lazy mode */ static inline void xen_mc_issue(unsigned mode) { + trace_xen_mc_issue(mode); + if ((paravirt_get_lazy_mode() & mode) == 0) xen_mc_flush(); /* restore flags saved in xen_mc_batch */ - local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); + local_irq_restore(this_cpu_read(xen_mc_irq_flags)); } /* Set up a callback to be called when the current batch is flushed */ void xen_mc_callback(void (*fn)(void *), void *data); +/* + * Try to extend the arguments of the previous multicall command. The + * previous command's op must match. If it does, then it attempts to + * extend the argument space allocated to the multicall entry by + * arg_size bytes. + * + * The returned multicall_space will return with mc pointing to the + * command on success, or NULL on failure, and args pointing to the + * newly allocated space. + */ +struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size); + #endif /* _XEN_MULTICALLS_H */ diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c new file mode 100644 index 00000000000..9bb3d82ffec --- /dev/null +++ b/arch/x86/xen/p2m.c @@ -0,0 +1,1340 @@ +/* + * Xen leaves the responsibility for maintaining p2m mappings to the + * guests themselves, but it must also access and update the p2m array + * during suspend/resume when all the pages are reallocated. + * + * The p2m table is logically a flat array, but we implement it as a + * three-level tree to allow the address space to be sparse. + * + * Xen + * | + * p2m_top p2m_top_mfn + * / \ / \ + * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn + * / \ / \ / / + * p2m p2m p2m p2m p2m p2m p2m ... + * + * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. + * + * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the + * maximum representable pseudo-physical address space is: + * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages + * + * P2M_PER_PAGE depends on the architecture, as a mfn is always + * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to + * 512 and 1024 entries respectively. + * + * In short, these structures contain the Machine Frame Number (MFN) of the PFN. + * + * However not all entries are filled with MFNs. Specifically for all other + * leaf entries, or for the top root, or middle one, for which there is a void + * entry, we assume it is "missing". So (for example) + * pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY. + * + * We also have the possibility of setting 1-1 mappings on certain regions, so + * that: + * pfn_to_mfn(0xc0000)=0xc0000 + * + * The benefit of this is, that we can assume for non-RAM regions (think + * PCI BARs, or ACPI spaces), we can create mappings easily because we + * get the PFN value to match the MFN. + * + * For this to work efficiently we have one new page p2m_identity and + * allocate (via reserved_brk) any other pages we need to cover the sides + * (1GB or 4MB boundary violations). All entries in p2m_identity are set to + * INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs, + * no other fancy value). + * + * On lookup we spot that the entry points to p2m_identity and return the + * identity value instead of dereferencing and returning INVALID_P2M_ENTRY. + * If the entry points to an allocated page, we just proceed as before and + * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in + * appropriate functions (pfn_to_mfn). + * + * The reason for having the IDENTITY_FRAME_BIT instead of just returning the + * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a + * non-identity pfn. To protect ourselves against we elect to set (and get) the + * IDENTITY_FRAME_BIT on all identity mapped PFNs. + * + * This simplistic diagram is used to explain the more subtle piece of code. + * There is also a digram of the P2M at the end that can help. + * Imagine your E820 looking as so: + * + * 1GB 2GB 4GB + * /-------------------+---------\/----\ /----------\ /---+-----\ + * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM | + * \-------------------+---------/\----/ \----------/ \---+-----/ + * ^- 1029MB ^- 2001MB + * + * [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100), + * 2048MB = 524288 (0x80000)] + * + * And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB + * is actually not present (would have to kick the balloon driver to put it in). + * + * When we are told to set the PFNs for identity mapping (see patch: "xen/setup: + * Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start + * of the PFN and the end PFN (263424 and 512256 respectively). The first step + * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page + * covers 512^2 of page estate (1GB) and in case the start or end PFN is not + * aligned on 512^2*PAGE_SIZE (1GB) we reserve_brk new middle and leaf pages as + * required to split any existing p2m_mid_missing middle pages. + * + * With the E820 example above, 263424 is not 1GB aligned so we allocate a + * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000. + * Each entry in the allocate page is "missing" (points to p2m_missing). + * + * Next stage is to determine if we need to do a more granular boundary check + * on the 4MB (or 2MB depending on architecture) off the start and end pfn's. + * We check if the start pfn and end pfn violate that boundary check, and if + * so reserve_brk a (p2m[x][y]) leaf page. This way we have a much finer + * granularity of setting which PFNs are missing and which ones are identity. + * In our example 263424 and 512256 both fail the check so we reserve_brk two + * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing" + * values) and assign them to p2m[1][2] and p2m[1][488] respectively. + * + * At this point we would at minimum reserve_brk one page, but could be up to + * three. Each call to set_phys_range_identity has at maximum a three page + * cost. If we were to query the P2M at this stage, all those entries from + * start PFN through end PFN (so 1029MB -> 2001MB) would return + * INVALID_P2M_ENTRY ("missing"). + * + * The next step is to walk from the start pfn to the end pfn setting + * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity. + * If we find that the middle entry is pointing to p2m_missing we can swap it + * over to p2m_identity - this way covering 4MB (or 2MB) PFN space (and + * similarly swapping p2m_mid_missing for p2m_mid_identity for larger regions). + * At this point we do not need to worry about boundary aligment (so no need to + * reserve_brk a middle page, figure out which PFNs are "missing" and which + * ones are identity), as that has been done earlier. If we find that the + * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference + * that page (which covers 512 PFNs) and set the appropriate PFN with + * IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we + * set from p2m[1][2][256->511] and p2m[1][488][0->256] with + * IDENTITY_FRAME_BIT set. + * + * All other regions that are void (or not filled) either point to p2m_missing + * (considered missing) or have the default value of INVALID_P2M_ENTRY (also + * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511] + * contain the INVALID_P2M_ENTRY value and are considered "missing." + * + * Finally, the region beyond the end of of the E820 (4 GB in this example) + * is set to be identity (in case there are MMIO regions placed here). + * + * This is what the p2m ends up looking (for the E820 above) with this + * fabulous drawing: + * + * p2m /--------------\ + * /-----\ | &mfn_list[0],| /-----------------\ + * | 0 |------>| &mfn_list[1],| /---------------\ | ~0, ~0, .. | + * |-----| | ..., ~0, ~0 | | ~0, ~0, [x]---+----->| IDENTITY [@256] | + * | 1 |---\ \--------------/ | [p2m_identity]+\ | IDENTITY [@257] | + * |-----| \ | [p2m_identity]+\\ | .... | + * | 2 |--\ \-------------------->| ... | \\ \----------------/ + * |-----| \ \---------------/ \\ + * | 3 |-\ \ \\ p2m_identity [1] + * |-----| \ \-------------------->/---------------\ /-----------------\ + * | .. |\ | | [p2m_identity]+-->| ~0, ~0, ~0, ... | + * \-----/ | | | [p2m_identity]+-->| ..., ~0 | + * | | | .... | \-----------------/ + * | | +-[x], ~0, ~0.. +\ + * | | \---------------/ \ + * | | \-> /---------------\ + * | V p2m_mid_missing p2m_missing | IDENTITY[@0] | + * | /-----------------\ /------------\ | IDENTITY[@256]| + * | | [p2m_missing] +---->| ~0, ~0, ...| | ~0, ~0, .... | + * | | [p2m_missing] +---->| ..., ~0 | \---------------/ + * | | ... | \------------/ + * | \-----------------/ + * | + * | p2m_mid_identity + * | /-----------------\ + * \-->| [p2m_identity] +---->[1] + * | [p2m_identity] +---->[1] + * | ... | + * \-----------------/ + * + * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT) + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/hash.h> +#include <linux/sched.h> +#include <linux/seq_file.h> + +#include <asm/cache.h> +#include <asm/setup.h> + +#include <asm/xen/page.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> +#include <xen/balloon.h> +#include <xen/grant_table.h> + +#include "multicalls.h" +#include "xen-ops.h" + +static void __init m2p_override_init(void); + +unsigned long xen_max_p2m_pfn __read_mostly; + +#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) +#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) +#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) + +#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) + +/* Placeholders for holes in the address space */ +static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); + +static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); + +static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_identity, P2M_MID_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_identity_mfn, P2M_MID_PER_PAGE); + +RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); +RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); + +/* We might hit two boundary violations at the start and end, at max each + * boundary violation will require three middle nodes. */ +RESERVE_BRK(p2m_mid_extra, PAGE_SIZE * 2 * 3); + +/* When we populate back during bootup, the amount of pages can vary. The + * max we have is seen is 395979, but that does not mean it can't be more. + * Some machines can have 3GB I/O holes even. With early_can_reuse_p2m_middle + * it can re-use Xen provided mfn_list array, so we only need to allocate at + * most three P2M top nodes. */ +RESERVE_BRK(p2m_populated, PAGE_SIZE * 3); + +static inline unsigned p2m_top_index(unsigned long pfn) +{ + BUG_ON(pfn >= MAX_P2M_PFN); + return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); +} + +static inline unsigned p2m_mid_index(unsigned long pfn) +{ + return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; +} + +static inline unsigned p2m_index(unsigned long pfn) +{ + return pfn % P2M_PER_PAGE; +} + +static void p2m_top_init(unsigned long ***top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = p2m_mid_missing; +} + +static void p2m_top_mfn_init(unsigned long *top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = virt_to_mfn(p2m_mid_missing_mfn); +} + +static void p2m_top_mfn_p_init(unsigned long **top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = p2m_mid_missing_mfn; +} + +static void p2m_mid_init(unsigned long **mid, unsigned long *leaf) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + mid[i] = leaf; +} + +static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + mid[i] = virt_to_mfn(leaf); +} + +static void p2m_init(unsigned long *p2m) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + p2m[i] = INVALID_P2M_ENTRY; +} + +/* + * Build the parallel p2m_top_mfn and p2m_mid_mfn structures + * + * This is called both at boot time, and after resuming from suspend: + * - At boot time we're called very early, and must use extend_brk() + * to allocate memory. + * + * - After resume we're called from within stop_machine, but the mfn + * tree should alreay be completely allocated. + */ +void __ref xen_build_mfn_list_list(void) +{ + unsigned long pfn; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + /* Pre-initialize p2m_top_mfn to be completely missing */ + if (p2m_top_mfn == NULL) { + p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); + p2m_mid_identity_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity); + + p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_p_init(p2m_top_mfn_p); + + p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_init(p2m_top_mfn); + } else { + /* Reinitialise, mfn's all change after migration */ + p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); + p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity); + } + + for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); + unsigned long **mid; + unsigned long *mid_mfn_p; + + mid = p2m_top[topidx]; + mid_mfn_p = p2m_top_mfn_p[topidx]; + + /* Don't bother allocating any mfn mid levels if + * they're just missing, just update the stored mfn, + * since all could have changed over a migrate. + */ + if (mid == p2m_mid_missing) { + BUG_ON(mididx); + BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); + p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); + pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; + continue; + } + + if (mid_mfn_p == p2m_mid_missing_mfn) { + /* + * XXX boot-time only! We should never find + * missing parts of the mfn tree after + * runtime. extend_brk() will BUG if we call + * it too late. + */ + mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(mid_mfn_p, p2m_missing); + + p2m_top_mfn_p[topidx] = mid_mfn_p; + } + + p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); + mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); + } +} + +void xen_setup_mfn_list_list(void) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); + + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = + virt_to_mfn(p2m_top_mfn); + HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; +} + +/* Set up p2m_top to point to the domain-builder provided p2m pages */ +void __init xen_build_dynamic_phys_to_machine(void) +{ + unsigned long *mfn_list; + unsigned long max_pfn; + unsigned long pfn; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + mfn_list = (unsigned long *)xen_start_info->mfn_list; + max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); + xen_max_p2m_pfn = max_pfn; + + p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_init(p2m_missing); + p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_init(p2m_identity); + + p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_init(p2m_mid_missing, p2m_missing); + p2m_mid_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_init(p2m_mid_identity, p2m_identity); + + p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_init(p2m_top); + + /* + * The domain builder gives us a pre-constructed p2m array in + * mfn_list for all the pages initially given to us, so we just + * need to graft that into our tree structure. + */ + for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); + + if (p2m_top[topidx] == p2m_mid_missing) { + unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_init(mid, p2m_missing); + + p2m_top[topidx] = mid; + } + + /* + * As long as the mfn_list has enough entries to completely + * fill a p2m page, pointing into the array is ok. But if + * not the entries beyond the last pfn will be undefined. + */ + if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) { + unsigned long p2midx; + + p2midx = max_pfn % P2M_PER_PAGE; + for ( ; p2midx < P2M_PER_PAGE; p2midx++) + mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY; + } + p2m_top[topidx][mididx] = &mfn_list[pfn]; + } + + m2p_override_init(); +} +#ifdef CONFIG_X86_64 +#include <linux/bootmem.h> +unsigned long __init xen_revector_p2m_tree(void) +{ + unsigned long va_start; + unsigned long va_end; + unsigned long pfn; + unsigned long pfn_free = 0; + unsigned long *mfn_list = NULL; + unsigned long size; + + va_start = xen_start_info->mfn_list; + /*We copy in increments of P2M_PER_PAGE * sizeof(unsigned long), + * so make sure it is rounded up to that */ + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + va_end = va_start + size; + + /* If we were revectored already, don't do it again. */ + if (va_start <= __START_KERNEL_map && va_start >= __PAGE_OFFSET) + return 0; + + mfn_list = alloc_bootmem_align(size, PAGE_SIZE); + if (!mfn_list) { + pr_warn("Could not allocate space for a new P2M tree!\n"); + return xen_start_info->mfn_list; + } + /* Fill it out with INVALID_P2M_ENTRY value */ + memset(mfn_list, 0xFF, size); + + for (pfn = 0; pfn < ALIGN(MAX_DOMAIN_PAGES, P2M_PER_PAGE); pfn += P2M_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx; + unsigned long *mid_p; + + if (!p2m_top[topidx]) + continue; + + if (p2m_top[topidx] == p2m_mid_missing) + continue; + + mididx = p2m_mid_index(pfn); + mid_p = p2m_top[topidx][mididx]; + if (!mid_p) + continue; + if ((mid_p == p2m_missing) || (mid_p == p2m_identity)) + continue; + + if ((unsigned long)mid_p == INVALID_P2M_ENTRY) + continue; + + /* The old va. Rebase it on mfn_list */ + if (mid_p >= (unsigned long *)va_start && mid_p <= (unsigned long *)va_end) { + unsigned long *new; + + if (pfn_free > (size / sizeof(unsigned long))) { + WARN(1, "Only allocated for %ld pages, but we want %ld!\n", + size / sizeof(unsigned long), pfn_free); + return 0; + } + new = &mfn_list[pfn_free]; + + copy_page(new, mid_p); + p2m_top[topidx][mididx] = &mfn_list[pfn_free]; + p2m_top_mfn_p[topidx][mididx] = virt_to_mfn(&mfn_list[pfn_free]); + + pfn_free += P2M_PER_PAGE; + + } + /* This should be the leafs allocated for identity from _brk. */ + } + return (unsigned long)mfn_list; + +} +#else +unsigned long __init xen_revector_p2m_tree(void) +{ + return 0; +} +#endif +unsigned long get_phys_to_machine(unsigned long pfn) +{ + unsigned topidx, mididx, idx; + + if (unlikely(pfn >= MAX_P2M_PFN)) + return IDENTITY_FRAME(pfn); + + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); + idx = p2m_index(pfn); + + /* + * The INVALID_P2M_ENTRY is filled in both p2m_*identity + * and in p2m_*missing, so returning the INVALID_P2M_ENTRY + * would be wrong. + */ + if (p2m_top[topidx][mididx] == p2m_identity) + return IDENTITY_FRAME(pfn); + + return p2m_top[topidx][mididx][idx]; +} +EXPORT_SYMBOL_GPL(get_phys_to_machine); + +static void *alloc_p2m_page(void) +{ + return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); +} + +static void free_p2m_page(void *p) +{ + free_page((unsigned long)p); +} + +/* + * Fully allocate the p2m structure for a given pfn. We need to check + * that both the top and mid levels are allocated, and make sure the + * parallel mfn tree is kept in sync. We may race with other cpus, so + * the new pages are installed with cmpxchg; if we lose the race then + * simply free the page we allocated and use the one that's there. + */ +static bool alloc_p2m(unsigned long pfn) +{ + unsigned topidx, mididx; + unsigned long ***top_p, **mid; + unsigned long *top_mfn_p, *mid_mfn; + + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); + + top_p = &p2m_top[topidx]; + mid = *top_p; + + if (mid == p2m_mid_missing) { + /* Mid level is missing, allocate a new one */ + mid = alloc_p2m_page(); + if (!mid) + return false; + + p2m_mid_init(mid, p2m_missing); + + if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) + free_p2m_page(mid); + } + + top_mfn_p = &p2m_top_mfn[topidx]; + mid_mfn = p2m_top_mfn_p[topidx]; + + BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); + + if (mid_mfn == p2m_mid_missing_mfn) { + /* Separately check the mid mfn level */ + unsigned long missing_mfn; + unsigned long mid_mfn_mfn; + + mid_mfn = alloc_p2m_page(); + if (!mid_mfn) + return false; + + p2m_mid_mfn_init(mid_mfn, p2m_missing); + + missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); + mid_mfn_mfn = virt_to_mfn(mid_mfn); + if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) + free_p2m_page(mid_mfn); + else + p2m_top_mfn_p[topidx] = mid_mfn; + } + + if (p2m_top[topidx][mididx] == p2m_identity || + p2m_top[topidx][mididx] == p2m_missing) { + /* p2m leaf page is missing */ + unsigned long *p2m; + unsigned long *p2m_orig = p2m_top[topidx][mididx]; + + p2m = alloc_p2m_page(); + if (!p2m) + return false; + + p2m_init(p2m); + + if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig) + free_p2m_page(p2m); + else + mid_mfn[mididx] = virt_to_mfn(p2m); + } + + return true; +} + +static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary) +{ + unsigned topidx, mididx, idx; + unsigned long *p2m; + unsigned long *mid_mfn_p; + + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); + idx = p2m_index(pfn); + + /* Pfff.. No boundary cross-over, lets get out. */ + if (!idx && check_boundary) + return false; + + WARN(p2m_top[topidx][mididx] == p2m_identity, + "P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n", + topidx, mididx); + + /* + * Could be done by xen_build_dynamic_phys_to_machine.. + */ + if (p2m_top[topidx][mididx] != p2m_missing) + return false; + + /* Boundary cross-over for the edges: */ + p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); + + p2m_init(p2m); + + p2m_top[topidx][mididx] = p2m; + + /* For save/restore we need to MFN of the P2M saved */ + + mid_mfn_p = p2m_top_mfn_p[topidx]; + WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing), + "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n", + topidx, mididx); + mid_mfn_p[mididx] = virt_to_mfn(p2m); + + return true; +} + +static bool __init early_alloc_p2m_middle(unsigned long pfn) +{ + unsigned topidx = p2m_top_index(pfn); + unsigned long *mid_mfn_p; + unsigned long **mid; + + mid = p2m_top[topidx]; + mid_mfn_p = p2m_top_mfn_p[topidx]; + if (mid == p2m_mid_missing) { + mid = extend_brk(PAGE_SIZE, PAGE_SIZE); + + p2m_mid_init(mid, p2m_missing); + + p2m_top[topidx] = mid; + + BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); + } + /* And the save/restore P2M tables.. */ + if (mid_mfn_p == p2m_mid_missing_mfn) { + mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(mid_mfn_p, p2m_missing); + + p2m_top_mfn_p[topidx] = mid_mfn_p; + p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); + /* Note: we don't set mid_mfn_p[midix] here, + * look in early_alloc_p2m() */ + } + return true; +} + +/* + * Skim over the P2M tree looking at pages that are either filled with + * INVALID_P2M_ENTRY or with 1:1 PFNs. If found, re-use that page and + * replace the P2M leaf with a p2m_missing or p2m_identity. + * Stick the old page in the new P2M tree location. + */ +bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_mfn) +{ + unsigned topidx; + unsigned mididx; + unsigned ident_pfns; + unsigned inv_pfns; + unsigned long *p2m; + unsigned long *mid_mfn_p; + unsigned idx; + unsigned long pfn; + + /* We only look when this entails a P2M middle layer */ + if (p2m_index(set_pfn)) + return false; + + for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) { + topidx = p2m_top_index(pfn); + + if (!p2m_top[topidx]) + continue; + + if (p2m_top[topidx] == p2m_mid_missing) + continue; + + mididx = p2m_mid_index(pfn); + p2m = p2m_top[topidx][mididx]; + if (!p2m) + continue; + + if ((p2m == p2m_missing) || (p2m == p2m_identity)) + continue; + + if ((unsigned long)p2m == INVALID_P2M_ENTRY) + continue; + + ident_pfns = 0; + inv_pfns = 0; + for (idx = 0; idx < P2M_PER_PAGE; idx++) { + /* IDENTITY_PFNs are 1:1 */ + if (p2m[idx] == IDENTITY_FRAME(pfn + idx)) + ident_pfns++; + else if (p2m[idx] == INVALID_P2M_ENTRY) + inv_pfns++; + else + break; + } + if ((ident_pfns == P2M_PER_PAGE) || (inv_pfns == P2M_PER_PAGE)) + goto found; + } + return false; +found: + /* Found one, replace old with p2m_identity or p2m_missing */ + p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing); + /* And the other for save/restore.. */ + mid_mfn_p = p2m_top_mfn_p[topidx]; + /* NOTE: Even if it is a p2m_identity it should still be point to + * a page filled with INVALID_P2M_ENTRY entries. */ + mid_mfn_p[mididx] = virt_to_mfn(p2m_missing); + + /* Reset where we want to stick the old page in. */ + topidx = p2m_top_index(set_pfn); + mididx = p2m_mid_index(set_pfn); + + /* This shouldn't happen */ + if (WARN_ON(p2m_top[topidx] == p2m_mid_missing)) + early_alloc_p2m_middle(set_pfn); + + if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing)) + return false; + + p2m_init(p2m); + p2m_top[topidx][mididx] = p2m; + mid_mfn_p = p2m_top_mfn_p[topidx]; + mid_mfn_p[mididx] = virt_to_mfn(p2m); + + return true; +} +bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + if (unlikely(!__set_phys_to_machine(pfn, mfn))) { + if (!early_alloc_p2m_middle(pfn)) + return false; + + if (early_can_reuse_p2m_middle(pfn, mfn)) + return __set_phys_to_machine(pfn, mfn); + + if (!early_alloc_p2m(pfn, false /* boundary crossover OK!*/)) + return false; + + if (!__set_phys_to_machine(pfn, mfn)) + return false; + } + + return true; +} + +static void __init early_split_p2m(unsigned long pfn) +{ + unsigned long mididx, idx; + + mididx = p2m_mid_index(pfn); + idx = p2m_index(pfn); + + /* + * Allocate new middle and leaf pages if this pfn lies in the + * middle of one. + */ + if (mididx || idx) + early_alloc_p2m_middle(pfn); + if (idx) + early_alloc_p2m(pfn, false); +} + +unsigned long __init set_phys_range_identity(unsigned long pfn_s, + unsigned long pfn_e) +{ + unsigned long pfn; + + if (unlikely(pfn_s >= MAX_P2M_PFN)) + return 0; + + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) + return pfn_e - pfn_s; + + if (pfn_s > pfn_e) + return 0; + + if (pfn_e > MAX_P2M_PFN) + pfn_e = MAX_P2M_PFN; + + early_split_p2m(pfn_s); + early_split_p2m(pfn_e); + + for (pfn = pfn_s; pfn < pfn_e;) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); + + if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) + break; + pfn++; + + /* + * If the PFN was set to a middle or leaf identity + * page the remainder must also be identity, so skip + * ahead to the next middle or leaf entry. + */ + if (p2m_top[topidx] == p2m_mid_identity) + pfn = ALIGN(pfn, P2M_MID_PER_PAGE * P2M_PER_PAGE); + else if (p2m_top[topidx][mididx] == p2m_identity) + pfn = ALIGN(pfn, P2M_PER_PAGE); + } + + if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s), + "Identity mapping failed. We are %ld short of 1-1 mappings!\n", + (pfn_e - pfn_s) - (pfn - pfn_s))) + printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn); + + return pfn - pfn_s; +} + +/* Try to install p2m mapping; fail if intermediate bits missing */ +bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + unsigned topidx, mididx, idx; + + /* don't track P2M changes in autotranslate guests */ + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) + return true; + + if (unlikely(pfn >= MAX_P2M_PFN)) { + BUG_ON(mfn != INVALID_P2M_ENTRY); + return true; + } + + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); + idx = p2m_index(pfn); + + /* For sparse holes were the p2m leaf has real PFN along with + * PCI holes, stick in the PFN as the MFN value. + * + * set_phys_range_identity() will have allocated new middle + * and leaf pages as required so an existing p2m_mid_missing + * or p2m_missing mean that whole range will be identity so + * these can be switched to p2m_mid_identity or p2m_identity. + */ + if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) { + if (p2m_top[topidx] == p2m_mid_identity) + return true; + + if (p2m_top[topidx] == p2m_mid_missing) { + WARN_ON(cmpxchg(&p2m_top[topidx], p2m_mid_missing, + p2m_mid_identity) != p2m_mid_missing); + return true; + } + + if (p2m_top[topidx][mididx] == p2m_identity) + return true; + + /* Swap over from MISSING to IDENTITY if needed. */ + if (p2m_top[topidx][mididx] == p2m_missing) { + WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing, + p2m_identity) != p2m_missing); + return true; + } + } + + if (p2m_top[topidx][mididx] == p2m_missing) + return mfn == INVALID_P2M_ENTRY; + + p2m_top[topidx][mididx][idx] = mfn; + + return true; +} + +bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + if (unlikely(!__set_phys_to_machine(pfn, mfn))) { + if (!alloc_p2m(pfn)) + return false; + + if (!__set_phys_to_machine(pfn, mfn)) + return false; + } + + return true; +} + +#define M2P_OVERRIDE_HASH_SHIFT 10 +#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT) + +static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH); +static DEFINE_SPINLOCK(m2p_override_lock); + +static void __init m2p_override_init(void) +{ + unsigned i; + + m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH, + sizeof(unsigned long)); + + for (i = 0; i < M2P_OVERRIDE_HASH; i++) + INIT_LIST_HEAD(&m2p_overrides[i]); +} + +static unsigned long mfn_hash(unsigned long mfn) +{ + return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT); +} + +int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, + struct gnttab_map_grant_ref *kmap_ops, + struct page **pages, unsigned int count) +{ + int i, ret = 0; + bool lazy = false; + pte_t *pte; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + if (kmap_ops && + !in_interrupt() && + paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { + arch_enter_lazy_mmu_mode(); + lazy = true; + } + + for (i = 0; i < count; i++) { + unsigned long mfn, pfn; + + /* Do not add to override if the map failed. */ + if (map_ops[i].status) + continue; + + if (map_ops[i].flags & GNTMAP_contains_pte) { + pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + + (map_ops[i].host_addr & ~PAGE_MASK)); + mfn = pte_mfn(*pte); + } else { + mfn = PFN_DOWN(map_ops[i].dev_bus_addr); + } + pfn = page_to_pfn(pages[i]); + + WARN_ON(PagePrivate(pages[i])); + SetPagePrivate(pages[i]); + set_page_private(pages[i], mfn); + pages[i]->index = pfn_to_mfn(pfn); + + if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) { + ret = -ENOMEM; + goto out; + } + + if (kmap_ops) { + ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]); + if (ret) + goto out; + } + } + +out: + if (lazy) + arch_leave_lazy_mmu_mode(); + + return ret; +} +EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping); + +/* Add an MFN override for a particular page */ +int m2p_add_override(unsigned long mfn, struct page *page, + struct gnttab_map_grant_ref *kmap_op) +{ + unsigned long flags; + unsigned long pfn; + unsigned long uninitialized_var(address); + unsigned level; + pte_t *ptep = NULL; + + pfn = page_to_pfn(page); + if (!PageHighMem(page)) { + address = (unsigned long)__va(pfn << PAGE_SHIFT); + ptep = lookup_address(address, &level); + if (WARN(ptep == NULL || level != PG_LEVEL_4K, + "m2p_add_override: pfn %lx not mapped", pfn)) + return -EINVAL; + } + + if (kmap_op != NULL) { + if (!PageHighMem(page)) { + struct multicall_space mcs = + xen_mc_entry(sizeof(*kmap_op)); + + MULTI_grant_table_op(mcs.mc, + GNTTABOP_map_grant_ref, kmap_op, 1); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + } + } + spin_lock_irqsave(&m2p_override_lock, flags); + list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); + spin_unlock_irqrestore(&m2p_override_lock, flags); + + /* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in + * this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other + * pfn so that the following mfn_to_pfn(mfn) calls will return the + * pfn from the m2p_override (the backend pfn) instead. + * We need to do this because the pages shared by the frontend + * (xen-blkfront) can be already locked (lock_page, called by + * do_read_cache_page); when the userspace backend tries to use them + * with direct_IO, mfn_to_pfn returns the pfn of the frontend, so + * do_blockdev_direct_IO is going to try to lock the same pages + * again resulting in a deadlock. + * As a side effect get_user_pages_fast might not be safe on the + * frontend pages while they are being shared with the backend, + * because mfn_to_pfn (that ends up being called by GUPF) will + * return the backend pfn rather than the frontend pfn. */ + pfn = mfn_to_pfn_no_overrides(mfn); + if (get_phys_to_machine(pfn) == mfn) + set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); + + return 0; +} +EXPORT_SYMBOL_GPL(m2p_add_override); + +int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, + struct gnttab_map_grant_ref *kmap_ops, + struct page **pages, unsigned int count) +{ + int i, ret = 0; + bool lazy = false; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + if (kmap_ops && + !in_interrupt() && + paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { + arch_enter_lazy_mmu_mode(); + lazy = true; + } + + for (i = 0; i < count; i++) { + unsigned long mfn = get_phys_to_machine(page_to_pfn(pages[i])); + unsigned long pfn = page_to_pfn(pages[i]); + + if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) { + ret = -EINVAL; + goto out; + } + + set_page_private(pages[i], INVALID_P2M_ENTRY); + WARN_ON(!PagePrivate(pages[i])); + ClearPagePrivate(pages[i]); + set_phys_to_machine(pfn, pages[i]->index); + + if (kmap_ops) + ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn); + if (ret) + goto out; + } + +out: + if (lazy) + arch_leave_lazy_mmu_mode(); + return ret; +} +EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping); + +int m2p_remove_override(struct page *page, + struct gnttab_map_grant_ref *kmap_op, + unsigned long mfn) +{ + unsigned long flags; + unsigned long pfn; + unsigned long uninitialized_var(address); + unsigned level; + pte_t *ptep = NULL; + + pfn = page_to_pfn(page); + + if (!PageHighMem(page)) { + address = (unsigned long)__va(pfn << PAGE_SHIFT); + ptep = lookup_address(address, &level); + + if (WARN(ptep == NULL || level != PG_LEVEL_4K, + "m2p_remove_override: pfn %lx not mapped", pfn)) + return -EINVAL; + } + + spin_lock_irqsave(&m2p_override_lock, flags); + list_del(&page->lru); + spin_unlock_irqrestore(&m2p_override_lock, flags); + + if (kmap_op != NULL) { + if (!PageHighMem(page)) { + struct multicall_space mcs; + struct gnttab_unmap_and_replace *unmap_op; + struct page *scratch_page = get_balloon_scratch_page(); + unsigned long scratch_page_address = (unsigned long) + __va(page_to_pfn(scratch_page) << PAGE_SHIFT); + + /* + * It might be that we queued all the m2p grant table + * hypercalls in a multicall, then m2p_remove_override + * get called before the multicall has actually been + * issued. In this case handle is going to -1 because + * it hasn't been modified yet. + */ + if (kmap_op->handle == -1) + xen_mc_flush(); + /* + * Now if kmap_op->handle is negative it means that the + * hypercall actually returned an error. + */ + if (kmap_op->handle == GNTST_general_error) { + printk(KERN_WARNING "m2p_remove_override: " + "pfn %lx mfn %lx, failed to modify kernel mappings", + pfn, mfn); + put_balloon_scratch_page(); + return -1; + } + + xen_mc_batch(); + + mcs = __xen_mc_entry( + sizeof(struct gnttab_unmap_and_replace)); + unmap_op = mcs.args; + unmap_op->host_addr = kmap_op->host_addr; + unmap_op->new_addr = scratch_page_address; + unmap_op->handle = kmap_op->handle; + + MULTI_grant_table_op(mcs.mc, + GNTTABOP_unmap_and_replace, unmap_op, 1); + + mcs = __xen_mc_entry(0); + MULTI_update_va_mapping(mcs.mc, scratch_page_address, + pfn_pte(page_to_pfn(scratch_page), + PAGE_KERNEL_RO), 0); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + kmap_op->host_addr = 0; + put_balloon_scratch_page(); + } + } + + /* p2m(m2p(mfn)) == FOREIGN_FRAME(mfn): the mfn is already present + * somewhere in this domain, even before being added to the + * m2p_override (see comment above in m2p_add_override). + * If there are no other entries in the m2p_override corresponding + * to this mfn, then remove the FOREIGN_FRAME_BIT from the p2m for + * the original pfn (the one shared by the frontend): the backend + * cannot do any IO on this page anymore because it has been + * unshared. Removing the FOREIGN_FRAME_BIT from the p2m entry of + * the original pfn causes mfn_to_pfn(mfn) to return the frontend + * pfn again. */ + mfn &= ~FOREIGN_FRAME_BIT; + pfn = mfn_to_pfn_no_overrides(mfn); + if (get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) && + m2p_find_override(mfn) == NULL) + set_phys_to_machine(pfn, mfn); + + return 0; +} +EXPORT_SYMBOL_GPL(m2p_remove_override); + +struct page *m2p_find_override(unsigned long mfn) +{ + unsigned long flags; + struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)]; + struct page *p, *ret; + + ret = NULL; + + spin_lock_irqsave(&m2p_override_lock, flags); + + list_for_each_entry(p, bucket, lru) { + if (page_private(p) == mfn) { + ret = p; + break; + } + } + + spin_unlock_irqrestore(&m2p_override_lock, flags); + + return ret; +} + +unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn) +{ + struct page *p = m2p_find_override(mfn); + unsigned long ret = pfn; + + if (p) + ret = page_to_pfn(p); + + return ret; +} +EXPORT_SYMBOL_GPL(m2p_find_override_pfn); + +#ifdef CONFIG_XEN_DEBUG_FS +#include <linux/debugfs.h> +#include "debugfs.h" +static int p2m_dump_show(struct seq_file *m, void *v) +{ + static const char * const level_name[] = { "top", "middle", + "entry", "abnormal", "error"}; +#define TYPE_IDENTITY 0 +#define TYPE_MISSING 1 +#define TYPE_PFN 2 +#define TYPE_UNKNOWN 3 + static const char * const type_name[] = { + [TYPE_IDENTITY] = "identity", + [TYPE_MISSING] = "missing", + [TYPE_PFN] = "pfn", + [TYPE_UNKNOWN] = "abnormal"}; + unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0; + unsigned int uninitialized_var(prev_level); + unsigned int uninitialized_var(prev_type); + + if (!p2m_top) + return 0; + + for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); + unsigned idx = p2m_index(pfn); + unsigned lvl, type; + + lvl = 4; + type = TYPE_UNKNOWN; + if (p2m_top[topidx] == p2m_mid_missing) { + lvl = 0; type = TYPE_MISSING; + } else if (p2m_top[topidx] == NULL) { + lvl = 0; type = TYPE_UNKNOWN; + } else if (p2m_top[topidx][mididx] == NULL) { + lvl = 1; type = TYPE_UNKNOWN; + } else if (p2m_top[topidx][mididx] == p2m_identity) { + lvl = 1; type = TYPE_IDENTITY; + } else if (p2m_top[topidx][mididx] == p2m_missing) { + lvl = 1; type = TYPE_MISSING; + } else if (p2m_top[topidx][mididx][idx] == 0) { + lvl = 2; type = TYPE_UNKNOWN; + } else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) { + lvl = 2; type = TYPE_IDENTITY; + } else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) { + lvl = 2; type = TYPE_MISSING; + } else if (p2m_top[topidx][mididx][idx] == pfn) { + lvl = 2; type = TYPE_PFN; + } else if (p2m_top[topidx][mididx][idx] != pfn) { + lvl = 2; type = TYPE_PFN; + } + if (pfn == 0) { + prev_level = lvl; + prev_type = type; + } + if (pfn == MAX_DOMAIN_PAGES-1) { + lvl = 3; + type = TYPE_UNKNOWN; + } + if (prev_type != type) { + seq_printf(m, " [0x%lx->0x%lx] %s\n", + prev_pfn_type, pfn, type_name[prev_type]); + prev_pfn_type = pfn; + prev_type = type; + } + if (prev_level != lvl) { + seq_printf(m, " [0x%lx->0x%lx] level %s\n", + prev_pfn_level, pfn, level_name[prev_level]); + prev_pfn_level = pfn; + prev_level = lvl; + } + } + return 0; +#undef TYPE_IDENTITY +#undef TYPE_MISSING +#undef TYPE_PFN +#undef TYPE_UNKNOWN +} + +static int p2m_dump_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, p2m_dump_show, NULL); +} + +static const struct file_operations p2m_dump_fops = { + .open = p2m_dump_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *d_mmu_debug; + +static int __init xen_p2m_debugfs(void) +{ + struct dentry *d_xen = xen_init_debugfs(); + + if (d_xen == NULL) + return -ENOMEM; + + d_mmu_debug = debugfs_create_dir("mmu", d_xen); + + debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops); + return 0; +} +fs_initcall(xen_p2m_debugfs); +#endif /* CONFIG_XEN_DEBUG_FS */ diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c new file mode 100644 index 00000000000..0e98e5d241d --- /dev/null +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -0,0 +1,109 @@ +/* Glue code to lib/swiotlb-xen.c */ + +#include <linux/dma-mapping.h> +#include <linux/pci.h> +#include <xen/swiotlb-xen.h> + +#include <asm/xen/hypervisor.h> +#include <xen/xen.h> +#include <asm/iommu_table.h> + + +#include <asm/xen/swiotlb-xen.h> +#ifdef CONFIG_X86_64 +#include <asm/iommu.h> +#include <asm/dma.h> +#endif +#include <linux/export.h> + +int xen_swiotlb __read_mostly; + +static struct dma_map_ops xen_swiotlb_dma_ops = { + .mapping_error = xen_swiotlb_dma_mapping_error, + .alloc = xen_swiotlb_alloc_coherent, + .free = xen_swiotlb_free_coherent, + .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, + .sync_single_for_device = xen_swiotlb_sync_single_for_device, + .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu, + .sync_sg_for_device = xen_swiotlb_sync_sg_for_device, + .map_sg = xen_swiotlb_map_sg_attrs, + .unmap_sg = xen_swiotlb_unmap_sg_attrs, + .map_page = xen_swiotlb_map_page, + .unmap_page = xen_swiotlb_unmap_page, + .dma_supported = xen_swiotlb_dma_supported, +}; + +/* + * pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary + * + * This returns non-zero if we are forced to use xen_swiotlb (by the boot + * option). + */ +int __init pci_xen_swiotlb_detect(void) +{ + + if (!xen_pv_domain()) + return 0; + + /* If running as PV guest, either iommu=soft, or swiotlb=force will + * activate this IOMMU. If running as PV privileged, activate it + * irregardless. + */ + if ((xen_initial_domain() || swiotlb || swiotlb_force)) + xen_swiotlb = 1; + + /* If we are running under Xen, we MUST disable the native SWIOTLB. + * Don't worry about swiotlb_force flag activating the native, as + * the 'swiotlb' flag is the only one turning it on. */ + swiotlb = 0; + +#ifdef CONFIG_X86_64 + /* pci_swiotlb_detect_4gb turns on native SWIOTLB if no_iommu == 0 + * (so no iommu=X command line over-writes). + * Considering that PV guests do not want the *native SWIOTLB* but + * only Xen SWIOTLB it is not useful to us so set no_iommu=1 here. + */ + if (max_pfn > MAX_DMA32_PFN) + no_iommu = 1; +#endif + return xen_swiotlb; +} + +void __init pci_xen_swiotlb_init(void) +{ + if (xen_swiotlb) { + xen_swiotlb_init(1, true /* early */); + dma_ops = &xen_swiotlb_dma_ops; + +#ifdef CONFIG_PCI + /* Make sure ACS will be enabled */ + pci_request_acs(); +#endif + } +} + +int pci_xen_swiotlb_init_late(void) +{ + int rc; + + if (xen_swiotlb) + return 0; + + rc = xen_swiotlb_init(1, false /* late */); + if (rc) + return rc; + + dma_ops = &xen_swiotlb_dma_ops; +#ifdef CONFIG_PCI + /* Make sure ACS will be enabled */ + pci_request_acs(); +#endif + + return 0; +} +EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late); + +IOMMU_INIT_FINISH(pci_xen_swiotlb_detect, + NULL, + pci_xen_swiotlb_init, + NULL); diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c new file mode 100644 index 00000000000..a8261716d58 --- /dev/null +++ b/arch/x86/xen/platform-pci-unplug.c @@ -0,0 +1,217 @@ +/****************************************************************************** + * platform-pci-unplug.c + * + * Xen platform PCI device driver + * Copyright (c) 2010, Citrix + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#include <linux/init.h> +#include <linux/io.h> +#include <linux/module.h> + +#include <xen/platform_pci.h> +#include "xen-ops.h" + +#define XEN_PLATFORM_ERR_MAGIC -1 +#define XEN_PLATFORM_ERR_PROTOCOL -2 +#define XEN_PLATFORM_ERR_BLACKLIST -3 + +#ifdef CONFIG_XEN_PVHVM +/* store the value of xen_emul_unplug after the unplug is done */ +static int xen_platform_pci_unplug; +static int xen_emul_unplug; + +static int check_platform_magic(void) +{ + short magic; + char protocol; + + magic = inw(XEN_IOPORT_MAGIC); + if (magic != XEN_IOPORT_MAGIC_VAL) { + printk(KERN_ERR "Xen Platform PCI: unrecognised magic value\n"); + return XEN_PLATFORM_ERR_MAGIC; + } + + protocol = inb(XEN_IOPORT_PROTOVER); + + printk(KERN_DEBUG "Xen Platform PCI: I/O protocol version %d\n", + protocol); + + switch (protocol) { + case 1: + outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM); + outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER); + if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) { + printk(KERN_ERR "Xen Platform: blacklisted by host\n"); + return XEN_PLATFORM_ERR_BLACKLIST; + } + break; + default: + printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version"); + return XEN_PLATFORM_ERR_PROTOCOL; + } + + return 0; +} + +bool xen_has_pv_devices() +{ + if (!xen_domain()) + return false; + + /* PV domains always have them. */ + if (xen_pv_domain()) + return true; + + /* And user has xen_platform_pci=0 set in guest config as + * driver did not modify the value. */ + if (xen_platform_pci_unplug == 0) + return false; + + if (xen_platform_pci_unplug & XEN_UNPLUG_NEVER) + return false; + + if (xen_platform_pci_unplug & XEN_UNPLUG_ALL) + return true; + + /* This is an odd one - we are going to run legacy + * and PV drivers at the same time. */ + if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) + return true; + + /* And the caller has to follow with xen_pv_{disk,nic}_devices + * to be certain which driver can load. */ + return false; +} +EXPORT_SYMBOL_GPL(xen_has_pv_devices); + +static bool __xen_has_pv_device(int state) +{ + /* HVM domains might or might not */ + if (xen_hvm_domain() && (xen_platform_pci_unplug & state)) + return true; + + return xen_has_pv_devices(); +} + +bool xen_has_pv_nic_devices(void) +{ + return __xen_has_pv_device(XEN_UNPLUG_ALL_NICS | XEN_UNPLUG_ALL); +} +EXPORT_SYMBOL_GPL(xen_has_pv_nic_devices); + +bool xen_has_pv_disk_devices(void) +{ + return __xen_has_pv_device(XEN_UNPLUG_ALL_IDE_DISKS | + XEN_UNPLUG_AUX_IDE_DISKS | XEN_UNPLUG_ALL); +} +EXPORT_SYMBOL_GPL(xen_has_pv_disk_devices); + +/* + * This one is odd - it determines whether you want to run PV _and_ + * legacy (IDE) drivers together. This combination is only possible + * under HVM. + */ +bool xen_has_pv_and_legacy_disk_devices(void) +{ + if (!xen_domain()) + return false; + + /* N.B. This is only ever used in HVM mode */ + if (xen_pv_domain()) + return false; + + if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) + return true; + + return false; +} +EXPORT_SYMBOL_GPL(xen_has_pv_and_legacy_disk_devices); + +void xen_unplug_emulated_devices(void) +{ + int r; + + /* user explicitly requested no unplug */ + if (xen_emul_unplug & XEN_UNPLUG_NEVER) + return; + /* check the version of the xen platform PCI device */ + r = check_platform_magic(); + /* If the version matches enable the Xen platform PCI driver. + * Also enable the Xen platform PCI driver if the host does + * not support the unplug protocol (XEN_PLATFORM_ERR_MAGIC) + * but the user told us that unplugging is unnecessary. */ + if (r && !(r == XEN_PLATFORM_ERR_MAGIC && + (xen_emul_unplug & XEN_UNPLUG_UNNECESSARY))) + return; + /* Set the default value of xen_emul_unplug depending on whether or + * not the Xen PV frontends and the Xen platform PCI driver have + * been compiled for this kernel (modules or built-in are both OK). */ + if (!xen_emul_unplug) { + if (xen_must_unplug_nics()) { + printk(KERN_INFO "Netfront and the Xen platform PCI driver have " + "been compiled for this kernel: unplug emulated NICs.\n"); + xen_emul_unplug |= XEN_UNPLUG_ALL_NICS; + } + if (xen_must_unplug_disks()) { + printk(KERN_INFO "Blkfront and the Xen platform PCI driver have " + "been compiled for this kernel: unplug emulated disks.\n" + "You might have to change the root device\n" + "from /dev/hd[a-d] to /dev/xvd[a-d]\n" + "in your root= kernel command line option\n"); + xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS; + } + } + /* Now unplug the emulated devices */ + if (!(xen_emul_unplug & XEN_UNPLUG_UNNECESSARY)) + outw(xen_emul_unplug, XEN_IOPORT_UNPLUG); + xen_platform_pci_unplug = xen_emul_unplug; +} + +static int __init parse_xen_emul_unplug(char *arg) +{ + char *p, *q; + int l; + + for (p = arg; p; p = q) { + q = strchr(p, ','); + if (q) { + l = q - p; + q++; + } else { + l = strlen(p); + } + if (!strncmp(p, "all", l)) + xen_emul_unplug |= XEN_UNPLUG_ALL; + else if (!strncmp(p, "ide-disks", l)) + xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS; + else if (!strncmp(p, "aux-ide-disks", l)) + xen_emul_unplug |= XEN_UNPLUG_AUX_IDE_DISKS; + else if (!strncmp(p, "nics", l)) + xen_emul_unplug |= XEN_UNPLUG_ALL_NICS; + else if (!strncmp(p, "unnecessary", l)) + xen_emul_unplug |= XEN_UNPLUG_UNNECESSARY; + else if (!strncmp(p, "never", l)) + xen_emul_unplug |= XEN_UNPLUG_NEVER; + else + printk(KERN_WARNING "unrecognised option '%s' " + "in parameter 'xen_emul_unplug'\n", p); + } + return 0; +} +early_param("xen_emul_unplug", parse_xen_emul_unplug); +#endif diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 2341492bf7a..2e555163c2f 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -8,84 +8,622 @@ #include <linux/sched.h> #include <linux/mm.h> #include <linux/pm.h> +#include <linux/memblock.h> +#include <linux/cpuidle.h> +#include <linux/cpufreq.h> #include <asm/elf.h> #include <asm/vdso.h> #include <asm/e820.h> #include <asm/setup.h> +#include <asm/acpi.h> +#include <asm/numa.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> +#include <xen/xen.h> +#include <xen/page.h> +#include <xen/interface/callback.h> +#include <xen/interface/memory.h> #include <xen/interface/physdev.h> #include <xen/features.h> - #include "xen-ops.h" #include "vdso.h" /* These are code, but not functions. Defined in entry.S */ extern const char xen_hypervisor_callback[]; extern const char xen_failsafe_callback[]; +#ifdef CONFIG_X86_64 +extern asmlinkage void nmi(void); +#endif +extern void xen_sysenter_target(void); +extern void xen_syscall_target(void); +extern void xen_syscall32_target(void); + +/* Amount of extra memory space we add to the e820 ranges */ +struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; + +/* Number of pages released from the initial allocation. */ +unsigned long xen_released_pages; + +/* + * The maximum amount of extra memory compared to the base size. The + * main scaling factor is the size of struct page. At extreme ratios + * of base:extra, all the base memory can be filled with page + * structures for the extra memory, leaving no space for anything + * else. + * + * 10x seems like a reasonable balance between scaling flexibility and + * leaving a practically usable system. + */ +#define EXTRA_MEM_RATIO (10) + +static void __init xen_add_extra_mem(u64 start, u64 size) +{ + unsigned long pfn; + int i; + + for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { + /* Add new region. */ + if (xen_extra_mem[i].size == 0) { + xen_extra_mem[i].start = start; + xen_extra_mem[i].size = size; + break; + } + /* Append to existing region. */ + if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) { + xen_extra_mem[i].size += size; + break; + } + } + if (i == XEN_EXTRA_MEM_MAX_REGIONS) + printk(KERN_WARNING "Warning: not enough extra memory regions\n"); + + memblock_reserve(start, size); + + xen_max_p2m_pfn = PFN_DOWN(start + size); + for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) { + unsigned long mfn = pfn_to_mfn(pfn); + + if (WARN_ONCE(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn)) + continue; + WARN_ONCE(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n", + pfn, mfn); + + __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + } +} + +static unsigned long __init xen_do_chunk(unsigned long start, + unsigned long end, bool release) +{ + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + unsigned long len = 0; + unsigned long pfn; + int ret; + + for (pfn = start; pfn < end; pfn++) { + unsigned long frame; + unsigned long mfn = pfn_to_mfn(pfn); + + if (release) { + /* Make sure pfn exists to start with */ + if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) + continue; + frame = mfn; + } else { + if (mfn != INVALID_P2M_ENTRY) + continue; + frame = pfn; + } + set_xen_guest_handle(reservation.extent_start, &frame); + reservation.nr_extents = 1; + + ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap, + &reservation); + WARN(ret != 1, "Failed to %s pfn %lx err=%d\n", + release ? "release" : "populate", pfn, ret); + + if (ret == 1) { + if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) { + if (release) + break; + set_xen_guest_handle(reservation.extent_start, &frame); + reservation.nr_extents = 1; + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &reservation); + break; + } + len++; + } else + break; + } + if (len) + printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n", + release ? "Freeing" : "Populating", + start, end, len, + release ? "freed" : "added"); + + return len; +} + +static unsigned long __init xen_release_chunk(unsigned long start, + unsigned long end) +{ + return xen_do_chunk(start, end, true); +} + +static unsigned long __init xen_populate_chunk( + const struct e820entry *list, size_t map_size, + unsigned long max_pfn, unsigned long *last_pfn, + unsigned long credits_left) +{ + const struct e820entry *entry; + unsigned int i; + unsigned long done = 0; + unsigned long dest_pfn; + + for (i = 0, entry = list; i < map_size; i++, entry++) { + unsigned long s_pfn; + unsigned long e_pfn; + unsigned long pfns; + long capacity; + + if (credits_left <= 0) + break; + + if (entry->type != E820_RAM) + continue; + + e_pfn = PFN_DOWN(entry->addr + entry->size); + + /* We only care about E820 after the xen_start_info->nr_pages */ + if (e_pfn <= max_pfn) + continue; + + s_pfn = PFN_UP(entry->addr); + /* If the E820 falls within the nr_pages, we want to start + * at the nr_pages PFN. + * If that would mean going past the E820 entry, skip it + */ + if (s_pfn <= max_pfn) { + capacity = e_pfn - max_pfn; + dest_pfn = max_pfn; + } else { + capacity = e_pfn - s_pfn; + dest_pfn = s_pfn; + } + + if (credits_left < capacity) + capacity = credits_left; + + pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false); + done += pfns; + *last_pfn = (dest_pfn + pfns); + if (pfns < capacity) + break; + credits_left -= pfns; + } + return done; +} + +static void __init xen_set_identity_and_release_chunk( + unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, + unsigned long *released, unsigned long *identity) +{ + unsigned long pfn; + + /* + * If the PFNs are currently mapped, clear the mappings + * (except for the ISA region which must be 1:1 mapped) to + * release the refcounts (in Xen) on the original frames. + */ + for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) { + pte_t pte = __pte_ma(0); + + if (pfn < PFN_UP(ISA_END_ADDRESS)) + pte = mfn_pte(pfn, PAGE_KERNEL_IO); + + (void)HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), pte, 0); + } + + if (start_pfn < nr_pages) + *released += xen_release_chunk( + start_pfn, min(end_pfn, nr_pages)); + + *identity += set_phys_range_identity(start_pfn, end_pfn); +} + +static unsigned long __init xen_set_identity_and_release( + const struct e820entry *list, size_t map_size, unsigned long nr_pages) +{ + phys_addr_t start = 0; + unsigned long released = 0; + unsigned long identity = 0; + const struct e820entry *entry; + int i; + + /* + * Combine non-RAM regions and gaps until a RAM region (or the + * end of the map) is reached, then set the 1:1 map and + * release the pages (if available) in those non-RAM regions. + * + * The combined non-RAM regions are rounded to a whole number + * of pages so any partial pages are accessible via the 1:1 + * mapping. This is needed for some BIOSes that put (for + * example) the DMI tables in a reserved region that begins on + * a non-page boundary. + */ + for (i = 0, entry = list; i < map_size; i++, entry++) { + phys_addr_t end = entry->addr + entry->size; + if (entry->type == E820_RAM || i == map_size - 1) { + unsigned long start_pfn = PFN_DOWN(start); + unsigned long end_pfn = PFN_UP(end); + + if (entry->type == E820_RAM) + end_pfn = PFN_UP(entry->addr); + + if (start_pfn < end_pfn) + xen_set_identity_and_release_chunk( + start_pfn, end_pfn, nr_pages, + &released, &identity); + + start = end; + } + } + + if (released) + printk(KERN_INFO "Released %lu pages of unused memory\n", released); + if (identity) + printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity); + + return released; +} + +static unsigned long __init xen_get_max_pages(void) +{ + unsigned long max_pages = MAX_DOMAIN_PAGES; + domid_t domid = DOMID_SELF; + int ret; + + /* + * For the initial domain we use the maximum reservation as + * the maximum page. + * + * For guest domains the current maximum reservation reflects + * the current maximum rather than the static maximum. In this + * case the e820 map provided to us will cover the static + * maximum region. + */ + if (xen_initial_domain()) { + ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid); + if (ret > 0) + max_pages = ret; + } + + return min(max_pages, MAX_DOMAIN_PAGES); +} + +static void xen_align_and_add_e820_region(u64 start, u64 size, int type) +{ + u64 end = start + size; + + /* Align RAM regions to page boundaries. */ + if (type == E820_RAM) { + start = PAGE_ALIGN(start); + end &= ~((u64)PAGE_SIZE - 1); + } + + e820_add_region(start, end - start, type); +} + +void xen_ignore_unusable(struct e820entry *list, size_t map_size) +{ + struct e820entry *entry; + unsigned int i; -unsigned long *phys_to_machine_mapping; -EXPORT_SYMBOL(phys_to_machine_mapping); + for (i = 0, entry = list; i < map_size; i++, entry++) { + if (entry->type == E820_UNUSABLE) + entry->type = E820_RAM; + } +} /** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ - char * __init xen_memory_setup(void) { + static struct e820entry map[E820MAX] __initdata; + unsigned long max_pfn = xen_start_info->nr_pages; + unsigned long long mem_end; + int rc; + struct xen_memory_map memmap; + unsigned long max_pages; + unsigned long last_pfn = 0; + unsigned long extra_pages = 0; + unsigned long populated; + int i; + int op; + + max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); + mem_end = PFN_PHYS(max_pfn); - e820.nr_map = 0; - add_memory_region(0, LOWMEMSIZE(), E820_RAM); - add_memory_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM); + memmap.nr_entries = E820MAX; + set_xen_guest_handle(memmap.buffer, map); + + op = xen_initial_domain() ? + XENMEM_machine_memory_map : + XENMEM_memory_map; + rc = HYPERVISOR_memory_op(op, &memmap); + if (rc == -ENOSYS) { + BUG_ON(xen_initial_domain()); + memmap.nr_entries = 1; + map[0].addr = 0ULL; + map[0].size = mem_end; + /* 8MB slack (to balance backend allocations). */ + map[0].size += 8ULL << 20; + map[0].type = E820_RAM; + rc = 0; + } + BUG_ON(rc); + + /* + * Xen won't allow a 1:1 mapping to be created to UNUSABLE + * regions, so if we're using the machine memory map leave the + * region as RAM as it is in the pseudo-physical map. + * + * UNUSABLE regions in domUs are not handled and will need + * a patch in the future. + */ + if (xen_initial_domain()) + xen_ignore_unusable(map, memmap.nr_entries); + + /* Make sure the Xen-supplied memory map is well-ordered. */ + sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); + + max_pages = xen_get_max_pages(); + if (max_pages > max_pfn) + extra_pages += max_pages - max_pfn; + + /* + * Set P2M for all non-RAM pages and E820 gaps to be identity + * type PFNs. Any RAM pages that would be made inaccesible by + * this are first released. + */ + xen_released_pages = xen_set_identity_and_release( + map, memmap.nr_entries, max_pfn); + + /* + * Populate back the non-RAM pages and E820 gaps that had been + * released. */ + populated = xen_populate_chunk(map, memmap.nr_entries, + max_pfn, &last_pfn, xen_released_pages); + + xen_released_pages -= populated; + extra_pages += xen_released_pages; + + if (last_pfn > max_pfn) { + max_pfn = min(MAX_DOMAIN_PAGES, last_pfn); + mem_end = PFN_PHYS(max_pfn); + } + /* + * Clamp the amount of extra memory to a EXTRA_MEM_RATIO + * factor the base size. On non-highmem systems, the base + * size is the full initial memory allocation; on highmem it + * is limited to the max size of lowmem, so that it doesn't + * get completely filled. + * + * In principle there could be a problem in lowmem systems if + * the initial memory is also very large with respect to + * lowmem, but we won't try to deal with that here. + */ + extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), + extra_pages); + i = 0; + while (i < memmap.nr_entries) { + u64 addr = map[i].addr; + u64 size = map[i].size; + u32 type = map[i].type; + + if (type == E820_RAM) { + if (addr < mem_end) { + size = min(size, mem_end - addr); + } else if (extra_pages) { + size = min(size, (u64)extra_pages * PAGE_SIZE); + extra_pages -= size / PAGE_SIZE; + xen_add_extra_mem(addr, size); + } else + type = E820_UNUSABLE; + } + + xen_align_and_add_e820_region(addr, size, type); + + map[i].addr += size; + map[i].size -= size; + if (map[i].size == 0) + i++; + } + + /* + * Set the rest as identity mapped, in case PCI BARs are + * located here. + * + * PFNs above MAX_P2M_PFN are considered identity mapped as + * well. + */ + set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul); + + /* + * In domU, the ISA region is normal, usable memory, but we + * reserve ISA memory anyway because too many things poke + * about in there. + */ + e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, + E820_RESERVED); + + /* + * Reserve Xen bits: + * - mfn_list + * - xen_start_info + * See comment above "struct start_info" in <xen/interface/xen.h> + * We tried to make the the memblock_reserve more selective so + * that it would be clear what region is reserved. Sadly we ran + * in the problem wherein on a 64-bit hypervisor with a 32-bit + * initial domain, the pt_base has the cr3 value which is not + * neccessarily where the pagetable starts! As Jan put it: " + * Actually, the adjustment turns out to be correct: The page + * tables for a 32-on-64 dom0 get allocated in the order "first L1", + * "first L2", "first L3", so the offset to the page table base is + * indeed 2. When reading xen/include/public/xen.h's comment + * very strictly, this is not a violation (since there nothing is said + * that the first thing in the page table space is pointed to by + * pt_base; I admit that this seems to be implied though, namely + * do I think that it is implied that the page table space is the + * range [pt_base, pt_base + nt_pt_frames), whereas that + * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames), + * which - without a priori knowledge - the kernel would have + * difficulty to figure out)." - so lets just fall back to the + * easy way and reserve the whole region. + */ + memblock_reserve(__pa(xen_start_info->mfn_list), + xen_start_info->pt_base - xen_start_info->mfn_list); + + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); return "Xen"; } -static void xen_idle(void) +/* + * Machine specific memory setup for auto-translated guests. + */ +char * __init xen_auto_xlated_memory_setup(void) { - local_irq_disable(); + static struct e820entry map[E820MAX] __initdata; - if (need_resched()) - local_irq_enable(); - else { - current_thread_info()->status &= ~TS_POLLING; - smp_mb__after_clear_bit(); - safe_halt(); - current_thread_info()->status |= TS_POLLING; - } + struct xen_memory_map memmap; + int i; + int rc; + + memmap.nr_entries = E820MAX; + set_xen_guest_handle(memmap.buffer, map); + + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); + if (rc < 0) + panic("No memory map (%d)\n", rc); + + sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries); + + for (i = 0; i < memmap.nr_entries; i++) + e820_add_region(map[i].addr, map[i].size, map[i].type); + + memblock_reserve(__pa(xen_start_info->mfn_list), + xen_start_info->pt_base - xen_start_info->mfn_list); + + return "Xen"; } /* * Set the bit indicating "nosegneg" library variants should be used. + * We only need to bother in pure 32-bit mode; compat 32-bit processes + * can have un-truncated segments, so wrapping around is allowed. */ static void __init fiddle_vdso(void) { - extern const char vdso32_default_start; - u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK); +#ifdef CONFIG_X86_32 + /* + * This could be called before selected_vdso32 is initialized, so + * just fiddle with both possible images. vdso_image_32_syscall + * can't be selected, since it only exists on 64-bit systems. + */ + u32 *mask; + mask = vdso_image_32_int80.data + + vdso_image_32_int80.sym_VDSO32_NOTE_MASK; *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; + mask = vdso_image_32_sysenter.data + + vdso_image_32_sysenter.sym_VDSO32_NOTE_MASK; + *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; +#endif } -void __init xen_arch_setup(void) +static int register_callback(unsigned type, const void *func) { - struct physdev_set_iopl set_iopl; - int rc; + struct callback_register callback = { + .type = type, + .address = XEN_CALLBACK(__KERNEL_CS, func), + .flags = CALLBACKF_mask_events, + }; + + return HYPERVISOR_callback_op(CALLBACKOP_register, &callback); +} + +void xen_enable_sysenter(void) +{ + int ret; + unsigned sysenter_feature; + +#ifdef CONFIG_X86_32 + sysenter_feature = X86_FEATURE_SEP; +#else + sysenter_feature = X86_FEATURE_SYSENTER32; +#endif + + if (!boot_cpu_has(sysenter_feature)) + return; + + ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target); + if(ret != 0) + setup_clear_cpu_cap(sysenter_feature); +} + +void xen_enable_syscall(void) +{ +#ifdef CONFIG_X86_64 + int ret; + ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target); + if (ret != 0) { + printk(KERN_ERR "Failed to set syscall callback: %d\n", ret); + /* Pretty fatal; 64-bit userspace has no other + mechanism for syscalls. */ + } + + if (boot_cpu_has(X86_FEATURE_SYSCALL32)) { + ret = register_callback(CALLBACKTYPE_syscall32, + xen_syscall32_target); + if (ret != 0) + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); + } +#endif /* CONFIG_X86_64 */ +} + +void __init xen_pvmmu_arch_setup(void) +{ HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); - if (!xen_feature(XENFEAT_auto_translated_physmap)) - HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); + HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_pae_extended_cr3); - HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, - __KERNEL_CS, (unsigned long)xen_failsafe_callback); + if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || + register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) + BUG(); - set_iopl.iopl = 1; - rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); - if (rc != 0) - printk(KERN_INFO "physdev_op failed %d\n", rc); + xen_enable_sysenter(); + xen_enable_syscall(); +} + +/* This function is not called for HVM domains */ +void __init xen_arch_setup(void) +{ + xen_panic_handler_init(); + if (!xen_feature(XENFEAT_auto_translated_physmap)) + xen_pvmmu_arch_setup(); #ifdef CONFIG_ACPI if (!(xen_start_info->flags & SIF_INITDOMAIN)) { @@ -98,14 +636,12 @@ void __init xen_arch_setup(void) MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); - pm_idle = xen_idle; - -#ifdef CONFIG_SMP - /* fill cpus_possible with all available cpus */ - xen_fill_possible_map(); -#endif - - paravirt_disable_iospace(); - + /* Set up idle, making sure it calls safe_halt() pvop */ + disable_cpuidle(); + disable_cpufreq(); + WARN_ON(xen_set_default_idle()); fiddle_vdso(); +#ifdef CONFIG_NUMA + numa_off = 1; +#endif } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index aafc5443740..7005974c3ff 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -11,12 +11,13 @@ * useful topology information for the kernel to make use of. As a * result, all CPUs are treated as if they're single-core and * single-threaded. - * - * This does not handle HOTPLUG_CPU yet. */ #include <linux/sched.h> #include <linux/err.h> +#include <linux/slab.h> #include <linux/smp.h> +#include <linux/irq_work.h> +#include <linux/tick.h> #include <asm/paravirt.h> #include <asm/desc.h> @@ -29,229 +30,401 @@ #include <asm/xen/interface.h> #include <asm/xen/hypercall.h> +#include <xen/xen.h> #include <xen/page.h> #include <xen/events.h> +#include <xen/hvc-console.h> #include "xen-ops.h" #include "mmu.h" -static cpumask_t cpu_initialized_map; -static DEFINE_PER_CPU(int, resched_irq); -static DEFINE_PER_CPU(int, callfunc_irq); +cpumask_var_t xen_cpu_initialized_map; -/* - * Structure and data for smp_call_function(). This is designed to minimise - * static memory requirements. It also looks cleaner. - */ -static DEFINE_SPINLOCK(call_lock); - -struct call_data_struct { - void (*func) (void *info); - void *info; - atomic_t started; - atomic_t finished; - int wait; +struct xen_common_irq { + int irq; + char *name; }; +static DEFINE_PER_CPU(struct xen_common_irq, xen_resched_irq) = { .irq = -1 }; +static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 }; +static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 }; +static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 }; +static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 }; static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); - -static struct call_data_struct *call_data; +static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); +static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id); /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back. */ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) { + inc_irq_stat(irq_resched_count); + scheduler_ipi(); + return IRQ_HANDLED; } -static __cpuinit void cpu_bringup_and_idle(void) +static void cpu_bringup(void) { - int cpu = smp_processor_id(); + int cpu; cpu_init(); - + touch_softlockup_watchdog(); preempt_disable(); - per_cpu(cpu_state, cpu) = CPU_ONLINE; + + /* PVH runs in ring 0 and allows us to do native syscalls. Yay! */ + if (!xen_feature(XENFEAT_supervisor_mode_kernel)) { + xen_enable_sysenter(); + xen_enable_syscall(); + } + cpu = smp_processor_id(); + smp_store_cpu_info(cpu); + cpu_data(cpu).x86_max_cores = 1; + set_cpu_sibling_map(cpu); xen_setup_cpu_clockevents(); + notify_cpu_starting(cpu); + + set_cpu_online(cpu, true); + + this_cpu_write(cpu_state, CPU_ONLINE); + + wmb(); + /* We can take interrupts now: we're officially "up". */ local_irq_enable(); wmb(); /* make sure everything is out */ - cpu_idle(); } +/* Note: cpu parameter is only relevant for PVH */ +static void cpu_bringup_and_idle(int cpu) +{ +#ifdef CONFIG_X86_64 + if (xen_feature(XENFEAT_auto_translated_physmap) && + xen_feature(XENFEAT_supervisor_mode_kernel)) + xen_pvh_secondary_vcpu_init(cpu); +#endif + cpu_bringup(); + cpu_startup_entry(CPUHP_ONLINE); +} + +static void xen_smp_intr_free(unsigned int cpu) +{ + if (per_cpu(xen_resched_irq, cpu).irq >= 0) { + unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu).irq, NULL); + per_cpu(xen_resched_irq, cpu).irq = -1; + kfree(per_cpu(xen_resched_irq, cpu).name); + per_cpu(xen_resched_irq, cpu).name = NULL; + } + if (per_cpu(xen_callfunc_irq, cpu).irq >= 0) { + unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu).irq, NULL); + per_cpu(xen_callfunc_irq, cpu).irq = -1; + kfree(per_cpu(xen_callfunc_irq, cpu).name); + per_cpu(xen_callfunc_irq, cpu).name = NULL; + } + if (per_cpu(xen_debug_irq, cpu).irq >= 0) { + unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu).irq, NULL); + per_cpu(xen_debug_irq, cpu).irq = -1; + kfree(per_cpu(xen_debug_irq, cpu).name); + per_cpu(xen_debug_irq, cpu).name = NULL; + } + if (per_cpu(xen_callfuncsingle_irq, cpu).irq >= 0) { + unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu).irq, + NULL); + per_cpu(xen_callfuncsingle_irq, cpu).irq = -1; + kfree(per_cpu(xen_callfuncsingle_irq, cpu).name); + per_cpu(xen_callfuncsingle_irq, cpu).name = NULL; + } + if (xen_hvm_domain()) + return; + + if (per_cpu(xen_irq_work, cpu).irq >= 0) { + unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL); + per_cpu(xen_irq_work, cpu).irq = -1; + kfree(per_cpu(xen_irq_work, cpu).name); + per_cpu(xen_irq_work, cpu).name = NULL; + } +}; static int xen_smp_intr_init(unsigned int cpu) { int rc; - const char *resched_name, *callfunc_name; - - per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1; + char *resched_name, *callfunc_name, *debug_name; resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, cpu, xen_reschedule_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + IRQF_PERCPU|IRQF_NOBALANCING, resched_name, NULL); if (rc < 0) goto fail; - per_cpu(resched_irq, cpu) = rc; + per_cpu(xen_resched_irq, cpu).irq = rc; + per_cpu(xen_resched_irq, cpu).name = resched_name; callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu); rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR, cpu, xen_call_function_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + IRQF_PERCPU|IRQF_NOBALANCING, + callfunc_name, + NULL); + if (rc < 0) + goto fail; + per_cpu(xen_callfunc_irq, cpu).irq = rc; + per_cpu(xen_callfunc_irq, cpu).name = callfunc_name; + + debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu); + rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt, + IRQF_PERCPU | IRQF_NOBALANCING, + debug_name, NULL); + if (rc < 0) + goto fail; + per_cpu(xen_debug_irq, cpu).irq = rc; + per_cpu(xen_debug_irq, cpu).name = debug_name; + + callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu); + rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR, + cpu, + xen_call_function_single_interrupt, + IRQF_PERCPU|IRQF_NOBALANCING, + callfunc_name, + NULL); + if (rc < 0) + goto fail; + per_cpu(xen_callfuncsingle_irq, cpu).irq = rc; + per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name; + + /* + * The IRQ worker on PVHVM goes through the native path and uses the + * IPI mechanism. + */ + if (xen_hvm_domain()) + return 0; + + callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu); + rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR, + cpu, + xen_irq_work_interrupt, + IRQF_PERCPU|IRQF_NOBALANCING, callfunc_name, NULL); if (rc < 0) goto fail; - per_cpu(callfunc_irq, cpu) = rc; + per_cpu(xen_irq_work, cpu).irq = rc; + per_cpu(xen_irq_work, cpu).name = callfunc_name; return 0; fail: - if (per_cpu(resched_irq, cpu) >= 0) - unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); - if (per_cpu(callfunc_irq, cpu) >= 0) - unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); + xen_smp_intr_free(cpu); return rc; } -void __init xen_fill_possible_map(void) +static void __init xen_fill_possible_map(void) { int i, rc; - for (i = 0; i < NR_CPUS; i++) { + if (xen_initial_domain()) + return; + + for (i = 0; i < nr_cpu_ids; i++) { rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); - if (rc >= 0) - cpu_set(i, cpu_possible_map); + if (rc >= 0) { + num_processors++; + set_cpu_possible(i, true); + } } } -void __init xen_smp_prepare_boot_cpu(void) +static void __init xen_filter_cpu_maps(void) { - int cpu; + int i, rc; + unsigned int subtract = 0; + + if (!xen_initial_domain()) + return; + num_processors = 0; + disabled_cpus = 0; + for (i = 0; i < nr_cpu_ids; i++) { + rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); + if (rc >= 0) { + num_processors++; + set_cpu_possible(i, true); + } else { + set_cpu_possible(i, false); + set_cpu_present(i, false); + subtract++; + } + } +#ifdef CONFIG_HOTPLUG_CPU + /* This is akin to using 'nr_cpus' on the Linux command line. + * Which is OK as when we use 'dom0_max_vcpus=X' we can only + * have up to X, while nr_cpu_ids is greater than X. This + * normally is not a problem, except when CPU hotplugging + * is involved and then there might be more than X CPUs + * in the guest - which will not work as there is no + * hypercall to expand the max number of VCPUs an already + * running guest has. So cap it up to X. */ + if (subtract) + nr_cpu_ids = nr_cpu_ids - subtract; +#endif + +} + +static void __init xen_smp_prepare_boot_cpu(void) +{ BUG_ON(smp_processor_id() != 0); native_smp_prepare_boot_cpu(); - /* We've switched to the "real" per-cpu gdt, so make sure the - old memory can be recycled */ - make_lowmem_page_readwrite(&per_cpu__gdt_page); + if (xen_pv_domain()) { + if (!xen_feature(XENFEAT_writable_page_tables)) + /* We've switched to the "real" per-cpu gdt, so make + * sure the old memory can be recycled. */ + make_lowmem_page_readwrite(xen_initial_gdt); - for_each_possible_cpu(cpu) { - cpus_clear(per_cpu(cpu_sibling_map, cpu)); +#ifdef CONFIG_X86_32 /* - * cpu_core_map lives in a per cpu area that is cleared - * when the per cpu array is allocated. - * - * cpus_clear(per_cpu(cpu_core_map, cpu)); + * Xen starts us with XEN_FLAT_RING1_DS, but linux code + * expects __USER_DS */ - } + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); +#endif - xen_setup_vcpu_info_placement(); + xen_filter_cpu_maps(); + xen_setup_vcpu_info_placement(); + } + /* + * The alternative logic (which patches the unlock/lock) runs before + * the smp bootup up code is activated. Hence we need to set this up + * the core kernel is being patched. Otherwise we will have only + * modules patched but not core code. + */ + xen_init_spinlocks(); } -void __init xen_smp_prepare_cpus(unsigned int max_cpus) +static void __init xen_smp_prepare_cpus(unsigned int max_cpus) { unsigned cpu; + unsigned int i; - for_each_possible_cpu(cpu) { - cpus_clear(per_cpu(cpu_sibling_map, cpu)); - /* - * cpu_core_ map will be zeroed when the per - * cpu area is allocated. - * - * cpus_clear(per_cpu(cpu_core_map, cpu)); - */ + if (skip_ioapic_setup) { + char *m = (max_cpus == 0) ? + "The nosmp parameter is incompatible with Xen; " \ + "use Xen dom0_max_vcpus=1 parameter" : + "The noapic parameter is incompatible with Xen"; + + xen_raw_printk(m); + panic(m); } + xen_init_lock_cpu(0); - smp_store_cpu_info(0); + smp_store_boot_cpu_info(); + cpu_data(0).x86_max_cores = 1; + + for_each_possible_cpu(i) { + zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); + } set_cpu_sibling_map(0); if (xen_smp_intr_init(0)) BUG(); - cpu_initialized_map = cpumask_of_cpu(0); + if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL)) + panic("could not allocate xen_cpu_initialized_map\n"); + + cpumask_copy(xen_cpu_initialized_map, cpumask_of(0)); /* Restrict the possible_map according to max_cpus. */ while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { - for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--) + for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--) continue; - cpu_clear(cpu, cpu_possible_map); + set_cpu_possible(cpu, false); } - for_each_possible_cpu (cpu) { - struct task_struct *idle; - - if (cpu == 0) - continue; - - idle = fork_idle(cpu); - if (IS_ERR(idle)) - panic("failed fork for CPU %d", cpu); - - cpu_set(cpu, cpu_present_map); - } - - //init_xenbus_allowed_cpumask(); + for_each_possible_cpu(cpu) + set_cpu_present(cpu, true); } -static __cpuinit int +static int cpu_initialize_context(unsigned int cpu, struct task_struct *idle) { struct vcpu_guest_context *ctxt; - struct gdt_page *gdt = &per_cpu(gdt_page, cpu); + struct desc_struct *gdt; + unsigned long gdt_mfn; - if (cpu_test_and_set(cpu, cpu_initialized_map)) + if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) return 0; ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); if (ctxt == NULL) return -ENOMEM; - ctxt->flags = VGCF_IN_KERNEL; - ctxt->user_regs.ds = __USER_DS; - ctxt->user_regs.es = __USER_DS; + gdt = get_cpu_gdt_table(cpu); + +#ifdef CONFIG_X86_32 + /* Note: PVH is not yet supported on x86_32. */ ctxt->user_regs.fs = __KERNEL_PERCPU; - ctxt->user_regs.gs = 0; - ctxt->user_regs.ss = __KERNEL_DS; + ctxt->user_regs.gs = __KERNEL_STACK_CANARY; +#endif ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; - ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); - xen_copy_trap_info(ctxt->trap_ctxt); + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + ctxt->flags = VGCF_IN_KERNEL; + ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ + ctxt->user_regs.ds = __USER_DS; + ctxt->user_regs.es = __USER_DS; + ctxt->user_regs.ss = __KERNEL_DS; - ctxt->ldt_ents = 0; + xen_copy_trap_info(ctxt->trap_ctxt); - BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); - make_lowmem_page_readonly(gdt->gdt); + ctxt->ldt_ents = 0; - ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); - ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); + BUG_ON((unsigned long)gdt & ~PAGE_MASK); - ctxt->user_regs.cs = __KERNEL_CS; - ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); + gdt_mfn = arbitrary_virt_to_mfn(gdt); + make_lowmem_page_readonly(gdt); + make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); - ctxt->kernel_ss = __KERNEL_DS; - ctxt->kernel_sp = idle->thread.sp0; + ctxt->gdt_frames[0] = gdt_mfn; + ctxt->gdt_ents = GDT_ENTRIES; - ctxt->event_callback_cs = __KERNEL_CS; - ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback; - ctxt->failsafe_callback_cs = __KERNEL_CS; - ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; + ctxt->kernel_ss = __KERNEL_DS; + ctxt->kernel_sp = idle->thread.sp0; - per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); +#ifdef CONFIG_X86_32 + ctxt->event_callback_cs = __KERNEL_CS; + ctxt->failsafe_callback_cs = __KERNEL_CS; +#else + ctxt->gs_base_kernel = per_cpu_offset(cpu); +#endif + ctxt->event_callback_eip = + (unsigned long)xen_hypervisor_callback; + ctxt->failsafe_callback_eip = + (unsigned long)xen_failsafe_callback; + ctxt->user_regs.cs = __KERNEL_CS; + per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); +#ifdef CONFIG_X86_32 + } +#else + } else + /* N.B. The user_regs.eip (cpu_bringup_and_idle) is called with + * %rdi having the cpu number - which means are passing in + * as the first parameter the cpu. Subtle! + */ + ctxt->user_regs.rdi = cpu; +#endif + ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); - if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt)) BUG(); @@ -259,21 +432,25 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) return 0; } -int __cpuinit xen_cpu_up(unsigned int cpu) +static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) { - struct task_struct *idle = idle_task(cpu); int rc; -#if 0 - rc = cpu_up_check(cpu); - if (rc) - return rc; -#endif - - init_gdt(cpu); per_cpu(current_task, cpu) = idle; +#ifdef CONFIG_X86_32 irq_ctx_init(cpu); +#else + clear_tsk_thread_flag(idle, TIF_FORK); +#endif + per_cpu(kernel_stack, cpu) = + (unsigned long)task_stack_page(idle) - + KERNEL_STACK_OFFSET + THREAD_SIZE; + + xen_setup_runstate_info(cpu); xen_setup_timer(cpu); + xen_init_lock_cpu(cpu); + + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; /* make sure interrupts start blocked */ per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; @@ -283,29 +460,83 @@ int __cpuinit xen_cpu_up(unsigned int cpu) return rc; if (num_online_cpus() == 1) - alternatives_smp_switch(1); + /* Just in case we booted with a single CPU. */ + alternatives_enable_smp(); rc = xen_smp_intr_init(cpu); if (rc) return rc; - smp_store_cpu_info(cpu); - set_cpu_sibling_map(cpu); - /* This must be done before setting cpu_online_map */ - wmb(); - - cpu_set(cpu, cpu_online_map); - rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); BUG_ON(rc); + while(per_cpu(cpu_state, cpu) != CPU_ONLINE) { + HYPERVISOR_sched_op(SCHEDOP_yield, NULL); + barrier(); + } + return 0; } -void xen_smp_cpus_done(unsigned int max_cpus) +static void xen_smp_cpus_done(unsigned int max_cpus) { } +#ifdef CONFIG_HOTPLUG_CPU +static int xen_cpu_disable(void) +{ + unsigned int cpu = smp_processor_id(); + if (cpu == 0) + return -EBUSY; + + cpu_disable_common(); + + load_cr3(swapper_pg_dir); + return 0; +} + +static void xen_cpu_die(unsigned int cpu) +{ + while (xen_pv_domain() && HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) { + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout(HZ/10); + } + xen_smp_intr_free(cpu); + xen_uninit_lock_cpu(cpu); + xen_teardown_timer(cpu); +} + +static void xen_play_dead(void) /* used only with HOTPLUG_CPU */ +{ + play_dead_common(); + HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); + cpu_bringup(); + /* + * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down) + * clears certain data that the cpu_idle loop (which called us + * and that we return from) expects. The only way to get that + * data back is to call: + */ + tick_nohz_idle_enter(); +} + +#else /* !CONFIG_HOTPLUG_CPU */ +static int xen_cpu_disable(void) +{ + return -ENOSYS; +} + +static void xen_cpu_die(unsigned int cpu) +{ + BUG(); +} + +static void xen_play_dead(void) +{ + BUG(); +} + +#endif static void stop_self(void *v) { int cpu = smp_processor_id(); @@ -314,108 +545,232 @@ static void stop_self(void *v) load_cr3(swapper_pg_dir); /* should set up a minimal gdt */ + set_cpu_online(cpu, false); + HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL); BUG(); } -void xen_smp_send_stop(void) +static void xen_stop_other_cpus(int wait) { - smp_call_function(stop_self, NULL, 0, 0); + smp_call_function(stop_self, NULL, wait); } -void xen_smp_send_reschedule(int cpu) +static void xen_smp_send_reschedule(int cpu) { xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); } +static void __xen_send_IPI_mask(const struct cpumask *mask, + int vector) +{ + unsigned cpu; + + for_each_cpu_and(cpu, mask, cpu_online_mask) + xen_send_IPI_one(cpu, vector); +} + +static void xen_smp_send_call_function_ipi(const struct cpumask *mask) +{ + int cpu; + + __xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); + + /* Make sure other vcpus get a chance to run if they need to. */ + for_each_cpu(cpu, mask) { + if (xen_vcpu_stolen(cpu)) { + HYPERVISOR_sched_op(SCHEDOP_yield, NULL); + break; + } + } +} + +static void xen_smp_send_call_function_single_ipi(int cpu) +{ + __xen_send_IPI_mask(cpumask_of(cpu), + XEN_CALL_FUNCTION_SINGLE_VECTOR); +} + +static inline int xen_map_vector(int vector) +{ + int xen_vector; + + switch (vector) { + case RESCHEDULE_VECTOR: + xen_vector = XEN_RESCHEDULE_VECTOR; + break; + case CALL_FUNCTION_VECTOR: + xen_vector = XEN_CALL_FUNCTION_VECTOR; + break; + case CALL_FUNCTION_SINGLE_VECTOR: + xen_vector = XEN_CALL_FUNCTION_SINGLE_VECTOR; + break; + case IRQ_WORK_VECTOR: + xen_vector = XEN_IRQ_WORK_VECTOR; + break; +#ifdef CONFIG_X86_64 + case NMI_VECTOR: + case APIC_DM_NMI: /* Some use that instead of NMI_VECTOR */ + xen_vector = XEN_NMI_VECTOR; + break; +#endif + default: + xen_vector = -1; + printk(KERN_ERR "xen: vector 0x%x is not implemented\n", + vector); + } + + return xen_vector; +} + +void xen_send_IPI_mask(const struct cpumask *mask, + int vector) +{ + int xen_vector = xen_map_vector(vector); + + if (xen_vector >= 0) + __xen_send_IPI_mask(mask, xen_vector); +} + +void xen_send_IPI_all(int vector) +{ + int xen_vector = xen_map_vector(vector); + + if (xen_vector >= 0) + __xen_send_IPI_mask(cpu_online_mask, xen_vector); +} -static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) +void xen_send_IPI_self(int vector) +{ + int xen_vector = xen_map_vector(vector); + + if (xen_vector >= 0) + xen_send_IPI_one(smp_processor_id(), xen_vector); +} + +void xen_send_IPI_mask_allbutself(const struct cpumask *mask, + int vector) { unsigned cpu; + unsigned int this_cpu = smp_processor_id(); + int xen_vector = xen_map_vector(vector); - cpus_and(mask, mask, cpu_online_map); + if (!(num_online_cpus() > 1) || (xen_vector < 0)) + return; - for_each_cpu_mask(cpu, mask) - xen_send_IPI_one(cpu, vector); + for_each_cpu_and(cpu, mask, cpu_online_mask) { + if (this_cpu == cpu) + continue; + + xen_send_IPI_one(cpu, xen_vector); + } } -static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) +void xen_send_IPI_allbutself(int vector) { - void (*func) (void *info) = call_data->func; - void *info = call_data->info; - int wait = call_data->wait; + xen_send_IPI_mask_allbutself(cpu_online_mask, vector); +} - /* - * Notify initiating CPU that I've grabbed the data and am - * about to execute the function - */ - mb(); - atomic_inc(&call_data->started); - /* - * At this point the info structure may be out of scope unless wait==1 - */ +static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) +{ irq_enter(); - (*func)(info); - __get_cpu_var(irq_stat).irq_call_count++; + generic_smp_call_function_interrupt(); + inc_irq_stat(irq_call_count); irq_exit(); - if (wait) { - mb(); /* commit everything before setting finished */ - atomic_inc(&call_data->finished); - } + return IRQ_HANDLED; +} + +static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) +{ + irq_enter(); + generic_smp_call_function_single_interrupt(); + inc_irq_stat(irq_call_count); + irq_exit(); return IRQ_HANDLED; } -int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), - void *info, int wait) +static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id) { - struct call_data_struct data; - int cpus, cpu; - bool yield; + irq_enter(); + irq_work_run(); + inc_irq_stat(apic_irq_work_irqs); + irq_exit(); - /* Holding any lock stops cpus from going down. */ - spin_lock(&call_lock); + return IRQ_HANDLED; +} - cpu_clear(smp_processor_id(), mask); +static const struct smp_ops xen_smp_ops __initconst = { + .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, + .smp_prepare_cpus = xen_smp_prepare_cpus, + .smp_cpus_done = xen_smp_cpus_done, - cpus = cpus_weight(mask); - if (!cpus) { - spin_unlock(&call_lock); - return 0; - } + .cpu_up = xen_cpu_up, + .cpu_die = xen_cpu_die, + .cpu_disable = xen_cpu_disable, + .play_dead = xen_play_dead, - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); + .stop_other_cpus = xen_stop_other_cpus, + .smp_send_reschedule = xen_smp_send_reschedule, - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); + .send_call_func_ipi = xen_smp_send_call_function_ipi, + .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi, +}; - call_data = &data; - mb(); /* write everything before IPI */ +void __init xen_smp_init(void) +{ + smp_ops = xen_smp_ops; + xen_fill_possible_map(); +} - /* Send a message to other CPUs and wait for them to respond */ - xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); +static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) +{ + native_smp_prepare_cpus(max_cpus); + WARN_ON(xen_smp_intr_init(0)); - /* Make sure other vcpus get a chance to run if they need to. */ - yield = false; - for_each_cpu_mask(cpu, mask) - if (xen_vcpu_stolen(cpu)) - yield = true; + xen_init_lock_cpu(0); +} - if (yield) - HYPERVISOR_sched_op(SCHEDOP_yield, 0); +static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) +{ + int rc; + /* + * xen_smp_intr_init() needs to run before native_cpu_up() + * so that IPI vectors are set up on the booting CPU before + * it is marked online in native_cpu_up(). + */ + rc = xen_smp_intr_init(cpu); + WARN_ON(rc); + if (!rc) + rc = native_cpu_up(cpu, tidle); - /* Wait for response */ - while (atomic_read(&data.started) != cpus || - (wait && atomic_read(&data.finished) != cpus)) - cpu_relax(); + /* + * We must initialize the slowpath CPU kicker _after_ the native + * path has executed. If we initialized it before none of the + * unlocker IPI kicks would reach the booting CPU as the booting + * CPU had not set itself 'online' in cpu_online_mask. That mask + * is checked when IPIs are sent (on HVM at least). + */ + xen_init_lock_cpu(cpu); + return rc; +} - spin_unlock(&call_lock); +static void xen_hvm_cpu_die(unsigned int cpu) +{ + xen_cpu_die(cpu); + native_cpu_die(cpu); +} - return 0; +void __init xen_hvm_smp_init(void) +{ + if (!xen_have_vector_callback) + return; + smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; + smp_ops.smp_send_reschedule = xen_smp_send_reschedule; + smp_ops.cpu_up = xen_hvm_cpu_up; + smp_ops.cpu_die = xen_hvm_cpu_die; + smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; + smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; + smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu; } diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h new file mode 100644 index 00000000000..c7c2d89efd7 --- /dev/null +++ b/arch/x86/xen/smp.h @@ -0,0 +1,11 @@ +#ifndef _XEN_SMP_H + +extern void xen_send_IPI_mask(const struct cpumask *mask, + int vector); +extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask, + int vector); +extern void xen_send_IPI_allbutself(int vector); +extern void xen_send_IPI_all(int vector); +extern void xen_send_IPI_self(int vector); + +#endif diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c new file mode 100644 index 00000000000..0ba5f3b967f --- /dev/null +++ b/arch/x86/xen/spinlock.c @@ -0,0 +1,348 @@ +/* + * Split spinlock implementation out into its own file, so it can be + * compiled in a FTRACE-compatible way. + */ +#include <linux/kernel_stat.h> +#include <linux/spinlock.h> +#include <linux/debugfs.h> +#include <linux/log2.h> +#include <linux/gfp.h> +#include <linux/slab.h> + +#include <asm/paravirt.h> + +#include <xen/interface/xen.h> +#include <xen/events.h> + +#include "xen-ops.h" +#include "debugfs.h" + +enum xen_contention_stat { + TAKEN_SLOW, + TAKEN_SLOW_PICKUP, + TAKEN_SLOW_SPURIOUS, + RELEASED_SLOW, + RELEASED_SLOW_KICKED, + NR_CONTENTION_STATS +}; + + +#ifdef CONFIG_XEN_DEBUG_FS +#define HISTO_BUCKETS 30 +static struct xen_spinlock_stats +{ + u32 contention_stats[NR_CONTENTION_STATS]; + u32 histo_spin_blocked[HISTO_BUCKETS+1]; + u64 time_blocked; +} spinlock_stats; + +static u8 zero_stats; + +static inline void check_zero(void) +{ + u8 ret; + u8 old = ACCESS_ONCE(zero_stats); + if (unlikely(old)) { + ret = cmpxchg(&zero_stats, old, 0); + /* This ensures only one fellow resets the stat */ + if (ret == old) + memset(&spinlock_stats, 0, sizeof(spinlock_stats)); + } +} + +static inline void add_stats(enum xen_contention_stat var, u32 val) +{ + check_zero(); + spinlock_stats.contention_stats[var] += val; +} + +static inline u64 spin_time_start(void) +{ + return xen_clocksource_read(); +} + +static void __spin_time_accum(u64 delta, u32 *array) +{ + unsigned index = ilog2(delta); + + check_zero(); + + if (index < HISTO_BUCKETS) + array[index]++; + else + array[HISTO_BUCKETS]++; +} + +static inline void spin_time_accum_blocked(u64 start) +{ + u32 delta = xen_clocksource_read() - start; + + __spin_time_accum(delta, spinlock_stats.histo_spin_blocked); + spinlock_stats.time_blocked += delta; +} +#else /* !CONFIG_XEN_DEBUG_FS */ +static inline void add_stats(enum xen_contention_stat var, u32 val) +{ +} + +static inline u64 spin_time_start(void) +{ + return 0; +} + +static inline void spin_time_accum_blocked(u64 start) +{ +} +#endif /* CONFIG_XEN_DEBUG_FS */ + +struct xen_lock_waiting { + struct arch_spinlock *lock; + __ticket_t want; +}; + +static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; +static DEFINE_PER_CPU(char *, irq_name); +static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting); +static cpumask_t waiting_cpus; + +static bool xen_pvspin = true; +__visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) +{ + int irq = __this_cpu_read(lock_kicker_irq); + struct xen_lock_waiting *w = &__get_cpu_var(lock_waiting); + int cpu = smp_processor_id(); + u64 start; + unsigned long flags; + + /* If kicker interrupts not initialized yet, just spin */ + if (irq == -1) + return; + + start = spin_time_start(); + + /* + * Make sure an interrupt handler can't upset things in a + * partially setup state. + */ + local_irq_save(flags); + /* + * We don't really care if we're overwriting some other + * (lock,want) pair, as that would mean that we're currently + * in an interrupt context, and the outer context had + * interrupts enabled. That has already kicked the VCPU out + * of xen_poll_irq(), so it will just return spuriously and + * retry with newly setup (lock,want). + * + * The ordering protocol on this is that the "lock" pointer + * may only be set non-NULL if the "want" ticket is correct. + * If we're updating "want", we must first clear "lock". + */ + w->lock = NULL; + smp_wmb(); + w->want = want; + smp_wmb(); + w->lock = lock; + + /* This uses set_bit, which atomic and therefore a barrier */ + cpumask_set_cpu(cpu, &waiting_cpus); + add_stats(TAKEN_SLOW, 1); + + /* clear pending */ + xen_clear_irq_pending(irq); + + /* Only check lock once pending cleared */ + barrier(); + + /* + * Mark entry to slowpath before doing the pickup test to make + * sure we don't deadlock with an unlocker. + */ + __ticket_enter_slowpath(lock); + + /* + * check again make sure it didn't become free while + * we weren't looking + */ + if (ACCESS_ONCE(lock->tickets.head) == want) { + add_stats(TAKEN_SLOW_PICKUP, 1); + goto out; + } + + /* Allow interrupts while blocked */ + local_irq_restore(flags); + + /* + * If an interrupt happens here, it will leave the wakeup irq + * pending, which will cause xen_poll_irq() to return + * immediately. + */ + + /* Block until irq becomes pending (or perhaps a spurious wakeup) */ + xen_poll_irq(irq); + add_stats(TAKEN_SLOW_SPURIOUS, !xen_test_irq_pending(irq)); + + local_irq_save(flags); + + kstat_incr_irq_this_cpu(irq); +out: + cpumask_clear_cpu(cpu, &waiting_cpus); + w->lock = NULL; + + local_irq_restore(flags); + + spin_time_accum_blocked(start); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_lock_spinning); + +static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next) +{ + int cpu; + + add_stats(RELEASED_SLOW, 1); + + for_each_cpu(cpu, &waiting_cpus) { + const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu); + + /* Make sure we read lock before want */ + if (ACCESS_ONCE(w->lock) == lock && + ACCESS_ONCE(w->want) == next) { + add_stats(RELEASED_SLOW_KICKED, 1); + xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); + break; + } + } +} + +static irqreturn_t dummy_handler(int irq, void *dev_id) +{ + BUG(); + return IRQ_HANDLED; +} + +void xen_init_lock_cpu(int cpu) +{ + int irq; + char *name; + + if (!xen_pvspin) + return; + + WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n", + cpu, per_cpu(lock_kicker_irq, cpu)); + + name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); + irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, + cpu, + dummy_handler, + IRQF_PERCPU|IRQF_NOBALANCING, + name, + NULL); + + if (irq >= 0) { + disable_irq(irq); /* make sure it's never delivered */ + per_cpu(lock_kicker_irq, cpu) = irq; + per_cpu(irq_name, cpu) = name; + } + + printk("cpu %d spinlock event irq %d\n", cpu, irq); +} + +void xen_uninit_lock_cpu(int cpu) +{ + if (!xen_pvspin) + return; + + unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL); + per_cpu(lock_kicker_irq, cpu) = -1; + kfree(per_cpu(irq_name, cpu)); + per_cpu(irq_name, cpu) = NULL; +} + + +/* + * Our init of PV spinlocks is split in two init functions due to us + * using paravirt patching and jump labels patching and having to do + * all of this before SMP code is invoked. + * + * The paravirt patching needs to be done _before_ the alternative asm code + * is started, otherwise we would not patch the core kernel code. + */ +void __init xen_init_spinlocks(void) +{ + + if (!xen_pvspin) { + printk(KERN_DEBUG "xen: PV spinlocks disabled\n"); + return; + } + printk(KERN_DEBUG "xen: PV spinlocks enabled\n"); + pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning); + pv_lock_ops.unlock_kick = xen_unlock_kick; +} + +/* + * While the jump_label init code needs to happend _after_ the jump labels are + * enabled and before SMP is started. Hence we use pre-SMP initcall level + * init. We cannot do it in xen_init_spinlocks as that is done before + * jump labels are activated. + */ +static __init int xen_init_spinlocks_jump(void) +{ + if (!xen_pvspin) + return 0; + + if (!xen_domain()) + return 0; + + static_key_slow_inc(¶virt_ticketlocks_enabled); + return 0; +} +early_initcall(xen_init_spinlocks_jump); + +static __init int xen_parse_nopvspin(char *arg) +{ + xen_pvspin = false; + return 0; +} +early_param("xen_nopvspin", xen_parse_nopvspin); + +#ifdef CONFIG_XEN_DEBUG_FS + +static struct dentry *d_spin_debug; + +static int __init xen_spinlock_debugfs(void) +{ + struct dentry *d_xen = xen_init_debugfs(); + + if (d_xen == NULL) + return -ENOMEM; + + if (!xen_pvspin) + return 0; + + d_spin_debug = debugfs_create_dir("spinlocks", d_xen); + + debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); + + debugfs_create_u32("taken_slow", 0444, d_spin_debug, + &spinlock_stats.contention_stats[TAKEN_SLOW]); + debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, + &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]); + debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug, + &spinlock_stats.contention_stats[TAKEN_SLOW_SPURIOUS]); + + debugfs_create_u32("released_slow", 0444, d_spin_debug, + &spinlock_stats.contention_stats[RELEASED_SLOW]); + debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, + &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]); + + debugfs_create_u64("time_blocked", 0444, d_spin_debug, + &spinlock_stats.time_blocked); + + debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, + spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); + + return 0; +} +fs_initcall(xen_spinlock_debugfs); + +#endif /* CONFIG_XEN_DEBUG_FS */ diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c new file mode 100644 index 00000000000..c4df9dbd63b --- /dev/null +++ b/arch/x86/xen/suspend.c @@ -0,0 +1,97 @@ +#include <linux/types.h> +#include <linux/clockchips.h> + +#include <xen/interface/xen.h> +#include <xen/grant_table.h> +#include <xen/events.h> + +#include <asm/xen/hypercall.h> +#include <asm/xen/page.h> +#include <asm/fixmap.h> + +#include "xen-ops.h" +#include "mmu.h" + +static void xen_pv_pre_suspend(void) +{ + xen_mm_pin_all(); + + xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); + xen_start_info->console.domU.mfn = + mfn_to_pfn(xen_start_info->console.domU.mfn); + + BUG_ON(!irqs_disabled()); + + HYPERVISOR_shared_info = &xen_dummy_shared_info; + if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_PARAVIRT_BOOTMAP), + __pte_ma(0), 0)) + BUG(); +} + +static void xen_hvm_post_suspend(int suspend_cancelled) +{ +#ifdef CONFIG_XEN_PVHVM + int cpu; + xen_hvm_init_shared_info(); + xen_callback_vector(); + xen_unplug_emulated_devices(); + if (xen_feature(XENFEAT_hvm_safe_pvclock)) { + for_each_online_cpu(cpu) { + xen_setup_runstate_info(cpu); + } + } +#endif +} + +static void xen_pv_post_suspend(int suspend_cancelled) +{ + xen_build_mfn_list_list(); + + xen_setup_shared_info(); + + if (suspend_cancelled) { + xen_start_info->store_mfn = + pfn_to_mfn(xen_start_info->store_mfn); + xen_start_info->console.domU.mfn = + pfn_to_mfn(xen_start_info->console.domU.mfn); + } else { +#ifdef CONFIG_SMP + BUG_ON(xen_cpu_initialized_map == NULL); + cpumask_copy(xen_cpu_initialized_map, cpu_online_mask); +#endif + xen_vcpu_restore(); + } + + xen_mm_unpin_all(); +} + +void xen_arch_pre_suspend(void) +{ + if (xen_pv_domain()) + xen_pv_pre_suspend(); +} + +void xen_arch_post_suspend(int cancelled) +{ + if (xen_pv_domain()) + xen_pv_post_suspend(cancelled); + else + xen_hvm_post_suspend(cancelled); +} + +static void xen_vcpu_notify_restore(void *data) +{ + unsigned long reason = (unsigned long)data; + + /* Boot processor notified via generic timekeeping_resume() */ + if ( smp_processor_id() == 0) + return; + + clockevents_notify(reason, NULL); +} + +void xen_arch_resume(void) +{ + on_each_cpu(xen_vcpu_notify_restore, + (void *)CLOCK_EVT_NOTIFY_RESUME, 1); +} diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index c39e1a5aa24..7b78f88c170 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -12,44 +12,34 @@ #include <linux/clocksource.h> #include <linux/clockchips.h> #include <linux/kernel_stat.h> +#include <linux/math64.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/pvclock_gtod.h> +#include <asm/pvclock.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> #include <xen/events.h> +#include <xen/features.h> #include <xen/interface/xen.h> #include <xen/interface/vcpu.h> #include "xen-ops.h" -#define XEN_SHIFT 22 - /* Xen may fire a timer up to this many ns early */ #define TIMER_SLOP 100000 #define NS_PER_TICK (1000000000LL / HZ) -static cycle_t xen_clocksource_read(void); - -/* These are perodically updated in shared_info, and then copied here. */ -struct shadow_time_info { - u64 tsc_timestamp; /* TSC at last update of time vals. */ - u64 system_timestamp; /* Time, in nanosecs, since boot. */ - u32 tsc_to_nsec_mul; - int tsc_shift; - u32 version; -}; - -static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); - /* runstate info updated by Xen */ -static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); +static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate); /* snapshots of runstate info */ -static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot); +static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot); -/* unused ns of stolen and blocked time */ -static DEFINE_PER_CPU(u64, residual_stolen); -static DEFINE_PER_CPU(u64, residual_blocked); +/* unused ns of stolen time */ +static DEFINE_PER_CPU(u64, xen_residual_stolen); /* return an consistent snapshot of 64-bit time/counter value */ static u64 get64(const u64 *p) @@ -90,7 +80,7 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res) BUG_ON(preemptible()); - state = &__get_cpu_var(runstate); + state = &__get_cpu_var(xen_runstate); /* * The runstate info is always updated by the hypervisor on @@ -108,14 +98,14 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res) /* return true when a vcpu could run but has no real cpu to run on */ bool xen_vcpu_stolen(int vcpu) { - return per_cpu(runstate, vcpu).state == RUNSTATE_runnable; + return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable; } -static void setup_runstate_info(int cpu) +void xen_setup_runstate_info(int cpu) { struct vcpu_register_runstate_memory_area area; - area.addr.v = &per_cpu(runstate, cpu); + area.addr.v = &per_cpu(xen_runstate, cpu); if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area)) @@ -126,249 +116,125 @@ static void do_stolen_accounting(void) { struct vcpu_runstate_info state; struct vcpu_runstate_info *snap; - s64 blocked, runnable, offline, stolen; + s64 runnable, offline, stolen; cputime_t ticks; get_runstate_snapshot(&state); WARN_ON(state.state != RUNSTATE_running); - snap = &__get_cpu_var(runstate_snapshot); + snap = &__get_cpu_var(xen_runstate_snapshot); /* work out how much time the VCPU has not been runn*ing* */ - blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]; offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]; *snap = state; /* Add the appropriate number of ticks of stolen time, - including any left-overs from last time. Passing NULL to - account_steal_time accounts the time as stolen. */ - stolen = runnable + offline + __get_cpu_var(residual_stolen); + including any left-overs from last time. */ + stolen = runnable + offline + __this_cpu_read(xen_residual_stolen); if (stolen < 0) stolen = 0; - ticks = 0; - while (stolen >= NS_PER_TICK) { - ticks++; - stolen -= NS_PER_TICK; - } - __get_cpu_var(residual_stolen) = stolen; - account_steal_time(NULL, ticks); - - /* Add the appropriate number of ticks of blocked time, - including any left-overs from last time. Passing idle to - account_steal_time accounts the time as idle/wait. */ - blocked += __get_cpu_var(residual_blocked); - - if (blocked < 0) - blocked = 0; - - ticks = 0; - while (blocked >= NS_PER_TICK) { - ticks++; - blocked -= NS_PER_TICK; - } - __get_cpu_var(residual_blocked) = blocked; - account_steal_time(idle_task(smp_processor_id()), ticks); + ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); + __this_cpu_write(xen_residual_stolen, stolen); + account_steal_ticks(ticks); } -/* - * Xen sched_clock implementation. Returns the number of unstolen - * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED - * states. - */ -unsigned long long xen_sched_clock(void) +/* Get the TSC speed from Xen */ +static unsigned long xen_tsc_khz(void) { - struct vcpu_runstate_info state; - cycle_t now; - u64 ret; - s64 offset; - - /* - * Ideally sched_clock should be called on a per-cpu basis - * anyway, so preempt should already be disabled, but that's - * not current practice at the moment. - */ - preempt_disable(); - - now = xen_clocksource_read(); - - get_runstate_snapshot(&state); - - WARN_ON(state.state != RUNSTATE_running); - - offset = now - state.state_entry_time; - if (offset < 0) - offset = 0; - - ret = state.time[RUNSTATE_blocked] + - state.time[RUNSTATE_running] + - offset; - - preempt_enable(); - - return ret; -} - - -/* Get the CPU speed from Xen */ -unsigned long xen_cpu_khz(void) -{ - u64 xen_khz = 1000000ULL << 32; - const struct vcpu_time_info *info = + struct pvclock_vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_info[0].time; - do_div(xen_khz, info->tsc_to_system_mul); - if (info->tsc_shift < 0) - xen_khz <<= -info->tsc_shift; - else - xen_khz >>= info->tsc_shift; - - return xen_khz; + return pvclock_tsc_khz(info); } -/* - * Reads a consistent set of time-base values from Xen, into a shadow data - * area. - */ -static unsigned get_time_values_from_xen(void) +cycle_t xen_clocksource_read(void) { - struct vcpu_time_info *src; - struct shadow_time_info *dst; + struct pvclock_vcpu_time_info *src; + cycle_t ret; - /* src is shared memory with the hypervisor, so we need to - make sure we get a consistent snapshot, even in the face of - being preempted. */ + preempt_disable_notrace(); src = &__get_cpu_var(xen_vcpu)->time; - dst = &__get_cpu_var(shadow_time); - - do { - dst->version = src->version; - rmb(); /* fetch version before data */ - dst->tsc_timestamp = src->tsc_timestamp; - dst->system_timestamp = src->system_time; - dst->tsc_to_nsec_mul = src->tsc_to_system_mul; - dst->tsc_shift = src->tsc_shift; - rmb(); /* test version after fetching data */ - } while ((src->version & 1) | (dst->version ^ src->version)); - - return dst->version; + ret = pvclock_clocksource_read(src); + preempt_enable_notrace(); + return ret; } -/* - * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, - * yielding a 64-bit result. - */ -static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) +static cycle_t xen_clocksource_get_cycles(struct clocksource *cs) { - u64 product; -#ifdef __i386__ - u32 tmp1, tmp2; -#endif + return xen_clocksource_read(); +} - if (shift < 0) - delta >>= -shift; - else - delta <<= shift; - -#ifdef __i386__ - __asm__ ( - "mul %5 ; " - "mov %4,%%eax ; " - "mov %%edx,%4 ; " - "mul %5 ; " - "xor %5,%5 ; " - "add %4,%%eax ; " - "adc %5,%%edx ; " - : "=A" (product), "=r" (tmp1), "=r" (tmp2) - : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); -#elif __x86_64__ - __asm__ ( - "mul %%rdx ; shrd $32,%%rdx,%%rax" - : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); -#else -#error implement me! -#endif +static void xen_read_wallclock(struct timespec *ts) +{ + struct shared_info *s = HYPERVISOR_shared_info; + struct pvclock_wall_clock *wall_clock = &(s->wc); + struct pvclock_vcpu_time_info *vcpu_time; - return product; + vcpu_time = &get_cpu_var(xen_vcpu)->time; + pvclock_read_wallclock(wall_clock, vcpu_time, ts); + put_cpu_var(xen_vcpu); } -static u64 get_nsec_offset(struct shadow_time_info *shadow) +static void xen_get_wallclock(struct timespec *now) { - u64 now, delta; - now = native_read_tsc(); - delta = now - shadow->tsc_timestamp; - return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); + xen_read_wallclock(now); } -static cycle_t xen_clocksource_read(void) +static int xen_set_wallclock(const struct timespec *now) { - struct shadow_time_info *shadow = &get_cpu_var(shadow_time); - cycle_t ret; - unsigned version; - - do { - version = get_time_values_from_xen(); - barrier(); - ret = shadow->system_timestamp + get_nsec_offset(shadow); - barrier(); - } while (version != __get_cpu_var(xen_vcpu)->time.version); - - put_cpu_var(shadow_time); - - return ret; + return -1; } -static void xen_read_wallclock(struct timespec *ts) +static int xen_pvclock_gtod_notify(struct notifier_block *nb, + unsigned long was_set, void *priv) { - const struct shared_info *s = HYPERVISOR_shared_info; - u32 version; - u64 delta; - struct timespec now; + /* Protected by the calling core code serialization */ + static struct timespec next_sync; - /* get wallclock at system boot */ - do { - version = s->wc_version; - rmb(); /* fetch version before time */ - now.tv_sec = s->wc_sec; - now.tv_nsec = s->wc_nsec; - rmb(); /* fetch time before checking version */ - } while ((s->wc_version & 1) | (version ^ s->wc_version)); + struct xen_platform_op op; + struct timespec now; - delta = xen_clocksource_read(); /* time since system boot */ - delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; + now = __current_kernel_time(); - now.tv_nsec = do_div(delta, NSEC_PER_SEC); - now.tv_sec = delta; + /* + * We only take the expensive HV call when the clock was set + * or when the 11 minutes RTC synchronization time elapsed. + */ + if (!was_set && timespec_compare(&now, &next_sync) < 0) + return NOTIFY_OK; - set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); -} + op.cmd = XENPF_settime; + op.u.settime.secs = now.tv_sec; + op.u.settime.nsecs = now.tv_nsec; + op.u.settime.system_time = xen_clocksource_read(); -unsigned long xen_get_wallclock(void) -{ - struct timespec ts; + (void)HYPERVISOR_dom0_op(&op); - xen_read_wallclock(&ts); + /* + * Move the next drift compensation time 11 minutes + * ahead. That's emulating the sync_cmos_clock() update for + * the hardware RTC. + */ + next_sync = now; + next_sync.tv_sec += 11 * 60; - return ts.tv_sec; + return NOTIFY_OK; } -int xen_set_wallclock(unsigned long now) -{ - /* do nothing for domU */ - return -1; -} +static struct notifier_block xen_pvclock_gtod_notifier = { + .notifier_call = xen_pvclock_gtod_notify, +}; static struct clocksource xen_clocksource __read_mostly = { .name = "xen", .rating = 400, - .read = xen_clocksource_read, + .read = xen_clocksource_get_cycles, .mask = ~0, - .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */ - .shift = XEN_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -522,11 +388,16 @@ static const struct clock_event_device xen_vcpuop_clockevent = { static const struct clock_event_device *xen_clockevent = &xen_timerop_clockevent; -static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events); + +struct xen_clock_event_device { + struct clock_event_device evt; + char *name; +}; +static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt.irq = -1 }; static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) { - struct clock_event_device *evt = &__get_cpu_var(xen_clock_events); + struct clock_event_device *evt = &__get_cpu_var(xen_clock_events).evt; irqreturn_t ret; ret = IRQ_NONE; @@ -540,12 +411,31 @@ static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) return ret; } +void xen_teardown_timer(int cpu) +{ + struct clock_event_device *evt; + BUG_ON(cpu == 0); + evt = &per_cpu(xen_clock_events, cpu).evt; + + if (evt->irq >= 0) { + unbind_from_irqhandler(evt->irq, NULL); + evt->irq = -1; + kfree(per_cpu(xen_clock_events, cpu).name); + per_cpu(xen_clock_events, cpu).name = NULL; + } +} + void xen_setup_timer(int cpu) { - const char *name; + char *name; struct clock_event_device *evt; int irq; + evt = &per_cpu(xen_clock_events, cpu).evt; + WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu); + if (evt->irq >= 0) + xen_teardown_timer(cpu); + printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu); name = kasprintf(GFP_KERNEL, "timer%d", cpu); @@ -553,32 +443,51 @@ void xen_setup_timer(int cpu) name = "<timer kasprintf failed>"; irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER| + IRQF_FORCE_RESUME, name, NULL); + (void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX); - evt = &per_cpu(xen_clock_events, cpu); memcpy(evt, xen_clockevent, sizeof(*evt)); - evt->cpumask = cpumask_of_cpu(cpu); + evt->cpumask = cpumask_of(cpu); evt->irq = irq; - - setup_runstate_info(cpu); + per_cpu(xen_clock_events, cpu).name = name; } + void xen_setup_cpu_clockevents(void) { BUG_ON(preemptible()); - clockevents_register_device(&__get_cpu_var(xen_clock_events)); + clockevents_register_device(&__get_cpu_var(xen_clock_events).evt); } -__init void xen_time_init(void) +void xen_timer_resume(void) { - int cpu = smp_processor_id(); + int cpu; - get_time_values_from_xen(); + pvclock_resume(); - clocksource_register(&xen_clocksource); + if (xen_clockevent != &xen_vcpuop_clockevent) + return; + + for_each_online_cpu(cpu) { + if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) + BUG(); + } +} + +static const struct pv_time_ops xen_time_ops __initconst = { + .sched_clock = xen_clocksource_read, +}; + +static void __init xen_time_init(void) +{ + int cpu = smp_processor_id(); + struct timespec tp; + + clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC); if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { /* Successfully turned off 100Hz tick, so we have the @@ -588,12 +497,66 @@ __init void xen_time_init(void) } /* Set initial system time with full resolution */ - xen_read_wallclock(&xtime); - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); + xen_read_wallclock(&tp); + do_settimeofday(&tp); setup_force_cpu_cap(X86_FEATURE_TSC); + xen_setup_runstate_info(cpu); xen_setup_timer(cpu); xen_setup_cpu_clockevents(); + + if (xen_initial_domain()) + pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier); } + +void __init xen_init_time_ops(void) +{ + pv_time_ops = xen_time_ops; + + x86_init.timers.timer_init = xen_time_init; + x86_init.timers.setup_percpu_clockev = x86_init_noop; + x86_cpuinit.setup_percpu_clockev = x86_init_noop; + + x86_platform.calibrate_tsc = xen_tsc_khz; + x86_platform.get_wallclock = xen_get_wallclock; + /* Dom0 uses the native method to set the hardware RTC. */ + if (!xen_initial_domain()) + x86_platform.set_wallclock = xen_set_wallclock; +} + +#ifdef CONFIG_XEN_PVHVM +static void xen_hvm_setup_cpu_clockevents(void) +{ + int cpu = smp_processor_id(); + xen_setup_runstate_info(cpu); + /* + * xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence + * doing it xen_hvm_cpu_notify (which gets called by smp_init during + * early bootup and also during CPU hotplug events). + */ + xen_setup_cpu_clockevents(); +} + +void __init xen_hvm_init_time_ops(void) +{ + /* vector callback is needed otherwise we cannot receive interrupts + * on cpu > 0 and at this point we don't know how many cpus are + * available */ + if (!xen_have_vector_callback) + return; + if (!xen_feature(XENFEAT_hvm_safe_pvclock)) { + printk(KERN_INFO "Xen doesn't support pvclock on HVM," + "disable pv timer\n"); + return; + } + + pv_time_ops = xen_time_ops; + x86_init.timers.setup_percpu_clockev = xen_time_init; + x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents; + + x86_platform.calibrate_tsc = xen_tsc_khz; + x86_platform.get_wallclock = xen_get_wallclock; + x86_platform.set_wallclock = xen_set_wallclock; +} +#endif diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c new file mode 100644 index 00000000000..520022d1a18 --- /dev/null +++ b/arch/x86/xen/trace.c @@ -0,0 +1,62 @@ +#include <linux/ftrace.h> +#include <xen/interface/xen.h> + +#define N(x) [__HYPERVISOR_##x] = "("#x")" +static const char *xen_hypercall_names[] = { + N(set_trap_table), + N(mmu_update), + N(set_gdt), + N(stack_switch), + N(set_callbacks), + N(fpu_taskswitch), + N(sched_op_compat), + N(dom0_op), + N(set_debugreg), + N(get_debugreg), + N(update_descriptor), + N(memory_op), + N(multicall), + N(update_va_mapping), + N(set_timer_op), + N(event_channel_op_compat), + N(xen_version), + N(console_io), + N(physdev_op_compat), + N(grant_table_op), + N(vm_assist), + N(update_va_mapping_otherdomain), + N(iret), + N(vcpu_op), + N(set_segment_base), + N(mmuext_op), + N(acm_op), + N(nmi_op), + N(sched_op), + N(callback_op), + N(xenoprof_op), + N(event_channel_op), + N(physdev_op), + N(hvm_op), + +/* Architecture-specific hypercall definitions. */ + N(arch_0), + N(arch_1), + N(arch_2), + N(arch_3), + N(arch_4), + N(arch_5), + N(arch_6), + N(arch_7), +}; +#undef N + +static const char *xen_hypercall_name(unsigned op) +{ + if (op < ARRAY_SIZE(xen_hypercall_names) && xen_hypercall_names[op] != NULL) + return xen_hypercall_names[op]; + + return ""; +} + +#define CREATE_TRACE_POINTS +#include <trace/events/xen.h> diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c new file mode 100644 index 00000000000..6722e3733f0 --- /dev/null +++ b/arch/x86/xen/vga.c @@ -0,0 +1,74 @@ +#include <linux/screen_info.h> +#include <linux/init.h> + +#include <asm/bootparam.h> +#include <asm/setup.h> + +#include <xen/interface/xen.h> + +#include "xen-ops.h" + +void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size) +{ + struct screen_info *screen_info = &boot_params.screen_info; + + /* This is drawn from a dump from vgacon:startup in + * standard Linux. */ + screen_info->orig_video_mode = 3; + screen_info->orig_video_isVGA = 1; + screen_info->orig_video_lines = 25; + screen_info->orig_video_cols = 80; + screen_info->orig_video_ega_bx = 3; + screen_info->orig_video_points = 16; + screen_info->orig_y = screen_info->orig_video_lines - 1; + + switch (info->video_type) { + case XEN_VGATYPE_TEXT_MODE_3: + if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3) + + sizeof(info->u.text_mode_3)) + break; + screen_info->orig_video_lines = info->u.text_mode_3.rows; + screen_info->orig_video_cols = info->u.text_mode_3.columns; + screen_info->orig_x = info->u.text_mode_3.cursor_x; + screen_info->orig_y = info->u.text_mode_3.cursor_y; + screen_info->orig_video_points = + info->u.text_mode_3.font_height; + break; + + case XEN_VGATYPE_EFI_LFB: + case XEN_VGATYPE_VESA_LFB: + if (size < offsetof(struct dom0_vga_console_info, + u.vesa_lfb.gbl_caps)) + break; + screen_info->orig_video_isVGA = VIDEO_TYPE_VLFB; + screen_info->lfb_width = info->u.vesa_lfb.width; + screen_info->lfb_height = info->u.vesa_lfb.height; + screen_info->lfb_depth = info->u.vesa_lfb.bits_per_pixel; + screen_info->lfb_base = info->u.vesa_lfb.lfb_base; + screen_info->lfb_size = info->u.vesa_lfb.lfb_size; + screen_info->lfb_linelength = info->u.vesa_lfb.bytes_per_line; + screen_info->red_size = info->u.vesa_lfb.red_size; + screen_info->red_pos = info->u.vesa_lfb.red_pos; + screen_info->green_size = info->u.vesa_lfb.green_size; + screen_info->green_pos = info->u.vesa_lfb.green_pos; + screen_info->blue_size = info->u.vesa_lfb.blue_size; + screen_info->blue_pos = info->u.vesa_lfb.blue_pos; + screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size; + screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos; + + if (info->video_type == XEN_VGATYPE_EFI_LFB) { + screen_info->orig_video_isVGA = VIDEO_TYPE_EFI; + break; + } + + if (size >= offsetof(struct dom0_vga_console_info, + u.vesa_lfb.gbl_caps) + + sizeof(info->u.vesa_lfb.gbl_caps)) + screen_info->capabilities = info->u.vesa_lfb.gbl_caps; + if (size >= offsetof(struct dom0_vga_console_info, + u.vesa_lfb.mode_attrs) + + sizeof(info->u.vesa_lfb.mode_attrs)) + screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs; + break; + } +} diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 6b7190449d0..3e45aa00071 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -1,47 +1,39 @@ /* - Asm versions of Xen pv-ops, suitable for either direct use or inlining. - The inline versions are the same as the direct-use versions, with the - pre- and post-amble chopped off. - - This code is encoded for size rather than absolute efficiency, - with a view to being able to inline as much as possible. - - We only bother with direct forms (ie, vcpu in pda) of the operations - here; the indirect forms are better handled in C, since they're - generally too large to inline anyway. + * Asm versions of Xen pv-ops, suitable for either direct use or + * inlining. The inline versions are the same as the direct-use + * versions, with the pre- and post-amble chopped off. + * + * This code is encoded for size rather than absolute efficiency, with + * a view to being able to inline as much as possible. + * + * We only bother with direct forms (ie, vcpu in percpu data) of the + * operations here; the indirect forms are better handled in C, since + * they're generally too large to inline anyway. */ -#include <linux/linkage.h> - #include <asm/asm-offsets.h> -#include <asm/thread_info.h> #include <asm/percpu.h> #include <asm/processor-flags.h> -#include <asm/segment.h> - -#include <xen/interface/xen.h> -#define RELOC(x, v) .globl x##_reloc; x##_reloc=v -#define ENDPATCH(x) .globl x##_end; x##_end=. - -/* Pseudo-flag used for virtual NMI, which we don't implement yet */ -#define XEN_EFLAGS_NMI 0x80000000 +#include "xen-asm.h" /* - Enable events. This clears the event mask and tests the pending - event status with one and operation. If there are pending - events, then enter the hypervisor to get them handled. + * Enable events. This clears the event mask and tests the pending + * event status with one and operation. If there are pending events, + * then enter the hypervisor to get them handled. */ ENTRY(xen_irq_enable_direct) /* Unmask events */ - movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask + movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask - /* Preempt here doesn't matter because that will deal with - any pending interrupts. The pending check may end up being - run on the wrong CPU, but that doesn't hurt. */ + /* + * Preempt here doesn't matter because that will deal with any + * pending interrupts. The pending check may end up being run + * on the wrong CPU, but that doesn't hurt. + */ /* Test for pending */ - testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending + testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending jz 1f 2: call check_events @@ -53,29 +45,29 @@ ENDPATCH(xen_irq_enable_direct) /* - Disabling events is simply a matter of making the event mask - non-zero. + * Disabling events is simply a matter of making the event mask + * non-zero. */ ENTRY(xen_irq_disable_direct) - movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask + movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask ENDPATCH(xen_irq_disable_direct) ret ENDPROC(xen_irq_disable_direct) RELOC(xen_irq_disable_direct, 0) /* - (xen_)save_fl is used to get the current interrupt enable status. - Callers expect the status to be in X86_EFLAGS_IF, and other bits - may be set in the return value. We take advantage of this by - making sure that X86_EFLAGS_IF has the right value (and other bits - in that byte are 0), but other bits in the return value are - undefined. We need to toggle the state of the bit, because - Xen and x86 use opposite senses (mask vs enable). + * (xen_)save_fl is used to get the current interrupt enable status. + * Callers expect the status to be in X86_EFLAGS_IF, and other bits + * may be set in the return value. We take advantage of this by + * making sure that X86_EFLAGS_IF has the right value (and other bits + * in that byte are 0), but other bits in the return value are + * undefined. We need to toggle the state of the bit, because Xen and + * x86 use opposite senses (mask vs enable). */ ENTRY(xen_save_fl_direct) - testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask + testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask setz %ah - addb %ah,%ah + addb %ah, %ah ENDPATCH(xen_save_fl_direct) ret ENDPROC(xen_save_fl_direct) @@ -83,23 +75,28 @@ ENDPATCH(xen_save_fl_direct) /* - In principle the caller should be passing us a value return - from xen_save_fl_direct, but for robustness sake we test only - the X86_EFLAGS_IF flag rather than the whole byte. After - setting the interrupt mask state, it checks for unmasked - pending events and enters the hypervisor to get them delivered - if so. + * In principle the caller should be passing us a value return from + * xen_save_fl_direct, but for robustness sake we test only the + * X86_EFLAGS_IF flag rather than the whole byte. After setting the + * interrupt mask state, it checks for unmasked pending events and + * enters the hypervisor to get them delivered if so. */ ENTRY(xen_restore_fl_direct) +#ifdef CONFIG_X86_64 + testw $X86_EFLAGS_IF, %di +#else testb $X86_EFLAGS_IF>>8, %ah - setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask - /* Preempt here doesn't matter because that will deal with - any pending interrupts. The pending check may end up being - run on the wrong CPU, but that doesn't hurt. */ +#endif + setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask + /* + * Preempt here doesn't matter because that will deal with any + * pending interrupts. The pending check may end up being run + * on the wrong CPU, but that doesn't hurt. + */ /* check for unmasked and pending */ - cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending - jz 1f + cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending + jnz 1f 2: call check_events 1: ENDPATCH(xen_restore_fl_direct) @@ -107,190 +104,39 @@ ENDPATCH(xen_restore_fl_direct) ENDPROC(xen_restore_fl_direct) RELOC(xen_restore_fl_direct, 2b+1) -/* - This is run where a normal iret would be run, with the same stack setup: - 8: eflags - 4: cs - esp-> 0: eip - - This attempts to make sure that any pending events are dealt - with on return to usermode, but there is a small window in - which an event can happen just before entering usermode. If - the nested interrupt ends up setting one of the TIF_WORK_MASK - pending work flags, they will not be tested again before - returning to usermode. This means that a process can end up - with pending work, which will be unprocessed until the process - enters and leaves the kernel again, which could be an - unbounded amount of time. This means that a pending signal or - reschedule event could be indefinitely delayed. - - The fix is to notice a nested interrupt in the critical - window, and if one occurs, then fold the nested interrupt into - the current interrupt stack frame, and re-process it - iteratively rather than recursively. This means that it will - exit via the normal path, and all pending work will be dealt - with appropriately. - - Because the nested interrupt handler needs to deal with the - current stack state in whatever form its in, we keep things - simple by only using a single register which is pushed/popped - on the stack. - - Non-direct iret could be done in the same way, but it would - require an annoying amount of code duplication. We'll assume - that direct mode will be the common case once the hypervisor - support becomes commonplace. - */ -ENTRY(xen_iret_direct) - /* test eflags for special cases */ - testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) - jnz hyper_iret - - push %eax - ESP_OFFSET=4 # bytes pushed onto stack - - /* Store vcpu_info pointer for easy access. Do it this - way to avoid having to reload %fs */ -#ifdef CONFIG_SMP - GET_THREAD_INFO(%eax) - movl TI_cpu(%eax),%eax - movl __per_cpu_offset(,%eax,4),%eax - lea per_cpu__xen_vcpu_info(%eax),%eax -#else - movl $per_cpu__xen_vcpu_info, %eax -#endif - - /* check IF state we're restoring */ - testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) - - /* Maybe enable events. Once this happens we could get a - recursive event, so the critical region starts immediately - afterwards. However, if that happens we don't end up - resuming the code, so we don't have to be worried about - being preempted to another CPU. */ - setz XEN_vcpu_info_mask(%eax) -xen_iret_start_crit: - - /* check for unmasked and pending */ - cmpw $0x0001, XEN_vcpu_info_pending(%eax) - - /* If there's something pending, mask events again so we - can jump back into xen_hypervisor_callback */ - sete XEN_vcpu_info_mask(%eax) - - popl %eax - - /* From this point on the registers are restored and the stack - updated, so we don't need to worry about it if we're preempted */ -iret_restore_end: - - /* Jump to hypervisor_callback after fixing up the stack. - Events are masked, so jumping out of the critical - region is OK. */ - je xen_hypervisor_callback - - iret -xen_iret_end_crit: - -hyper_iret: - /* put this out of line since its very rarely used */ - jmp hypercall_page + __HYPERVISOR_iret * 32 - - .globl xen_iret_start_crit, xen_iret_end_crit - -/* - This is called by xen_hypervisor_callback in entry.S when it sees - that the EIP at the time of interrupt was between xen_iret_start_crit - and xen_iret_end_crit. We're passed the EIP in %eax so we can do - a more refined determination of what to do. - - The stack format at this point is: - ---------------- - ss : (ss/esp may be present if we came from usermode) - esp : - eflags } outer exception info - cs } - eip } - ---------------- <- edi (copy dest) - eax : outer eax if it hasn't been restored - ---------------- - eflags } nested exception info - cs } (no ss/esp because we're nested - eip } from the same ring) - orig_eax }<- esi (copy src) - - - - - - - - - - fs } - es } - ds } SAVE_ALL state - eax } - : : - ebx } - ---------------- - return addr <- esp - ---------------- - - In order to deliver the nested exception properly, we need to shift - everything from the return addr up to the error code so it - sits just under the outer exception info. This means that when we - handle the exception, we do it in the context of the outer exception - rather than starting a new one. - - The only caveat is that if the outer eax hasn't been - restored yet (ie, it's still on stack), we need to insert - its value into the SAVE_ALL state before going on, since - it's usermode state which we eventually need to restore. - */ -ENTRY(xen_iret_crit_fixup) - /* offsets +4 for return address */ - - /* - Paranoia: Make sure we're really coming from userspace. - One could imagine a case where userspace jumps into the - critical range address, but just before the CPU delivers a GP, - it decides to deliver an interrupt instead. Unlikely? - Definitely. Easy to avoid? Yes. The Intel documents - explicitly say that the reported EIP for a bad jump is the - jump instruction itself, not the destination, but some virtual - environments get this wrong. - */ - movl PT_CS+4(%esp), %ecx - andl $SEGMENT_RPL_MASK, %ecx - cmpl $USER_RPL, %ecx - je 2f - - lea PT_ORIG_EAX+4(%esp), %esi - lea PT_EFLAGS+4(%esp), %edi - - /* If eip is before iret_restore_end then stack - hasn't been restored yet. */ - cmp $iret_restore_end, %eax - jae 1f - - movl 0+4(%edi),%eax /* copy EAX */ - movl %eax, PT_EAX+4(%esp) - - lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ - - /* set up the copy */ -1: std - mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */ - rep movsl - cld - - lea 4(%edi),%esp /* point esp to new frame */ -2: ret - /* - Force an event check by making a hypercall, - but preserve regs before making the call. + * Force an event check by making a hypercall, but preserve regs + * before making the call. */ check_events: +#ifdef CONFIG_X86_32 push %eax push %ecx push %edx - call force_evtchn_callback + call xen_force_evtchn_callback pop %edx pop %ecx pop %eax +#else + push %rax + push %rcx + push %rdx + push %rsi + push %rdi + push %r8 + push %r9 + push %r10 + push %r11 + call xen_force_evtchn_callback + pop %r11 + pop %r10 + pop %r9 + pop %r8 + pop %rdi + pop %rsi + pop %rdx + pop %rcx + pop %rax +#endif ret diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h new file mode 100644 index 00000000000..465276467a4 --- /dev/null +++ b/arch/x86/xen/xen-asm.h @@ -0,0 +1,12 @@ +#ifndef _XEN_XEN_ASM_H +#define _XEN_XEN_ASM_H + +#include <linux/linkage.h> + +#define RELOC(x, v) .globl x##_reloc; x##_reloc=v +#define ENDPATCH(x) .globl x##_end; x##_end=. + +/* Pseudo-flag used for virtual NMI, which we don't implement yet */ +#define XEN_EFLAGS_NMI 0x80000000 + +#endif diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S new file mode 100644 index 00000000000..fd92a64d748 --- /dev/null +++ b/arch/x86/xen/xen-asm_32.S @@ -0,0 +1,237 @@ +/* + * Asm versions of Xen pv-ops, suitable for either direct use or + * inlining. The inline versions are the same as the direct-use + * versions, with the pre- and post-amble chopped off. + * + * This code is encoded for size rather than absolute efficiency, with + * a view to being able to inline as much as possible. + * + * We only bother with direct forms (ie, vcpu in pda) of the + * operations here; the indirect forms are better handled in C, since + * they're generally too large to inline anyway. + */ + +#include <asm/thread_info.h> +#include <asm/processor-flags.h> +#include <asm/segment.h> +#include <asm/asm.h> + +#include <xen/interface/xen.h> + +#include "xen-asm.h" + +/* + * Force an event check by making a hypercall, but preserve regs + * before making the call. + */ +check_events: + push %eax + push %ecx + push %edx + call xen_force_evtchn_callback + pop %edx + pop %ecx + pop %eax + ret + +/* + * We can't use sysexit directly, because we're not running in ring0. + * But we can easily fake it up using iret. Assuming xen_sysexit is + * jumped to with a standard stack frame, we can just strip it back to + * a standard iret frame and use iret. + */ +ENTRY(xen_sysexit) + movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */ + orl $X86_EFLAGS_IF, PT_EFLAGS(%esp) + lea PT_EIP(%esp), %esp + + jmp xen_iret +ENDPROC(xen_sysexit) + +/* + * This is run where a normal iret would be run, with the same stack setup: + * 8: eflags + * 4: cs + * esp-> 0: eip + * + * This attempts to make sure that any pending events are dealt with + * on return to usermode, but there is a small window in which an + * event can happen just before entering usermode. If the nested + * interrupt ends up setting one of the TIF_WORK_MASK pending work + * flags, they will not be tested again before returning to + * usermode. This means that a process can end up with pending work, + * which will be unprocessed until the process enters and leaves the + * kernel again, which could be an unbounded amount of time. This + * means that a pending signal or reschedule event could be + * indefinitely delayed. + * + * The fix is to notice a nested interrupt in the critical window, and + * if one occurs, then fold the nested interrupt into the current + * interrupt stack frame, and re-process it iteratively rather than + * recursively. This means that it will exit via the normal path, and + * all pending work will be dealt with appropriately. + * + * Because the nested interrupt handler needs to deal with the current + * stack state in whatever form its in, we keep things simple by only + * using a single register which is pushed/popped on the stack. + */ + +.macro POP_FS +1: + popw %fs +.pushsection .fixup, "ax" +2: movw $0, (%esp) + jmp 1b +.popsection + _ASM_EXTABLE(1b,2b) +.endm + +ENTRY(xen_iret) + /* test eflags for special cases */ + testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) + jnz hyper_iret + + push %eax + ESP_OFFSET=4 # bytes pushed onto stack + + /* Store vcpu_info pointer for easy access */ +#ifdef CONFIG_SMP + pushw %fs + movl $(__KERNEL_PERCPU), %eax + movl %eax, %fs + movl %fs:xen_vcpu, %eax + POP_FS +#else + movl %ss:xen_vcpu, %eax +#endif + + /* check IF state we're restoring */ + testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) + + /* + * Maybe enable events. Once this happens we could get a + * recursive event, so the critical region starts immediately + * afterwards. However, if that happens we don't end up + * resuming the code, so we don't have to be worried about + * being preempted to another CPU. + */ + setz %ss:XEN_vcpu_info_mask(%eax) +xen_iret_start_crit: + + /* check for unmasked and pending */ + cmpw $0x0001, %ss:XEN_vcpu_info_pending(%eax) + + /* + * If there's something pending, mask events again so we can + * jump back into xen_hypervisor_callback. Otherwise do not + * touch XEN_vcpu_info_mask. + */ + jne 1f + movb $1, %ss:XEN_vcpu_info_mask(%eax) + +1: popl %eax + + /* + * From this point on the registers are restored and the stack + * updated, so we don't need to worry about it if we're + * preempted + */ +iret_restore_end: + + /* + * Jump to hypervisor_callback after fixing up the stack. + * Events are masked, so jumping out of the critical region is + * OK. + */ + je xen_hypervisor_callback + +1: iret +xen_iret_end_crit: + _ASM_EXTABLE(1b, iret_exc) + +hyper_iret: + /* put this out of line since its very rarely used */ + jmp hypercall_page + __HYPERVISOR_iret * 32 + + .globl xen_iret_start_crit, xen_iret_end_crit + +/* + * This is called by xen_hypervisor_callback in entry.S when it sees + * that the EIP at the time of interrupt was between + * xen_iret_start_crit and xen_iret_end_crit. We're passed the EIP in + * %eax so we can do a more refined determination of what to do. + * + * The stack format at this point is: + * ---------------- + * ss : (ss/esp may be present if we came from usermode) + * esp : + * eflags } outer exception info + * cs } + * eip } + * ---------------- <- edi (copy dest) + * eax : outer eax if it hasn't been restored + * ---------------- + * eflags } nested exception info + * cs } (no ss/esp because we're nested + * eip } from the same ring) + * orig_eax }<- esi (copy src) + * - - - - - - - - + * fs } + * es } + * ds } SAVE_ALL state + * eax } + * : : + * ebx }<- esp + * ---------------- + * + * In order to deliver the nested exception properly, we need to shift + * everything from the return addr up to the error code so it sits + * just under the outer exception info. This means that when we + * handle the exception, we do it in the context of the outer + * exception rather than starting a new one. + * + * The only caveat is that if the outer eax hasn't been restored yet + * (ie, it's still on stack), we need to insert its value into the + * SAVE_ALL state before going on, since it's usermode state which we + * eventually need to restore. + */ +ENTRY(xen_iret_crit_fixup) + /* + * Paranoia: Make sure we're really coming from kernel space. + * One could imagine a case where userspace jumps into the + * critical range address, but just before the CPU delivers a + * GP, it decides to deliver an interrupt instead. Unlikely? + * Definitely. Easy to avoid? Yes. The Intel documents + * explicitly say that the reported EIP for a bad jump is the + * jump instruction itself, not the destination, but some + * virtual environments get this wrong. + */ + movl PT_CS(%esp), %ecx + andl $SEGMENT_RPL_MASK, %ecx + cmpl $USER_RPL, %ecx + je 2f + + lea PT_ORIG_EAX(%esp), %esi + lea PT_EFLAGS(%esp), %edi + + /* + * If eip is before iret_restore_end then stack + * hasn't been restored yet. + */ + cmp $iret_restore_end, %eax + jae 1f + + movl 0+4(%edi), %eax /* copy EAX (just above top of frame) */ + movl %eax, PT_EAX(%esp) + + lea ESP_OFFSET(%edi), %edi /* move dest up over saved regs */ + + /* set up the copy */ +1: std + mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */ + rep movsl + cld + + lea 4(%edi), %esp /* point esp to new frame */ +2: jmp xen_do_upcall + diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S new file mode 100644 index 00000000000..53adefda427 --- /dev/null +++ b/arch/x86/xen/xen-asm_64.S @@ -0,0 +1,159 @@ +/* + * Asm versions of Xen pv-ops, suitable for either direct use or + * inlining. The inline versions are the same as the direct-use + * versions, with the pre- and post-amble chopped off. + * + * This code is encoded for size rather than absolute efficiency, with + * a view to being able to inline as much as possible. + * + * We only bother with direct forms (ie, vcpu in pda) of the + * operations here; the indirect forms are better handled in C, since + * they're generally too large to inline anyway. + */ + +#include <asm/errno.h> +#include <asm/percpu.h> +#include <asm/processor-flags.h> +#include <asm/segment.h> + +#include <xen/interface/xen.h> + +#include "xen-asm.h" + +ENTRY(xen_adjust_exception_frame) + mov 8+0(%rsp), %rcx + mov 8+8(%rsp), %r11 + ret $16 + +hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 +/* + * Xen64 iret frame: + * + * ss + * rsp + * rflags + * cs + * rip <-- standard iret frame + * + * flags + * + * rcx } + * r11 }<-- pushed by hypercall page + * rsp->rax } + */ +ENTRY(xen_iret) + pushq $0 +1: jmp hypercall_iret +ENDPATCH(xen_iret) +RELOC(xen_iret, 1b+1) + +/* + * sysexit is not used for 64-bit processes, so it's only ever used to + * return to 32-bit compat userspace. + */ +ENTRY(xen_sysexit) + pushq $__USER32_DS + pushq %rcx + pushq $X86_EFLAGS_IF + pushq $__USER32_CS + pushq %rdx + + pushq $0 +1: jmp hypercall_iret +ENDPATCH(xen_sysexit) +RELOC(xen_sysexit, 1b+1) + +ENTRY(xen_sysret64) + /* + * We're already on the usermode stack at this point, but + * still with the kernel gs, so we can easily switch back + */ + movq %rsp, PER_CPU_VAR(old_rsp) + movq PER_CPU_VAR(kernel_stack), %rsp + + pushq $__USER_DS + pushq PER_CPU_VAR(old_rsp) + pushq %r11 + pushq $__USER_CS + pushq %rcx + + pushq $VGCF_in_syscall +1: jmp hypercall_iret +ENDPATCH(xen_sysret64) +RELOC(xen_sysret64, 1b+1) + +ENTRY(xen_sysret32) + /* + * We're already on the usermode stack at this point, but + * still with the kernel gs, so we can easily switch back + */ + movq %rsp, PER_CPU_VAR(old_rsp) + movq PER_CPU_VAR(kernel_stack), %rsp + + pushq $__USER32_DS + pushq PER_CPU_VAR(old_rsp) + pushq %r11 + pushq $__USER32_CS + pushq %rcx + + pushq $0 +1: jmp hypercall_iret +ENDPATCH(xen_sysret32) +RELOC(xen_sysret32, 1b+1) + +/* + * Xen handles syscall callbacks much like ordinary exceptions, which + * means we have: + * - kernel gs + * - kernel rsp + * - an iret-like stack frame on the stack (including rcx and r11): + * ss + * rsp + * rflags + * cs + * rip + * r11 + * rsp->rcx + * + * In all the entrypoints, we undo all that to make it look like a + * CPU-generated syscall/sysenter and jump to the normal entrypoint. + */ + +.macro undo_xen_syscall + mov 0*8(%rsp), %rcx + mov 1*8(%rsp), %r11 + mov 5*8(%rsp), %rsp +.endm + +/* Normal 64-bit system call target */ +ENTRY(xen_syscall_target) + undo_xen_syscall + jmp system_call_after_swapgs +ENDPROC(xen_syscall_target) + +#ifdef CONFIG_IA32_EMULATION + +/* 32-bit compat syscall target */ +ENTRY(xen_syscall32_target) + undo_xen_syscall + jmp ia32_cstar_target +ENDPROC(xen_syscall32_target) + +/* 32-bit compat sysenter target */ +ENTRY(xen_sysenter_target) + undo_xen_syscall + jmp ia32_sysenter_target +ENDPROC(xen_sysenter_target) + +#else /* !CONFIG_IA32_EMULATION */ + +ENTRY(xen_syscall32_target) +ENTRY(xen_sysenter_target) + lea 16(%rsp), %rsp /* strip %rcx, %r11 */ + mov $-ENOSYS, %rax + pushq $0 + jmp hypercall_iret +ENDPROC(xen_syscall32_target) +ENDPROC(xen_sysenter_target) + +#endif /* CONFIG_IA32_EMULATION */ diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 288d587ce73..485b6958554 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -5,36 +5,126 @@ #include <linux/elfnote.h> #include <linux/init.h> + #include <asm/boot.h> +#include <asm/asm.h> +#include <asm/page_types.h> + #include <xen/interface/elfnote.h> +#include <xen/interface/features.h> +#include <asm/xen/interface.h> + +#ifdef CONFIG_XEN_PVH +#define PVH_FEATURES_STR "|writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel" +/* Note the lack of 'hvm_callback_vector'. Older hypervisor will + * balk at this being part of XEN_ELFNOTE_FEATURES, so we put it in + * XEN_ELFNOTE_SUPPORTED_FEATURES which older hypervisors will ignore. + */ +#define PVH_FEATURES ((1 << XENFEAT_writable_page_tables) | \ + (1 << XENFEAT_auto_translated_physmap) | \ + (1 << XENFEAT_supervisor_mode_kernel) | \ + (1 << XENFEAT_hvm_callback_vector)) +/* The XENFEAT_writable_page_tables is not stricly neccessary as we set that + * up regardless whether this CONFIG option is enabled or not, but it + * clarifies what the right flags need to be. + */ +#else +#define PVH_FEATURES_STR "" +#define PVH_FEATURES (0) +#endif __INIT ENTRY(startup_xen) - movl %esi,xen_start_info cld - movl $(init_thread_union+THREAD_SIZE),%esp +#ifdef CONFIG_X86_32 + mov %esi,xen_start_info + mov $init_thread_union+THREAD_SIZE,%esp +#else + mov %rsi,xen_start_info + mov $init_thread_union+THREAD_SIZE,%rsp +#endif jmp xen_start_kernel __FINIT -.pushsection .bss.page_aligned - .align PAGE_SIZE_asm +.pushsection .text + .balign PAGE_SIZE ENTRY(hypercall_page) - .skip 0x1000 +#define NEXT_HYPERCALL(x) \ + ENTRY(xen_hypercall_##x) \ + .skip 32 + +NEXT_HYPERCALL(set_trap_table) +NEXT_HYPERCALL(mmu_update) +NEXT_HYPERCALL(set_gdt) +NEXT_HYPERCALL(stack_switch) +NEXT_HYPERCALL(set_callbacks) +NEXT_HYPERCALL(fpu_taskswitch) +NEXT_HYPERCALL(sched_op_compat) +NEXT_HYPERCALL(platform_op) +NEXT_HYPERCALL(set_debugreg) +NEXT_HYPERCALL(get_debugreg) +NEXT_HYPERCALL(update_descriptor) +NEXT_HYPERCALL(ni) +NEXT_HYPERCALL(memory_op) +NEXT_HYPERCALL(multicall) +NEXT_HYPERCALL(update_va_mapping) +NEXT_HYPERCALL(set_timer_op) +NEXT_HYPERCALL(event_channel_op_compat) +NEXT_HYPERCALL(xen_version) +NEXT_HYPERCALL(console_io) +NEXT_HYPERCALL(physdev_op_compat) +NEXT_HYPERCALL(grant_table_op) +NEXT_HYPERCALL(vm_assist) +NEXT_HYPERCALL(update_va_mapping_otherdomain) +NEXT_HYPERCALL(iret) +NEXT_HYPERCALL(vcpu_op) +NEXT_HYPERCALL(set_segment_base) +NEXT_HYPERCALL(mmuext_op) +NEXT_HYPERCALL(xsm_op) +NEXT_HYPERCALL(nmi_op) +NEXT_HYPERCALL(sched_op) +NEXT_HYPERCALL(callback_op) +NEXT_HYPERCALL(xenoprof_op) +NEXT_HYPERCALL(event_channel_op) +NEXT_HYPERCALL(physdev_op) +NEXT_HYPERCALL(hvm_op) +NEXT_HYPERCALL(sysctl) +NEXT_HYPERCALL(domctl) +NEXT_HYPERCALL(kexec_op) +NEXT_HYPERCALL(tmem_op) /* 38 */ +ENTRY(xen_hypercall_rsvr) + .skip 320 +NEXT_HYPERCALL(mca) /* 48 */ +NEXT_HYPERCALL(arch_1) +NEXT_HYPERCALL(arch_2) +NEXT_HYPERCALL(arch_3) +NEXT_HYPERCALL(arch_4) +NEXT_HYPERCALL(arch_5) +NEXT_HYPERCALL(arch_6) + .balign PAGE_SIZE .popsection ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") - ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) - ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) - ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) - ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") -#ifdef CONFIG_X86_PAE - ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") +#ifdef CONFIG_X86_32 + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET) #else - ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no") + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) #endif + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables|pae_pgdir_above_4gb"; .asciz PVH_FEATURES_STR) + ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long (PVH_FEATURES) | + (1 << XENFEAT_writable_page_tables) | + (1 << XENFEAT_dom0)) + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, + .quad _PAGE_PRESENT; .quad _PAGE_PRESENT) + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) + ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START) + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0) #endif /*CONFIG_XEN */ diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index b02a909bfd4..97d87659f77 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -2,66 +2,129 @@ #define XEN_OPS_H #include <linux/init.h> +#include <linux/clocksource.h> +#include <linux/irqreturn.h> +#include <xen/xen-ops.h> /* These are code, but not functions. Defined in entry.S */ extern const char xen_hypervisor_callback[]; extern const char xen_failsafe_callback[]; +extern void *xen_initial_gdt; + +struct trap_info; void xen_copy_trap_info(struct trap_info *traps); -DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); +DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info); DECLARE_PER_CPU(unsigned long, xen_cr3); DECLARE_PER_CPU(unsigned long, xen_current_cr3); extern struct start_info *xen_start_info; +extern struct shared_info xen_dummy_shared_info; extern struct shared_info *HYPERVISOR_shared_info; -char * __init xen_memory_setup(void); -void __init xen_arch_setup(void); -void __init xen_init_IRQ(void); +void xen_setup_mfn_list_list(void); +void xen_setup_shared_info(void); +void xen_build_mfn_list_list(void); +void xen_setup_machphys_mapping(void); +void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); +void xen_reserve_top(void); +extern unsigned long xen_max_p2m_pfn; -void xen_setup_timer(int cpu); -void xen_setup_cpu_clockevents(void); -unsigned long xen_cpu_khz(void); -void __init xen_time_init(void); -unsigned long xen_get_wallclock(void); -int xen_set_wallclock(unsigned long time); -unsigned long long xen_sched_clock(void); +void xen_mm_pin_all(void); +void xen_mm_unpin_all(void); +void xen_set_pat(u64); -bool xen_vcpu_stolen(int vcpu); +char * __init xen_memory_setup(void); +char * xen_auto_xlated_memory_setup(void); +void __init xen_arch_setup(void); +void xen_enable_sysenter(void); +void xen_enable_syscall(void); +void xen_vcpu_restore(void); -void xen_mark_init_mm_pinned(void); +void xen_callback_vector(void); +void xen_hvm_init_shared_info(void); +void xen_unplug_emulated_devices(void); -void __init xen_fill_possible_map(void); +void __init xen_build_dynamic_phys_to_machine(void); +unsigned long __init xen_revector_p2m_tree(void); -void __init xen_setup_vcpu_info_placement(void); -void xen_smp_prepare_boot_cpu(void); -void xen_smp_prepare_cpus(unsigned int max_cpus); -int xen_cpu_up(unsigned int cpu); -void xen_smp_cpus_done(unsigned int max_cpus); +void xen_init_irq_ops(void); +void xen_setup_timer(int cpu); +void xen_setup_runstate_info(int cpu); +void xen_teardown_timer(int cpu); +cycle_t xen_clocksource_read(void); +void xen_setup_cpu_clockevents(void); +void __init xen_init_time_ops(void); +void __init xen_hvm_init_time_ops(void); -void xen_smp_send_stop(void); -void xen_smp_send_reschedule(int cpu); -int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic, - int wait); -int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait); +irqreturn_t xen_debug_interrupt(int irq, void *dev_id); -int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), - void *info, int wait); +bool xen_vcpu_stolen(int vcpu); +void xen_setup_vcpu_info_placement(void); + +#ifdef CONFIG_SMP +void xen_smp_init(void); +void __init xen_hvm_smp_init(void); + +extern cpumask_var_t xen_cpu_initialized_map; +#else +static inline void xen_smp_init(void) {} +static inline void xen_hvm_smp_init(void) {} +#endif + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +void __init xen_init_spinlocks(void); +void xen_init_lock_cpu(int cpu); +void xen_uninit_lock_cpu(int cpu); +#else +static inline void xen_init_spinlocks(void) +{ +} +static inline void xen_init_lock_cpu(int cpu) +{ +} +static inline void xen_uninit_lock_cpu(int cpu) +{ +} +#endif + +struct dom0_vga_console_info; + +#ifdef CONFIG_XEN_DOM0 +void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size); +void __init xen_init_apic(void); +#else +static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, + size_t size) +{ +} +static inline void __init xen_init_apic(void) +{ +} +#endif /* Declare an asm function, along with symbols needed to make it inlineable */ #define DECL_ASM(ret, name, ...) \ - ret name(__VA_ARGS__); \ - extern char name##_end[]; \ - extern char name##_reloc[] \ + __visible ret name(__VA_ARGS__); \ + extern char name##_end[] __visible; \ + extern char name##_reloc[] __visible DECL_ASM(void, xen_irq_enable_direct, void); DECL_ASM(void, xen_irq_disable_direct, void); DECL_ASM(unsigned long, xen_save_fl_direct, void); DECL_ASM(void, xen_restore_fl_direct, unsigned long); -void xen_iret_direct(void); +/* These are not functions, and cannot be called normally */ +__visible void xen_iret(void); +__visible void xen_sysexit(void); +__visible void xen_sysret32(void); +__visible void xen_sysret64(void); +__visible void xen_adjust_exception_frame(void); + +extern int xen_panic_handler_init(void); + +void xen_pvh_secondary_vcpu_init(int cpu); #endif /* XEN_OPS_H */ |
