aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/xen
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/Kconfig45
-rw-r--r--arch/x86/xen/Makefile26
-rw-r--r--arch/x86/xen/apic.c34
-rw-r--r--arch/x86/xen/debugfs.c21
-rw-r--r--arch/x86/xen/debugfs.h6
-rw-r--r--arch/x86/xen/enlighten.c1817
-rw-r--r--arch/x86/xen/events.c591
-rw-r--r--arch/x86/xen/features.c29
-rw-r--r--arch/x86/xen/grant-table.c225
-rw-r--r--arch/x86/xen/irq.c136
-rw-r--r--arch/x86/xen/manage.c143
-rw-r--r--arch/x86/xen/mmu.c2522
-rw-r--r--arch/x86/xen/mmu.h55
-rw-r--r--arch/x86/xen/multicalls.c116
-rw-r--r--arch/x86/xen/multicalls.h24
-rw-r--r--arch/x86/xen/p2m.c1340
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c109
-rw-r--r--arch/x86/xen/platform-pci-unplug.c217
-rw-r--r--arch/x86/xen/setup.c614
-rw-r--r--arch/x86/xen/smp.c733
-rw-r--r--arch/x86/xen/smp.h11
-rw-r--r--arch/x86/xen/spinlock.c348
-rw-r--r--arch/x86/xen/suspend.c97
-rw-r--r--arch/x86/xen/time.c417
-rw-r--r--arch/x86/xen/trace.c62
-rw-r--r--arch/x86/xen/vga.c74
-rw-r--r--arch/x86/xen/xen-asm.S304
-rw-r--r--arch/x86/xen/xen-asm.h12
-rw-r--r--arch/x86/xen/xen-asm_32.S237
-rw-r--r--arch/x86/xen/xen-asm_64.S159
-rw-r--r--arch/x86/xen/xen-head.S114
-rw-r--r--arch/x86/xen/xen-ops.h125
32 files changed, 8368 insertions, 2395 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 4d5f2649bee..e88fda867a3 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -4,10 +4,49 @@
config XEN
bool "Xen guest support"
- select PARAVIRT
- depends on X86_32
- depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER)
+ depends on PARAVIRT
+ select PARAVIRT_CLOCK
+ select XEN_HAVE_PVMMU
+ depends on X86_64 || (X86_32 && X86_PAE)
+ depends on X86_TSC
help
This is the Linux Xen port. Enabling this will allow the
kernel to boot in a paravirtualized environment under the
Xen hypervisor.
+
+config XEN_DOM0
+ def_bool y
+ depends on XEN && PCI_XEN && SWIOTLB_XEN
+ depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI
+
+config XEN_PVHVM
+ def_bool y
+ depends on XEN && PCI && X86_LOCAL_APIC
+
+config XEN_MAX_DOMAIN_MEMORY
+ int
+ default 500 if X86_64
+ default 64 if X86_32
+ depends on XEN
+ help
+ This only affects the sizing of some bss arrays, the unused
+ portions of which are freed.
+
+config XEN_SAVE_RESTORE
+ bool
+ depends on XEN
+ select HIBERNATE_CALLBACKS
+ default y
+
+config XEN_DEBUG_FS
+ bool "Enable Xen debug and tuning parameters in debugfs"
+ depends on XEN && DEBUG_FS
+ default n
+ help
+ Enable statistics output and various tuning options in debugfs.
+ Enabling this option may incur a significant performance overhead.
+
+config XEN_PVH
+ bool "Support for running as a PVH guest"
+ depends on X86_64 && XEN && XEN_PVHVM
+ def_bool n
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 343df246bd3..96ab2c09cb6 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,24 @@
-obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \
- events.o time.o manage.o xen-asm.o
+ifdef CONFIG_FUNCTION_TRACER
+# Do not profile debug and lowlevel utilities
+CFLAGS_REMOVE_spinlock.o = -pg
+CFLAGS_REMOVE_time.o = -pg
+CFLAGS_REMOVE_irq.o = -pg
+endif
-obj-$(CONFIG_SMP) += smp.o
+# Make sure early boot has no stackprotector
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_enlighten.o := $(nostackp)
+CFLAGS_mmu.o := $(nostackp)
+
+obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
+ time.o xen-asm.o xen-asm_$(BITS).o \
+ grant-table.o suspend.o platform-pci-unplug.o \
+ p2m.o
+
+obj-$(CONFIG_EVENT_TRACING) += trace.o
+
+obj-$(CONFIG_SMP) += smp.o
+obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
+obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
+obj-$(CONFIG_XEN_DOM0) += apic.o vga.o
+obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
new file mode 100644
index 00000000000..7005ced5d1a
--- /dev/null
+++ b/arch/x86/xen/apic.c
@@ -0,0 +1,34 @@
+#include <linux/init.h>
+
+#include <asm/x86_init.h>
+#include <asm/apic.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
+#include <xen/interface/physdev.h>
+#include "xen-ops.h"
+
+static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
+{
+ struct physdev_apic apic_op;
+ int ret;
+
+ apic_op.apic_physbase = mpc_ioapic_addr(apic);
+ apic_op.reg = reg;
+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
+ if (!ret)
+ return apic_op.value;
+
+ /* fallback to return an emulated IO_APIC values */
+ if (reg == 0x1)
+ return 0x00170020;
+ else if (reg == 0x0)
+ return apic << 24;
+
+ return 0xfd;
+}
+
+void __init xen_init_apic(void)
+{
+ x86_io_apic_ops.read = xen_io_apic_read;
+}
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
new file mode 100644
index 00000000000..c8377fb26cd
--- /dev/null
+++ b/arch/x86/xen/debugfs.c
@@ -0,0 +1,21 @@
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#include "debugfs.h"
+
+static struct dentry *d_xen_debug;
+
+struct dentry * __init xen_init_debugfs(void)
+{
+ if (!d_xen_debug) {
+ d_xen_debug = debugfs_create_dir("xen", NULL);
+
+ if (!d_xen_debug)
+ pr_warning("Could not create 'xen' debugfs directory\n");
+ }
+
+ return d_xen_debug;
+}
+
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
new file mode 100644
index 00000000000..12ebf3325c7
--- /dev/null
+++ b/arch/x86/xen/debugfs.h
@@ -0,0 +1,6 @@
+#ifndef _XEN_DEBUGFS_H
+#define _XEN_DEBUGFS_H
+
+struct dentry * __init xen_init_debugfs(void);
+
+#endif /* _XEN_DEBUGFS_H */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 27ee26aedf9..ffb101e4573 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -11,6 +11,7 @@
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
+#include <linux/cpu.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/smp.h>
@@ -20,67 +21,119 @@
#include <linux/delay.h>
#include <linux/start_kernel.h>
#include <linux/sched.h>
+#include <linux/kprobes.h>
#include <linux/bootmem.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
#include <linux/highmem.h>
-
+#include <linux/console.h>
+#include <linux/pci.h>
+#include <linux/gfp.h>
+#include <linux/memblock.h>
+#include <linux/edd.h>
+
+#include <xen/xen.h>
+#include <xen/events.h>
#include <xen/interface/xen.h>
+#include <xen/interface/version.h>
#include <xen/interface/physdev.h>
#include <xen/interface/vcpu.h>
-#include <xen/interface/sched.h>
+#include <xen/interface/memory.h>
+#include <xen/interface/xen-mca.h>
#include <xen/features.h>
#include <xen/page.h>
+#include <xen/hvm.h>
+#include <xen/hvc-console.h>
+#include <xen/acpi.h>
#include <asm/paravirt.h>
+#include <asm/apic.h>
#include <asm/page.h>
+#include <asm/xen/pci.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
#include <asm/fixmap.h>
#include <asm/processor.h>
+#include <asm/proto.h>
+#include <asm/msr-index.h>
+#include <asm/traps.h>
#include <asm/setup.h>
#include <asm/desc.h>
+#include <asm/pgalloc.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/reboot.h>
+#include <asm/stackprotector.h>
+#include <asm/hypervisor.h>
+#include <asm/mwait.h>
+#include <asm/pci_x86.h>
+#include <asm/pat.h>
+
+#ifdef CONFIG_ACPI
+#include <linux/acpi.h>
+#include <asm/acpi.h>
+#include <acpi/pdc_intel.h>
+#include <acpi/processor.h>
+#include <xen/interface/platform.h>
+#endif
#include "xen-ops.h"
#include "mmu.h"
+#include "smp.h"
#include "multicalls.h"
EXPORT_SYMBOL_GPL(hypercall_page);
-DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
-DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
-
/*
- * Note about cr3 (pagetable base) values:
+ * Pointer to the xen_vcpu_info structure or
+ * &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info
+ * and xen_vcpu_setup for details. By default it points to share_info->vcpu_info
+ * but if the hypervisor supports VCPUOP_register_vcpu_info then it can point
+ * to xen_vcpu_info. The pointer is used in __xen_evtchn_do_upcall to
+ * acknowledge pending events.
+ * Also more subtly it is used by the patched version of irq enable/disable
+ * e.g. xen_irq_enable_direct and xen_iret in PV mode.
*
- * xen_cr3 contains the current logical cr3 value; it contains the
- * last set cr3. This may not be the current effective cr3, because
- * its update may be being lazily deferred. However, a vcpu looking
- * at its own cr3 can use this value knowing that it everything will
- * be self-consistent.
+ * The desire to be able to do those mask/unmask operations as a single
+ * instruction by using the per-cpu offset held in %gs is the real reason
+ * vcpu info is in a per-cpu pointer and the original reason for this
+ * hypercall.
*
- * xen_current_cr3 contains the actual vcpu cr3; it is set once the
- * hypercall to set the vcpu cr3 is complete (so it may be a little
- * out of date, but it will never be set early). If one vcpu is
- * looking at another vcpu's cr3 value, it should use this variable.
*/
-DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
-DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
+DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
+
+/*
+ * Per CPU pages used if hypervisor supports VCPUOP_register_vcpu_info
+ * hypercall. This can be used both in PV and PVHVM mode. The structure
+ * overrides the default per_cpu(xen_vcpu, cpu) value.
+ */
+DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+
+enum xen_domain_type xen_domain_type = XEN_NATIVE;
+EXPORT_SYMBOL_GPL(xen_domain_type);
+
+unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
+EXPORT_SYMBOL(machine_to_phys_mapping);
+unsigned long machine_to_phys_nr;
+EXPORT_SYMBOL(machine_to_phys_nr);
struct start_info *xen_start_info;
EXPORT_SYMBOL_GPL(xen_start_info);
-static /* __initdata */ struct shared_info dummy_shared_info;
+struct shared_info xen_dummy_shared_info;
+
+void *xen_initial_gdt;
+
+RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
+__read_mostly int xen_have_vector_callback;
+EXPORT_SYMBOL_GPL(xen_have_vector_callback);
/*
* Point at some empty memory to start with. We map the real shared_info
* page as soon as fixmap is up and running.
*/
-struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
+struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
/*
* Flag to determine whether vcpu info placement is available on all
@@ -97,65 +150,187 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
*/
static int have_vcpu_info_placement = 1;
-static void __init xen_vcpu_setup(int cpu)
+struct tls_descs {
+ struct desc_struct desc[3];
+};
+
+/*
+ * Updating the 3 TLS descriptors in the GDT on every task switch is
+ * surprisingly expensive so we avoid updating them if they haven't
+ * changed. Since Xen writes different descriptors than the one
+ * passed in the update_descriptor hypercall we keep shadow copies to
+ * compare against.
+ */
+static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
+
+static void clamp_max_cpus(void)
+{
+#ifdef CONFIG_SMP
+ if (setup_max_cpus > MAX_VIRT_CPUS)
+ setup_max_cpus = MAX_VIRT_CPUS;
+#endif
+}
+
+static void xen_vcpu_setup(int cpu)
{
struct vcpu_register_vcpu_info info;
int err;
struct vcpu_info *vcpup;
- BUG_ON(HYPERVISOR_shared_info == &dummy_shared_info);
- per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+ BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
- if (!have_vcpu_info_placement)
- return; /* already tested, not available */
+ /*
+ * This path is called twice on PVHVM - first during bootup via
+ * smp_init -> xen_hvm_cpu_notify, and then if the VCPU is being
+ * hotplugged: cpu_up -> xen_hvm_cpu_notify.
+ * As we can only do the VCPUOP_register_vcpu_info once lets
+ * not over-write its result.
+ *
+ * For PV it is called during restore (xen_vcpu_restore) and bootup
+ * (xen_setup_vcpu_info_placement). The hotplug mechanism does not
+ * use this function.
+ */
+ if (xen_hvm_domain()) {
+ if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu))
+ return;
+ }
+ if (cpu < MAX_VIRT_CPUS)
+ per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
- vcpup = &per_cpu(xen_vcpu_info, cpu);
+ if (!have_vcpu_info_placement) {
+ if (cpu >= MAX_VIRT_CPUS)
+ clamp_max_cpus();
+ return;
+ }
- info.mfn = virt_to_mfn(vcpup);
+ vcpup = &per_cpu(xen_vcpu_info, cpu);
+ info.mfn = arbitrary_virt_to_mfn(vcpup);
info.offset = offset_in_page(vcpup);
- printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
- cpu, vcpup, info.mfn, info.offset);
-
/* Check to see if the hypervisor will put the vcpu_info
structure where we want it, which allows direct access via
- a percpu-variable. */
+ a percpu-variable.
+ N.B. This hypercall can _only_ be called once per CPU. Subsequent
+ calls will error out with -EINVAL. This is due to the fact that
+ hypervisor has no unregister variant and this hypercall does not
+ allow to over-write info.mfn and info.offset.
+ */
err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);
if (err) {
printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
have_vcpu_info_placement = 0;
+ clamp_max_cpus();
} else {
/* This cpu is using the registered vcpu info, even if
later ones fail to. */
per_cpu(xen_vcpu, cpu) = vcpup;
+ }
+}
- printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
- cpu, vcpup);
+/*
+ * On restore, set the vcpu placement up again.
+ * If it fails, then we're in a bad state, since
+ * we can't back out from using it...
+ */
+void xen_vcpu_restore(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ bool other_cpu = (cpu != smp_processor_id());
+ bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL);
+
+ if (other_cpu && is_up &&
+ HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL))
+ BUG();
+
+ xen_setup_runstate_info(cpu);
+
+ if (have_vcpu_info_placement)
+ xen_vcpu_setup(cpu);
+
+ if (other_cpu && is_up &&
+ HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
+ BUG();
}
}
static void __init xen_banner(void)
{
- printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
- pv_info.name);
- printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
+ unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
+ struct xen_extraversion extra;
+ HYPERVISOR_xen_version(XENVER_extraversion, &extra);
+
+ pr_info("Booting paravirtualized kernel %son %s\n",
+ xen_feature(XENFEAT_auto_translated_physmap) ?
+ "with PVH extensions " : "", pv_info.name);
+ printk(KERN_INFO "Xen version: %d.%d%s%s\n",
+ version >> 16, version & 0xffff, extra.extraversion,
+ xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
+}
+/* Check if running on Xen version (major, minor) or later */
+bool
+xen_running_on_version_or_later(unsigned int major, unsigned int minor)
+{
+ unsigned int version;
+
+ if (!xen_domain())
+ return false;
+
+ version = HYPERVISOR_xen_version(XENVER_version, NULL);
+ if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) ||
+ ((version >> 16) > major))
+ return true;
+ return false;
}
+#define CPUID_THERM_POWER_LEAF 6
+#define APERFMPERF_PRESENT 0
+
+static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
+static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
+
+static __read_mostly unsigned int cpuid_leaf1_ecx_set_mask;
+static __read_mostly unsigned int cpuid_leaf5_ecx_val;
+static __read_mostly unsigned int cpuid_leaf5_edx_val;
+
static void xen_cpuid(unsigned int *ax, unsigned int *bx,
unsigned int *cx, unsigned int *dx)
{
+ unsigned maskebx = ~0;
+ unsigned maskecx = ~0;
unsigned maskedx = ~0;
-
+ unsigned setecx = 0;
/*
* Mask out inconvenient features, to try and disable as many
* unsupported kernel subsystems as possible.
*/
- if (*ax == 1)
- maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */
- (1 << X86_FEATURE_ACPI) | /* disable ACPI */
- (1 << X86_FEATURE_SEP) | /* disable SEP */
- (1 << X86_FEATURE_ACC)); /* thermal monitoring */
+ switch (*ax) {
+ case 1:
+ maskecx = cpuid_leaf1_ecx_mask;
+ setecx = cpuid_leaf1_ecx_set_mask;
+ maskedx = cpuid_leaf1_edx_mask;
+ break;
+
+ case CPUID_MWAIT_LEAF:
+ /* Synthesize the values.. */
+ *ax = 0;
+ *bx = 0;
+ *cx = cpuid_leaf5_ecx_val;
+ *dx = cpuid_leaf5_edx_val;
+ return;
+
+ case CPUID_THERM_POWER_LEAF:
+ /* Disabling APERFMPERF for kernel usage */
+ maskecx = ~(1 << APERFMPERF_PRESENT);
+ break;
+
+ case 0xb:
+ /* Suppress extended topology stuff */
+ maskebx = 0;
+ break;
+ }
asm(XEN_EMULATE_PREFIX "cpuid"
: "=a" (*ax),
@@ -163,111 +338,129 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
"=c" (*cx),
"=d" (*dx)
: "0" (*ax), "2" (*cx));
+
+ *bx &= maskebx;
+ *cx &= maskecx;
+ *cx |= setecx;
*dx &= maskedx;
-}
-static void xen_set_debugreg(int reg, unsigned long val)
-{
- HYPERVISOR_set_debugreg(reg, val);
}
-static unsigned long xen_get_debugreg(int reg)
+static bool __init xen_check_mwait(void)
{
- return HYPERVISOR_get_debugreg(reg);
-}
+#ifdef CONFIG_ACPI
+ struct xen_platform_op op = {
+ .cmd = XENPF_set_processor_pminfo,
+ .u.set_pminfo.id = -1,
+ .u.set_pminfo.type = XEN_PM_PDC,
+ };
+ uint32_t buf[3];
+ unsigned int ax, bx, cx, dx;
+ unsigned int mwait_mask;
+
+ /* We need to determine whether it is OK to expose the MWAIT
+ * capability to the kernel to harvest deeper than C3 states from ACPI
+ * _CST using the processor_harvest_xen.c module. For this to work, we
+ * need to gather the MWAIT_LEAF values (which the cstate.c code
+ * checks against). The hypervisor won't expose the MWAIT flag because
+ * it would break backwards compatibility; so we will find out directly
+ * from the hardware and hypercall.
+ */
+ if (!xen_initial_domain())
+ return false;
-static unsigned long xen_save_fl(void)
-{
- struct vcpu_info *vcpu;
- unsigned long flags;
+ /*
+ * When running under platform earlier than Xen4.2, do not expose
+ * mwait, to avoid the risk of loading native acpi pad driver
+ */
+ if (!xen_running_on_version_or_later(4, 2))
+ return false;
- vcpu = x86_read_percpu(xen_vcpu);
+ ax = 1;
+ cx = 0;
- /* flag has opposite sense of mask */
- flags = !vcpu->evtchn_upcall_mask;
+ native_cpuid(&ax, &bx, &cx, &dx);
- /* convert to IF type flag
- -0 -> 0x00000000
- -1 -> 0xffffffff
- */
- return (-flags) & X86_EFLAGS_IF;
-}
+ mwait_mask = (1 << (X86_FEATURE_EST % 32)) |
+ (1 << (X86_FEATURE_MWAIT % 32));
-static void xen_restore_fl(unsigned long flags)
-{
- struct vcpu_info *vcpu;
+ if ((cx & mwait_mask) != mwait_mask)
+ return false;
- /* convert from IF type flag */
- flags = !(flags & X86_EFLAGS_IF);
+ /* We need to emulate the MWAIT_LEAF and for that we need both
+ * ecx and edx. The hypercall provides only partial information.
+ */
- /* There's a one instruction preempt window here. We need to
- make sure we're don't switch CPUs between getting the vcpu
- pointer and updating the mask. */
- preempt_disable();
- vcpu = x86_read_percpu(xen_vcpu);
- vcpu->evtchn_upcall_mask = flags;
- preempt_enable_no_resched();
-
- /* Doesn't matter if we get preempted here, because any
- pending event will get dealt with anyway. */
-
- if (flags == 0) {
- preempt_check_resched();
- barrier(); /* unmask then check (avoid races) */
- if (unlikely(vcpu->evtchn_upcall_pending))
- force_evtchn_callback();
+ ax = CPUID_MWAIT_LEAF;
+ bx = 0;
+ cx = 0;
+ dx = 0;
+
+ native_cpuid(&ax, &bx, &cx, &dx);
+
+ /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so,
+ * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3.
+ */
+ buf[0] = ACPI_PDC_REVISION_ID;
+ buf[1] = 1;
+ buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP);
+
+ set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
+
+ if ((HYPERVISOR_dom0_op(&op) == 0) &&
+ (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) {
+ cpuid_leaf5_ecx_val = cx;
+ cpuid_leaf5_edx_val = dx;
}
+ return true;
+#else
+ return false;
+#endif
}
-
-static void xen_irq_disable(void)
+static void __init xen_init_cpuid_mask(void)
{
- /* There's a one instruction preempt window here. We need to
- make sure we're don't switch CPUs between getting the vcpu
- pointer and updating the mask. */
- preempt_disable();
- x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
- preempt_enable_no_resched();
-}
+ unsigned int ax, bx, cx, dx;
+ unsigned int xsave_mask;
-static void xen_irq_enable(void)
-{
- struct vcpu_info *vcpu;
+ cpuid_leaf1_edx_mask =
+ ~((1 << X86_FEATURE_MTRR) | /* disable MTRR */
+ (1 << X86_FEATURE_ACC)); /* thermal monitoring */
- /* There's a one instruction preempt window here. We need to
- make sure we're don't switch CPUs between getting the vcpu
- pointer and updating the mask. */
- preempt_disable();
- vcpu = x86_read_percpu(xen_vcpu);
- vcpu->evtchn_upcall_mask = 0;
- preempt_enable_no_resched();
+ if (!xen_initial_domain())
+ cpuid_leaf1_edx_mask &=
+ ~((1 << X86_FEATURE_ACPI)); /* disable ACPI */
+
+ cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32));
- /* Doesn't matter if we get preempted here, because any
- pending event will get dealt with anyway. */
+ ax = 1;
+ cx = 0;
+ cpuid(1, &ax, &bx, &cx, &dx);
- barrier(); /* unmask then check (avoid races) */
- if (unlikely(vcpu->evtchn_upcall_pending))
- force_evtchn_callback();
+ xsave_mask =
+ (1 << (X86_FEATURE_XSAVE % 32)) |
+ (1 << (X86_FEATURE_OSXSAVE % 32));
+
+ /* Xen will set CR4.OSXSAVE if supported and not disabled by force */
+ if ((cx & xsave_mask) != xsave_mask)
+ cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */
+ if (xen_check_mwait())
+ cpuid_leaf1_ecx_set_mask = (1 << (X86_FEATURE_MWAIT % 32));
}
-static void xen_safe_halt(void)
+static void xen_set_debugreg(int reg, unsigned long val)
{
- /* Blocking includes an implicit local_irq_enable(). */
- if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0)
- BUG();
+ HYPERVISOR_set_debugreg(reg, val);
}
-static void xen_halt(void)
+static unsigned long xen_get_debugreg(int reg)
{
- if (irqs_disabled())
- HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
- else
- xen_safe_halt();
+ return HYPERVISOR_get_debugreg(reg);
}
-static void xen_leave_lazy(void)
+static void xen_end_context_switch(struct task_struct *next)
{
- paravirt_leave_lazy(paravirt_get_lazy_mode());
xen_mc_flush();
+ paravirt_end_context_switch(next);
}
static unsigned long xen_store_tr(void)
@@ -275,11 +468,66 @@ static unsigned long xen_store_tr(void)
return 0;
}
+/*
+ * Set the page permissions for a particular virtual address. If the
+ * address is a vmalloc mapping (or other non-linear mapping), then
+ * find the linear mapping of the page and also set its protections to
+ * match.
+ */
+static void set_aliased_prot(void *v, pgprot_t prot)
+{
+ int level;
+ pte_t *ptep;
+ pte_t pte;
+ unsigned long pfn;
+ struct page *page;
+
+ ptep = lookup_address((unsigned long)v, &level);
+ BUG_ON(ptep == NULL);
+
+ pfn = pte_pfn(*ptep);
+ page = pfn_to_page(pfn);
+
+ pte = pfn_pte(pfn, prot);
+
+ if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
+ BUG();
+
+ if (!PageHighMem(page)) {
+ void *av = __va(PFN_PHYS(pfn));
+
+ if (av != v)
+ if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
+ BUG();
+ } else
+ kmap_flush_unused();
+}
+
+static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
+{
+ const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
+ int i;
+
+ for(i = 0; i < entries; i += entries_per_page)
+ set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
+}
+
+static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
+{
+ const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
+ int i;
+
+ for(i = 0; i < entries; i += entries_per_page)
+ set_aliased_prot(ldt + i, PAGE_KERNEL);
+}
+
static void xen_set_ldt(const void *addr, unsigned entries)
{
struct mmuext_op *op;
struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+ trace_xen_cpu_set_ldt(addr, entries);
+
op = mcs.args;
op->cmd = MMUEXT_SET_LDT;
op->arg1.linear_addr = (unsigned long)addr;
@@ -292,44 +540,142 @@ static void xen_set_ldt(const void *addr, unsigned entries)
static void xen_load_gdt(const struct desc_ptr *dtr)
{
- unsigned long *frames;
unsigned long va = dtr->address;
unsigned int size = dtr->size + 1;
unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+ unsigned long frames[pages];
int f;
- struct multicall_space mcs;
- /* A GDT can be up to 64k in size, which corresponds to 8192
- 8-byte entries, or 16 4k pages.. */
+ /*
+ * A GDT can be up to 64k in size, which corresponds to 8192
+ * 8-byte entries, or 16 4k pages..
+ */
BUG_ON(size > 65536);
BUG_ON(va & ~PAGE_MASK);
- mcs = xen_mc_entry(sizeof(*frames) * pages);
- frames = mcs.args;
-
for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
- frames[f] = virt_to_mfn(va);
+ int level;
+ pte_t *ptep;
+ unsigned long pfn, mfn;
+ void *virt;
+
+ /*
+ * The GDT is per-cpu and is in the percpu data area.
+ * That can be virtually mapped, so we need to do a
+ * page-walk to get the underlying MFN for the
+ * hypercall. The page can also be in the kernel's
+ * linear range, so we need to RO that mapping too.
+ */
+ ptep = lookup_address(va, &level);
+ BUG_ON(ptep == NULL);
+
+ pfn = pte_pfn(*ptep);
+ mfn = pfn_to_mfn(pfn);
+ virt = __va(PFN_PHYS(pfn));
+
+ frames[f] = mfn;
+
make_lowmem_page_readonly((void *)va);
+ make_lowmem_page_readonly(virt);
}
- MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
+ if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
+ BUG();
+}
- xen_mc_issue(PARAVIRT_LAZY_CPU);
+/*
+ * load_gdt for early boot, when the gdt is only mapped once
+ */
+static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
+{
+ unsigned long va = dtr->address;
+ unsigned int size = dtr->size + 1;
+ unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+ unsigned long frames[pages];
+ int f;
+
+ /*
+ * A GDT can be up to 64k in size, which corresponds to 8192
+ * 8-byte entries, or 16 4k pages..
+ */
+
+ BUG_ON(size > 65536);
+ BUG_ON(va & ~PAGE_MASK);
+
+ for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
+ pte_t pte;
+ unsigned long pfn, mfn;
+
+ pfn = virt_to_pfn(va);
+ mfn = pfn_to_mfn(pfn);
+
+ pte = pfn_pte(pfn, PAGE_KERNEL_RO);
+
+ if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
+ BUG();
+
+ frames[f] = mfn;
+ }
+
+ if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
+ BUG();
+}
+
+static inline bool desc_equal(const struct desc_struct *d1,
+ const struct desc_struct *d2)
+{
+ return d1->a == d2->a && d1->b == d2->b;
}
static void load_TLS_descriptor(struct thread_struct *t,
unsigned int cpu, unsigned int i)
{
- struct desc_struct *gdt = get_cpu_gdt_table(cpu);
- xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
- struct multicall_space mc = __xen_mc_entry(0);
+ struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
+ struct desc_struct *gdt;
+ xmaddr_t maddr;
+ struct multicall_space mc;
+
+ if (desc_equal(shadow, &t->tls_array[i]))
+ return;
+
+ *shadow = t->tls_array[i];
+
+ gdt = get_cpu_gdt_table(cpu);
+ maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+ mc = __xen_mc_entry(0);
MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
}
static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
{
+ /*
+ * XXX sleazy hack: If we're being called in a lazy-cpu zone
+ * and lazy gs handling is enabled, it means we're in a
+ * context switch, and %gs has just been saved. This means we
+ * can zero it out to prevent faults on exit from the
+ * hypervisor if the next process has no %gs. Either way, it
+ * has been saved, and the new value will get loaded properly.
+ * This will go away as soon as Xen has been modified to not
+ * save/restore %gs for normal hypercalls.
+ *
+ * On x86_64, this hack is not used for %gs, because gs points
+ * to KERNEL_GS_BASE (and uses it for PDA references), so we
+ * must not zero %gs on x86_64
+ *
+ * For x86_64, we need to zero %fs, otherwise we may get an
+ * exception between the new %fs descriptor being loaded and
+ * %fs being effectively cleared at __switch_to().
+ */
+ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
+#ifdef CONFIG_X86_32
+ lazy_load_gs(0);
+#else
+ loadsegment(fs, 0);
+#endif
+ }
+
xen_mc_batch();
load_TLS_descriptor(t, cpu, 0);
@@ -337,27 +683,24 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
load_TLS_descriptor(t, cpu, 2);
xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
- /*
- * XXX sleazy hack: If we're being called in a lazy-cpu zone,
- * it means we're in a context switch, and %gs has just been
- * saved. This means we can zero it out to prevent faults on
- * exit from the hypervisor if the next process has no %gs.
- * Either way, it has been saved, and the new value will get
- * loaded properly. This will go away as soon as Xen has been
- * modified to not save/restore %gs for normal hypercalls.
- */
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)
- loadsegment(gs, 0);
+#ifdef CONFIG_X86_64
+static void xen_load_gs_index(unsigned int idx)
+{
+ if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
+ BUG();
}
+#endif
static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
const void *ptr)
{
- unsigned long lp = (unsigned long)&dt[entrynum];
- xmaddr_t mach_lp = virt_to_machine(lp);
+ xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
u64 entry = *(u64 *)ptr;
+ trace_xen_cpu_write_ldt_entry(dt, entrynum, entry);
+
preempt_disable();
xen_mc_flush();
@@ -367,24 +710,60 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
preempt_enable();
}
-static int cvt_gate_to_trap(int vector, u32 low, u32 high,
+static int cvt_gate_to_trap(int vector, const gate_desc *val,
struct trap_info *info)
{
- u8 type, dpl;
+ unsigned long addr;
- type = (high >> 8) & 0x1f;
- dpl = (high >> 13) & 3;
-
- if (type != 0xf && type != 0xe)
+ if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
return 0;
info->vector = vector;
- info->address = (high & 0xffff0000) | (low & 0x0000ffff);
- info->cs = low >> 16;
- info->flags = dpl;
+
+ addr = gate_offset(*val);
+#ifdef CONFIG_X86_64
+ /*
+ * Look for known traps using IST, and substitute them
+ * appropriately. The debugger ones are the only ones we care
+ * about. Xen will handle faults like double_fault,
+ * so we should never see them. Warn if
+ * there's an unexpected IST-using fault handler.
+ */
+ if (addr == (unsigned long)debug)
+ addr = (unsigned long)xen_debug;
+ else if (addr == (unsigned long)int3)
+ addr = (unsigned long)xen_int3;
+ else if (addr == (unsigned long)stack_segment)
+ addr = (unsigned long)xen_stack_segment;
+ else if (addr == (unsigned long)double_fault) {
+ /* Don't need to handle these */
+ return 0;
+#ifdef CONFIG_X86_MCE
+ } else if (addr == (unsigned long)machine_check) {
+ /*
+ * when xen hypervisor inject vMCE to guest,
+ * use native mce handler to handle it
+ */
+ ;
+#endif
+ } else if (addr == (unsigned long)nmi)
+ /*
+ * Use the native version as well.
+ */
+ ;
+ else {
+ /* Some other trap using IST? */
+ if (WARN_ON(val->ist != 0))
+ return 0;
+ }
+#endif /* CONFIG_X86_64 */
+ info->address = addr;
+
+ info->cs = gate_segment(*val);
+ info->flags = val->dpl;
/* interrupt gates clear IF */
- if (type == 0xe)
- info->flags |= 4;
+ if (val->type == GATE_INTERRUPT)
+ info->flags |= 1 << 2;
return 1;
}
@@ -399,10 +778,12 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
unsigned long p = (unsigned long)&dt[entrynum];
unsigned long start, end;
+ trace_xen_cpu_write_idt_entry(dt, entrynum, g);
+
preempt_disable();
- start = __get_cpu_var(idt_desc).address;
- end = start + __get_cpu_var(idt_desc).size + 1;
+ start = __this_cpu_read(idt_desc.address);
+ end = start + __this_cpu_read(idt_desc.size) + 1;
xen_mc_flush();
@@ -410,11 +791,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
if (p >= start && (p + 8) <= end) {
struct trap_info info[2];
- u32 *desc = (u32 *)g;
info[1].address = 0;
- if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0]))
+ if (cvt_gate_to_trap(entrynum, g, &info[0]))
if (HYPERVISOR_set_trap_table(info))
BUG();
}
@@ -427,13 +807,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc,
{
unsigned in, out, count;
- count = (desc->size+1) / 8;
+ count = (desc->size+1) / sizeof(gate_desc);
BUG_ON(count > 256);
for (in = out = 0; in < count; in++) {
- const u32 *entry = (u32 *)(desc->address + in * 8);
+ gate_desc *entry = (gate_desc*)(desc->address) + in;
- if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
+ if (cvt_gate_to_trap(in, entry, &traps[out]))
out++;
}
traps[out].address = 0;
@@ -454,6 +834,8 @@ static void xen_load_idt(const struct desc_ptr *desc)
static DEFINE_SPINLOCK(lock);
static struct trap_info traps[257];
+ trace_xen_cpu_load_idt(desc);
+
spin_lock(&lock);
__get_cpu_var(idt_desc) = *desc;
@@ -472,6 +854,8 @@ static void xen_load_idt(const struct desc_ptr *desc)
static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
const void *desc, int type)
{
+ trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
+
preempt_disable();
switch (type) {
@@ -481,7 +865,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
break;
default: {
- xmaddr_t maddr = virt_to_machine(&dt[entry]);
+ xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
xen_mc_flush();
if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
@@ -493,10 +877,37 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
preempt_enable();
}
+/*
+ * Version of write_gdt_entry for use at early boot-time needed to
+ * update an entry as simply as possible.
+ */
+static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
+ const void *desc, int type)
+{
+ trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
+
+ switch (type) {
+ case DESC_LDT:
+ case DESC_TSS:
+ /* ignore */
+ break;
+
+ default: {
+ xmaddr_t maddr = virt_to_machine(&dt[entry]);
+
+ if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
+ dt[entry] = *(struct desc_struct *)desc;
+ }
+
+ }
+}
+
static void xen_load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
+ struct thread_struct *thread)
{
- struct multicall_space mcs = xen_mc_entry(0);
+ struct multicall_space mcs;
+
+ mcs = xen_mc_entry(0);
MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
xen_mc_issue(PARAVIRT_LAZY_CPU);
}
@@ -515,325 +926,202 @@ static void xen_io_delay(void)
}
#ifdef CONFIG_X86_LOCAL_APIC
-static u32 xen_apic_read(unsigned long reg)
+static unsigned long xen_set_apic_id(unsigned int x)
{
- return 0;
-}
-
-static void xen_apic_write(unsigned long reg, u32 val)
-{
- /* Warn to see if there's any stray references */
WARN_ON(1);
+ return x;
}
-#endif
-
-static void xen_flush_tlb(void)
-{
- struct mmuext_op *op;
- struct multicall_space mcs = xen_mc_entry(sizeof(*op));
-
- op = mcs.args;
- op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
- xen_mc_issue(PARAVIRT_LAZY_MMU);
-}
-
-static void xen_flush_tlb_single(unsigned long addr)
+static unsigned int xen_get_apic_id(unsigned long x)
{
- struct mmuext_op *op;
- struct multicall_space mcs = xen_mc_entry(sizeof(*op));
-
- op = mcs.args;
- op->cmd = MMUEXT_INVLPG_LOCAL;
- op->arg1.linear_addr = addr & PAGE_MASK;
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ return ((x)>>24) & 0xFFu;
}
-
-static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
- unsigned long va)
+static u32 xen_apic_read(u32 reg)
{
- struct {
- struct mmuext_op op;
- cpumask_t mask;
- } *args;
- cpumask_t cpumask = *cpus;
- struct multicall_space mcs;
-
- /*
- * A couple of (to be removed) sanity checks:
- *
- * - current CPU must not be in mask
- * - mask must exist :)
- */
- BUG_ON(cpus_empty(cpumask));
- BUG_ON(cpu_isset(smp_processor_id(), cpumask));
- BUG_ON(!mm);
-
- /* If a CPU which we ran on has gone down, OK. */
- cpus_and(cpumask, cpumask, cpu_online_map);
- if (cpus_empty(cpumask))
- return;
+ struct xen_platform_op op = {
+ .cmd = XENPF_get_cpuinfo,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.pcpu_info.xen_cpuid = 0,
+ };
+ int ret = 0;
+
+ /* Shouldn't need this as APIC is turned off for PV, and we only
+ * get called on the bootup processor. But just in case. */
+ if (!xen_initial_domain() || smp_processor_id())
+ return 0;
- mcs = xen_mc_entry(sizeof(*args));
- args = mcs.args;
- args->mask = cpumask;
- args->op.arg2.vcpumask = &args->mask;
+ if (reg == APIC_LVR)
+ return 0x10;
- if (va == TLB_FLUSH_ALL) {
- args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
- } else {
- args->op.cmd = MMUEXT_INVLPG_MULTI;
- args->op.arg1.linear_addr = va;
- }
+ if (reg != APIC_ID)
+ return 0;
- MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
+ ret = HYPERVISOR_dom0_op(&op);
+ if (ret)
+ return 0;
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ return op.u.pcpu_info.apic_id << 24;
}
-static void xen_write_cr2(unsigned long cr2)
+static void xen_apic_write(u32 reg, u32 val)
{
- x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
+ /* Warn to see if there's any stray references */
+ WARN_ON(1);
}
-static unsigned long xen_read_cr2(void)
+static u64 xen_apic_icr_read(void)
{
- return x86_read_percpu(xen_vcpu)->arch.cr2;
+ return 0;
}
-static unsigned long xen_read_cr2_direct(void)
+static void xen_apic_icr_write(u32 low, u32 id)
{
- return x86_read_percpu(xen_vcpu_info.arch.cr2);
+ /* Warn to see if there's any stray references */
+ WARN_ON(1);
}
-static void xen_write_cr4(unsigned long cr4)
+static void xen_apic_wait_icr_idle(void)
{
- /* Just ignore cr4 changes; Xen doesn't allow us to do
- anything anyway. */
+ return;
}
-static unsigned long xen_read_cr3(void)
+static u32 xen_safe_apic_wait_icr_idle(void)
{
- return x86_read_percpu(xen_cr3);
+ return 0;
}
-static void set_current_cr3(void *v)
+static void set_xen_basic_apic_ops(void)
{
- x86_write_percpu(xen_current_cr3, (unsigned long)v);
+ apic->read = xen_apic_read;
+ apic->write = xen_apic_write;
+ apic->icr_read = xen_apic_icr_read;
+ apic->icr_write = xen_apic_icr_write;
+ apic->wait_icr_idle = xen_apic_wait_icr_idle;
+ apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle;
+ apic->set_apic_id = xen_set_apic_id;
+ apic->get_apic_id = xen_get_apic_id;
+
+#ifdef CONFIG_SMP
+ apic->send_IPI_allbutself = xen_send_IPI_allbutself;
+ apic->send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself;
+ apic->send_IPI_mask = xen_send_IPI_mask;
+ apic->send_IPI_all = xen_send_IPI_all;
+ apic->send_IPI_self = xen_send_IPI_self;
+#endif
}
-static void xen_write_cr3(unsigned long cr3)
+#endif
+
+static void xen_clts(void)
{
- struct mmuext_op *op;
struct multicall_space mcs;
- unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
-
- BUG_ON(preemptible());
-
- mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */
- /* Update while interrupts are disabled, so its atomic with
- respect to ipis */
- x86_write_percpu(xen_cr3, cr3);
+ mcs = xen_mc_entry(0);
- op = mcs.args;
- op->cmd = MMUEXT_NEW_BASEPTR;
- op->arg1.mfn = mfn;
-
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
- /* Update xen_update_cr3 once the batch has actually
- been submitted. */
- xen_mc_callback(set_current_cr3, (void *)cr3);
-
- xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
-}
-
-/* Early in boot, while setting up the initial pagetable, assume
- everything is pinned. */
-static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
-{
- BUG_ON(mem_map); /* should only be used early */
- make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
-}
+ MULTI_fpu_taskswitch(mcs.mc, 0);
-/* Early release_pt assumes that all pts are pinned, since there's
- only init_mm and anything attached to that is pinned. */
-static void xen_release_pt_init(u32 pfn)
-{
- make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
}
-static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
-{
- struct mmuext_op op;
- op.cmd = cmd;
- op.arg1.mfn = pfn_to_mfn(pfn);
- if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
- BUG();
-}
+static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
-/* This needs to make sure the new pte page is pinned iff its being
- attached to a pinned pagetable. */
-static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
+static unsigned long xen_read_cr0(void)
{
- struct page *page = pfn_to_page(pfn);
-
- if (PagePinned(virt_to_page(mm->pgd))) {
- SetPagePinned(page);
+ unsigned long cr0 = this_cpu_read(xen_cr0_value);
- if (!PageHighMem(page)) {
- make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
- if (level == PT_PTE)
- pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
- } else
- /* make sure there are no stray mappings of
- this page */
- kmap_flush_unused();
+ if (unlikely(cr0 == 0)) {
+ cr0 = native_read_cr0();
+ this_cpu_write(xen_cr0_value, cr0);
}
-}
-static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
-{
- xen_alloc_ptpage(mm, pfn, PT_PTE);
+ return cr0;
}
-static void xen_alloc_pd(struct mm_struct *mm, u32 pfn)
+static void xen_write_cr0(unsigned long cr0)
{
- xen_alloc_ptpage(mm, pfn, PT_PMD);
-}
+ struct multicall_space mcs;
-/* This should never happen until we're OK to use struct page */
-static void xen_release_ptpage(u32 pfn, unsigned level)
-{
- struct page *page = pfn_to_page(pfn);
+ this_cpu_write(xen_cr0_value, cr0);
- if (PagePinned(page)) {
- if (!PageHighMem(page)) {
- if (level == PT_PTE)
- pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
- make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
- }
- ClearPagePinned(page);
- }
-}
+ /* Only pay attention to cr0.TS; everything else is
+ ignored. */
+ mcs = xen_mc_entry(0);
-static void xen_release_pt(u32 pfn)
-{
- xen_release_ptpage(pfn, PT_PTE);
-}
+ MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0);
-static void xen_release_pd(u32 pfn)
-{
- xen_release_ptpage(pfn, PT_PMD);
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
}
-#ifdef CONFIG_HIGHPTE
-static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
+static void xen_write_cr4(unsigned long cr4)
{
- pgprot_t prot = PAGE_KERNEL;
-
- if (PagePinned(page))
- prot = PAGE_KERNEL_RO;
-
- if (0 && PageHighMem(page))
- printk("mapping highpte %lx type %d prot %s\n",
- page_to_pfn(page), type,
- (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
+ cr4 &= ~X86_CR4_PGE;
+ cr4 &= ~X86_CR4_PSE;
- return kmap_atomic_prot(page, type, prot);
+ native_write_cr4(cr4);
}
-#endif
-
-static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
+#ifdef CONFIG_X86_64
+static inline unsigned long xen_read_cr8(void)
{
- /* If there's an existing pte, then don't allow _PAGE_RW to be set */
- if (pte_val_ma(*ptep) & _PAGE_PRESENT)
- pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
- pte_val_ma(pte));
-
- return pte;
+ return 0;
}
-
-/* Init-time set_pte while constructing initial pagetables, which
- doesn't allow RO pagetable pages to be remapped RW */
-static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
+static inline void xen_write_cr8(unsigned long val)
{
- pte = mask_rw_pte(ptep, pte);
-
- xen_set_pte(ptep, pte);
+ BUG_ON(val);
}
-
-static __init void xen_pagetable_setup_start(pgd_t *base)
+#endif
+static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
{
- pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
+ int ret;
- /* special set_pte for pagetable initialization */
- pv_mmu_ops.set_pte = xen_set_pte_init;
+ ret = 0;
- init_mm.pgd = base;
- /*
- * copy top-level of Xen-supplied pagetable into place. For
- * !PAE we can use this as-is, but for PAE it is a stand-in
- * while we copy the pmd pages.
- */
- memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
+ switch (msr) {
+#ifdef CONFIG_X86_64
+ unsigned which;
+ u64 base;
- if (PTRS_PER_PMD > 1) {
- int i;
- /*
- * For PAE, need to allocate new pmds, rather than
- * share Xen's, since Xen doesn't like pmd's being
- * shared between address spaces.
- */
- for (i = 0; i < PTRS_PER_PGD; i++) {
- if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
- pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+ case MSR_FS_BASE: which = SEGBASE_FS; goto set;
+ case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
+ case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
- memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
- PAGE_SIZE);
+ set:
+ base = ((u64)high << 32) | low;
+ if (HYPERVISOR_set_segment_base(which, base) != 0)
+ ret = -EIO;
+ break;
+#endif
- make_lowmem_page_readonly(pmd);
+ case MSR_STAR:
+ case MSR_CSTAR:
+ case MSR_LSTAR:
+ case MSR_SYSCALL_MASK:
+ case MSR_IA32_SYSENTER_CS:
+ case MSR_IA32_SYSENTER_ESP:
+ case MSR_IA32_SYSENTER_EIP:
+ /* Fast syscall setup is all done in hypercalls, so
+ these are all ignored. Stub them out here to stop
+ Xen console noise. */
+ break;
- set_pgd(&base[i], __pgd(1 + __pa(pmd)));
- } else
- pgd_clear(&base[i]);
- }
- }
+ case MSR_IA32_CR_PAT:
+ if (smp_processor_id() == 0)
+ xen_set_pat(((u64)high << 32) | low);
+ break;
- /* make sure zero_page is mapped RO so we can use it in pagetables */
- make_lowmem_page_readonly(empty_zero_page);
- make_lowmem_page_readonly(base);
- /*
- * Switch to new pagetable. This is done before
- * pagetable_init has done anything so that the new pages
- * added to the table can be prepared properly for Xen.
- */
- xen_write_cr3(__pa(base));
+ default:
+ ret = native_write_msr_safe(msr, low, high);
+ }
- /* Unpin initial Xen pagetable */
- pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
- PFN_DOWN(__pa(xen_start_info->pt_base)));
+ return ret;
}
-static __init void setup_shared_info(void)
+void xen_setup_shared_info(void)
{
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
- unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP);
-
- /*
- * Create a mapping for the shared info page.
- * Should be set_fixmap(), but shared_info is a machine
- * address with no corresponding pseudo-phys address.
- */
- set_pte_mfn(addr,
- PFN_DOWN(xen_start_info->shared_info),
- PAGE_KERNEL);
+ set_fixmap(FIX_PARAVIRT_BOOTMAP,
+ xen_start_info->shared_info);
- HYPERVISOR_shared_info = (struct shared_info *)addr;
+ HYPERVISOR_shared_info =
+ (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
} else
HYPERVISOR_shared_info =
(struct shared_info *)__va(xen_start_info->shared_info);
@@ -842,37 +1130,12 @@ static __init void setup_shared_info(void)
/* In UP this is as good a place as any to set up shared info */
xen_setup_vcpu_info_placement();
#endif
-}
-
-static __init void xen_pagetable_setup_done(pgd_t *base)
-{
- /* This will work as long as patching hasn't happened yet
- (which it hasn't) */
- pv_mmu_ops.alloc_pt = xen_alloc_pt;
- pv_mmu_ops.alloc_pd = xen_alloc_pd;
- pv_mmu_ops.release_pt = xen_release_pt;
- pv_mmu_ops.release_pd = xen_release_pd;
- pv_mmu_ops.set_pte = xen_set_pte;
-
- setup_shared_info();
-
- /* Actually pin the pagetable down, but we can't set PG_pinned
- yet because the page structures don't exist yet. */
- {
- unsigned level;
-
-#ifdef CONFIG_X86_PAE
- level = MMUEXT_PIN_L3_TABLE;
-#else
- level = MMUEXT_PIN_L2_TABLE;
-#endif
- pin_pagetable_pfn(level, PFN_DOWN(__pa(base)));
- }
+ xen_setup_mfn_list_list();
}
-/* This is called once we have the cpu_possible_map */
-void __init xen_setup_vcpu_info_placement(void)
+/* This is called once we have the cpu_possible_mask */
+void xen_setup_vcpu_info_placement(void)
{
int cpu;
@@ -880,16 +1143,14 @@ void __init xen_setup_vcpu_info_placement(void)
xen_vcpu_setup(cpu);
/* xen_vcpu_setup managed to place the vcpu_info within the
- percpu area for all cpus, so make use of it */
- if (have_vcpu_info_placement) {
- printk(KERN_INFO "Xen: using vcpu_info placement\n");
-
- pv_irq_ops.save_fl = xen_save_fl_direct;
- pv_irq_ops.restore_fl = xen_restore_fl_direct;
- pv_irq_ops.irq_disable = xen_irq_disable_direct;
- pv_irq_ops.irq_enable = xen_irq_enable_direct;
+ * percpu area for all cpus, so make use of it. Note that for
+ * PVH we want to use native IRQ mechanism. */
+ if (have_vcpu_info_placement && !xen_pvh_domain()) {
+ pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
+ pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
+ pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
+ pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
- pv_cpu_ops.iret = xen_iret_direct;
}
}
@@ -946,63 +1207,70 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
return ret;
}
-static const struct pv_info xen_info __initdata = {
+static const struct pv_info xen_info __initconst = {
.paravirt_enabled = 1,
.shared_kernel_pmd = 0,
+#ifdef CONFIG_X86_64
+ .extra_user_64bit_cs = FLAT_USER_CS64,
+#endif
+
.name = "Xen",
};
-static const struct pv_init_ops xen_init_ops __initdata = {
+static const struct pv_init_ops xen_init_ops __initconst = {
.patch = xen_patch,
-
- .banner = xen_banner,
- .memory_setup = xen_memory_setup,
- .arch_setup = xen_arch_setup,
- .post_allocator_init = xen_mark_init_mm_pinned,
-};
-
-static const struct pv_time_ops xen_time_ops __initdata = {
- .time_init = xen_time_init,
-
- .set_wallclock = xen_set_wallclock,
- .get_wallclock = xen_get_wallclock,
- .get_cpu_khz = xen_cpu_khz,
- .sched_clock = xen_sched_clock,
};
-static const struct pv_cpu_ops xen_cpu_ops __initdata = {
+static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.cpuid = xen_cpuid,
.set_debugreg = xen_set_debugreg,
.get_debugreg = xen_get_debugreg,
- .clts = native_clts,
+ .clts = xen_clts,
- .read_cr0 = native_read_cr0,
- .write_cr0 = native_write_cr0,
+ .read_cr0 = xen_read_cr0,
+ .write_cr0 = xen_write_cr0,
.read_cr4 = native_read_cr4,
.read_cr4_safe = native_read_cr4_safe,
.write_cr4 = xen_write_cr4,
+#ifdef CONFIG_X86_64
+ .read_cr8 = xen_read_cr8,
+ .write_cr8 = xen_write_cr8,
+#endif
+
.wbinvd = native_wbinvd,
.read_msr = native_read_msr_safe,
- .write_msr = native_write_msr_safe,
+ .write_msr = xen_write_msr_safe,
+
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
- .iret = (void *)&hypercall_page[__HYPERVISOR_iret],
- .irq_enable_syscall_ret = NULL, /* never called */
+ .read_tscp = native_read_tscp,
+
+ .iret = xen_iret,
+ .irq_enable_sysexit = xen_sysexit,
+#ifdef CONFIG_X86_64
+ .usergs_sysret32 = xen_sysret32,
+ .usergs_sysret64 = xen_sysret64,
+#endif
.load_tr_desc = paravirt_nop,
.set_ldt = xen_set_ldt,
.load_gdt = xen_load_gdt,
.load_idt = xen_load_idt,
.load_tls = xen_load_tls,
+#ifdef CONFIG_X86_64
+ .load_gs_index = xen_load_gs_index,
+#endif
+
+ .alloc_ldt = xen_alloc_ldt,
+ .free_ldt = xen_free_ldt,
- .store_gdt = native_store_gdt,
.store_idt = native_store_idt,
.store_tr = xen_store_tr,
@@ -1014,112 +1282,24 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
.set_iopl_mask = xen_set_iopl_mask,
.io_delay = xen_io_delay,
- .lazy_mode = {
- .enter = paravirt_enter_lazy_cpu,
- .leave = xen_leave_lazy,
- },
-};
+ /* Xen takes care of %gs when switching to usermode for us */
+ .swapgs = paravirt_nop,
-static const struct pv_irq_ops xen_irq_ops __initdata = {
- .init_IRQ = xen_init_IRQ,
- .save_fl = xen_save_fl,
- .restore_fl = xen_restore_fl,
- .irq_disable = xen_irq_disable,
- .irq_enable = xen_irq_enable,
- .safe_halt = xen_safe_halt,
- .halt = xen_halt,
+ .start_context_switch = paravirt_start_context_switch,
+ .end_context_switch = xen_end_context_switch,
};
-static const struct pv_apic_ops xen_apic_ops __initdata = {
+static const struct pv_apic_ops xen_apic_ops __initconst = {
#ifdef CONFIG_X86_LOCAL_APIC
- .apic_write = xen_apic_write,
- .apic_write_atomic = xen_apic_write,
- .apic_read = xen_apic_read,
- .setup_boot_clock = paravirt_nop,
- .setup_secondary_clock = paravirt_nop,
.startup_ipi_hook = paravirt_nop,
#endif
};
-static const struct pv_mmu_ops xen_mmu_ops __initdata = {
- .pagetable_setup_start = xen_pagetable_setup_start,
- .pagetable_setup_done = xen_pagetable_setup_done,
-
- .read_cr2 = xen_read_cr2,
- .write_cr2 = xen_write_cr2,
-
- .read_cr3 = xen_read_cr3,
- .write_cr3 = xen_write_cr3,
-
- .flush_tlb_user = xen_flush_tlb,
- .flush_tlb_kernel = xen_flush_tlb,
- .flush_tlb_single = xen_flush_tlb_single,
- .flush_tlb_others = xen_flush_tlb_others,
-
- .pte_update = paravirt_nop,
- .pte_update_defer = paravirt_nop,
-
- .alloc_pt = xen_alloc_pt_init,
- .release_pt = xen_release_pt_init,
- .alloc_pd = xen_alloc_pt_init,
- .alloc_pd_clone = paravirt_nop,
- .release_pd = xen_release_pt_init,
-
-#ifdef CONFIG_HIGHPTE
- .kmap_atomic_pte = xen_kmap_atomic_pte,
-#endif
-
- .set_pte = NULL, /* see xen_pagetable_setup_* */
- .set_pte_at = xen_set_pte_at,
- .set_pmd = xen_set_pmd,
-
- .pte_val = xen_pte_val,
- .pgd_val = xen_pgd_val,
-
- .make_pte = xen_make_pte,
- .make_pgd = xen_make_pgd,
-
-#ifdef CONFIG_X86_PAE
- .set_pte_atomic = xen_set_pte_atomic,
- .set_pte_present = xen_set_pte_at,
- .set_pud = xen_set_pud,
- .pte_clear = xen_pte_clear,
- .pmd_clear = xen_pmd_clear,
-
- .make_pmd = xen_make_pmd,
- .pmd_val = xen_pmd_val,
-#endif /* PAE */
-
- .activate_mm = xen_activate_mm,
- .dup_mmap = xen_dup_mmap,
- .exit_mmap = xen_exit_mmap,
-
- .lazy_mode = {
- .enter = paravirt_enter_lazy_mmu,
- .leave = xen_leave_lazy,
- },
-};
-
-#ifdef CONFIG_SMP
-static const struct smp_ops xen_smp_ops __initdata = {
- .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
- .smp_prepare_cpus = xen_smp_prepare_cpus,
- .cpu_up = xen_cpu_up,
- .smp_cpus_done = xen_smp_cpus_done,
-
- .smp_send_stop = xen_smp_send_stop,
- .smp_send_reschedule = xen_smp_send_reschedule,
- .smp_call_function_mask = xen_smp_call_function_mask,
-};
-#endif /* CONFIG_SMP */
-
static void xen_reboot(int reason)
{
-#ifdef CONFIG_SMP
- smp_send_stop();
-#endif
+ struct sched_shutdown r = { .reason = reason };
- if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason))
+ if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
BUG();
}
@@ -1138,96 +1318,539 @@ static void xen_machine_halt(void)
xen_reboot(SHUTDOWN_poweroff);
}
+static void xen_machine_power_off(void)
+{
+ if (pm_power_off)
+ pm_power_off();
+ xen_reboot(SHUTDOWN_poweroff);
+}
+
static void xen_crash_shutdown(struct pt_regs *regs)
{
xen_reboot(SHUTDOWN_crash);
}
-static const struct machine_ops __initdata xen_machine_ops = {
+static int
+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ xen_reboot(SHUTDOWN_crash);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block xen_panic_block = {
+ .notifier_call= xen_panic_event,
+ .priority = INT_MIN
+};
+
+int xen_panic_handler_init(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
+ return 0;
+}
+
+static const struct machine_ops xen_machine_ops __initconst = {
.restart = xen_restart,
.halt = xen_machine_halt,
- .power_off = xen_machine_halt,
+ .power_off = xen_machine_power_off,
.shutdown = xen_machine_halt,
.crash_shutdown = xen_crash_shutdown,
.emergency_restart = xen_emergency_restart,
};
+static void __init xen_boot_params_init_edd(void)
+{
+#if IS_ENABLED(CONFIG_EDD)
+ struct xen_platform_op op;
+ struct edd_info *edd_info;
+ u32 *mbr_signature;
+ unsigned nr;
+ int ret;
+
+ edd_info = boot_params.eddbuf;
+ mbr_signature = boot_params.edd_mbr_sig_buffer;
+
+ op.cmd = XENPF_firmware_info;
+
+ op.u.firmware_info.type = XEN_FW_DISK_INFO;
+ for (nr = 0; nr < EDDMAXNR; nr++) {
+ struct edd_info *info = edd_info + nr;
+
+ op.u.firmware_info.index = nr;
+ info->params.length = sizeof(info->params);
+ set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params,
+ &info->params);
+ ret = HYPERVISOR_dom0_op(&op);
+ if (ret)
+ break;
+
+#define C(x) info->x = op.u.firmware_info.u.disk_info.x
+ C(device);
+ C(version);
+ C(interface_support);
+ C(legacy_max_cylinder);
+ C(legacy_max_head);
+ C(legacy_sectors_per_track);
+#undef C
+ }
+ boot_params.eddbuf_entries = nr;
+
+ op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE;
+ for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) {
+ op.u.firmware_info.index = nr;
+ ret = HYPERVISOR_dom0_op(&op);
+ if (ret)
+ break;
+ mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature;
+ }
+ boot_params.edd_mbr_sig_buf_entries = nr;
+#endif
+}
-static void __init xen_reserve_top(void)
+/*
+ * Set up the GDT and segment registers for -fstack-protector. Until
+ * we do this, we have to be careful not to call any stack-protected
+ * function, which is most of the kernel.
+ *
+ * Note, that it is __ref because the only caller of this after init
+ * is PVH which is not going to use xen_load_gdt_boot or other
+ * __init functions.
+ */
+static void __ref xen_setup_gdt(int cpu)
{
- unsigned long top = HYPERVISOR_VIRT_START;
- struct xen_platform_parameters pp;
+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
+#ifdef CONFIG_X86_64
+ unsigned long dummy;
+
+ load_percpu_segment(cpu); /* We need to access per-cpu area */
+ switch_to_new_gdt(cpu); /* GDT and GS set */
+
+ /* We are switching of the Xen provided GDT to our HVM mode
+ * GDT. The new GDT has __KERNEL_CS with CS.L = 1
+ * and we are jumping to reload it.
+ */
+ asm volatile ("pushq %0\n"
+ "leaq 1f(%%rip),%0\n"
+ "pushq %0\n"
+ "lretq\n"
+ "1:\n"
+ : "=&r" (dummy) : "0" (__KERNEL_CS));
- if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
- top = pp.virt_start;
+ /*
+ * While not needed, we also set the %es, %ds, and %fs
+ * to zero. We don't care about %ss as it is NULL.
+ * Strictly speaking this is not needed as Xen zeros those
+ * out (and also MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE)
+ *
+ * Linux zeros them in cpu_init() and in secondary_startup_64
+ * (for BSP).
+ */
+ loadsegment(es, 0);
+ loadsegment(ds, 0);
+ loadsegment(fs, 0);
+#else
+ /* PVH: TODO Implement. */
+ BUG();
+#endif
+ return; /* PVH does not need any PV GDT ops. */
+ }
+ pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
+ pv_cpu_ops.load_gdt = xen_load_gdt_boot;
- reserve_top_address(-top + 2 * PAGE_SIZE);
+ setup_stack_canary_segment(0);
+ switch_to_new_gdt(0);
+
+ pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry;
+ pv_cpu_ops.load_gdt = xen_load_gdt;
+}
+
+/*
+ * A PV guest starts with default flags that are not set for PVH, set them
+ * here asap.
+ */
+static void xen_pvh_set_cr_flags(int cpu)
+{
+
+ /* Some of these are setup in 'secondary_startup_64'. The others:
+ * X86_CR0_TS, X86_CR0_PE, X86_CR0_ET are set by Xen for HVM guests
+ * (which PVH shared codepaths), while X86_CR0_PG is for PVH. */
+ write_cr0(read_cr0() | X86_CR0_MP | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM);
+
+ if (!cpu)
+ return;
+ /*
+ * For BSP, PSE PGE are set in probe_page_size_mask(), for APs
+ * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu_init.
+ */
+ if (cpu_has_pse)
+ set_in_cr4(X86_CR4_PSE);
+
+ if (cpu_has_pge)
+ set_in_cr4(X86_CR4_PGE);
+}
+
+/*
+ * Note, that it is ref - because the only caller of this after init
+ * is PVH which is not going to use xen_load_gdt_boot or other
+ * __init functions.
+ */
+void __ref xen_pvh_secondary_vcpu_init(int cpu)
+{
+ xen_setup_gdt(cpu);
+ xen_pvh_set_cr_flags(cpu);
+}
+
+static void __init xen_pvh_early_guest_init(void)
+{
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ if (!xen_feature(XENFEAT_hvm_callback_vector))
+ return;
+
+ xen_have_vector_callback = 1;
+ xen_pvh_set_cr_flags(0);
+
+#ifdef CONFIG_X86_32
+ BUG(); /* PVH: Implement proper support. */
+#endif
}
/* First C function to be called on Xen boot */
-asmlinkage void __init xen_start_kernel(void)
+asmlinkage __visible void __init xen_start_kernel(void)
{
- pgd_t *pgd;
+ struct physdev_set_iopl set_iopl;
+ int rc;
if (!xen_start_info)
return;
- BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0);
+ xen_domain_type = XEN_PV_DOMAIN;
+
+ xen_setup_features();
+ xen_pvh_early_guest_init();
+ xen_setup_machphys_mapping();
/* Install Xen paravirt ops */
pv_info = xen_info;
pv_init_ops = xen_init_ops;
- pv_time_ops = xen_time_ops;
- pv_cpu_ops = xen_cpu_ops;
- pv_irq_ops = xen_irq_ops;
pv_apic_ops = xen_apic_ops;
- pv_mmu_ops = xen_mmu_ops;
+ if (!xen_pvh_domain())
+ pv_cpu_ops = xen_cpu_ops;
- machine_ops = xen_machine_ops;
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ x86_init.resources.memory_setup = xen_auto_xlated_memory_setup;
+ else
+ x86_init.resources.memory_setup = xen_memory_setup;
+ x86_init.oem.arch_setup = xen_arch_setup;
+ x86_init.oem.banner = xen_banner;
-#ifdef CONFIG_SMP
- smp_ops = xen_smp_ops;
+ xen_init_time_ops();
+
+ /*
+ * Set up some pagetable state before starting to set any ptes.
+ */
+
+ xen_init_mmu_ops();
+
+ /* Prevent unwanted bits from being set in PTEs. */
+ __supported_pte_mask &= ~_PAGE_GLOBAL;
+#if 0
+ if (!xen_initial_domain())
#endif
+ __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
- xen_setup_features();
+ __supported_pte_mask |= _PAGE_IOMAP;
+
+ /*
+ * Prevent page tables from being allocated in highmem, even
+ * if CONFIG_HIGHPTE is enabled.
+ */
+ __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
+
+ /* Work out if we support NX */
+ x86_configure_nx();
/* Get mfn list */
- if (!xen_feature(XENFEAT_auto_translated_physmap))
- phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list;
+ xen_build_dynamic_phys_to_machine();
- pgd = (pgd_t *)xen_start_info->pt_base;
+ /*
+ * Set up kernel GDT and segment registers, mainly so that
+ * -fstack-protector code can be executed.
+ */
+ xen_setup_gdt(0);
- init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
+ xen_init_irq_ops();
+ xen_init_cpuid_mask();
- init_mm.pgd = pgd; /* use the Xen pagetables to start */
+#ifdef CONFIG_X86_LOCAL_APIC
+ /*
+ * set up the basic apic ops.
+ */
+ set_xen_basic_apic_ops();
+#endif
- /* keep using Xen gdt for now; no urgent need to change it */
+ if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
+ pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
+ pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
+ }
- x86_write_percpu(xen_cr3, __pa(pgd));
- x86_write_percpu(xen_current_cr3, __pa(pgd));
+ machine_ops = xen_machine_ops;
+ /*
+ * The only reliable way to retain the initial address of the
+ * percpu gdt_page is to remember it here, so we can go and
+ * mark it RW later, when the initial percpu area is freed.
+ */
+ xen_initial_gdt = &per_cpu(gdt_page, 0);
+
+ xen_smp_init();
+
+#ifdef CONFIG_ACPI_NUMA
+ /*
+ * The pages we from Xen are not related to machine pages, so
+ * any NUMA information the kernel tries to get from ACPI will
+ * be meaningless. Prevent it from trying.
+ */
+ acpi_numa = -1;
+#endif
+#ifdef CONFIG_X86_PAT
+ /*
+ * For right now disable the PAT. We should remove this once
+ * git commit 8eaffa67b43e99ae581622c5133e20b0f48bcef1
+ * (xen/pat: Disable PAT support for now) is reverted.
+ */
+ pat_enabled = 0;
+#endif
/* Don't do the full vcpu_info placement stuff until we have a
possible map and a non-dummy shared_info. */
per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
+ local_irq_disable();
+ early_boot_irqs_disabled = true;
+
+ xen_raw_console_write("mapping kernel into physical memory\n");
+ xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages);
+
+ /* Allocate and initialize top and mid mfn levels for p2m structure */
+ xen_build_mfn_list_list();
+
+ /* keep using Xen gdt for now; no urgent need to change it */
+
+#ifdef CONFIG_X86_32
pv_info.kernel_rpl = 1;
if (xen_feature(XENFEAT_supervisor_mode_kernel))
pv_info.kernel_rpl = 0;
-
+#else
+ pv_info.kernel_rpl = 0;
+#endif
/* set the limit of our address space */
xen_reserve_top();
+ /* PVH: runs at default kernel iopl of 0 */
+ if (!xen_pvh_domain()) {
+ /*
+ * We used to do this in xen_arch_setup, but that is too late
+ * on AMD were early_cpu_init (run before ->arch_setup()) calls
+ * early_amd_init which pokes 0xcf8 port.
+ */
+ set_iopl.iopl = 1;
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+ if (rc != 0)
+ xen_raw_printk("physdev_op failed %d\n", rc);
+ }
+
+#ifdef CONFIG_X86_32
/* set up basic CPUID stuff */
cpu_detect(&new_cpu_data);
- new_cpu_data.hard_math = 1;
+ set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
+ new_cpu_data.wp_works_ok = 1;
new_cpu_data.x86_capability[0] = cpuid_edx(1);
+#endif
/* Poke various useful things into boot_params */
boot_params.hdr.type_of_loader = (9 << 4) | 0;
boot_params.hdr.ramdisk_image = xen_start_info->mod_start
? __pa(xen_start_info->mod_start) : 0;
boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
+ boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
+
+ if (!xen_initial_domain()) {
+ add_preferred_console("xenboot", 0, NULL);
+ add_preferred_console("tty", 0, NULL);
+ add_preferred_console("hvc", 0, NULL);
+ if (pci_xen)
+ x86_init.pci.arch_init = pci_xen_init;
+ } else {
+ const struct dom0_vga_console_info *info =
+ (void *)((char *)xen_start_info +
+ xen_start_info->console.dom0.info_off);
+ struct xen_platform_op op = {
+ .cmd = XENPF_firmware_info,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS,
+ };
+
+ xen_init_vga(info, xen_start_info->console.dom0.info_size);
+ xen_start_info->console.domU.mfn = 0;
+ xen_start_info->console.domU.evtchn = 0;
+
+ if (HYPERVISOR_dom0_op(&op) == 0)
+ boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags;
+
+ xen_init_apic();
+
+ /* Make sure ACS will be enabled */
+ pci_request_acs();
+
+ xen_acpi_sleep_register();
+
+ /* Avoid searching for BIOS MP tables */
+ x86_init.mpparse.find_smp_config = x86_init_noop;
+ x86_init.mpparse.get_smp_config = x86_init_uint_noop;
+
+ xen_boot_params_init_edd();
+ }
+#ifdef CONFIG_PCI
+ /* PCI BIOS service won't work from a PV guest. */
+ pci_probe &= ~PCI_PROBE_BIOS;
+#endif
+ xen_raw_console_write("about to get started...\n");
+
+ xen_setup_runstate_info(0);
/* Start the world */
- start_kernel();
+#ifdef CONFIG_X86_32
+ i386_start_kernel();
+#else
+ x86_64_start_reservations((char *)__pa_symbol(&boot_params));
+#endif
+}
+
+void __ref xen_hvm_init_shared_info(void)
+{
+ int cpu;
+ struct xen_add_to_physmap xatp;
+ static struct shared_info *shared_info_page = 0;
+
+ if (!shared_info_page)
+ shared_info_page = (struct shared_info *)
+ extend_brk(PAGE_SIZE, PAGE_SIZE);
+ xatp.domid = DOMID_SELF;
+ xatp.idx = 0;
+ xatp.space = XENMAPSPACE_shared_info;
+ xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
+ if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
+ BUG();
+
+ HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
+
+ /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
+ * page, we use it in the event channel upcall and in some pvclock
+ * related functions. We don't need the vcpu_info placement
+ * optimizations because we don't use any pv_mmu or pv_irq op on
+ * HVM.
+ * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
+ * online but xen_hvm_init_shared_info is run at resume time too and
+ * in that case multiple vcpus might be online. */
+ for_each_online_cpu(cpu) {
+ /* Leave it to be NULL. */
+ if (cpu >= MAX_VIRT_CPUS)
+ continue;
+ per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+ }
+}
+
+#ifdef CONFIG_XEN_PVHVM
+static void __init init_hvm_pv_info(void)
+{
+ int major, minor;
+ uint32_t eax, ebx, ecx, edx, pages, msr, base;
+ u64 pfn;
+
+ base = xen_cpuid_base();
+ cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+
+ major = eax >> 16;
+ minor = eax & 0xffff;
+ printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
+
+ cpuid(base + 2, &pages, &msr, &ecx, &edx);
+
+ pfn = __pa(hypercall_page);
+ wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+
+ xen_setup_features();
+
+ pv_info.name = "Xen HVM";
+
+ xen_domain_type = XEN_HVM_DOMAIN;
}
+
+static int xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action,
+ void *hcpu)
+{
+ int cpu = (long)hcpu;
+ switch (action) {
+ case CPU_UP_PREPARE:
+ xen_vcpu_setup(cpu);
+ if (xen_have_vector_callback) {
+ if (xen_feature(XENFEAT_hvm_safe_pvclock))
+ xen_setup_timer(cpu);
+ }
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block xen_hvm_cpu_notifier = {
+ .notifier_call = xen_hvm_cpu_notify,
+};
+
+static void __init xen_hvm_guest_init(void)
+{
+ init_hvm_pv_info();
+
+ xen_hvm_init_shared_info();
+
+ xen_panic_handler_init();
+
+ if (xen_feature(XENFEAT_hvm_callback_vector))
+ xen_have_vector_callback = 1;
+ xen_hvm_smp_init();
+ register_cpu_notifier(&xen_hvm_cpu_notifier);
+ xen_unplug_emulated_devices();
+ x86_init.irqs.intr_init = xen_init_IRQ;
+ xen_hvm_init_time_ops();
+ xen_hvm_init_mmu_ops();
+}
+
+static uint32_t __init xen_hvm_platform(void)
+{
+ if (xen_pv_domain())
+ return 0;
+
+ return xen_cpuid_base();
+}
+
+bool xen_hvm_need_lapic(void)
+{
+ if (xen_pv_domain())
+ return false;
+ if (!xen_hvm_domain())
+ return false;
+ if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback)
+ return false;
+ return true;
+}
+EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
+
+const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = {
+ .name = "Xen HVM",
+ .detect = xen_hvm_platform,
+ .init_platform = xen_hvm_guest_init,
+ .x2apic_available = xen_x2apic_para_available,
+};
+EXPORT_SYMBOL(x86_hyper_xen_hvm);
+#endif
diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c
deleted file mode 100644
index dcf613e1758..00000000000
--- a/arch/x86/xen/events.c
+++ /dev/null
@@ -1,591 +0,0 @@
-/*
- * Xen event channels
- *
- * Xen models interrupts with abstract event channels. Because each
- * domain gets 1024 event channels, but NR_IRQ is not that large, we
- * must dynamically map irqs<->event channels. The event channels
- * interface with the rest of the kernel by defining a xen interrupt
- * chip. When an event is recieved, it is mapped to an irq and sent
- * through the normal interrupt processing path.
- *
- * There are four kinds of events which can be mapped to an event
- * channel:
- *
- * 1. Inter-domain notifications. This includes all the virtual
- * device events, since they're driven by front-ends in another domain
- * (typically dom0).
- * 2. VIRQs, typically used for timers. These are per-cpu events.
- * 3. IPIs.
- * 4. Hardware interrupts. Not supported at present.
- *
- * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
- */
-
-#include <linux/linkage.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <asm/ptrace.h>
-#include <asm/irq.h>
-#include <asm/sync_bitops.h>
-#include <asm/xen/hypercall.h>
-#include <asm/xen/hypervisor.h>
-
-#include <xen/events.h>
-#include <xen/interface/xen.h>
-#include <xen/interface/event_channel.h>
-
-#include "xen-ops.h"
-
-/*
- * This lock protects updates to the following mapping and reference-count
- * arrays. The lock does not need to be acquired to read the mapping tables.
- */
-static DEFINE_SPINLOCK(irq_mapping_update_lock);
-
-/* IRQ <-> VIRQ mapping. */
-static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
-
-/* IRQ <-> IPI mapping */
-static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
-
-/* Packed IRQ information: binding type, sub-type index, and event channel. */
-struct packed_irq
-{
- unsigned short evtchn;
- unsigned char index;
- unsigned char type;
-};
-
-static struct packed_irq irq_info[NR_IRQS];
-
-/* Binding types. */
-enum {
- IRQT_UNBOUND,
- IRQT_PIRQ,
- IRQT_VIRQ,
- IRQT_IPI,
- IRQT_EVTCHN
-};
-
-/* Convenient shorthand for packed representation of an unbound IRQ. */
-#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0)
-
-static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
- [0 ... NR_EVENT_CHANNELS-1] = -1
-};
-static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
-static u8 cpu_evtchn[NR_EVENT_CHANNELS];
-
-/* Reference counts for bindings to IRQs. */
-static int irq_bindcount[NR_IRQS];
-
-/* Xen will never allocate port zero for any purpose. */
-#define VALID_EVTCHN(chn) ((chn) != 0)
-
-/*
- * Force a proper event-channel callback from Xen after clearing the
- * callback mask. We do this in a very simple manner, by making a call
- * down into Xen. The pending flag will be checked by Xen on return.
- */
-void force_evtchn_callback(void)
-{
- (void)HYPERVISOR_xen_version(0, NULL);
-}
-EXPORT_SYMBOL_GPL(force_evtchn_callback);
-
-static struct irq_chip xen_dynamic_chip;
-
-/* Constructor for packed IRQ information. */
-static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn)
-{
- return (struct packed_irq) { evtchn, index, type };
-}
-
-/*
- * Accessors for packed IRQ information.
- */
-static inline unsigned int evtchn_from_irq(int irq)
-{
- return irq_info[irq].evtchn;
-}
-
-static inline unsigned int index_from_irq(int irq)
-{
- return irq_info[irq].index;
-}
-
-static inline unsigned int type_from_irq(int irq)
-{
- return irq_info[irq].type;
-}
-
-static inline unsigned long active_evtchns(unsigned int cpu,
- struct shared_info *sh,
- unsigned int idx)
-{
- return (sh->evtchn_pending[idx] &
- cpu_evtchn_mask[cpu][idx] &
- ~sh->evtchn_mask[idx]);
-}
-
-static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
-{
- int irq = evtchn_to_irq[chn];
-
- BUG_ON(irq == -1);
-#ifdef CONFIG_SMP
- irq_desc[irq].affinity = cpumask_of_cpu(cpu);
-#endif
-
- __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
- __set_bit(chn, cpu_evtchn_mask[cpu]);
-
- cpu_evtchn[chn] = cpu;
-}
-
-static void init_evtchn_cpu_bindings(void)
-{
-#ifdef CONFIG_SMP
- int i;
- /* By default all event channels notify CPU#0. */
- for (i = 0; i < NR_IRQS; i++)
- irq_desc[i].affinity = cpumask_of_cpu(0);
-#endif
-
- memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
- memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
-}
-
-static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
-{
- return cpu_evtchn[evtchn];
-}
-
-static inline void clear_evtchn(int port)
-{
- struct shared_info *s = HYPERVISOR_shared_info;
- sync_clear_bit(port, &s->evtchn_pending[0]);
-}
-
-static inline void set_evtchn(int port)
-{
- struct shared_info *s = HYPERVISOR_shared_info;
- sync_set_bit(port, &s->evtchn_pending[0]);
-}
-
-
-/**
- * notify_remote_via_irq - send event to remote end of event channel via irq
- * @irq: irq of event channel to send event to
- *
- * Unlike notify_remote_via_evtchn(), this is safe to use across
- * save/restore. Notifications on a broken connection are silently
- * dropped.
- */
-void notify_remote_via_irq(int irq)
-{
- int evtchn = evtchn_from_irq(irq);
-
- if (VALID_EVTCHN(evtchn))
- notify_remote_via_evtchn(evtchn);
-}
-EXPORT_SYMBOL_GPL(notify_remote_via_irq);
-
-static void mask_evtchn(int port)
-{
- struct shared_info *s = HYPERVISOR_shared_info;
- sync_set_bit(port, &s->evtchn_mask[0]);
-}
-
-static void unmask_evtchn(int port)
-{
- struct shared_info *s = HYPERVISOR_shared_info;
- unsigned int cpu = get_cpu();
-
- BUG_ON(!irqs_disabled());
-
- /* Slow path (hypercall) if this is a non-local port. */
- if (unlikely(cpu != cpu_from_evtchn(port))) {
- struct evtchn_unmask unmask = { .port = port };
- (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
- } else {
- struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
-
- sync_clear_bit(port, &s->evtchn_mask[0]);
-
- /*
- * The following is basically the equivalent of
- * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
- * the interrupt edge' if the channel is masked.
- */
- if (sync_test_bit(port, &s->evtchn_pending[0]) &&
- !sync_test_and_set_bit(port / BITS_PER_LONG,
- &vcpu_info->evtchn_pending_sel))
- vcpu_info->evtchn_upcall_pending = 1;
- }
-
- put_cpu();
-}
-
-static int find_unbound_irq(void)
-{
- int irq;
-
- /* Only allocate from dynirq range */
- for (irq = 0; irq < NR_IRQS; irq++)
- if (irq_bindcount[irq] == 0)
- break;
-
- if (irq == NR_IRQS)
- panic("No available IRQ to bind to: increase NR_IRQS!\n");
-
- return irq;
-}
-
-int bind_evtchn_to_irq(unsigned int evtchn)
-{
- int irq;
-
- spin_lock(&irq_mapping_update_lock);
-
- irq = evtchn_to_irq[evtchn];
-
- if (irq == -1) {
- irq = find_unbound_irq();
-
- dynamic_irq_init(irq);
- set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
- handle_level_irq, "event");
-
- evtchn_to_irq[evtchn] = irq;
- irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
- }
-
- irq_bindcount[irq]++;
-
- spin_unlock(&irq_mapping_update_lock);
-
- return irq;
-}
-EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
-
-static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
-{
- struct evtchn_bind_ipi bind_ipi;
- int evtchn, irq;
-
- spin_lock(&irq_mapping_update_lock);
-
- irq = per_cpu(ipi_to_irq, cpu)[ipi];
- if (irq == -1) {
- irq = find_unbound_irq();
- if (irq < 0)
- goto out;
-
- dynamic_irq_init(irq);
- set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
- handle_level_irq, "ipi");
-
- bind_ipi.vcpu = cpu;
- if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
- &bind_ipi) != 0)
- BUG();
- evtchn = bind_ipi.port;
-
- evtchn_to_irq[evtchn] = irq;
- irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
-
- per_cpu(ipi_to_irq, cpu)[ipi] = irq;
-
- bind_evtchn_to_cpu(evtchn, cpu);
- }
-
- irq_bindcount[irq]++;
-
- out:
- spin_unlock(&irq_mapping_update_lock);
- return irq;
-}
-
-
-static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
-{
- struct evtchn_bind_virq bind_virq;
- int evtchn, irq;
-
- spin_lock(&irq_mapping_update_lock);
-
- irq = per_cpu(virq_to_irq, cpu)[virq];
-
- if (irq == -1) {
- bind_virq.virq = virq;
- bind_virq.vcpu = cpu;
- if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
- &bind_virq) != 0)
- BUG();
- evtchn = bind_virq.port;
-
- irq = find_unbound_irq();
-
- dynamic_irq_init(irq);
- set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
- handle_level_irq, "virq");
-
- evtchn_to_irq[evtchn] = irq;
- irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
-
- per_cpu(virq_to_irq, cpu)[virq] = irq;
-
- bind_evtchn_to_cpu(evtchn, cpu);
- }
-
- irq_bindcount[irq]++;
-
- spin_unlock(&irq_mapping_update_lock);
-
- return irq;
-}
-
-static void unbind_from_irq(unsigned int irq)
-{
- struct evtchn_close close;
- int evtchn = evtchn_from_irq(irq);
-
- spin_lock(&irq_mapping_update_lock);
-
- if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
- close.port = evtchn;
- if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
- BUG();
-
- switch (type_from_irq(irq)) {
- case IRQT_VIRQ:
- per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
- [index_from_irq(irq)] = -1;
- break;
- default:
- break;
- }
-
- /* Closed ports are implicitly re-bound to VCPU0. */
- bind_evtchn_to_cpu(evtchn, 0);
-
- evtchn_to_irq[evtchn] = -1;
- irq_info[irq] = IRQ_UNBOUND;
-
- dynamic_irq_init(irq);
- }
-
- spin_unlock(&irq_mapping_update_lock);
-}
-
-int bind_evtchn_to_irqhandler(unsigned int evtchn,
- irq_handler_t handler,
- unsigned long irqflags,
- const char *devname, void *dev_id)
-{
- unsigned int irq;
- int retval;
-
- irq = bind_evtchn_to_irq(evtchn);
- retval = request_irq(irq, handler, irqflags, devname, dev_id);
- if (retval != 0) {
- unbind_from_irq(irq);
- return retval;
- }
-
- return irq;
-}
-EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
-
-int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
- irq_handler_t handler,
- unsigned long irqflags, const char *devname, void *dev_id)
-{
- unsigned int irq;
- int retval;
-
- irq = bind_virq_to_irq(virq, cpu);
- retval = request_irq(irq, handler, irqflags, devname, dev_id);
- if (retval != 0) {
- unbind_from_irq(irq);
- return retval;
- }
-
- return irq;
-}
-EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
-
-int bind_ipi_to_irqhandler(enum ipi_vector ipi,
- unsigned int cpu,
- irq_handler_t handler,
- unsigned long irqflags,
- const char *devname,
- void *dev_id)
-{
- int irq, retval;
-
- irq = bind_ipi_to_irq(ipi, cpu);
- if (irq < 0)
- return irq;
-
- retval = request_irq(irq, handler, irqflags, devname, dev_id);
- if (retval != 0) {
- unbind_from_irq(irq);
- return retval;
- }
-
- return irq;
-}
-
-void unbind_from_irqhandler(unsigned int irq, void *dev_id)
-{
- free_irq(irq, dev_id);
- unbind_from_irq(irq);
-}
-EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
-
-void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
-{
- int irq = per_cpu(ipi_to_irq, cpu)[vector];
- BUG_ON(irq < 0);
- notify_remote_via_irq(irq);
-}
-
-
-/*
- * Search the CPUs pending events bitmasks. For each one found, map
- * the event number to an irq, and feed it into do_IRQ() for
- * handling.
- *
- * Xen uses a two-level bitmap to speed searching. The first level is
- * a bitset of words which contain pending event bits. The second
- * level is a bitset of pending events themselves.
- */
-void xen_evtchn_do_upcall(struct pt_regs *regs)
-{
- int cpu = get_cpu();
- struct shared_info *s = HYPERVISOR_shared_info;
- struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
- unsigned long pending_words;
-
- vcpu_info->evtchn_upcall_pending = 0;
-
- /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
- pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
- while (pending_words != 0) {
- unsigned long pending_bits;
- int word_idx = __ffs(pending_words);
- pending_words &= ~(1UL << word_idx);
-
- while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
- int bit_idx = __ffs(pending_bits);
- int port = (word_idx * BITS_PER_LONG) + bit_idx;
- int irq = evtchn_to_irq[port];
-
- if (irq != -1) {
- regs->orig_ax = ~irq;
- do_IRQ(regs);
- }
- }
- }
-
- put_cpu();
-}
-
-/* Rebind an evtchn so that it gets delivered to a specific cpu */
-static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
-{
- struct evtchn_bind_vcpu bind_vcpu;
- int evtchn = evtchn_from_irq(irq);
-
- if (!VALID_EVTCHN(evtchn))
- return;
-
- /* Send future instances of this interrupt to other vcpu. */
- bind_vcpu.port = evtchn;
- bind_vcpu.vcpu = tcpu;
-
- /*
- * If this fails, it usually just indicates that we're dealing with a
- * virq or IPI channel, which don't actually need to be rebound. Ignore
- * it, but don't do the xenlinux-level rebind in that case.
- */
- if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
- bind_evtchn_to_cpu(evtchn, tcpu);
-}
-
-
-static void set_affinity_irq(unsigned irq, cpumask_t dest)
-{
- unsigned tcpu = first_cpu(dest);
- rebind_irq_to_cpu(irq, tcpu);
-}
-
-static void enable_dynirq(unsigned int irq)
-{
- int evtchn = evtchn_from_irq(irq);
-
- if (VALID_EVTCHN(evtchn))
- unmask_evtchn(evtchn);
-}
-
-static void disable_dynirq(unsigned int irq)
-{
- int evtchn = evtchn_from_irq(irq);
-
- if (VALID_EVTCHN(evtchn))
- mask_evtchn(evtchn);
-}
-
-static void ack_dynirq(unsigned int irq)
-{
- int evtchn = evtchn_from_irq(irq);
-
- move_native_irq(irq);
-
- if (VALID_EVTCHN(evtchn))
- clear_evtchn(evtchn);
-}
-
-static int retrigger_dynirq(unsigned int irq)
-{
- int evtchn = evtchn_from_irq(irq);
- int ret = 0;
-
- if (VALID_EVTCHN(evtchn)) {
- set_evtchn(evtchn);
- ret = 1;
- }
-
- return ret;
-}
-
-static struct irq_chip xen_dynamic_chip __read_mostly = {
- .name = "xen-dyn",
- .mask = disable_dynirq,
- .unmask = enable_dynirq,
- .ack = ack_dynirq,
- .set_affinity = set_affinity_irq,
- .retrigger = retrigger_dynirq,
-};
-
-void __init xen_init_IRQ(void)
-{
- int i;
-
- init_evtchn_cpu_bindings();
-
- /* No event channels are 'live' right now. */
- for (i = 0; i < NR_EVENT_CHANNELS; i++)
- mask_evtchn(i);
-
- /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
- for (i = 0; i < NR_IRQS; i++)
- irq_bindcount[i] = 0;
-
- irq_ctx_init(smp_processor_id());
-}
diff --git a/arch/x86/xen/features.c b/arch/x86/xen/features.c
deleted file mode 100644
index 0707714e40d..00000000000
--- a/arch/x86/xen/features.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/******************************************************************************
- * features.c
- *
- * Xen feature flags.
- *
- * Copyright (c) 2006, Ian Campbell, XenSource Inc.
- */
-#include <linux/types.h>
-#include <linux/cache.h>
-#include <linux/module.h>
-#include <asm/xen/hypervisor.h>
-#include <xen/features.h>
-
-u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
-EXPORT_SYMBOL_GPL(xen_features);
-
-void xen_setup_features(void)
-{
- struct xen_feature_info fi;
- int i, j;
-
- for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
- fi.submap_idx = i;
- if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
- break;
- for (j = 0; j < 32; j++)
- xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
- }
-}
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
new file mode 100644
index 00000000000..ebfa9b2c871
--- /dev/null
+++ b/arch/x86/xen/grant-table.c
@@ -0,0 +1,225 @@
+/******************************************************************************
+ * grant_table.c
+ * x86 specific part
+ *
+ * Granting foreign access to our memory reservation.
+ *
+ * Copyright (c) 2005-2006, Christopher Clark
+ * Copyright (c) 2004-2005, K A Fraser
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ * VA Linux Systems Japan. Split out x86 specific part.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <xen/interface/xen.h>
+#include <xen/page.h>
+#include <xen/grant_table.h>
+#include <xen/xen.h>
+
+#include <asm/pgtable.h>
+
+static struct gnttab_vm_area {
+ struct vm_struct *area;
+ pte_t **ptes;
+} gnttab_shared_vm_area, gnttab_status_vm_area;
+
+int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
+ unsigned long max_nr_gframes,
+ void **__shared)
+{
+ void *shared = *__shared;
+ unsigned long addr;
+ unsigned long i;
+
+ if (shared == NULL)
+ *__shared = shared = gnttab_shared_vm_area.area->addr;
+
+ addr = (unsigned long)shared;
+
+ for (i = 0; i < nr_gframes; i++) {
+ set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i],
+ mfn_pte(frames[i], PAGE_KERNEL));
+ addr += PAGE_SIZE;
+ }
+
+ return 0;
+}
+
+int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
+ unsigned long max_nr_gframes,
+ grant_status_t **__shared)
+{
+ grant_status_t *shared = *__shared;
+ unsigned long addr;
+ unsigned long i;
+
+ if (shared == NULL)
+ *__shared = shared = gnttab_status_vm_area.area->addr;
+
+ addr = (unsigned long)shared;
+
+ for (i = 0; i < nr_gframes; i++) {
+ set_pte_at(&init_mm, addr, gnttab_status_vm_area.ptes[i],
+ mfn_pte(frames[i], PAGE_KERNEL));
+ addr += PAGE_SIZE;
+ }
+
+ return 0;
+}
+
+void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
+{
+ pte_t **ptes;
+ unsigned long addr;
+ unsigned long i;
+
+ if (shared == gnttab_status_vm_area.area->addr)
+ ptes = gnttab_status_vm_area.ptes;
+ else
+ ptes = gnttab_shared_vm_area.ptes;
+
+ addr = (unsigned long)shared;
+
+ for (i = 0; i < nr_gframes; i++) {
+ set_pte_at(&init_mm, addr, ptes[i], __pte(0));
+ addr += PAGE_SIZE;
+ }
+}
+
+static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames)
+{
+ area->ptes = kmalloc(sizeof(pte_t *) * nr_frames, GFP_KERNEL);
+ if (area->ptes == NULL)
+ return -ENOMEM;
+
+ area->area = alloc_vm_area(PAGE_SIZE * nr_frames, area->ptes);
+ if (area->area == NULL) {
+ kfree(area->ptes);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void arch_gnttab_vfree(struct gnttab_vm_area *area)
+{
+ free_vm_area(area->area);
+ kfree(area->ptes);
+}
+
+int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status)
+{
+ int ret;
+
+ if (!xen_pv_domain())
+ return 0;
+
+ ret = arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Always allocate the space for the status frames in case
+ * we're migrated to a host with V2 support.
+ */
+ ret = arch_gnttab_valloc(&gnttab_status_vm_area, nr_status);
+ if (ret < 0)
+ goto err;
+
+ return 0;
+ err:
+ arch_gnttab_vfree(&gnttab_shared_vm_area);
+ return -ENOMEM;
+}
+
+#ifdef CONFIG_XEN_PVH
+#include <xen/balloon.h>
+#include <xen/events.h>
+#include <linux/slab.h>
+static int __init xlated_setup_gnttab_pages(void)
+{
+ struct page **pages;
+ xen_pfn_t *pfns;
+ int rc;
+ unsigned int i;
+ unsigned long nr_grant_frames = gnttab_max_grant_frames();
+
+ BUG_ON(nr_grant_frames == 0);
+ pages = kcalloc(nr_grant_frames, sizeof(pages[0]), GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ pfns = kcalloc(nr_grant_frames, sizeof(pfns[0]), GFP_KERNEL);
+ if (!pfns) {
+ kfree(pages);
+ return -ENOMEM;
+ }
+ rc = alloc_xenballooned_pages(nr_grant_frames, pages, 0 /* lowmem */);
+ if (rc) {
+ pr_warn("%s Couldn't balloon alloc %ld pfns rc:%d\n", __func__,
+ nr_grant_frames, rc);
+ kfree(pages);
+ kfree(pfns);
+ return rc;
+ }
+ for (i = 0; i < nr_grant_frames; i++)
+ pfns[i] = page_to_pfn(pages[i]);
+
+ rc = arch_gnttab_map_shared(pfns, nr_grant_frames, nr_grant_frames,
+ &xen_auto_xlat_grant_frames.vaddr);
+
+ if (rc) {
+ pr_warn("%s Couldn't map %ld pfns rc:%d\n", __func__,
+ nr_grant_frames, rc);
+ free_xenballooned_pages(nr_grant_frames, pages);
+ kfree(pages);
+ kfree(pfns);
+ return rc;
+ }
+ kfree(pages);
+
+ xen_auto_xlat_grant_frames.pfn = pfns;
+ xen_auto_xlat_grant_frames.count = nr_grant_frames;
+
+ return 0;
+}
+
+static int __init xen_pvh_gnttab_setup(void)
+{
+ if (!xen_pvh_domain())
+ return -ENODEV;
+
+ return xlated_setup_gnttab_pages();
+}
+/* Call it _before_ __gnttab_init as we need to initialize the
+ * xen_auto_xlat_grant_frames first. */
+core_initcall(xen_pvh_gnttab_setup);
+#endif
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
new file mode 100644
index 00000000000..a1207cb6472
--- /dev/null
+++ b/arch/x86/xen/irq.c
@@ -0,0 +1,136 @@
+#include <linux/hardirq.h>
+
+#include <asm/x86_init.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/sched.h>
+#include <xen/interface/vcpu.h>
+#include <xen/features.h>
+#include <xen/events.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include "xen-ops.h"
+
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void xen_force_evtchn_callback(void)
+{
+ (void)HYPERVISOR_xen_version(0, NULL);
+}
+
+asmlinkage __visible unsigned long xen_save_fl(void)
+{
+ struct vcpu_info *vcpu;
+ unsigned long flags;
+
+ vcpu = this_cpu_read(xen_vcpu);
+
+ /* flag has opposite sense of mask */
+ flags = !vcpu->evtchn_upcall_mask;
+
+ /* convert to IF type flag
+ -0 -> 0x00000000
+ -1 -> 0xffffffff
+ */
+ return (-flags) & X86_EFLAGS_IF;
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
+
+__visible void xen_restore_fl(unsigned long flags)
+{
+ struct vcpu_info *vcpu;
+
+ /* convert from IF type flag */
+ flags = !(flags & X86_EFLAGS_IF);
+
+ /* See xen_irq_enable() for why preemption must be disabled. */
+ preempt_disable();
+ vcpu = this_cpu_read(xen_vcpu);
+ vcpu->evtchn_upcall_mask = flags;
+
+ if (flags == 0) {
+ barrier(); /* unmask then check (avoid races) */
+ if (unlikely(vcpu->evtchn_upcall_pending))
+ xen_force_evtchn_callback();
+ preempt_enable();
+ } else
+ preempt_enable_no_resched();
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
+
+asmlinkage __visible void xen_irq_disable(void)
+{
+ /* There's a one instruction preempt window here. We need to
+ make sure we're don't switch CPUs between getting the vcpu
+ pointer and updating the mask. */
+ preempt_disable();
+ this_cpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
+ preempt_enable_no_resched();
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
+
+asmlinkage __visible void xen_irq_enable(void)
+{
+ struct vcpu_info *vcpu;
+
+ /*
+ * We may be preempted as soon as vcpu->evtchn_upcall_mask is
+ * cleared, so disable preemption to ensure we check for
+ * events on the VCPU we are still running on.
+ */
+ preempt_disable();
+
+ vcpu = this_cpu_read(xen_vcpu);
+ vcpu->evtchn_upcall_mask = 0;
+
+ /* Doesn't matter if we get preempted here, because any
+ pending event will get dealt with anyway. */
+
+ barrier(); /* unmask then check (avoid races) */
+ if (unlikely(vcpu->evtchn_upcall_pending))
+ xen_force_evtchn_callback();
+
+ preempt_enable();
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
+
+static void xen_safe_halt(void)
+{
+ /* Blocking includes an implicit local_irq_enable(). */
+ if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
+ BUG();
+}
+
+static void xen_halt(void)
+{
+ if (irqs_disabled())
+ HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
+ else
+ xen_safe_halt();
+}
+
+static const struct pv_irq_ops xen_irq_ops __initconst = {
+ .save_fl = PV_CALLEE_SAVE(xen_save_fl),
+ .restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
+ .irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
+ .irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
+
+ .safe_halt = xen_safe_halt,
+ .halt = xen_halt,
+#ifdef CONFIG_X86_64
+ .adjust_exception_frame = xen_adjust_exception_frame,
+#endif
+};
+
+void __init xen_init_irq_ops(void)
+{
+ /* For PVH we use default pv_irq_ops settings. */
+ if (!xen_feature(XENFEAT_hvm_callback_vector))
+ pv_irq_ops = xen_irq_ops;
+ x86_init.irqs.intr_init = xen_init_IRQ;
+}
diff --git a/arch/x86/xen/manage.c b/arch/x86/xen/manage.c
deleted file mode 100644
index aa7af9e6abc..00000000000
--- a/arch/x86/xen/manage.c
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Handle extern requests for shutdown, reboot and sysrq
- */
-#include <linux/kernel.h>
-#include <linux/err.h>
-#include <linux/reboot.h>
-#include <linux/sysrq.h>
-
-#include <xen/xenbus.h>
-
-#define SHUTDOWN_INVALID -1
-#define SHUTDOWN_POWEROFF 0
-#define SHUTDOWN_SUSPEND 2
-/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
- * report a crash, not be instructed to crash!
- * HALT is the same as POWEROFF, as far as we're concerned. The tools use
- * the distinction when we return the reason code to them.
- */
-#define SHUTDOWN_HALT 4
-
-/* Ignore multiple shutdown requests. */
-static int shutting_down = SHUTDOWN_INVALID;
-
-static void shutdown_handler(struct xenbus_watch *watch,
- const char **vec, unsigned int len)
-{
- char *str;
- struct xenbus_transaction xbt;
- int err;
-
- if (shutting_down != SHUTDOWN_INVALID)
- return;
-
- again:
- err = xenbus_transaction_start(&xbt);
- if (err)
- return;
-
- str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
- /* Ignore read errors and empty reads. */
- if (XENBUS_IS_ERR_READ(str)) {
- xenbus_transaction_end(xbt, 1);
- return;
- }
-
- xenbus_write(xbt, "control", "shutdown", "");
-
- err = xenbus_transaction_end(xbt, 0);
- if (err == -EAGAIN) {
- kfree(str);
- goto again;
- }
-
- if (strcmp(str, "poweroff") == 0 ||
- strcmp(str, "halt") == 0)
- orderly_poweroff(false);
- else if (strcmp(str, "reboot") == 0)
- ctrl_alt_del();
- else {
- printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
- shutting_down = SHUTDOWN_INVALID;
- }
-
- kfree(str);
-}
-
-static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
- unsigned int len)
-{
- char sysrq_key = '\0';
- struct xenbus_transaction xbt;
- int err;
-
- again:
- err = xenbus_transaction_start(&xbt);
- if (err)
- return;
- if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
- printk(KERN_ERR "Unable to read sysrq code in "
- "control/sysrq\n");
- xenbus_transaction_end(xbt, 1);
- return;
- }
-
- if (sysrq_key != '\0')
- xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
-
- err = xenbus_transaction_end(xbt, 0);
- if (err == -EAGAIN)
- goto again;
-
- if (sysrq_key != '\0')
- handle_sysrq(sysrq_key, NULL);
-}
-
-static struct xenbus_watch shutdown_watch = {
- .node = "control/shutdown",
- .callback = shutdown_handler
-};
-
-static struct xenbus_watch sysrq_watch = {
- .node = "control/sysrq",
- .callback = sysrq_handler
-};
-
-static int setup_shutdown_watcher(void)
-{
- int err;
-
- err = register_xenbus_watch(&shutdown_watch);
- if (err) {
- printk(KERN_ERR "Failed to set shutdown watcher\n");
- return err;
- }
-
- err = register_xenbus_watch(&sysrq_watch);
- if (err) {
- printk(KERN_ERR "Failed to set sysrq watcher\n");
- return err;
- }
-
- return 0;
-}
-
-static int shutdown_event(struct notifier_block *notifier,
- unsigned long event,
- void *data)
-{
- setup_shutdown_watcher();
- return NOTIFY_DONE;
-}
-
-static int __init setup_shutdown_event(void)
-{
- static struct notifier_block xenstore_notifier = {
- .notifier_call = shutdown_event
- };
- register_xenstore_notifier(&xenstore_notifier);
-
- return 0;
-}
-
-subsys_initcall(setup_shutdown_event);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 2a054ef2a3d..e8a1201c329 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -40,32 +40,118 @@
*/
#include <linux/sched.h>
#include <linux/highmem.h>
+#include <linux/debugfs.h>
#include <linux/bug.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <linux/memblock.h>
+#include <linux/seq_file.h>
+#include <linux/crash_dump.h>
+
+#include <trace/events/xen.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
+#include <asm/fixmap.h>
#include <asm/mmu_context.h>
+#include <asm/setup.h>
#include <asm/paravirt.h>
+#include <asm/e820.h>
+#include <asm/linkage.h>
+#include <asm/page.h>
+#include <asm/init.h>
+#include <asm/pat.h>
+#include <asm/smp.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
+#include <xen/xen.h>
#include <xen/page.h>
#include <xen/interface/xen.h>
+#include <xen/interface/hvm/hvm_op.h>
+#include <xen/interface/version.h>
+#include <xen/interface/memory.h>
+#include <xen/hvc-console.h>
#include "multicalls.h"
#include "mmu.h"
+#include "debugfs.h"
+
+/*
+ * Protects atomic reservation decrease/increase against concurrent increases.
+ * Also protects non-atomic updates of current_pages and balloon lists.
+ */
+DEFINE_SPINLOCK(xen_reservation_lock);
+
+#ifdef CONFIG_X86_32
+/*
+ * Identity map, in addition to plain kernel map. This needs to be
+ * large enough to allocate page table pages to allocate the rest.
+ * Each page can map 2MB.
+ */
+#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)
+static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
+#endif
+#ifdef CONFIG_X86_64
+/* l3 pud for userspace vsyscall mapping */
+static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Note about cr3 (pagetable base) values:
+ *
+ * xen_cr3 contains the current logical cr3 value; it contains the
+ * last set cr3. This may not be the current effective cr3, because
+ * its update may be being lazily deferred. However, a vcpu looking
+ * at its own cr3 can use this value knowing that it everything will
+ * be self-consistent.
+ *
+ * xen_current_cr3 contains the actual vcpu cr3; it is set once the
+ * hypercall to set the vcpu cr3 is complete (so it may be a little
+ * out of date, but it will never be set early). If one vcpu is
+ * looking at another vcpu's cr3 value, it should use this variable.
+ */
+DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
+DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
-xmaddr_t arbitrary_virt_to_machine(unsigned long address)
+
+/*
+ * Just beyond the highest usermode address. STACK_TOP_MAX has a
+ * redzone above it, so round it up to a PGD boundary.
+ */
+#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
+
+unsigned long arbitrary_virt_to_mfn(void *vaddr)
{
+ xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
+
+ return PFN_DOWN(maddr.maddr);
+}
+
+xmaddr_t arbitrary_virt_to_machine(void *vaddr)
+{
+ unsigned long address = (unsigned long)vaddr;
unsigned int level;
- pte_t *pte = lookup_address(address, &level);
- unsigned offset = address & PAGE_MASK;
+ pte_t *pte;
+ unsigned offset;
- BUG_ON(pte == NULL);
+ /*
+ * if the PFN is in the linear mapped vaddr range, we can just use
+ * the (quick) virt_to_machine() p2m lookup
+ */
+ if (virt_addr_valid(vaddr))
+ return virt_to_machine(vaddr);
- return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
+ /* otherwise we have to do a (slower) full page-table walk */
+
+ pte = lookup_address(address, &level);
+ BUG_ON(pte == NULL);
+ offset = address & ~PAGE_MASK;
+ return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
}
+EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
void make_lowmem_page_readonly(void *vaddr)
{
@@ -74,7 +160,8 @@ void make_lowmem_page_readonly(void *vaddr)
unsigned int level;
pte = lookup_address(address, &level);
- BUG_ON(pte == NULL);
+ if (pte == NULL)
+ return; /* vaddr missing */
ptev = pte_wrprotect(*pte);
@@ -89,7 +176,8 @@ void make_lowmem_page_readwrite(void *vaddr)
unsigned int level;
pte = lookup_address(address, &level);
- BUG_ON(pte == NULL);
+ if (pte == NULL)
+ return; /* vaddr missing */
ptev = pte_mkwrite(*pte);
@@ -98,307 +186,625 @@ void make_lowmem_page_readwrite(void *vaddr)
}
-void xen_set_pmd(pmd_t *ptr, pmd_t val)
+static bool xen_page_pinned(void *ptr)
+{
+ struct page *page = virt_to_page(ptr);
+
+ return PagePinned(page);
+}
+
+void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
{
struct multicall_space mcs;
struct mmu_update *u;
- preempt_disable();
+ trace_xen_mmu_set_domain_pte(ptep, pteval, domid);
mcs = xen_mc_entry(sizeof(*u));
u = mcs.args;
- u->ptr = virt_to_machine(ptr).maddr;
- u->val = pmd_val_ma(val);
- MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+
+ /* ptep might be kmapped when using 32-bit HIGHPTE */
+ u->ptr = virt_to_machine(ptep).maddr;
+ u->val = pte_val_ma(pteval);
+
+ MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+EXPORT_SYMBOL_GPL(xen_set_domain_pte);
+
+static void xen_extend_mmu_update(const struct mmu_update *update)
+{
+ struct multicall_space mcs;
+ struct mmu_update *u;
+
+ mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
+
+ if (mcs.mc != NULL) {
+ mcs.mc->args[1]++;
+ } else {
+ mcs = __xen_mc_entry(sizeof(*u));
+ MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
+ }
+
+ u = mcs.args;
+ *u = *update;
+}
+
+static void xen_extend_mmuext_op(const struct mmuext_op *op)
+{
+ struct multicall_space mcs;
+ struct mmuext_op *u;
+
+ mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));
+
+ if (mcs.mc != NULL) {
+ mcs.mc->args[1]++;
+ } else {
+ mcs = __xen_mc_entry(sizeof(*u));
+ MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
+ }
+
+ u = mcs.args;
+ *u = *op;
+}
+
+static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
+{
+ struct mmu_update u;
+
+ preempt_disable();
+
+ xen_mc_batch();
+
+ /* ptr may be ioremapped for 64-bit pagetable setup */
+ u.ptr = arbitrary_virt_to_machine(ptr).maddr;
+ u.val = pmd_val_ma(val);
+ xen_extend_mmu_update(&u);
xen_mc_issue(PARAVIRT_LAZY_MMU);
preempt_enable();
}
+static void xen_set_pmd(pmd_t *ptr, pmd_t val)
+{
+ trace_xen_mmu_set_pmd(ptr, val);
+
+ /* If page is not pinned, we can just update the entry
+ directly */
+ if (!xen_page_pinned(ptr)) {
+ *ptr = val;
+ return;
+ }
+
+ xen_set_pmd_hyper(ptr, val);
+}
+
/*
* Associate a virtual page frame with a given physical page frame
* and protection flags for that frame.
*/
void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
+ set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
+}
- pgd = swapper_pg_dir + pgd_index(vaddr);
- if (pgd_none(*pgd)) {
- BUG();
- return;
- }
- pud = pud_offset(pgd, vaddr);
- if (pud_none(*pud)) {
- BUG();
- return;
- }
- pmd = pmd_offset(pud, vaddr);
- if (pmd_none(*pmd)) {
- BUG();
- return;
+static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
+{
+ struct mmu_update u;
+
+ if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
+ return false;
+
+ xen_mc_batch();
+
+ u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
+ u.val = pte_val_ma(pteval);
+ xen_extend_mmu_update(&u);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ return true;
+}
+
+static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
+{
+ if (!xen_batched_set_pte(ptep, pteval)) {
+ /*
+ * Could call native_set_pte() here and trap and
+ * emulate the PTE write but with 32-bit guests this
+ * needs two traps (one for each of the two 32-bit
+ * words in the PTE) so do one hypercall directly
+ * instead.
+ */
+ struct mmu_update u;
+
+ u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
+ u.val = pte_val_ma(pteval);
+ HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
}
- pte = pte_offset_kernel(pmd, vaddr);
- /* <mfn,flags> stored as-is, to permit clearing entries */
- xen_set_pte(pte, mfn_pte(mfn, flags));
+}
- /*
- * It's enough to flush this one mapping.
- * (PGE mappings get flushed as well)
- */
- __flush_tlb_one(vaddr);
+static void xen_set_pte(pte_t *ptep, pte_t pteval)
+{
+ trace_xen_mmu_set_pte(ptep, pteval);
+ __xen_set_pte(ptep, pteval);
}
-void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval)
{
- if (mm == current->mm || mm == &init_mm) {
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
- struct multicall_space mcs;
- mcs = xen_mc_entry(0);
+ trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);
+ __xen_set_pte(ptep, pteval);
+}
- MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
- return;
- } else
- if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
- return;
- }
- xen_set_pte(ptep, pteval);
+pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ /* Just return the pte as-is. We preserve the bits on commit */
+ trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
+ return *ptep;
}
-#ifdef CONFIG_X86_PAE
-void xen_set_pud(pud_t *ptr, pud_t val)
+void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
{
- struct multicall_space mcs;
- struct mmu_update *u;
+ struct mmu_update u;
- preempt_disable();
+ trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
+ xen_mc_batch();
- mcs = xen_mc_entry(sizeof(*u));
- u = mcs.args;
- u->ptr = virt_to_machine(ptr).maddr;
- u->val = pud_val_ma(val);
- MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+ u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
+ u.val = pte_val_ma(pte);
+ xen_extend_mmu_update(&u);
xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
- preempt_enable();
+/* Assume pteval_t is equivalent to all the other *val_t types. */
+static pteval_t pte_mfn_to_pfn(pteval_t val)
+{
+ if (val & _PAGE_PRESENT) {
+ unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
+ unsigned long pfn = mfn_to_pfn(mfn);
+
+ pteval_t flags = val & PTE_FLAGS_MASK;
+ if (unlikely(pfn == ~0))
+ val = flags & ~_PAGE_PRESENT;
+ else
+ val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
+ }
+
+ return val;
}
-void xen_set_pte(pte_t *ptep, pte_t pte)
+static pteval_t pte_pfn_to_mfn(pteval_t val)
{
- ptep->pte_high = pte.pte_high;
- smp_wmb();
- ptep->pte_low = pte.pte_low;
+ if (val & _PAGE_PRESENT) {
+ unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
+ pteval_t flags = val & PTE_FLAGS_MASK;
+ unsigned long mfn;
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ mfn = get_phys_to_machine(pfn);
+ else
+ mfn = pfn;
+ /*
+ * If there's no mfn for the pfn, then just create an
+ * empty non-present pte. Unfortunately this loses
+ * information about the original pfn, so
+ * pte_mfn_to_pfn is asymmetric.
+ */
+ if (unlikely(mfn == INVALID_P2M_ENTRY)) {
+ mfn = 0;
+ flags = 0;
+ } else {
+ /*
+ * Paramount to do this test _after_ the
+ * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
+ * IDENTITY_FRAME_BIT resolves to true.
+ */
+ mfn &= ~FOREIGN_FRAME_BIT;
+ if (mfn & IDENTITY_FRAME_BIT) {
+ mfn &= ~IDENTITY_FRAME_BIT;
+ flags |= _PAGE_IOMAP;
+ }
+ }
+ val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
+ }
+
+ return val;
}
-void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+static pteval_t iomap_pte(pteval_t val)
{
- set_64bit((u64 *)ptep, pte_val_ma(pte));
+ if (val & _PAGE_PRESENT) {
+ unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
+ pteval_t flags = val & PTE_FLAGS_MASK;
+
+ /* We assume the pte frame number is a MFN, so
+ just use it as-is. */
+ val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
+ }
+
+ return val;
}
-void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+__visible pteval_t xen_pte_val(pte_t pte)
{
- ptep->pte_low = 0;
- smp_wmb(); /* make sure low gets written first */
- ptep->pte_high = 0;
+ pteval_t pteval = pte.pte;
+#if 0
+ /* If this is a WC pte, convert back from Xen WC to Linux WC */
+ if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
+ WARN_ON(!pat_enabled);
+ pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
+ }
+#endif
+ if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
+ return pteval;
+
+ return pte_mfn_to_pfn(pteval);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
-void xen_pmd_clear(pmd_t *pmdp)
+__visible pgdval_t xen_pgd_val(pgd_t pgd)
{
- xen_set_pmd(pmdp, __pmd(0));
+ return pte_mfn_to_pfn(pgd.pgd);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
-unsigned long long xen_pte_val(pte_t pte)
+/*
+ * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
+ * are reserved for now, to correspond to the Intel-reserved PAT
+ * types.
+ *
+ * We expect Linux's PAT set as follows:
+ *
+ * Idx PTE flags Linux Xen Default
+ * 0 WB WB WB
+ * 1 PWT WC WT WT
+ * 2 PCD UC- UC- UC-
+ * 3 PCD PWT UC UC UC
+ * 4 PAT WB WC WB
+ * 5 PAT PWT WC WP WT
+ * 6 PAT PCD UC- rsv UC-
+ * 7 PAT PCD PWT UC rsv UC
+ */
+
+void xen_set_pat(u64 pat)
{
- unsigned long long ret = 0;
+ /* We expect Linux to use a PAT setting of
+ * UC UC- WC WB (ignoring the PAT flag) */
+ WARN_ON(pat != 0x0007010600070106ull);
+}
- if (pte.pte_low) {
- ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
- ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+__visible pte_t xen_make_pte(pteval_t pte)
+{
+ phys_addr_t addr = (pte & PTE_PFN_MASK);
+#if 0
+ /* If Linux is trying to set a WC pte, then map to the Xen WC.
+ * If _PAGE_PAT is set, then it probably means it is really
+ * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
+ * things work out OK...
+ *
+ * (We should never see kernel mappings with _PAGE_PSE set,
+ * but we could see hugetlbfs mappings, I think.).
+ */
+ if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
+ if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
+ pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
+ }
+#endif
+ /*
+ * Unprivileged domains are allowed to do IOMAPpings for
+ * PCI passthrough, but not map ISA space. The ISA
+ * mappings are just dummy local mappings to keep other
+ * parts of the kernel happy.
+ */
+ if (unlikely(pte & _PAGE_IOMAP) &&
+ (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
+ pte = iomap_pte(pte);
+ } else {
+ pte &= ~_PAGE_IOMAP;
+ pte = pte_pfn_to_mfn(pte);
}
- return ret;
+ return native_make_pte(pte);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
-unsigned long long xen_pmd_val(pmd_t pmd)
+__visible pgd_t xen_make_pgd(pgdval_t pgd)
{
- unsigned long long ret = pmd.pmd;
- if (ret)
- ret = machine_to_phys(XMADDR(ret)).paddr | 1;
- return ret;
+ pgd = pte_pfn_to_mfn(pgd);
+ return native_make_pgd(pgd);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
-unsigned long long xen_pgd_val(pgd_t pgd)
+__visible pmdval_t xen_pmd_val(pmd_t pmd)
{
- unsigned long long ret = pgd.pgd;
- if (ret)
- ret = machine_to_phys(XMADDR(ret)).paddr | 1;
- return ret;
+ return pte_mfn_to_pfn(pmd.pmd);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
-pte_t xen_make_pte(unsigned long long pte)
+static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
{
- if (pte & _PAGE_PRESENT) {
- pte = phys_to_machine(XPADDR(pte)).maddr;
- pte &= ~(_PAGE_PCD | _PAGE_PWT);
+ struct mmu_update u;
+
+ preempt_disable();
+
+ xen_mc_batch();
+
+ /* ptr may be ioremapped for 64-bit pagetable setup */
+ u.ptr = arbitrary_virt_to_machine(ptr).maddr;
+ u.val = pud_val_ma(val);
+ xen_extend_mmu_update(&u);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
+}
+
+static void xen_set_pud(pud_t *ptr, pud_t val)
+{
+ trace_xen_mmu_set_pud(ptr, val);
+
+ /* If page is not pinned, we can just update the entry
+ directly */
+ if (!xen_page_pinned(ptr)) {
+ *ptr = val;
+ return;
}
- return (pte_t){ .pte = pte };
+ xen_set_pud_hyper(ptr, val);
}
-pmd_t xen_make_pmd(unsigned long long pmd)
+#ifdef CONFIG_X86_PAE
+static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
{
- if (pmd & 1)
- pmd = phys_to_machine(XPADDR(pmd)).maddr;
+ trace_xen_mmu_set_pte_atomic(ptep, pte);
+ set_64bit((u64 *)ptep, native_pte_val(pte));
+}
- return (pmd_t){ pmd };
+static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+ trace_xen_mmu_pte_clear(mm, addr, ptep);
+ if (!xen_batched_set_pte(ptep, native_make_pte(0)))
+ native_pte_clear(mm, addr, ptep);
}
-pgd_t xen_make_pgd(unsigned long long pgd)
+static void xen_pmd_clear(pmd_t *pmdp)
{
- if (pgd & _PAGE_PRESENT)
- pgd = phys_to_machine(XPADDR(pgd)).maddr;
+ trace_xen_mmu_pmd_clear(pmdp);
+ set_pmd(pmdp, __pmd(0));
+}
+#endif /* CONFIG_X86_PAE */
- return (pgd_t){ pgd };
+__visible pmd_t xen_make_pmd(pmdval_t pmd)
+{
+ pmd = pte_pfn_to_mfn(pmd);
+ return native_make_pmd(pmd);
}
-#else /* !PAE */
-void xen_set_pte(pte_t *ptep, pte_t pte)
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
+
+#if PAGETABLE_LEVELS == 4
+__visible pudval_t xen_pud_val(pud_t pud)
{
- *ptep = pte;
+ return pte_mfn_to_pfn(pud.pud);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
-unsigned long xen_pte_val(pte_t pte)
+__visible pud_t xen_make_pud(pudval_t pud)
{
- unsigned long ret = pte.pte_low;
+ pud = pte_pfn_to_mfn(pud);
- if (ret & _PAGE_PRESENT)
- ret = machine_to_phys(XMADDR(ret)).paddr;
+ return native_make_pud(pud);
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
- return ret;
+static pgd_t *xen_get_user_pgd(pgd_t *pgd)
+{
+ pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
+ unsigned offset = pgd - pgd_page;
+ pgd_t *user_ptr = NULL;
+
+ if (offset < pgd_index(USER_LIMIT)) {
+ struct page *page = virt_to_page(pgd_page);
+ user_ptr = (pgd_t *)page->private;
+ if (user_ptr)
+ user_ptr += offset;
+ }
+
+ return user_ptr;
}
-unsigned long xen_pgd_val(pgd_t pgd)
+static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
{
- unsigned long ret = pgd.pgd;
- if (ret)
- ret = machine_to_phys(XMADDR(ret)).paddr | 1;
- return ret;
+ struct mmu_update u;
+
+ u.ptr = virt_to_machine(ptr).maddr;
+ u.val = pgd_val_ma(val);
+ xen_extend_mmu_update(&u);
}
-pte_t xen_make_pte(unsigned long pte)
+/*
+ * Raw hypercall-based set_pgd, intended for in early boot before
+ * there's a page structure. This implies:
+ * 1. The only existing pagetable is the kernel's
+ * 2. It is always pinned
+ * 3. It has no user pagetable attached to it
+ */
+static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
{
- if (pte & _PAGE_PRESENT) {
- pte = phys_to_machine(XPADDR(pte)).maddr;
- pte &= ~(_PAGE_PCD | _PAGE_PWT);
- }
+ preempt_disable();
+
+ xen_mc_batch();
- return (pte_t){ pte };
+ __xen_set_pgd_hyper(ptr, val);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
}
-pgd_t xen_make_pgd(unsigned long pgd)
+static void xen_set_pgd(pgd_t *ptr, pgd_t val)
{
- if (pgd & _PAGE_PRESENT)
- pgd = phys_to_machine(XPADDR(pgd)).maddr;
+ pgd_t *user_ptr = xen_get_user_pgd(ptr);
- return (pgd_t){ pgd };
+ trace_xen_mmu_set_pgd(ptr, user_ptr, val);
+
+ /* If page is not pinned, we can just update the entry
+ directly */
+ if (!xen_page_pinned(ptr)) {
+ *ptr = val;
+ if (user_ptr) {
+ WARN_ON(xen_page_pinned(user_ptr));
+ *user_ptr = val;
+ }
+ return;
+ }
+
+ /* If it's pinned, then we can at least batch the kernel and
+ user updates together. */
+ xen_mc_batch();
+
+ __xen_set_pgd_hyper(ptr, val);
+ if (user_ptr)
+ __xen_set_pgd_hyper(user_ptr, val);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
}
-#endif /* CONFIG_X86_PAE */
+#endif /* PAGETABLE_LEVELS == 4 */
/*
- (Yet another) pagetable walker. This one is intended for pinning a
- pagetable. This means that it walks a pagetable and calls the
- callback function on each page it finds making up the page table,
- at every level. It walks the entire pagetable, but it only bothers
- pinning pte pages which are below pte_limit. In the normal case
- this will be TASK_SIZE, but at boot we need to pin up to
- FIXADDR_TOP. But the important bit is that we don't pin beyond
- there, because then we start getting into Xen's ptes.
-*/
-static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
- unsigned long limit)
-{
- pgd_t *pgd = pgd_base;
+ * (Yet another) pagetable walker. This one is intended for pinning a
+ * pagetable. This means that it walks a pagetable and calls the
+ * callback function on each page it finds making up the page table,
+ * at every level. It walks the entire pagetable, but it only bothers
+ * pinning pte pages which are below limit. In the normal case this
+ * will be STACK_TOP_MAX, but at boot we need to pin up to
+ * FIXADDR_TOP.
+ *
+ * For 32-bit the important bit is that we don't pin beyond there,
+ * because then we start getting into Xen's ptes.
+ *
+ * For 64-bit, we must skip the Xen hole in the middle of the address
+ * space, just after the big x86-64 virtual hole.
+ */
+static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
+ int (*func)(struct mm_struct *mm, struct page *,
+ enum pt_level),
+ unsigned long limit)
+{
int flush = 0;
- unsigned long addr = 0;
- unsigned long pgd_next;
+ unsigned hole_low, hole_high;
+ unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
+ unsigned pgdidx, pudidx, pmdidx;
- BUG_ON(limit > FIXADDR_TOP);
+ /* The limit is the last byte to be touched */
+ limit--;
+ BUG_ON(limit >= FIXADDR_TOP);
if (xen_feature(XENFEAT_auto_translated_physmap))
return 0;
- for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
+ /*
+ * 64-bit has a great big hole in the middle of the address
+ * space, which contains the Xen mappings. On 32-bit these
+ * will end up making a zero-sized hole and so is a no-op.
+ */
+ hole_low = pgd_index(USER_LIMIT);
+ hole_high = pgd_index(PAGE_OFFSET);
+
+ pgdidx_limit = pgd_index(limit);
+#if PTRS_PER_PUD > 1
+ pudidx_limit = pud_index(limit);
+#else
+ pudidx_limit = 0;
+#endif
+#if PTRS_PER_PMD > 1
+ pmdidx_limit = pmd_index(limit);
+#else
+ pmdidx_limit = 0;
+#endif
+
+ for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
pud_t *pud;
- unsigned long pud_limit, pud_next;
- pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
+ if (pgdidx >= hole_low && pgdidx < hole_high)
+ continue;
- if (!pgd_val(*pgd))
+ if (!pgd_val(pgd[pgdidx]))
continue;
- pud = pud_offset(pgd, 0);
+ pud = pud_offset(&pgd[pgdidx], 0);
if (PTRS_PER_PUD > 1) /* not folded */
- flush |= (*func)(virt_to_page(pud), PT_PUD);
+ flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
- for (; addr != pud_limit; pud++, addr = pud_next) {
+ for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
pmd_t *pmd;
- unsigned long pmd_limit;
- pud_next = pud_addr_end(addr, pud_limit);
-
- if (pud_next < limit)
- pmd_limit = pud_next;
- else
- pmd_limit = limit;
+ if (pgdidx == pgdidx_limit &&
+ pudidx > pudidx_limit)
+ goto out;
- if (pud_none(*pud))
+ if (pud_none(pud[pudidx]))
continue;
- pmd = pmd_offset(pud, 0);
+ pmd = pmd_offset(&pud[pudidx], 0);
if (PTRS_PER_PMD > 1) /* not folded */
- flush |= (*func)(virt_to_page(pmd), PT_PMD);
+ flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
- for (; addr != pmd_limit; pmd++) {
- addr += (PAGE_SIZE * PTRS_PER_PTE);
- if ((pmd_limit-1) < (addr-1)) {
- addr = pmd_limit;
- break;
- }
+ for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
+ struct page *pte;
- if (pmd_none(*pmd))
+ if (pgdidx == pgdidx_limit &&
+ pudidx == pudidx_limit &&
+ pmdidx > pmdidx_limit)
+ goto out;
+
+ if (pmd_none(pmd[pmdidx]))
continue;
- flush |= (*func)(pmd_page(*pmd), PT_PTE);
+ pte = pmd_page(pmd[pmdidx]);
+ flush |= (*func)(mm, pte, PT_PTE);
}
}
}
- flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
+out:
+ /* Do the top level last, so that the callbacks can use it as
+ a cue to do final things like tlb flushes. */
+ flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
return flush;
}
-static spinlock_t *lock_pte(struct page *page)
+static int xen_pgd_walk(struct mm_struct *mm,
+ int (*func)(struct mm_struct *mm, struct page *,
+ enum pt_level),
+ unsigned long limit)
+{
+ return __xen_pgd_walk(mm, mm->pgd, func, limit);
+}
+
+/* If we're using split pte locks, then take the page's lock and
+ return a pointer to it. Otherwise return NULL. */
+static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
{
spinlock_t *ptl = NULL;
-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
- ptl = __pte_lockptr(page);
- spin_lock(ptl);
+#if USE_SPLIT_PTE_PTLOCKS
+ ptl = ptlock_ptr(page);
+ spin_lock_nest_lock(ptl, &mm->page_table_lock);
#endif
return ptl;
}
-static void do_unlock(void *v)
+static void xen_pte_unlock(void *v)
{
spinlock_t *ptl = v;
spin_unlock(ptl);
@@ -406,19 +812,18 @@ static void do_unlock(void *v)
static void xen_do_pin(unsigned level, unsigned long pfn)
{
- struct mmuext_op *op;
- struct multicall_space mcs;
+ struct mmuext_op op;
- mcs = __xen_mc_entry(sizeof(*op));
- op = mcs.args;
- op->cmd = level;
- op->arg1.mfn = pfn_to_mfn(pfn);
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+ op.cmd = level;
+ op.arg1.mfn = pfn_to_mfn(pfn);
+
+ xen_extend_mmuext_op(&op);
}
-static int pin_page(struct page *page, enum pt_level level)
+static int xen_pin_page(struct mm_struct *mm, struct page *page,
+ enum pt_level level)
{
- unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
+ unsigned pgfl = TestSetPagePinned(page);
int flush;
if (pgfl)
@@ -435,21 +840,40 @@ static int pin_page(struct page *page, enum pt_level level)
flush = 0;
+ /*
+ * We need to hold the pagetable lock between the time
+ * we make the pagetable RO and when we actually pin
+ * it. If we don't, then other users may come in and
+ * attempt to update the pagetable by writing it,
+ * which will fail because the memory is RO but not
+ * pinned, so Xen won't do the trap'n'emulate.
+ *
+ * If we're using split pte locks, we can't hold the
+ * entire pagetable's worth of locks during the
+ * traverse, because we may wrap the preempt count (8
+ * bits). The solution is to mark RO and pin each PTE
+ * page while holding the lock. This means the number
+ * of locks we end up holding is never more than a
+ * batch size (~32 entries, at present).
+ *
+ * If we're not using split pte locks, we needn't pin
+ * the PTE pages independently, because we're
+ * protected by the overall pagetable lock.
+ */
ptl = NULL;
if (level == PT_PTE)
- ptl = lock_pte(page);
+ ptl = xen_pte_lock(page, mm);
MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
pfn_pte(pfn, PAGE_KERNEL_RO),
level == PT_PGD ? UVMF_TLB_FLUSH : 0);
- if (level == PT_PTE)
+ if (ptl) {
xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
- if (ptl) {
/* Queue a deferred unlock for when this batch
is completed. */
- xen_mc_callback(do_unlock, ptl);
+ xen_mc_callback(xen_pte_unlock, ptl);
}
}
@@ -459,47 +883,96 @@ static int pin_page(struct page *page, enum pt_level level)
/* This is called just after a mm has been created, but it has not
been used yet. We need to make sure that its pagetable is all
read-only, and can be pinned. */
-void xen_pgd_pin(pgd_t *pgd)
+static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
{
- unsigned level;
+ trace_xen_mmu_pgd_pin(mm, pgd);
xen_mc_batch();
- if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
- /* re-enable interrupts for kmap_flush_unused */
+ if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
+ /* re-enable interrupts for flushing */
xen_mc_issue(0);
+
kmap_flush_unused();
+
xen_mc_batch();
}
+#ifdef CONFIG_X86_64
+ {
+ pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+ xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
+
+ if (user_pgd) {
+ xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
+ xen_do_pin(MMUEXT_PIN_L4_TABLE,
+ PFN_DOWN(__pa(user_pgd)));
+ }
+ }
+#else /* CONFIG_X86_32 */
#ifdef CONFIG_X86_PAE
- level = MMUEXT_PIN_L3_TABLE;
-#else
- level = MMUEXT_PIN_L2_TABLE;
+ /* Need to make sure unshared kernel PMD is pinnable */
+ xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
+ PT_PMD);
#endif
+ xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
+#endif /* CONFIG_X86_64 */
+ xen_mc_issue(0);
+}
- xen_do_pin(level, PFN_DOWN(__pa(pgd)));
+static void xen_pgd_pin(struct mm_struct *mm)
+{
+ __xen_pgd_pin(mm, mm->pgd);
+}
- xen_mc_issue(0);
+/*
+ * On save, we need to pin all pagetables to make sure they get their
+ * mfns turned into pfns. Search the list for any unpinned pgds and pin
+ * them (unpinned pgds are not currently in use, probably because the
+ * process is under construction or destruction).
+ *
+ * Expected to be called in stop_machine() ("equivalent to taking
+ * every spinlock in the system"), so the locking doesn't really
+ * matter all that much.
+ */
+void xen_mm_pin_all(void)
+{
+ struct page *page;
+
+ spin_lock(&pgd_lock);
+
+ list_for_each_entry(page, &pgd_list, lru) {
+ if (!PagePinned(page)) {
+ __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
+ SetPageSavePinned(page);
+ }
+ }
+
+ spin_unlock(&pgd_lock);
}
-/* The init_mm pagetable is really pinned as soon as its created, but
- that's before we have page structures to store the bits. So do all
- the book-keeping now. */
-static __init int mark_pinned(struct page *page, enum pt_level level)
+/*
+ * The init_mm pagetable is really pinned as soon as its created, but
+ * that's before we have page structures to store the bits. So do all
+ * the book-keeping now.
+ */
+static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
+ enum pt_level level)
{
SetPagePinned(page);
return 0;
}
-void __init xen_mark_init_mm_pinned(void)
+static void __init xen_mark_init_mm_pinned(void)
{
- pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
+ xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
}
-static int unpin_page(struct page *page, enum pt_level level)
+static int xen_unpin_page(struct mm_struct *mm, struct page *page,
+ enum pt_level level)
{
- unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
+ unsigned pgfl = TestClearPagePinned(page);
if (pgfl && !PageHighMem(page)) {
void *pt = lowmem_page_address(page);
@@ -507,10 +980,18 @@ static int unpin_page(struct page *page, enum pt_level level)
spinlock_t *ptl = NULL;
struct multicall_space mcs;
+ /*
+ * Do the converse to pin_page. If we're using split
+ * pte locks, we must be holding the lock for while
+ * the pte page is unpinned but still RO to prevent
+ * concurrent updates from seeing it in this
+ * partially-pinned state.
+ */
if (level == PT_PTE) {
- ptl = lock_pte(page);
+ ptl = xen_pte_lock(page, mm);
- xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
+ if (ptl)
+ xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
}
mcs = __xen_mc_entry(0);
@@ -521,7 +1002,7 @@ static int unpin_page(struct page *page, enum pt_level level)
if (ptl) {
/* unlock when batch completed */
- xen_mc_callback(do_unlock, ptl);
+ xen_mc_callback(xen_pte_unlock, ptl);
}
}
@@ -529,28 +1010,74 @@ static int unpin_page(struct page *page, enum pt_level level)
}
/* Release a pagetables pages back as normal RW */
-static void xen_pgd_unpin(pgd_t *pgd)
+static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
{
+ trace_xen_mmu_pgd_unpin(mm, pgd);
+
xen_mc_batch();
xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
- pgd_walk(pgd, unpin_page, TASK_SIZE);
+#ifdef CONFIG_X86_64
+ {
+ pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+ if (user_pgd) {
+ xen_do_pin(MMUEXT_UNPIN_TABLE,
+ PFN_DOWN(__pa(user_pgd)));
+ xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
+ }
+ }
+#endif
+
+#ifdef CONFIG_X86_PAE
+ /* Need to make sure unshared kernel PMD is unpinned */
+ xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
+ PT_PMD);
+#endif
+
+ __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
xen_mc_issue(0);
}
-void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
+static void xen_pgd_unpin(struct mm_struct *mm)
+{
+ __xen_pgd_unpin(mm, mm->pgd);
+}
+
+/*
+ * On resume, undo any pinning done at save, so that the rest of the
+ * kernel doesn't see any unexpected pinned pagetables.
+ */
+void xen_mm_unpin_all(void)
+{
+ struct page *page;
+
+ spin_lock(&pgd_lock);
+
+ list_for_each_entry(page, &pgd_list, lru) {
+ if (PageSavePinned(page)) {
+ BUG_ON(!PagePinned(page));
+ __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
+ ClearPageSavePinned(page);
+ }
+ }
+
+ spin_unlock(&pgd_lock);
+}
+
+static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
{
spin_lock(&next->page_table_lock);
- xen_pgd_pin(next->pgd);
+ xen_pgd_pin(next);
spin_unlock(&next->page_table_lock);
}
-void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
spin_lock(&mm->page_table_lock);
- xen_pgd_pin(mm->pgd);
+ xen_pgd_pin(mm);
spin_unlock(&mm->page_table_lock);
}
@@ -561,21 +1088,22 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
static void drop_other_mm_ref(void *info)
{
struct mm_struct *mm = info;
+ struct mm_struct *active_mm;
+
+ active_mm = this_cpu_read(cpu_tlbstate.active_mm);
- if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
+ if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
leave_mm(smp_processor_id());
/* If this cpu still has a stale cr3 reference, then make sure
it has been flushed. */
- if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
+ if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
load_cr3(swapper_pg_dir);
- arch_flush_lazy_cpu_mode();
- }
}
-static void drop_mm_ref(struct mm_struct *mm)
+static void xen_drop_mm_ref(struct mm_struct *mm)
{
- cpumask_t mask;
+ cpumask_var_t mask;
unsigned cpu;
if (current->active_mm == mm) {
@@ -583,11 +1111,19 @@ static void drop_mm_ref(struct mm_struct *mm)
load_cr3(swapper_pg_dir);
else
leave_mm(smp_processor_id());
- arch_flush_lazy_cpu_mode();
}
/* Get the "official" set of cpus referring to our pagetable. */
- mask = mm->cpu_vm_mask;
+ if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
+ for_each_online_cpu(cpu) {
+ if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
+ && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
+ continue;
+ smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
+ }
+ return;
+ }
+ cpumask_copy(mask, mm_cpumask(mm));
/* It's possible that a vcpu may have a stale reference to our
cr3, because its in lazy mode, and it hasn't yet flushed
@@ -596,14 +1132,15 @@ static void drop_mm_ref(struct mm_struct *mm)
if needed. */
for_each_online_cpu(cpu) {
if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
- cpu_set(cpu, mask);
+ cpumask_set_cpu(cpu, mask);
}
- if (!cpus_empty(mask))
- xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
+ if (!cpumask_empty(mask))
+ smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
+ free_cpumask_var(mask);
}
#else
-static void drop_mm_ref(struct mm_struct *mm)
+static void xen_drop_mm_ref(struct mm_struct *mm)
{
if (current->active_mm == mm)
load_cr3(swapper_pg_dir);
@@ -624,17 +1161,1544 @@ static void drop_mm_ref(struct mm_struct *mm)
* pagetable because of lazy tlb flushing. This means we need need to
* switch all CPUs off this pagetable before we can unpin it.
*/
-void xen_exit_mmap(struct mm_struct *mm)
+static void xen_exit_mmap(struct mm_struct *mm)
{
get_cpu(); /* make sure we don't move around */
- drop_mm_ref(mm);
+ xen_drop_mm_ref(mm);
put_cpu();
spin_lock(&mm->page_table_lock);
/* pgd may not be pinned in the error exit path of execve */
- if (PagePinned(virt_to_page(mm->pgd)))
- xen_pgd_unpin(mm->pgd);
+ if (xen_page_pinned(mm->pgd))
+ xen_pgd_unpin(mm);
spin_unlock(&mm->page_table_lock);
}
+
+static void xen_post_allocator_init(void);
+
+#ifdef CONFIG_X86_64
+static void __init xen_cleanhighmap(unsigned long vaddr,
+ unsigned long vaddr_end)
+{
+ unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
+ pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr);
+
+ /* NOTE: The loop is more greedy than the cleanup_highmap variant.
+ * We include the PMD passed in on _both_ boundaries. */
+ for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PAGE_SIZE));
+ pmd++, vaddr += PMD_SIZE) {
+ if (pmd_none(*pmd))
+ continue;
+ if (vaddr < (unsigned long) _text || vaddr > kernel_end)
+ set_pmd(pmd, __pmd(0));
+ }
+ /* In case we did something silly, we should crash in this function
+ * instead of somewhere later and be confusing. */
+ xen_mc_flush();
+}
+static void __init xen_pagetable_p2m_copy(void)
+{
+ unsigned long size;
+ unsigned long addr;
+ unsigned long new_mfn_list;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+
+ new_mfn_list = xen_revector_p2m_tree();
+ /* No memory or already called. */
+ if (!new_mfn_list || new_mfn_list == xen_start_info->mfn_list)
+ return;
+
+ /* using __ka address and sticking INVALID_P2M_ENTRY! */
+ memset((void *)xen_start_info->mfn_list, 0xff, size);
+
+ /* We should be in __ka space. */
+ BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
+ addr = xen_start_info->mfn_list;
+ /* We roundup to the PMD, which means that if anybody at this stage is
+ * using the __ka address of xen_start_info or xen_start_info->shared_info
+ * they are in going to crash. Fortunatly we have already revectored
+ * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
+ size = roundup(size, PMD_SIZE);
+ xen_cleanhighmap(addr, addr + size);
+
+ size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+ memblock_free(__pa(xen_start_info->mfn_list), size);
+ /* And revector! Bye bye old array */
+ xen_start_info->mfn_list = new_mfn_list;
+
+ /* At this stage, cleanup_highmap has already cleaned __ka space
+ * from _brk_limit way up to the max_pfn_mapped (which is the end of
+ * the ramdisk). We continue on, erasing PMD entries that point to page
+ * tables - do note that they are accessible at this stage via __va.
+ * For good measure we also round up to the PMD - which means that if
+ * anybody is using __ka address to the initial boot-stack - and try
+ * to use it - they are going to crash. The xen_start_info has been
+ * taken care of already in xen_setup_kernel_pagetable. */
+ addr = xen_start_info->pt_base;
+ size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE);
+
+ xen_cleanhighmap(addr, addr + size);
+ xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
+#ifdef DEBUG
+ /* This is superflous and is not neccessary, but you know what
+ * lets do it. The MODULES_VADDR -> MODULES_END should be clear of
+ * anything at this stage. */
+ xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
+#endif
+}
+#endif
+
+static void __init xen_pagetable_init(void)
+{
+ paging_init();
+ xen_setup_shared_info();
+#ifdef CONFIG_X86_64
+ xen_pagetable_p2m_copy();
+#endif
+ xen_post_allocator_init();
+}
+static void xen_write_cr2(unsigned long cr2)
+{
+ this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
+}
+
+static unsigned long xen_read_cr2(void)
+{
+ return this_cpu_read(xen_vcpu)->arch.cr2;
+}
+
+unsigned long xen_read_cr2_direct(void)
+{
+ return this_cpu_read(xen_vcpu_info.arch.cr2);
+}
+
+void xen_flush_tlb_all(void)
+{
+ struct mmuext_op *op;
+ struct multicall_space mcs;
+
+ trace_xen_mmu_flush_tlb_all(0);
+
+ preempt_disable();
+
+ mcs = xen_mc_entry(sizeof(*op));
+
+ op = mcs.args;
+ op->cmd = MMUEXT_TLB_FLUSH_ALL;
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
+}
+static void xen_flush_tlb(void)
+{
+ struct mmuext_op *op;
+ struct multicall_space mcs;
+
+ trace_xen_mmu_flush_tlb(0);
+
+ preempt_disable();
+
+ mcs = xen_mc_entry(sizeof(*op));
+
+ op = mcs.args;
+ op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
+}
+
+static void xen_flush_tlb_single(unsigned long addr)
+{
+ struct mmuext_op *op;
+ struct multicall_space mcs;
+
+ trace_xen_mmu_flush_tlb_single(addr);
+
+ preempt_disable();
+
+ mcs = xen_mc_entry(sizeof(*op));
+ op = mcs.args;
+ op->cmd = MMUEXT_INVLPG_LOCAL;
+ op->arg1.linear_addr = addr & PAGE_MASK;
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
+}
+
+static void xen_flush_tlb_others(const struct cpumask *cpus,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long end)
+{
+ struct {
+ struct mmuext_op op;
+#ifdef CONFIG_SMP
+ DECLARE_BITMAP(mask, num_processors);
+#else
+ DECLARE_BITMAP(mask, NR_CPUS);
+#endif
+ } *args;
+ struct multicall_space mcs;
+
+ trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
+
+ if (cpumask_empty(cpus))
+ return; /* nothing to do */
+
+ mcs = xen_mc_entry(sizeof(*args));
+ args = mcs.args;
+ args->op.arg2.vcpumask = to_cpumask(args->mask);
+
+ /* Remove us, and any offline CPUS. */
+ cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
+
+ args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+ if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
+ args->op.cmd = MMUEXT_INVLPG_MULTI;
+ args->op.arg1.linear_addr = start;
+ }
+
+ MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+static unsigned long xen_read_cr3(void)
+{
+ return this_cpu_read(xen_cr3);
+}
+
+static void set_current_cr3(void *v)
+{
+ this_cpu_write(xen_current_cr3, (unsigned long)v);
+}
+
+static void __xen_write_cr3(bool kernel, unsigned long cr3)
+{
+ struct mmuext_op op;
+ unsigned long mfn;
+
+ trace_xen_mmu_write_cr3(kernel, cr3);
+
+ if (cr3)
+ mfn = pfn_to_mfn(PFN_DOWN(cr3));
+ else
+ mfn = 0;
+
+ WARN_ON(mfn == 0 && kernel);
+
+ op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
+ op.arg1.mfn = mfn;
+
+ xen_extend_mmuext_op(&op);
+
+ if (kernel) {
+ this_cpu_write(xen_cr3, cr3);
+
+ /* Update xen_current_cr3 once the batch has actually
+ been submitted. */
+ xen_mc_callback(set_current_cr3, (void *)cr3);
+ }
+}
+static void xen_write_cr3(unsigned long cr3)
+{
+ BUG_ON(preemptible());
+
+ xen_mc_batch(); /* disables interrupts */
+
+ /* Update while interrupts are disabled, so its atomic with
+ respect to ipis */
+ this_cpu_write(xen_cr3, cr3);
+
+ __xen_write_cr3(true, cr3);
+
+#ifdef CONFIG_X86_64
+ {
+ pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
+ if (user_pgd)
+ __xen_write_cr3(false, __pa(user_pgd));
+ else
+ __xen_write_cr3(false, 0);
+ }
+#endif
+
+ xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * At the start of the day - when Xen launches a guest, it has already
+ * built pagetables for the guest. We diligently look over them
+ * in xen_setup_kernel_pagetable and graft as appropiate them in the
+ * init_level4_pgt and its friends. Then when we are happy we load
+ * the new init_level4_pgt - and continue on.
+ *
+ * The generic code starts (start_kernel) and 'init_mem_mapping' sets
+ * up the rest of the pagetables. When it has completed it loads the cr3.
+ * N.B. that baremetal would start at 'start_kernel' (and the early
+ * #PF handler would create bootstrap pagetables) - so we are running
+ * with the same assumptions as what to do when write_cr3 is executed
+ * at this point.
+ *
+ * Since there are no user-page tables at all, we have two variants
+ * of xen_write_cr3 - the early bootup (this one), and the late one
+ * (xen_write_cr3). The reason we have to do that is that in 64-bit
+ * the Linux kernel and user-space are both in ring 3 while the
+ * hypervisor is in ring 0.
+ */
+static void __init xen_write_cr3_init(unsigned long cr3)
+{
+ BUG_ON(preemptible());
+
+ xen_mc_batch(); /* disables interrupts */
+
+ /* Update while interrupts are disabled, so its atomic with
+ respect to ipis */
+ this_cpu_write(xen_cr3, cr3);
+
+ __xen_write_cr3(true, cr3);
+
+ xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
+}
+#endif
+
+static int xen_pgd_alloc(struct mm_struct *mm)
+{
+ pgd_t *pgd = mm->pgd;
+ int ret = 0;
+
+ BUG_ON(PagePinned(virt_to_page(pgd)));
+
+#ifdef CONFIG_X86_64
+ {
+ struct page *page = virt_to_page(pgd);
+ pgd_t *user_pgd;
+
+ BUG_ON(page->private != 0);
+
+ ret = -ENOMEM;
+
+ user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ page->private = (unsigned long)user_pgd;
+
+ if (user_pgd != NULL) {
+ user_pgd[pgd_index(VSYSCALL_ADDR)] =
+ __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
+ ret = 0;
+ }
+
+ BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
+ }
+#endif
+
+ return ret;
+}
+
+static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+#ifdef CONFIG_X86_64
+ pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+ if (user_pgd)
+ free_page((unsigned long)user_pgd);
+#endif
+}
+
+#ifdef CONFIG_X86_32
+static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
+{
+ /* If there's an existing pte, then don't allow _PAGE_RW to be set */
+ if (pte_val_ma(*ptep) & _PAGE_PRESENT)
+ pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
+ pte_val_ma(pte));
+
+ return pte;
+}
+#else /* CONFIG_X86_64 */
+static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
+{
+ return pte;
+}
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Init-time set_pte while constructing initial pagetables, which
+ * doesn't allow RO page table pages to be remapped RW.
+ *
+ * If there is no MFN for this PFN then this page is initially
+ * ballooned out so clear the PTE (as in decrease_reservation() in
+ * drivers/xen/balloon.c).
+ *
+ * Many of these PTE updates are done on unpinned and writable pages
+ * and doing a hypercall for these is unnecessary and expensive. At
+ * this point it is not possible to tell if a page is pinned or not,
+ * so always write the PTE directly and rely on Xen trapping and
+ * emulating any updates as necessary.
+ */
+static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
+{
+ if (pte_mfn(pte) != INVALID_P2M_ENTRY)
+ pte = mask_rw_pte(ptep, pte);
+ else
+ pte = __pte_ma(0);
+
+ native_set_pte(ptep, pte);
+}
+
+static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+ struct mmuext_op op;
+ op.cmd = cmd;
+ op.arg1.mfn = pfn_to_mfn(pfn);
+ if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+ BUG();
+}
+
+/* Early in boot, while setting up the initial pagetable, assume
+ everything is pinned. */
+static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
+{
+#ifdef CONFIG_FLATMEM
+ BUG_ON(mem_map); /* should only be used early */
+#endif
+ make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+ pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+}
+
+/* Used for pmd and pud */
+static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
+{
+#ifdef CONFIG_FLATMEM
+ BUG_ON(mem_map); /* should only be used early */
+#endif
+ make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+}
+
+/* Early release_pte assumes that all pts are pinned, since there's
+ only init_mm and anything attached to that is pinned. */
+static void __init xen_release_pte_init(unsigned long pfn)
+{
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
+ make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+}
+
+static void __init xen_release_pmd_init(unsigned long pfn)
+{
+ make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+}
+
+static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+ struct multicall_space mcs;
+ struct mmuext_op *op;
+
+ mcs = __xen_mc_entry(sizeof(*op));
+ op = mcs.args;
+ op->cmd = cmd;
+ op->arg1.mfn = pfn_to_mfn(pfn);
+
+ MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
+}
+
+static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
+{
+ struct multicall_space mcs;
+ unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
+
+ mcs = __xen_mc_entry(0);
+ MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,
+ pfn_pte(pfn, prot), 0);
+}
+
+/* This needs to make sure the new pte page is pinned iff its being
+ attached to a pinned pagetable. */
+static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
+ unsigned level)
+{
+ bool pinned = PagePinned(virt_to_page(mm->pgd));
+
+ trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
+
+ if (pinned) {
+ struct page *page = pfn_to_page(pfn);
+
+ SetPagePinned(page);
+
+ if (!PageHighMem(page)) {
+ xen_mc_batch();
+
+ __set_pfn_prot(pfn, PAGE_KERNEL_RO);
+
+ if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
+ __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+ } else {
+ /* make sure there are no stray mappings of
+ this page */
+ kmap_flush_unused();
+ }
+ }
+}
+
+static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
+{
+ xen_alloc_ptpage(mm, pfn, PT_PTE);
+}
+
+static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
+{
+ xen_alloc_ptpage(mm, pfn, PT_PMD);
+}
+
+/* This should never happen until we're OK to use struct page */
+static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
+{
+ struct page *page = pfn_to_page(pfn);
+ bool pinned = PagePinned(page);
+
+ trace_xen_mmu_release_ptpage(pfn, level, pinned);
+
+ if (pinned) {
+ if (!PageHighMem(page)) {
+ xen_mc_batch();
+
+ if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
+ __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
+
+ __set_pfn_prot(pfn, PAGE_KERNEL);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+ }
+ ClearPagePinned(page);
+ }
+}
+
+static void xen_release_pte(unsigned long pfn)
+{
+ xen_release_ptpage(pfn, PT_PTE);
+}
+
+static void xen_release_pmd(unsigned long pfn)
+{
+ xen_release_ptpage(pfn, PT_PMD);
+}
+
+#if PAGETABLE_LEVELS == 4
+static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
+{
+ xen_alloc_ptpage(mm, pfn, PT_PUD);
+}
+
+static void xen_release_pud(unsigned long pfn)
+{
+ xen_release_ptpage(pfn, PT_PUD);
+}
+#endif
+
+void __init xen_reserve_top(void)
+{
+#ifdef CONFIG_X86_32
+ unsigned long top = HYPERVISOR_VIRT_START;
+ struct xen_platform_parameters pp;
+
+ if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
+ top = pp.virt_start;
+
+ reserve_top_address(-top);
+#endif /* CONFIG_X86_32 */
+}
+
+/*
+ * Like __va(), but returns address in the kernel mapping (which is
+ * all we have until the physical memory mapping has been set up.
+ */
+static void *__ka(phys_addr_t paddr)
+{
+#ifdef CONFIG_X86_64
+ return (void *)(paddr + __START_KERNEL_map);
+#else
+ return __va(paddr);
+#endif
+}
+
+/* Convert a machine address to physical address */
+static unsigned long m2p(phys_addr_t maddr)
+{
+ phys_addr_t paddr;
+
+ maddr &= PTE_PFN_MASK;
+ paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
+
+ return paddr;
+}
+
+/* Convert a machine address to kernel virtual */
+static void *m2v(phys_addr_t maddr)
+{
+ return __ka(m2p(maddr));
+}
+
+/* Set the page permissions on an identity-mapped pages */
+static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags)
+{
+ unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
+ pte_t pte = pfn_pte(pfn, prot);
+
+ /* For PVH no need to set R/O or R/W to pin them or unpin them. */
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
+ BUG();
+}
+static void set_page_prot(void *addr, pgprot_t prot)
+{
+ return set_page_prot_flags(addr, prot, UVMF_NONE);
+}
+#ifdef CONFIG_X86_32
+static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
+{
+ unsigned pmdidx, pteidx;
+ unsigned ident_pte;
+ unsigned long pfn;
+
+ level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
+ PAGE_SIZE);
+
+ ident_pte = 0;
+ pfn = 0;
+ for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
+ pte_t *pte_page;
+
+ /* Reuse or allocate a page of ptes */
+ if (pmd_present(pmd[pmdidx]))
+ pte_page = m2v(pmd[pmdidx].pmd);
+ else {
+ /* Check for free pte pages */
+ if (ident_pte == LEVEL1_IDENT_ENTRIES)
+ break;
+
+ pte_page = &level1_ident_pgt[ident_pte];
+ ident_pte += PTRS_PER_PTE;
+
+ pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
+ }
+
+ /* Install mappings */
+ for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
+ pte_t pte;
+
+#ifdef CONFIG_X86_32
+ if (pfn > max_pfn_mapped)
+ max_pfn_mapped = pfn;
+#endif
+
+ if (!pte_none(pte_page[pteidx]))
+ continue;
+
+ pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
+ pte_page[pteidx] = pte;
+ }
+ }
+
+ for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
+ set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
+
+ set_page_prot(pmd, PAGE_KERNEL_RO);
+}
+#endif
+void __init xen_setup_machphys_mapping(void)
+{
+ struct xen_machphys_mapping mapping;
+
+ if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
+ machine_to_phys_mapping = (unsigned long *)mapping.v_start;
+ machine_to_phys_nr = mapping.max_mfn + 1;
+ } else {
+ machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
+ }
+#ifdef CONFIG_X86_32
+ WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1))
+ < machine_to_phys_mapping);
+#endif
+}
+
+#ifdef CONFIG_X86_64
+static void convert_pfn_mfn(void *v)
+{
+ pte_t *pte = v;
+ int i;
+
+ /* All levels are converted the same way, so just treat them
+ as ptes. */
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ pte[i] = xen_make_pte(pte[i].pte);
+}
+static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
+ unsigned long addr)
+{
+ if (*pt_base == PFN_DOWN(__pa(addr))) {
+ set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
+ clear_page((void *)addr);
+ (*pt_base)++;
+ }
+ if (*pt_end == PFN_DOWN(__pa(addr))) {
+ set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
+ clear_page((void *)addr);
+ (*pt_end)--;
+ }
+}
+/*
+ * Set up the initial kernel pagetable.
+ *
+ * We can construct this by grafting the Xen provided pagetable into
+ * head_64.S's preconstructed pagetables. We copy the Xen L2's into
+ * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
+ * means that only the kernel has a physical mapping to start with -
+ * but that's enough to get __va working. We need to fill in the rest
+ * of the physical mapping once some sort of allocator has been set
+ * up.
+ * NOTE: for PVH, the page tables are native.
+ */
+void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
+{
+ pud_t *l3;
+ pmd_t *l2;
+ unsigned long addr[3];
+ unsigned long pt_base, pt_end;
+ unsigned i;
+
+ /* max_pfn_mapped is the last pfn mapped in the initial memory
+ * mappings. Considering that on Xen after the kernel mappings we
+ * have the mappings of some pages that don't exist in pfn space, we
+ * set max_pfn_mapped to the last real pfn mapped. */
+ max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
+
+ pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
+ pt_end = pt_base + xen_start_info->nr_pt_frames;
+
+ /* Zap identity mapping */
+ init_level4_pgt[0] = __pgd(0);
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ /* Pre-constructed entries are in pfn, so convert to mfn */
+ /* L4[272] -> level3_ident_pgt
+ * L4[511] -> level3_kernel_pgt */
+ convert_pfn_mfn(init_level4_pgt);
+
+ /* L3_i[0] -> level2_ident_pgt */
+ convert_pfn_mfn(level3_ident_pgt);
+ /* L3_k[510] -> level2_kernel_pgt
+ * L3_i[511] -> level2_fixmap_pgt */
+ convert_pfn_mfn(level3_kernel_pgt);
+ }
+ /* We get [511][511] and have Xen's version of level2_kernel_pgt */
+ l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
+ l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
+
+ addr[0] = (unsigned long)pgd;
+ addr[1] = (unsigned long)l3;
+ addr[2] = (unsigned long)l2;
+ /* Graft it onto L4[272][0]. Note that we creating an aliasing problem:
+ * Both L4[272][0] and L4[511][511] have entries that point to the same
+ * L2 (PMD) tables. Meaning that if you modify it in __va space
+ * it will be also modified in the __ka space! (But if you just
+ * modify the PMD table to point to other PTE's or none, then you
+ * are OK - which is what cleanup_highmap does) */
+ copy_page(level2_ident_pgt, l2);
+ /* Graft it onto L4[511][511] */
+ copy_page(level2_kernel_pgt, l2);
+
+ /* Get [511][510] and graft that in level2_fixmap_pgt */
+ l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
+ l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
+ copy_page(level2_fixmap_pgt, l2);
+ /* Note that we don't do anything with level1_fixmap_pgt which
+ * we don't need. */
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ /* Make pagetable pieces RO */
+ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
+ set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
+
+ /* Pin down new L4 */
+ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
+ PFN_DOWN(__pa_symbol(init_level4_pgt)));
+
+ /* Unpin Xen-provided one */
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+
+ /*
+ * At this stage there can be no user pgd, and no page
+ * structure to attach it to, so make sure we just set kernel
+ * pgd.
+ */
+ xen_mc_batch();
+ __xen_write_cr3(true, __pa(init_level4_pgt));
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+ } else
+ native_write_cr3(__pa(init_level4_pgt));
+
+ /* We can't that easily rip out L3 and L2, as the Xen pagetables are
+ * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for
+ * the initial domain. For guests using the toolstack, they are in:
+ * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only
+ * rip out the [L4] (pgd), but for guests we shave off three pages.
+ */
+ for (i = 0; i < ARRAY_SIZE(addr); i++)
+ check_pt_base(&pt_base, &pt_end, addr[i]);
+
+ /* Our (by three pages) smaller Xen pagetable that we are using */
+ memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE);
+ /* Revector the xen_start_info */
+ xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
+}
+#else /* !CONFIG_X86_64 */
+static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
+static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
+
+static void __init xen_write_cr3_init(unsigned long cr3)
+{
+ unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
+
+ BUG_ON(read_cr3() != __pa(initial_page_table));
+ BUG_ON(cr3 != __pa(swapper_pg_dir));
+
+ /*
+ * We are switching to swapper_pg_dir for the first time (from
+ * initial_page_table) and therefore need to mark that page
+ * read-only and then pin it.
+ *
+ * Xen disallows sharing of kernel PMDs for PAE
+ * guests. Therefore we must copy the kernel PMD from
+ * initial_page_table into a new kernel PMD to be used in
+ * swapper_pg_dir.
+ */
+ swapper_kernel_pmd =
+ extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
+ copy_page(swapper_kernel_pmd, initial_kernel_pmd);
+ swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
+ __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
+ set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
+
+ set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
+ xen_write_cr3(cr3);
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
+
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
+ PFN_DOWN(__pa(initial_page_table)));
+ set_page_prot(initial_page_table, PAGE_KERNEL);
+ set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
+
+ pv_mmu_ops.write_cr3 = &xen_write_cr3;
+}
+
+void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
+{
+ pmd_t *kernel_pmd;
+
+ initial_kernel_pmd =
+ extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
+
+ max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
+ xen_start_info->nr_pt_frames * PAGE_SIZE +
+ 512*1024);
+
+ kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
+ copy_page(initial_kernel_pmd, kernel_pmd);
+
+ xen_map_identity_early(initial_kernel_pmd, max_pfn);
+
+ copy_page(initial_page_table, pgd);
+ initial_page_table[KERNEL_PGD_BOUNDARY] =
+ __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
+
+ set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
+ set_page_prot(initial_page_table, PAGE_KERNEL_RO);
+ set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
+
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
+ PFN_DOWN(__pa(initial_page_table)));
+ xen_write_cr3(__pa(initial_page_table));
+
+ memblock_reserve(__pa(xen_start_info->pt_base),
+ xen_start_info->nr_pt_frames * PAGE_SIZE);
+}
+#endif /* CONFIG_X86_64 */
+
+static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
+
+static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
+{
+ pte_t pte;
+
+ phys >>= PAGE_SHIFT;
+
+ switch (idx) {
+ case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
+ case FIX_RO_IDT:
+#ifdef CONFIG_X86_32
+ case FIX_WP_TEST:
+# ifdef CONFIG_HIGHMEM
+ case FIX_KMAP_BEGIN ... FIX_KMAP_END:
+# endif
+#else
+ case VSYSCALL_PAGE:
+#endif
+ case FIX_TEXT_POKE0:
+ case FIX_TEXT_POKE1:
+ /* All local page mappings */
+ pte = pfn_pte(phys, prot);
+ break;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ case FIX_APIC_BASE: /* maps dummy local APIC */
+ pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
+ break;
+#endif
+
+#ifdef CONFIG_X86_IO_APIC
+ case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
+ /*
+ * We just don't map the IO APIC - all access is via
+ * hypercalls. Keep the address in the pte for reference.
+ */
+ pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
+ break;
+#endif
+
+ case FIX_PARAVIRT_BOOTMAP:
+ /* This is an MFN, but it isn't an IO mapping from the
+ IO domain */
+ pte = mfn_pte(phys, prot);
+ break;
+
+ default:
+ /* By default, set_fixmap is used for hardware mappings */
+ pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
+ break;
+ }
+
+ __native_set_fixmap(idx, pte);
+
+#ifdef CONFIG_X86_64
+ /* Replicate changes to map the vsyscall page into the user
+ pagetable vsyscall mapping. */
+ if (idx == VSYSCALL_PAGE) {
+ unsigned long vaddr = __fix_to_virt(idx);
+ set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
+ }
+#endif
+}
+
+static void __init xen_post_allocator_init(void)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ pv_mmu_ops.set_pte = xen_set_pte;
+ pv_mmu_ops.set_pmd = xen_set_pmd;
+ pv_mmu_ops.set_pud = xen_set_pud;
+#if PAGETABLE_LEVELS == 4
+ pv_mmu_ops.set_pgd = xen_set_pgd;
+#endif
+
+ /* This will work as long as patching hasn't happened yet
+ (which it hasn't) */
+ pv_mmu_ops.alloc_pte = xen_alloc_pte;
+ pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
+ pv_mmu_ops.release_pte = xen_release_pte;
+ pv_mmu_ops.release_pmd = xen_release_pmd;
+#if PAGETABLE_LEVELS == 4
+ pv_mmu_ops.alloc_pud = xen_alloc_pud;
+ pv_mmu_ops.release_pud = xen_release_pud;
+#endif
+
+#ifdef CONFIG_X86_64
+ pv_mmu_ops.write_cr3 = &xen_write_cr3;
+ SetPagePinned(virt_to_page(level3_user_vsyscall));
+#endif
+ xen_mark_init_mm_pinned();
+}
+
+static void xen_leave_lazy_mmu(void)
+{
+ preempt_disable();
+ xen_mc_flush();
+ paravirt_leave_lazy_mmu();
+ preempt_enable();
+}
+
+static const struct pv_mmu_ops xen_mmu_ops __initconst = {
+ .read_cr2 = xen_read_cr2,
+ .write_cr2 = xen_write_cr2,
+
+ .read_cr3 = xen_read_cr3,
+ .write_cr3 = xen_write_cr3_init,
+
+ .flush_tlb_user = xen_flush_tlb,
+ .flush_tlb_kernel = xen_flush_tlb,
+ .flush_tlb_single = xen_flush_tlb_single,
+ .flush_tlb_others = xen_flush_tlb_others,
+
+ .pte_update = paravirt_nop,
+ .pte_update_defer = paravirt_nop,
+
+ .pgd_alloc = xen_pgd_alloc,
+ .pgd_free = xen_pgd_free,
+
+ .alloc_pte = xen_alloc_pte_init,
+ .release_pte = xen_release_pte_init,
+ .alloc_pmd = xen_alloc_pmd_init,
+ .release_pmd = xen_release_pmd_init,
+
+ .set_pte = xen_set_pte_init,
+ .set_pte_at = xen_set_pte_at,
+ .set_pmd = xen_set_pmd_hyper,
+
+ .ptep_modify_prot_start = __ptep_modify_prot_start,
+ .ptep_modify_prot_commit = __ptep_modify_prot_commit,
+
+ .pte_val = PV_CALLEE_SAVE(xen_pte_val),
+ .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
+
+ .make_pte = PV_CALLEE_SAVE(xen_make_pte),
+ .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
+
+#ifdef CONFIG_X86_PAE
+ .set_pte_atomic = xen_set_pte_atomic,
+ .pte_clear = xen_pte_clear,
+ .pmd_clear = xen_pmd_clear,
+#endif /* CONFIG_X86_PAE */
+ .set_pud = xen_set_pud_hyper,
+
+ .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
+ .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
+
+#if PAGETABLE_LEVELS == 4
+ .pud_val = PV_CALLEE_SAVE(xen_pud_val),
+ .make_pud = PV_CALLEE_SAVE(xen_make_pud),
+ .set_pgd = xen_set_pgd_hyper,
+
+ .alloc_pud = xen_alloc_pmd_init,
+ .release_pud = xen_release_pmd_init,
+#endif /* PAGETABLE_LEVELS == 4 */
+
+ .activate_mm = xen_activate_mm,
+ .dup_mmap = xen_dup_mmap,
+ .exit_mmap = xen_exit_mmap,
+
+ .lazy_mode = {
+ .enter = paravirt_enter_lazy_mmu,
+ .leave = xen_leave_lazy_mmu,
+ .flush = paravirt_flush_lazy_mmu,
+ },
+
+ .set_fixmap = xen_set_fixmap,
+};
+
+void __init xen_init_mmu_ops(void)
+{
+ x86_init.paging.pagetable_init = xen_pagetable_init;
+
+ /* Optimization - we can use the HVM one but it has no idea which
+ * VCPUs are descheduled - which means that it will needlessly IPI
+ * them. Xen knows so let it do the job.
+ */
+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
+ pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others;
+ return;
+ }
+ pv_mmu_ops = xen_mmu_ops;
+
+ memset(dummy_mapping, 0xff, PAGE_SIZE);
+}
+
+/* Protected by xen_reservation_lock. */
+#define MAX_CONTIG_ORDER 9 /* 2MB */
+static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
+
+#define VOID_PTE (mfn_pte(0, __pgprot(0)))
+static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
+ unsigned long *in_frames,
+ unsigned long *out_frames)
+{
+ int i;
+ struct multicall_space mcs;
+
+ xen_mc_batch();
+ for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
+ mcs = __xen_mc_entry(0);
+
+ if (in_frames)
+ in_frames[i] = virt_to_mfn(vaddr);
+
+ MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
+ __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
+
+ if (out_frames)
+ out_frames[i] = virt_to_pfn(vaddr);
+ }
+ xen_mc_issue(0);
+}
+
+/*
+ * Update the pfn-to-mfn mappings for a virtual address range, either to
+ * point to an array of mfns, or contiguously from a single starting
+ * mfn.
+ */
+static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
+ unsigned long *mfns,
+ unsigned long first_mfn)
+{
+ unsigned i, limit;
+ unsigned long mfn;
+
+ xen_mc_batch();
+
+ limit = 1u << order;
+ for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
+ struct multicall_space mcs;
+ unsigned flags;
+
+ mcs = __xen_mc_entry(0);
+ if (mfns)
+ mfn = mfns[i];
+ else
+ mfn = first_mfn + i;
+
+ if (i < (limit - 1))
+ flags = 0;
+ else {
+ if (order == 0)
+ flags = UVMF_INVLPG | UVMF_ALL;
+ else
+ flags = UVMF_TLB_FLUSH | UVMF_ALL;
+ }
+
+ MULTI_update_va_mapping(mcs.mc, vaddr,
+ mfn_pte(mfn, PAGE_KERNEL), flags);
+
+ set_phys_to_machine(virt_to_pfn(vaddr), mfn);
+ }
+
+ xen_mc_issue(0);
+}
+
+/*
+ * Perform the hypercall to exchange a region of our pfns to point to
+ * memory with the required contiguous alignment. Takes the pfns as
+ * input, and populates mfns as output.
+ *
+ * Returns a success code indicating whether the hypervisor was able to
+ * satisfy the request or not.
+ */
+static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
+ unsigned long *pfns_in,
+ unsigned long extents_out,
+ unsigned int order_out,
+ unsigned long *mfns_out,
+ unsigned int address_bits)
+{
+ long rc;
+ int success;
+
+ struct xen_memory_exchange exchange = {
+ .in = {
+ .nr_extents = extents_in,
+ .extent_order = order_in,
+ .extent_start = pfns_in,
+ .domid = DOMID_SELF
+ },
+ .out = {
+ .nr_extents = extents_out,
+ .extent_order = order_out,
+ .extent_start = mfns_out,
+ .address_bits = address_bits,
+ .domid = DOMID_SELF
+ }
+ };
+
+ BUG_ON(extents_in << order_in != extents_out << order_out);
+
+ rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
+ success = (exchange.nr_exchanged == extents_in);
+
+ BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
+ BUG_ON(success && (rc != 0));
+
+ return success;
+}
+
+int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
+ unsigned int address_bits,
+ dma_addr_t *dma_handle)
+{
+ unsigned long *in_frames = discontig_frames, out_frame;
+ unsigned long flags;
+ int success;
+ unsigned long vstart = (unsigned long)phys_to_virt(pstart);
+
+ /*
+ * Currently an auto-translated guest will not perform I/O, nor will
+ * it require PAE page directories below 4GB. Therefore any calls to
+ * this function are redundant and can be ignored.
+ */
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
+ if (unlikely(order > MAX_CONTIG_ORDER))
+ return -ENOMEM;
+
+ memset((void *) vstart, 0, PAGE_SIZE << order);
+
+ spin_lock_irqsave(&xen_reservation_lock, flags);
+
+ /* 1. Zap current PTEs, remembering MFNs. */
+ xen_zap_pfn_range(vstart, order, in_frames, NULL);
+
+ /* 2. Get a new contiguous memory extent. */
+ out_frame = virt_to_pfn(vstart);
+ success = xen_exchange_memory(1UL << order, 0, in_frames,
+ 1, order, &out_frame,
+ address_bits);
+
+ /* 3. Map the new extent in place of old pages. */
+ if (success)
+ xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
+ else
+ xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
+
+ spin_unlock_irqrestore(&xen_reservation_lock, flags);
+
+ *dma_handle = virt_to_machine(vstart).maddr;
+ return success ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
+
+void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
+{
+ unsigned long *out_frames = discontig_frames, in_frame;
+ unsigned long flags;
+ int success;
+ unsigned long vstart;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ if (unlikely(order > MAX_CONTIG_ORDER))
+ return;
+
+ vstart = (unsigned long)phys_to_virt(pstart);
+ memset((void *) vstart, 0, PAGE_SIZE << order);
+
+ spin_lock_irqsave(&xen_reservation_lock, flags);
+
+ /* 1. Find start MFN of contiguous extent. */
+ in_frame = virt_to_mfn(vstart);
+
+ /* 2. Zap current PTEs. */
+ xen_zap_pfn_range(vstart, order, NULL, out_frames);
+
+ /* 3. Do the exchange for non-contiguous MFNs. */
+ success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
+ 0, out_frames, 0);
+
+ /* 4. Map new pages in place of old pages. */
+ if (success)
+ xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
+ else
+ xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
+
+ spin_unlock_irqrestore(&xen_reservation_lock, flags);
+}
+EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
+
+#ifdef CONFIG_XEN_PVHVM
+#ifdef CONFIG_PROC_VMCORE
+/*
+ * This function is used in two contexts:
+ * - the kdump kernel has to check whether a pfn of the crashed kernel
+ * was a ballooned page. vmcore is using this function to decide
+ * whether to access a pfn of the crashed kernel.
+ * - the kexec kernel has to check whether a pfn was ballooned by the
+ * previous kernel. If the pfn is ballooned, handle it properly.
+ * Returns 0 if the pfn is not backed by a RAM page, the caller may
+ * handle the pfn special in this case.
+ */
+static int xen_oldmem_pfn_is_ram(unsigned long pfn)
+{
+ struct xen_hvm_get_mem_type a = {
+ .domid = DOMID_SELF,
+ .pfn = pfn,
+ };
+ int ram;
+
+ if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a))
+ return -ENXIO;
+
+ switch (a.mem_type) {
+ case HVMMEM_mmio_dm:
+ ram = 0;
+ break;
+ case HVMMEM_ram_rw:
+ case HVMMEM_ram_ro:
+ default:
+ ram = 1;
+ break;
+ }
+
+ return ram;
+}
+#endif
+
+static void xen_hvm_exit_mmap(struct mm_struct *mm)
+{
+ struct xen_hvm_pagetable_dying a;
+ int rc;
+
+ a.domid = DOMID_SELF;
+ a.gpa = __pa(mm->pgd);
+ rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
+ WARN_ON_ONCE(rc < 0);
+}
+
+static int is_pagetable_dying_supported(void)
+{
+ struct xen_hvm_pagetable_dying a;
+ int rc = 0;
+
+ a.domid = DOMID_SELF;
+ a.gpa = 0x00;
+ rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
+ if (rc < 0) {
+ printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
+ return 0;
+ }
+ return 1;
+}
+
+void __init xen_hvm_init_mmu_ops(void)
+{
+ if (is_pagetable_dying_supported())
+ pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
+#ifdef CONFIG_PROC_VMCORE
+ register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram);
+#endif
+}
+#endif
+
+#ifdef CONFIG_XEN_PVH
+/*
+ * Map foreign gfn (fgfn), to local pfn (lpfn). This for the user
+ * space creating new guest on pvh dom0 and needing to map domU pages.
+ */
+static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn,
+ unsigned int domid)
+{
+ int rc, err = 0;
+ xen_pfn_t gpfn = lpfn;
+ xen_ulong_t idx = fgfn;
+
+ struct xen_add_to_physmap_range xatp = {
+ .domid = DOMID_SELF,
+ .foreign_domid = domid,
+ .size = 1,
+ .space = XENMAPSPACE_gmfn_foreign,
+ };
+ set_xen_guest_handle(xatp.idxs, &idx);
+ set_xen_guest_handle(xatp.gpfns, &gpfn);
+ set_xen_guest_handle(xatp.errs, &err);
+
+ rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
+ if (rc < 0)
+ return rc;
+ return err;
+}
+
+static int xlate_remove_from_p2m(unsigned long spfn, int count)
+{
+ struct xen_remove_from_physmap xrp;
+ int i, rc;
+
+ for (i = 0; i < count; i++) {
+ xrp.domid = DOMID_SELF;
+ xrp.gpfn = spfn+i;
+ rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
+ if (rc)
+ break;
+ }
+ return rc;
+}
+
+struct xlate_remap_data {
+ unsigned long fgfn; /* foreign domain's gfn */
+ pgprot_t prot;
+ domid_t domid;
+ int index;
+ struct page **pages;
+};
+
+static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
+ void *data)
+{
+ int rc;
+ struct xlate_remap_data *remap = data;
+ unsigned long pfn = page_to_pfn(remap->pages[remap->index++]);
+ pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot));
+
+ rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid);
+ if (rc)
+ return rc;
+ native_set_pte(ptep, pteval);
+
+ return 0;
+}
+
+static int xlate_remap_gfn_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long mfn,
+ int nr, pgprot_t prot, unsigned domid,
+ struct page **pages)
+{
+ int err;
+ struct xlate_remap_data pvhdata;
+
+ BUG_ON(!pages);
+
+ pvhdata.fgfn = mfn;
+ pvhdata.prot = prot;
+ pvhdata.domid = domid;
+ pvhdata.index = 0;
+ pvhdata.pages = pages;
+ err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT,
+ xlate_map_pte_fn, &pvhdata);
+ flush_tlb_all();
+ return err;
+}
+#endif
+
+#define REMAP_BATCH_SIZE 16
+
+struct remap_data {
+ unsigned long mfn;
+ pgprot_t prot;
+ struct mmu_update *mmu_update;
+};
+
+static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
+ unsigned long addr, void *data)
+{
+ struct remap_data *rmd = data;
+ pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot));
+
+ rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
+ rmd->mmu_update->val = pte_val_ma(pte);
+ rmd->mmu_update++;
+
+ return 0;
+}
+
+int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+ unsigned long addr,
+ xen_pfn_t mfn, int nr,
+ pgprot_t prot, unsigned domid,
+ struct page **pages)
+
+{
+ struct remap_data rmd;
+ struct mmu_update mmu_update[REMAP_BATCH_SIZE];
+ int batch;
+ unsigned long range;
+ int err = 0;
+
+ BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
+
+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
+#ifdef CONFIG_XEN_PVH
+ /* We need to update the local page tables and the xen HAP */
+ return xlate_remap_gfn_range(vma, addr, mfn, nr, prot,
+ domid, pages);
+#else
+ return -EINVAL;
+#endif
+ }
+
+ rmd.mfn = mfn;
+ rmd.prot = prot;
+
+ while (nr) {
+ batch = min(REMAP_BATCH_SIZE, nr);
+ range = (unsigned long)batch << PAGE_SHIFT;
+
+ rmd.mmu_update = mmu_update;
+ err = apply_to_page_range(vma->vm_mm, addr, range,
+ remap_area_mfn_pte_fn, &rmd);
+ if (err)
+ goto out;
+
+ err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid);
+ if (err < 0)
+ goto out;
+
+ nr -= batch;
+ addr += range;
+ }
+
+ err = 0;
+out:
+
+ xen_flush_tlb_all();
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
+
+/* Returns: 0 success */
+int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
+ int numpgs, struct page **pages)
+{
+ if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
+#ifdef CONFIG_XEN_PVH
+ while (numpgs--) {
+ /*
+ * The mmu has already cleaned up the process mmu
+ * resources at this point (lookup_address will return
+ * NULL).
+ */
+ unsigned long pfn = page_to_pfn(pages[numpgs]);
+
+ xlate_remove_from_p2m(pfn, 1);
+ }
+ /*
+ * We don't need to flush tlbs because as part of
+ * xlate_remove_from_p2m, the hypervisor will do tlb flushes
+ * after removing the p2m entries from the EPT/NPT
+ */
+ return 0;
+#else
+ return -EINVAL;
+#endif
+}
+EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range);
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index b5e189b1519..73809bb951b 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -10,58 +10,17 @@ enum pt_level {
PT_PTE
};
-/*
- * Page-directory addresses above 4GB do not fit into architectural %cr3.
- * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
- * must use the following accessor macros to pack/unpack valid MFNs.
- *
- * Note that Xen is using the fact that the pagetable base is always
- * page-aligned, and putting the 12 MSB of the address into the 12 LSB
- * of cr3.
- */
-#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
-#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-void xen_set_pte(pte_t *ptep, pte_t pteval);
-void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval);
-void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
+pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte);
-void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
-void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
-void xen_exit_mmap(struct mm_struct *mm);
-
-void xen_pgd_pin(pgd_t *pgd);
-//void xen_pgd_unpin(pgd_t *pgd);
-
-#ifdef CONFIG_X86_PAE
-unsigned long long xen_pte_val(pte_t);
-unsigned long long xen_pmd_val(pmd_t);
-unsigned long long xen_pgd_val(pgd_t);
-
-pte_t xen_make_pte(unsigned long long);
-pmd_t xen_make_pmd(unsigned long long);
-pgd_t xen_make_pgd(unsigned long long);
-
-void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval);
-void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
-void xen_set_pud(pud_t *ptr, pud_t val);
-void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void xen_pmd_clear(pmd_t *pmdp);
-
-
-#else
-unsigned long xen_pte_val(pte_t);
-unsigned long xen_pmd_val(pmd_t);
-unsigned long xen_pgd_val(pgd_t);
-
-pte_t xen_make_pte(unsigned long);
-pmd_t xen_make_pmd(unsigned long);
-pgd_t xen_make_pgd(unsigned long);
-#endif
+unsigned long xen_read_cr2_direct(void);
+extern void xen_init_mmu_ops(void);
+extern void xen_hvm_init_mmu_ops(void);
#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 5e6f36f6d87..0d82003e76a 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -21,27 +21,32 @@
*/
#include <linux/percpu.h>
#include <linux/hardirq.h>
+#include <linux/debugfs.h>
#include <asm/xen/hypercall.h>
#include "multicalls.h"
-
-#define MC_DEBUG 1
+#include "debugfs.h"
#define MC_BATCH 32
-#define MC_ARGS (MC_BATCH * 16 / sizeof(u64))
+
+#define MC_DEBUG 0
+
+#define MC_ARGS (MC_BATCH * 16)
+
struct mc_buffer {
+ unsigned mcidx, argidx, cbidx;
struct multicall_entry entries[MC_BATCH];
#if MC_DEBUG
struct multicall_entry debug[MC_BATCH];
+ void *caller[MC_BATCH];
#endif
- u64 args[MC_ARGS];
+ unsigned char args[MC_ARGS];
struct callback {
void (*fn)(void *);
void *data;
} callbacks[MC_BATCH];
- unsigned mcidx, argidx, cbidx;
};
static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
@@ -50,6 +55,7 @@ DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
void xen_mc_flush(void)
{
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+ struct multicall_entry *mc;
int ret = 0;
unsigned long flags;
int i;
@@ -60,7 +66,26 @@ void xen_mc_flush(void)
something in the middle */
local_irq_save(flags);
- if (b->mcidx) {
+ trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx);
+
+ switch (b->mcidx) {
+ case 0:
+ /* no-op */
+ BUG_ON(b->argidx != 0);
+ break;
+
+ case 1:
+ /* Singleton multicall - bypass multicall machinery
+ and just do the call directly. */
+ mc = &b->entries[0];
+
+ mc->result = privcmd_call(mc->op,
+ mc->args[0], mc->args[1], mc->args[2],
+ mc->args[3], mc->args[4]);
+ ret = mc->result < 0;
+ break;
+
+ default:
#if MC_DEBUG
memcpy(b->debug, b->entries,
b->mcidx * sizeof(struct multicall_entry));
@@ -76,51 +101,92 @@ void xen_mc_flush(void)
if (ret) {
printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
ret, smp_processor_id());
- for(i = 0; i < b->mcidx; i++) {
- printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
+ dump_stack();
+ for (i = 0; i < b->mcidx; i++) {
+ printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\t%pF\n",
i+1, b->mcidx,
b->debug[i].op,
b->debug[i].args[0],
- b->entries[i].result);
+ b->entries[i].result,
+ b->caller[i]);
}
}
#endif
+ }
- b->mcidx = 0;
- b->argidx = 0;
- } else
- BUG_ON(b->argidx != 0);
-
- local_irq_restore(flags);
+ b->mcidx = 0;
+ b->argidx = 0;
- for(i = 0; i < b->cbidx; i++) {
+ for (i = 0; i < b->cbidx; i++) {
struct callback *cb = &b->callbacks[i];
(*cb->fn)(cb->data);
}
b->cbidx = 0;
- BUG_ON(ret);
+ local_irq_restore(flags);
+
+ WARN_ON(ret);
}
struct multicall_space __xen_mc_entry(size_t args)
{
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
struct multicall_space ret;
- unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
+ unsigned argidx = roundup(b->argidx, sizeof(u64));
+
+ trace_xen_mc_entry_alloc(args);
BUG_ON(preemptible());
- BUG_ON(argspace > MC_ARGS);
+ BUG_ON(b->argidx >= MC_ARGS);
- if (b->mcidx == MC_BATCH ||
- (b->argidx + argspace) > MC_ARGS)
+ if (unlikely(b->mcidx == MC_BATCH ||
+ (argidx + args) >= MC_ARGS)) {
+ trace_xen_mc_flush_reason((b->mcidx == MC_BATCH) ?
+ XEN_MC_FL_BATCH : XEN_MC_FL_ARGS);
xen_mc_flush();
+ argidx = roundup(b->argidx, sizeof(u64));
+ }
ret.mc = &b->entries[b->mcidx];
+#if MC_DEBUG
+ b->caller[b->mcidx] = __builtin_return_address(0);
+#endif
b->mcidx++;
+ ret.args = &b->args[argidx];
+ b->argidx = argidx + args;
+
+ BUG_ON(b->argidx >= MC_ARGS);
+ return ret;
+}
+
+struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
+{
+ struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+ struct multicall_space ret = { NULL, NULL };
+
+ BUG_ON(preemptible());
+ BUG_ON(b->argidx >= MC_ARGS);
+
+ if (unlikely(b->mcidx == 0 ||
+ b->entries[b->mcidx - 1].op != op)) {
+ trace_xen_mc_extend_args(op, size, XEN_MC_XE_BAD_OP);
+ goto out;
+ }
+
+ if (unlikely((b->argidx + size) >= MC_ARGS)) {
+ trace_xen_mc_extend_args(op, size, XEN_MC_XE_NO_SPACE);
+ goto out;
+ }
+
+ ret.mc = &b->entries[b->mcidx - 1];
ret.args = &b->args[b->argidx];
- b->argidx += argspace;
+ b->argidx += size;
+
+ BUG_ON(b->argidx >= MC_ARGS);
+ trace_xen_mc_extend_args(op, size, XEN_MC_XE_OK);
+out:
return ret;
}
@@ -129,8 +195,12 @@ void xen_mc_callback(void (*fn)(void *), void *data)
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
struct callback *cb;
- if (b->cbidx == MC_BATCH)
+ if (b->cbidx == MC_BATCH) {
+ trace_xen_mc_flush_reason(XEN_MC_FL_CALLBACK);
xen_mc_flush();
+ }
+
+ trace_xen_mc_callback(fn, data);
cb = &b->callbacks[b->cbidx++];
cb->fn = fn;
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index 8bae996d99a..9c2e74f9096 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -1,6 +1,8 @@
#ifndef _XEN_MULTICALLS_H
#define _XEN_MULTICALLS_H
+#include <trace/events/xen.h>
+
#include "xen-ops.h"
/* Multicalls */
@@ -19,8 +21,12 @@ DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
paired with xen_mc_issue() */
static inline void xen_mc_batch(void)
{
+ unsigned long flags;
+
/* need to disable interrupts until this entry is complete */
- local_irq_save(__get_cpu_var(xen_mc_irq_flags));
+ local_irq_save(flags);
+ trace_xen_mc_batch(paravirt_get_lazy_mode());
+ __this_cpu_write(xen_mc_irq_flags, flags);
}
static inline struct multicall_space xen_mc_entry(size_t args)
@@ -35,14 +41,28 @@ void xen_mc_flush(void);
/* Issue a multicall if we're not in a lazy mode */
static inline void xen_mc_issue(unsigned mode)
{
+ trace_xen_mc_issue(mode);
+
if ((paravirt_get_lazy_mode() & mode) == 0)
xen_mc_flush();
/* restore flags saved in xen_mc_batch */
- local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
+ local_irq_restore(this_cpu_read(xen_mc_irq_flags));
}
/* Set up a callback to be called when the current batch is flushed */
void xen_mc_callback(void (*fn)(void *), void *data);
+/*
+ * Try to extend the arguments of the previous multicall command. The
+ * previous command's op must match. If it does, then it attempts to
+ * extend the argument space allocated to the multicall entry by
+ * arg_size bytes.
+ *
+ * The returned multicall_space will return with mc pointing to the
+ * command on success, or NULL on failure, and args pointing to the
+ * newly allocated space.
+ */
+struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size);
+
#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
new file mode 100644
index 00000000000..9bb3d82ffec
--- /dev/null
+++ b/arch/x86/xen/p2m.c
@@ -0,0 +1,1340 @@
+/*
+ * Xen leaves the responsibility for maintaining p2m mappings to the
+ * guests themselves, but it must also access and update the p2m array
+ * during suspend/resume when all the pages are reallocated.
+ *
+ * The p2m table is logically a flat array, but we implement it as a
+ * three-level tree to allow the address space to be sparse.
+ *
+ * Xen
+ * |
+ * p2m_top p2m_top_mfn
+ * / \ / \
+ * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
+ * / \ / \ / /
+ * p2m p2m p2m p2m p2m p2m p2m ...
+ *
+ * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
+ *
+ * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
+ * maximum representable pseudo-physical address space is:
+ * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
+ *
+ * P2M_PER_PAGE depends on the architecture, as a mfn is always
+ * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
+ * 512 and 1024 entries respectively.
+ *
+ * In short, these structures contain the Machine Frame Number (MFN) of the PFN.
+ *
+ * However not all entries are filled with MFNs. Specifically for all other
+ * leaf entries, or for the top root, or middle one, for which there is a void
+ * entry, we assume it is "missing". So (for example)
+ * pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY.
+ *
+ * We also have the possibility of setting 1-1 mappings on certain regions, so
+ * that:
+ * pfn_to_mfn(0xc0000)=0xc0000
+ *
+ * The benefit of this is, that we can assume for non-RAM regions (think
+ * PCI BARs, or ACPI spaces), we can create mappings easily because we
+ * get the PFN value to match the MFN.
+ *
+ * For this to work efficiently we have one new page p2m_identity and
+ * allocate (via reserved_brk) any other pages we need to cover the sides
+ * (1GB or 4MB boundary violations). All entries in p2m_identity are set to
+ * INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs,
+ * no other fancy value).
+ *
+ * On lookup we spot that the entry points to p2m_identity and return the
+ * identity value instead of dereferencing and returning INVALID_P2M_ENTRY.
+ * If the entry points to an allocated page, we just proceed as before and
+ * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in
+ * appropriate functions (pfn_to_mfn).
+ *
+ * The reason for having the IDENTITY_FRAME_BIT instead of just returning the
+ * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a
+ * non-identity pfn. To protect ourselves against we elect to set (and get) the
+ * IDENTITY_FRAME_BIT on all identity mapped PFNs.
+ *
+ * This simplistic diagram is used to explain the more subtle piece of code.
+ * There is also a digram of the P2M at the end that can help.
+ * Imagine your E820 looking as so:
+ *
+ * 1GB 2GB 4GB
+ * /-------------------+---------\/----\ /----------\ /---+-----\
+ * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM |
+ * \-------------------+---------/\----/ \----------/ \---+-----/
+ * ^- 1029MB ^- 2001MB
+ *
+ * [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100),
+ * 2048MB = 524288 (0x80000)]
+ *
+ * And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB
+ * is actually not present (would have to kick the balloon driver to put it in).
+ *
+ * When we are told to set the PFNs for identity mapping (see patch: "xen/setup:
+ * Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start
+ * of the PFN and the end PFN (263424 and 512256 respectively). The first step
+ * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page
+ * covers 512^2 of page estate (1GB) and in case the start or end PFN is not
+ * aligned on 512^2*PAGE_SIZE (1GB) we reserve_brk new middle and leaf pages as
+ * required to split any existing p2m_mid_missing middle pages.
+ *
+ * With the E820 example above, 263424 is not 1GB aligned so we allocate a
+ * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000.
+ * Each entry in the allocate page is "missing" (points to p2m_missing).
+ *
+ * Next stage is to determine if we need to do a more granular boundary check
+ * on the 4MB (or 2MB depending on architecture) off the start and end pfn's.
+ * We check if the start pfn and end pfn violate that boundary check, and if
+ * so reserve_brk a (p2m[x][y]) leaf page. This way we have a much finer
+ * granularity of setting which PFNs are missing and which ones are identity.
+ * In our example 263424 and 512256 both fail the check so we reserve_brk two
+ * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing"
+ * values) and assign them to p2m[1][2] and p2m[1][488] respectively.
+ *
+ * At this point we would at minimum reserve_brk one page, but could be up to
+ * three. Each call to set_phys_range_identity has at maximum a three page
+ * cost. If we were to query the P2M at this stage, all those entries from
+ * start PFN through end PFN (so 1029MB -> 2001MB) would return
+ * INVALID_P2M_ENTRY ("missing").
+ *
+ * The next step is to walk from the start pfn to the end pfn setting
+ * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity.
+ * If we find that the middle entry is pointing to p2m_missing we can swap it
+ * over to p2m_identity - this way covering 4MB (or 2MB) PFN space (and
+ * similarly swapping p2m_mid_missing for p2m_mid_identity for larger regions).
+ * At this point we do not need to worry about boundary aligment (so no need to
+ * reserve_brk a middle page, figure out which PFNs are "missing" and which
+ * ones are identity), as that has been done earlier. If we find that the
+ * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference
+ * that page (which covers 512 PFNs) and set the appropriate PFN with
+ * IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we
+ * set from p2m[1][2][256->511] and p2m[1][488][0->256] with
+ * IDENTITY_FRAME_BIT set.
+ *
+ * All other regions that are void (or not filled) either point to p2m_missing
+ * (considered missing) or have the default value of INVALID_P2M_ENTRY (also
+ * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511]
+ * contain the INVALID_P2M_ENTRY value and are considered "missing."
+ *
+ * Finally, the region beyond the end of of the E820 (4 GB in this example)
+ * is set to be identity (in case there are MMIO regions placed here).
+ *
+ * This is what the p2m ends up looking (for the E820 above) with this
+ * fabulous drawing:
+ *
+ * p2m /--------------\
+ * /-----\ | &mfn_list[0],| /-----------------\
+ * | 0 |------>| &mfn_list[1],| /---------------\ | ~0, ~0, .. |
+ * |-----| | ..., ~0, ~0 | | ~0, ~0, [x]---+----->| IDENTITY [@256] |
+ * | 1 |---\ \--------------/ | [p2m_identity]+\ | IDENTITY [@257] |
+ * |-----| \ | [p2m_identity]+\\ | .... |
+ * | 2 |--\ \-------------------->| ... | \\ \----------------/
+ * |-----| \ \---------------/ \\
+ * | 3 |-\ \ \\ p2m_identity [1]
+ * |-----| \ \-------------------->/---------------\ /-----------------\
+ * | .. |\ | | [p2m_identity]+-->| ~0, ~0, ~0, ... |
+ * \-----/ | | | [p2m_identity]+-->| ..., ~0 |
+ * | | | .... | \-----------------/
+ * | | +-[x], ~0, ~0.. +\
+ * | | \---------------/ \
+ * | | \-> /---------------\
+ * | V p2m_mid_missing p2m_missing | IDENTITY[@0] |
+ * | /-----------------\ /------------\ | IDENTITY[@256]|
+ * | | [p2m_missing] +---->| ~0, ~0, ...| | ~0, ~0, .... |
+ * | | [p2m_missing] +---->| ..., ~0 | \---------------/
+ * | | ... | \------------/
+ * | \-----------------/
+ * |
+ * | p2m_mid_identity
+ * | /-----------------\
+ * \-->| [p2m_identity] +---->[1]
+ * | [p2m_identity] +---->[1]
+ * | ... |
+ * \-----------------/
+ *
+ * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT)
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+
+#include <asm/cache.h>
+#include <asm/setup.h>
+
+#include <asm/xen/page.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/balloon.h>
+#include <xen/grant_table.h>
+
+#include "multicalls.h"
+#include "xen-ops.h"
+
+static void __init m2p_override_init(void);
+
+unsigned long xen_max_p2m_pfn __read_mostly;
+
+#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
+#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
+
+#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
+
+/* Placeholders for holes in the address space */
+static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
+
+static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
+
+static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_identity, P2M_MID_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_identity_mfn, P2M_MID_PER_PAGE);
+
+RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
+RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
+
+/* We might hit two boundary violations at the start and end, at max each
+ * boundary violation will require three middle nodes. */
+RESERVE_BRK(p2m_mid_extra, PAGE_SIZE * 2 * 3);
+
+/* When we populate back during bootup, the amount of pages can vary. The
+ * max we have is seen is 395979, but that does not mean it can't be more.
+ * Some machines can have 3GB I/O holes even. With early_can_reuse_p2m_middle
+ * it can re-use Xen provided mfn_list array, so we only need to allocate at
+ * most three P2M top nodes. */
+RESERVE_BRK(p2m_populated, PAGE_SIZE * 3);
+
+static inline unsigned p2m_top_index(unsigned long pfn)
+{
+ BUG_ON(pfn >= MAX_P2M_PFN);
+ return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
+}
+
+static inline unsigned p2m_mid_index(unsigned long pfn)
+{
+ return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
+}
+
+static inline unsigned p2m_index(unsigned long pfn)
+{
+ return pfn % P2M_PER_PAGE;
+}
+
+static void p2m_top_init(unsigned long ***top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = p2m_mid_missing;
+}
+
+static void p2m_top_mfn_init(unsigned long *top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = virt_to_mfn(p2m_mid_missing_mfn);
+}
+
+static void p2m_top_mfn_p_init(unsigned long **top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = p2m_mid_missing_mfn;
+}
+
+static void p2m_mid_init(unsigned long **mid, unsigned long *leaf)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ mid[i] = leaf;
+}
+
+static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ mid[i] = virt_to_mfn(leaf);
+}
+
+static void p2m_init(unsigned long *p2m)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ p2m[i] = INVALID_P2M_ENTRY;
+}
+
+/*
+ * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
+ *
+ * This is called both at boot time, and after resuming from suspend:
+ * - At boot time we're called very early, and must use extend_brk()
+ * to allocate memory.
+ *
+ * - After resume we're called from within stop_machine, but the mfn
+ * tree should alreay be completely allocated.
+ */
+void __ref xen_build_mfn_list_list(void)
+{
+ unsigned long pfn;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ /* Pre-initialize p2m_top_mfn to be completely missing */
+ if (p2m_top_mfn == NULL) {
+ p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
+ p2m_mid_identity_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity);
+
+ p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_mfn_p_init(p2m_top_mfn_p);
+
+ p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_mfn_init(p2m_top_mfn);
+ } else {
+ /* Reinitialise, mfn's all change after migration */
+ p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
+ p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity);
+ }
+
+ for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
+ unsigned long **mid;
+ unsigned long *mid_mfn_p;
+
+ mid = p2m_top[topidx];
+ mid_mfn_p = p2m_top_mfn_p[topidx];
+
+ /* Don't bother allocating any mfn mid levels if
+ * they're just missing, just update the stored mfn,
+ * since all could have changed over a migrate.
+ */
+ if (mid == p2m_mid_missing) {
+ BUG_ON(mididx);
+ BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
+ p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
+ pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
+ continue;
+ }
+
+ if (mid_mfn_p == p2m_mid_missing_mfn) {
+ /*
+ * XXX boot-time only! We should never find
+ * missing parts of the mfn tree after
+ * runtime. extend_brk() will BUG if we call
+ * it too late.
+ */
+ mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(mid_mfn_p, p2m_missing);
+
+ p2m_top_mfn_p[topidx] = mid_mfn_p;
+ }
+
+ p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
+ mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
+ }
+}
+
+void xen_setup_mfn_list_list(void)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
+
+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+ virt_to_mfn(p2m_top_mfn);
+ HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
+}
+
+/* Set up p2m_top to point to the domain-builder provided p2m pages */
+void __init xen_build_dynamic_phys_to_machine(void)
+{
+ unsigned long *mfn_list;
+ unsigned long max_pfn;
+ unsigned long pfn;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ mfn_list = (unsigned long *)xen_start_info->mfn_list;
+ max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
+ xen_max_p2m_pfn = max_pfn;
+
+ p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_init(p2m_missing);
+ p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_init(p2m_identity);
+
+ p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_init(p2m_mid_missing, p2m_missing);
+ p2m_mid_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_init(p2m_mid_identity, p2m_identity);
+
+ p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_init(p2m_top);
+
+ /*
+ * The domain builder gives us a pre-constructed p2m array in
+ * mfn_list for all the pages initially given to us, so we just
+ * need to graft that into our tree structure.
+ */
+ for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
+
+ if (p2m_top[topidx] == p2m_mid_missing) {
+ unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_init(mid, p2m_missing);
+
+ p2m_top[topidx] = mid;
+ }
+
+ /*
+ * As long as the mfn_list has enough entries to completely
+ * fill a p2m page, pointing into the array is ok. But if
+ * not the entries beyond the last pfn will be undefined.
+ */
+ if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) {
+ unsigned long p2midx;
+
+ p2midx = max_pfn % P2M_PER_PAGE;
+ for ( ; p2midx < P2M_PER_PAGE; p2midx++)
+ mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY;
+ }
+ p2m_top[topidx][mididx] = &mfn_list[pfn];
+ }
+
+ m2p_override_init();
+}
+#ifdef CONFIG_X86_64
+#include <linux/bootmem.h>
+unsigned long __init xen_revector_p2m_tree(void)
+{
+ unsigned long va_start;
+ unsigned long va_end;
+ unsigned long pfn;
+ unsigned long pfn_free = 0;
+ unsigned long *mfn_list = NULL;
+ unsigned long size;
+
+ va_start = xen_start_info->mfn_list;
+ /*We copy in increments of P2M_PER_PAGE * sizeof(unsigned long),
+ * so make sure it is rounded up to that */
+ size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+ va_end = va_start + size;
+
+ /* If we were revectored already, don't do it again. */
+ if (va_start <= __START_KERNEL_map && va_start >= __PAGE_OFFSET)
+ return 0;
+
+ mfn_list = alloc_bootmem_align(size, PAGE_SIZE);
+ if (!mfn_list) {
+ pr_warn("Could not allocate space for a new P2M tree!\n");
+ return xen_start_info->mfn_list;
+ }
+ /* Fill it out with INVALID_P2M_ENTRY value */
+ memset(mfn_list, 0xFF, size);
+
+ for (pfn = 0; pfn < ALIGN(MAX_DOMAIN_PAGES, P2M_PER_PAGE); pfn += P2M_PER_PAGE) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx;
+ unsigned long *mid_p;
+
+ if (!p2m_top[topidx])
+ continue;
+
+ if (p2m_top[topidx] == p2m_mid_missing)
+ continue;
+
+ mididx = p2m_mid_index(pfn);
+ mid_p = p2m_top[topidx][mididx];
+ if (!mid_p)
+ continue;
+ if ((mid_p == p2m_missing) || (mid_p == p2m_identity))
+ continue;
+
+ if ((unsigned long)mid_p == INVALID_P2M_ENTRY)
+ continue;
+
+ /* The old va. Rebase it on mfn_list */
+ if (mid_p >= (unsigned long *)va_start && mid_p <= (unsigned long *)va_end) {
+ unsigned long *new;
+
+ if (pfn_free > (size / sizeof(unsigned long))) {
+ WARN(1, "Only allocated for %ld pages, but we want %ld!\n",
+ size / sizeof(unsigned long), pfn_free);
+ return 0;
+ }
+ new = &mfn_list[pfn_free];
+
+ copy_page(new, mid_p);
+ p2m_top[topidx][mididx] = &mfn_list[pfn_free];
+ p2m_top_mfn_p[topidx][mididx] = virt_to_mfn(&mfn_list[pfn_free]);
+
+ pfn_free += P2M_PER_PAGE;
+
+ }
+ /* This should be the leafs allocated for identity from _brk. */
+ }
+ return (unsigned long)mfn_list;
+
+}
+#else
+unsigned long __init xen_revector_p2m_tree(void)
+{
+ return 0;
+}
+#endif
+unsigned long get_phys_to_machine(unsigned long pfn)
+{
+ unsigned topidx, mididx, idx;
+
+ if (unlikely(pfn >= MAX_P2M_PFN))
+ return IDENTITY_FRAME(pfn);
+
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+ idx = p2m_index(pfn);
+
+ /*
+ * The INVALID_P2M_ENTRY is filled in both p2m_*identity
+ * and in p2m_*missing, so returning the INVALID_P2M_ENTRY
+ * would be wrong.
+ */
+ if (p2m_top[topidx][mididx] == p2m_identity)
+ return IDENTITY_FRAME(pfn);
+
+ return p2m_top[topidx][mididx][idx];
+}
+EXPORT_SYMBOL_GPL(get_phys_to_machine);
+
+static void *alloc_p2m_page(void)
+{
+ return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
+}
+
+static void free_p2m_page(void *p)
+{
+ free_page((unsigned long)p);
+}
+
+/*
+ * Fully allocate the p2m structure for a given pfn. We need to check
+ * that both the top and mid levels are allocated, and make sure the
+ * parallel mfn tree is kept in sync. We may race with other cpus, so
+ * the new pages are installed with cmpxchg; if we lose the race then
+ * simply free the page we allocated and use the one that's there.
+ */
+static bool alloc_p2m(unsigned long pfn)
+{
+ unsigned topidx, mididx;
+ unsigned long ***top_p, **mid;
+ unsigned long *top_mfn_p, *mid_mfn;
+
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+
+ top_p = &p2m_top[topidx];
+ mid = *top_p;
+
+ if (mid == p2m_mid_missing) {
+ /* Mid level is missing, allocate a new one */
+ mid = alloc_p2m_page();
+ if (!mid)
+ return false;
+
+ p2m_mid_init(mid, p2m_missing);
+
+ if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
+ free_p2m_page(mid);
+ }
+
+ top_mfn_p = &p2m_top_mfn[topidx];
+ mid_mfn = p2m_top_mfn_p[topidx];
+
+ BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
+
+ if (mid_mfn == p2m_mid_missing_mfn) {
+ /* Separately check the mid mfn level */
+ unsigned long missing_mfn;
+ unsigned long mid_mfn_mfn;
+
+ mid_mfn = alloc_p2m_page();
+ if (!mid_mfn)
+ return false;
+
+ p2m_mid_mfn_init(mid_mfn, p2m_missing);
+
+ missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
+ mid_mfn_mfn = virt_to_mfn(mid_mfn);
+ if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
+ free_p2m_page(mid_mfn);
+ else
+ p2m_top_mfn_p[topidx] = mid_mfn;
+ }
+
+ if (p2m_top[topidx][mididx] == p2m_identity ||
+ p2m_top[topidx][mididx] == p2m_missing) {
+ /* p2m leaf page is missing */
+ unsigned long *p2m;
+ unsigned long *p2m_orig = p2m_top[topidx][mididx];
+
+ p2m = alloc_p2m_page();
+ if (!p2m)
+ return false;
+
+ p2m_init(p2m);
+
+ if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig)
+ free_p2m_page(p2m);
+ else
+ mid_mfn[mididx] = virt_to_mfn(p2m);
+ }
+
+ return true;
+}
+
+static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary)
+{
+ unsigned topidx, mididx, idx;
+ unsigned long *p2m;
+ unsigned long *mid_mfn_p;
+
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+ idx = p2m_index(pfn);
+
+ /* Pfff.. No boundary cross-over, lets get out. */
+ if (!idx && check_boundary)
+ return false;
+
+ WARN(p2m_top[topidx][mididx] == p2m_identity,
+ "P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n",
+ topidx, mididx);
+
+ /*
+ * Could be done by xen_build_dynamic_phys_to_machine..
+ */
+ if (p2m_top[topidx][mididx] != p2m_missing)
+ return false;
+
+ /* Boundary cross-over for the edges: */
+ p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
+
+ p2m_init(p2m);
+
+ p2m_top[topidx][mididx] = p2m;
+
+ /* For save/restore we need to MFN of the P2M saved */
+
+ mid_mfn_p = p2m_top_mfn_p[topidx];
+ WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing),
+ "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n",
+ topidx, mididx);
+ mid_mfn_p[mididx] = virt_to_mfn(p2m);
+
+ return true;
+}
+
+static bool __init early_alloc_p2m_middle(unsigned long pfn)
+{
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned long *mid_mfn_p;
+ unsigned long **mid;
+
+ mid = p2m_top[topidx];
+ mid_mfn_p = p2m_top_mfn_p[topidx];
+ if (mid == p2m_mid_missing) {
+ mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
+
+ p2m_mid_init(mid, p2m_missing);
+
+ p2m_top[topidx] = mid;
+
+ BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
+ }
+ /* And the save/restore P2M tables.. */
+ if (mid_mfn_p == p2m_mid_missing_mfn) {
+ mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(mid_mfn_p, p2m_missing);
+
+ p2m_top_mfn_p[topidx] = mid_mfn_p;
+ p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
+ /* Note: we don't set mid_mfn_p[midix] here,
+ * look in early_alloc_p2m() */
+ }
+ return true;
+}
+
+/*
+ * Skim over the P2M tree looking at pages that are either filled with
+ * INVALID_P2M_ENTRY or with 1:1 PFNs. If found, re-use that page and
+ * replace the P2M leaf with a p2m_missing or p2m_identity.
+ * Stick the old page in the new P2M tree location.
+ */
+bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_mfn)
+{
+ unsigned topidx;
+ unsigned mididx;
+ unsigned ident_pfns;
+ unsigned inv_pfns;
+ unsigned long *p2m;
+ unsigned long *mid_mfn_p;
+ unsigned idx;
+ unsigned long pfn;
+
+ /* We only look when this entails a P2M middle layer */
+ if (p2m_index(set_pfn))
+ return false;
+
+ for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) {
+ topidx = p2m_top_index(pfn);
+
+ if (!p2m_top[topidx])
+ continue;
+
+ if (p2m_top[topidx] == p2m_mid_missing)
+ continue;
+
+ mididx = p2m_mid_index(pfn);
+ p2m = p2m_top[topidx][mididx];
+ if (!p2m)
+ continue;
+
+ if ((p2m == p2m_missing) || (p2m == p2m_identity))
+ continue;
+
+ if ((unsigned long)p2m == INVALID_P2M_ENTRY)
+ continue;
+
+ ident_pfns = 0;
+ inv_pfns = 0;
+ for (idx = 0; idx < P2M_PER_PAGE; idx++) {
+ /* IDENTITY_PFNs are 1:1 */
+ if (p2m[idx] == IDENTITY_FRAME(pfn + idx))
+ ident_pfns++;
+ else if (p2m[idx] == INVALID_P2M_ENTRY)
+ inv_pfns++;
+ else
+ break;
+ }
+ if ((ident_pfns == P2M_PER_PAGE) || (inv_pfns == P2M_PER_PAGE))
+ goto found;
+ }
+ return false;
+found:
+ /* Found one, replace old with p2m_identity or p2m_missing */
+ p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing);
+ /* And the other for save/restore.. */
+ mid_mfn_p = p2m_top_mfn_p[topidx];
+ /* NOTE: Even if it is a p2m_identity it should still be point to
+ * a page filled with INVALID_P2M_ENTRY entries. */
+ mid_mfn_p[mididx] = virt_to_mfn(p2m_missing);
+
+ /* Reset where we want to stick the old page in. */
+ topidx = p2m_top_index(set_pfn);
+ mididx = p2m_mid_index(set_pfn);
+
+ /* This shouldn't happen */
+ if (WARN_ON(p2m_top[topidx] == p2m_mid_missing))
+ early_alloc_p2m_middle(set_pfn);
+
+ if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing))
+ return false;
+
+ p2m_init(p2m);
+ p2m_top[topidx][mididx] = p2m;
+ mid_mfn_p = p2m_top_mfn_p[topidx];
+ mid_mfn_p[mididx] = virt_to_mfn(p2m);
+
+ return true;
+}
+bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+ if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
+ if (!early_alloc_p2m_middle(pfn))
+ return false;
+
+ if (early_can_reuse_p2m_middle(pfn, mfn))
+ return __set_phys_to_machine(pfn, mfn);
+
+ if (!early_alloc_p2m(pfn, false /* boundary crossover OK!*/))
+ return false;
+
+ if (!__set_phys_to_machine(pfn, mfn))
+ return false;
+ }
+
+ return true;
+}
+
+static void __init early_split_p2m(unsigned long pfn)
+{
+ unsigned long mididx, idx;
+
+ mididx = p2m_mid_index(pfn);
+ idx = p2m_index(pfn);
+
+ /*
+ * Allocate new middle and leaf pages if this pfn lies in the
+ * middle of one.
+ */
+ if (mididx || idx)
+ early_alloc_p2m_middle(pfn);
+ if (idx)
+ early_alloc_p2m(pfn, false);
+}
+
+unsigned long __init set_phys_range_identity(unsigned long pfn_s,
+ unsigned long pfn_e)
+{
+ unsigned long pfn;
+
+ if (unlikely(pfn_s >= MAX_P2M_PFN))
+ return 0;
+
+ if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
+ return pfn_e - pfn_s;
+
+ if (pfn_s > pfn_e)
+ return 0;
+
+ if (pfn_e > MAX_P2M_PFN)
+ pfn_e = MAX_P2M_PFN;
+
+ early_split_p2m(pfn_s);
+ early_split_p2m(pfn_e);
+
+ for (pfn = pfn_s; pfn < pfn_e;) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
+
+ if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
+ break;
+ pfn++;
+
+ /*
+ * If the PFN was set to a middle or leaf identity
+ * page the remainder must also be identity, so skip
+ * ahead to the next middle or leaf entry.
+ */
+ if (p2m_top[topidx] == p2m_mid_identity)
+ pfn = ALIGN(pfn, P2M_MID_PER_PAGE * P2M_PER_PAGE);
+ else if (p2m_top[topidx][mididx] == p2m_identity)
+ pfn = ALIGN(pfn, P2M_PER_PAGE);
+ }
+
+ if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s),
+ "Identity mapping failed. We are %ld short of 1-1 mappings!\n",
+ (pfn_e - pfn_s) - (pfn - pfn_s)))
+ printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn);
+
+ return pfn - pfn_s;
+}
+
+/* Try to install p2m mapping; fail if intermediate bits missing */
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+ unsigned topidx, mididx, idx;
+
+ /* don't track P2M changes in autotranslate guests */
+ if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
+ return true;
+
+ if (unlikely(pfn >= MAX_P2M_PFN)) {
+ BUG_ON(mfn != INVALID_P2M_ENTRY);
+ return true;
+ }
+
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+ idx = p2m_index(pfn);
+
+ /* For sparse holes were the p2m leaf has real PFN along with
+ * PCI holes, stick in the PFN as the MFN value.
+ *
+ * set_phys_range_identity() will have allocated new middle
+ * and leaf pages as required so an existing p2m_mid_missing
+ * or p2m_missing mean that whole range will be identity so
+ * these can be switched to p2m_mid_identity or p2m_identity.
+ */
+ if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) {
+ if (p2m_top[topidx] == p2m_mid_identity)
+ return true;
+
+ if (p2m_top[topidx] == p2m_mid_missing) {
+ WARN_ON(cmpxchg(&p2m_top[topidx], p2m_mid_missing,
+ p2m_mid_identity) != p2m_mid_missing);
+ return true;
+ }
+
+ if (p2m_top[topidx][mididx] == p2m_identity)
+ return true;
+
+ /* Swap over from MISSING to IDENTITY if needed. */
+ if (p2m_top[topidx][mididx] == p2m_missing) {
+ WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing,
+ p2m_identity) != p2m_missing);
+ return true;
+ }
+ }
+
+ if (p2m_top[topidx][mididx] == p2m_missing)
+ return mfn == INVALID_P2M_ENTRY;
+
+ p2m_top[topidx][mididx][idx] = mfn;
+
+ return true;
+}
+
+bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+ if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
+ if (!alloc_p2m(pfn))
+ return false;
+
+ if (!__set_phys_to_machine(pfn, mfn))
+ return false;
+ }
+
+ return true;
+}
+
+#define M2P_OVERRIDE_HASH_SHIFT 10
+#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT)
+
+static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH);
+static DEFINE_SPINLOCK(m2p_override_lock);
+
+static void __init m2p_override_init(void)
+{
+ unsigned i;
+
+ m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
+ sizeof(unsigned long));
+
+ for (i = 0; i < M2P_OVERRIDE_HASH; i++)
+ INIT_LIST_HEAD(&m2p_overrides[i]);
+}
+
+static unsigned long mfn_hash(unsigned long mfn)
+{
+ return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
+}
+
+int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
+ struct gnttab_map_grant_ref *kmap_ops,
+ struct page **pages, unsigned int count)
+{
+ int i, ret = 0;
+ bool lazy = false;
+ pte_t *pte;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
+ if (kmap_ops &&
+ !in_interrupt() &&
+ paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
+ arch_enter_lazy_mmu_mode();
+ lazy = true;
+ }
+
+ for (i = 0; i < count; i++) {
+ unsigned long mfn, pfn;
+
+ /* Do not add to override if the map failed. */
+ if (map_ops[i].status)
+ continue;
+
+ if (map_ops[i].flags & GNTMAP_contains_pte) {
+ pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
+ (map_ops[i].host_addr & ~PAGE_MASK));
+ mfn = pte_mfn(*pte);
+ } else {
+ mfn = PFN_DOWN(map_ops[i].dev_bus_addr);
+ }
+ pfn = page_to_pfn(pages[i]);
+
+ WARN_ON(PagePrivate(pages[i]));
+ SetPagePrivate(pages[i]);
+ set_page_private(pages[i], mfn);
+ pages[i]->index = pfn_to_mfn(pfn);
+
+ if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (kmap_ops) {
+ ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]);
+ if (ret)
+ goto out;
+ }
+ }
+
+out:
+ if (lazy)
+ arch_leave_lazy_mmu_mode();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping);
+
+/* Add an MFN override for a particular page */
+int m2p_add_override(unsigned long mfn, struct page *page,
+ struct gnttab_map_grant_ref *kmap_op)
+{
+ unsigned long flags;
+ unsigned long pfn;
+ unsigned long uninitialized_var(address);
+ unsigned level;
+ pte_t *ptep = NULL;
+
+ pfn = page_to_pfn(page);
+ if (!PageHighMem(page)) {
+ address = (unsigned long)__va(pfn << PAGE_SHIFT);
+ ptep = lookup_address(address, &level);
+ if (WARN(ptep == NULL || level != PG_LEVEL_4K,
+ "m2p_add_override: pfn %lx not mapped", pfn))
+ return -EINVAL;
+ }
+
+ if (kmap_op != NULL) {
+ if (!PageHighMem(page)) {
+ struct multicall_space mcs =
+ xen_mc_entry(sizeof(*kmap_op));
+
+ MULTI_grant_table_op(mcs.mc,
+ GNTTABOP_map_grant_ref, kmap_op, 1);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+ }
+ }
+ spin_lock_irqsave(&m2p_override_lock, flags);
+ list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
+ spin_unlock_irqrestore(&m2p_override_lock, flags);
+
+ /* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in
+ * this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other
+ * pfn so that the following mfn_to_pfn(mfn) calls will return the
+ * pfn from the m2p_override (the backend pfn) instead.
+ * We need to do this because the pages shared by the frontend
+ * (xen-blkfront) can be already locked (lock_page, called by
+ * do_read_cache_page); when the userspace backend tries to use them
+ * with direct_IO, mfn_to_pfn returns the pfn of the frontend, so
+ * do_blockdev_direct_IO is going to try to lock the same pages
+ * again resulting in a deadlock.
+ * As a side effect get_user_pages_fast might not be safe on the
+ * frontend pages while they are being shared with the backend,
+ * because mfn_to_pfn (that ends up being called by GUPF) will
+ * return the backend pfn rather than the frontend pfn. */
+ pfn = mfn_to_pfn_no_overrides(mfn);
+ if (get_phys_to_machine(pfn) == mfn)
+ set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(m2p_add_override);
+
+int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
+ struct gnttab_map_grant_ref *kmap_ops,
+ struct page **pages, unsigned int count)
+{
+ int i, ret = 0;
+ bool lazy = false;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
+ if (kmap_ops &&
+ !in_interrupt() &&
+ paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
+ arch_enter_lazy_mmu_mode();
+ lazy = true;
+ }
+
+ for (i = 0; i < count; i++) {
+ unsigned long mfn = get_phys_to_machine(page_to_pfn(pages[i]));
+ unsigned long pfn = page_to_pfn(pages[i]);
+
+ if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ set_page_private(pages[i], INVALID_P2M_ENTRY);
+ WARN_ON(!PagePrivate(pages[i]));
+ ClearPagePrivate(pages[i]);
+ set_phys_to_machine(pfn, pages[i]->index);
+
+ if (kmap_ops)
+ ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn);
+ if (ret)
+ goto out;
+ }
+
+out:
+ if (lazy)
+ arch_leave_lazy_mmu_mode();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping);
+
+int m2p_remove_override(struct page *page,
+ struct gnttab_map_grant_ref *kmap_op,
+ unsigned long mfn)
+{
+ unsigned long flags;
+ unsigned long pfn;
+ unsigned long uninitialized_var(address);
+ unsigned level;
+ pte_t *ptep = NULL;
+
+ pfn = page_to_pfn(page);
+
+ if (!PageHighMem(page)) {
+ address = (unsigned long)__va(pfn << PAGE_SHIFT);
+ ptep = lookup_address(address, &level);
+
+ if (WARN(ptep == NULL || level != PG_LEVEL_4K,
+ "m2p_remove_override: pfn %lx not mapped", pfn))
+ return -EINVAL;
+ }
+
+ spin_lock_irqsave(&m2p_override_lock, flags);
+ list_del(&page->lru);
+ spin_unlock_irqrestore(&m2p_override_lock, flags);
+
+ if (kmap_op != NULL) {
+ if (!PageHighMem(page)) {
+ struct multicall_space mcs;
+ struct gnttab_unmap_and_replace *unmap_op;
+ struct page *scratch_page = get_balloon_scratch_page();
+ unsigned long scratch_page_address = (unsigned long)
+ __va(page_to_pfn(scratch_page) << PAGE_SHIFT);
+
+ /*
+ * It might be that we queued all the m2p grant table
+ * hypercalls in a multicall, then m2p_remove_override
+ * get called before the multicall has actually been
+ * issued. In this case handle is going to -1 because
+ * it hasn't been modified yet.
+ */
+ if (kmap_op->handle == -1)
+ xen_mc_flush();
+ /*
+ * Now if kmap_op->handle is negative it means that the
+ * hypercall actually returned an error.
+ */
+ if (kmap_op->handle == GNTST_general_error) {
+ printk(KERN_WARNING "m2p_remove_override: "
+ "pfn %lx mfn %lx, failed to modify kernel mappings",
+ pfn, mfn);
+ put_balloon_scratch_page();
+ return -1;
+ }
+
+ xen_mc_batch();
+
+ mcs = __xen_mc_entry(
+ sizeof(struct gnttab_unmap_and_replace));
+ unmap_op = mcs.args;
+ unmap_op->host_addr = kmap_op->host_addr;
+ unmap_op->new_addr = scratch_page_address;
+ unmap_op->handle = kmap_op->handle;
+
+ MULTI_grant_table_op(mcs.mc,
+ GNTTABOP_unmap_and_replace, unmap_op, 1);
+
+ mcs = __xen_mc_entry(0);
+ MULTI_update_va_mapping(mcs.mc, scratch_page_address,
+ pfn_pte(page_to_pfn(scratch_page),
+ PAGE_KERNEL_RO), 0);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ kmap_op->host_addr = 0;
+ put_balloon_scratch_page();
+ }
+ }
+
+ /* p2m(m2p(mfn)) == FOREIGN_FRAME(mfn): the mfn is already present
+ * somewhere in this domain, even before being added to the
+ * m2p_override (see comment above in m2p_add_override).
+ * If there are no other entries in the m2p_override corresponding
+ * to this mfn, then remove the FOREIGN_FRAME_BIT from the p2m for
+ * the original pfn (the one shared by the frontend): the backend
+ * cannot do any IO on this page anymore because it has been
+ * unshared. Removing the FOREIGN_FRAME_BIT from the p2m entry of
+ * the original pfn causes mfn_to_pfn(mfn) to return the frontend
+ * pfn again. */
+ mfn &= ~FOREIGN_FRAME_BIT;
+ pfn = mfn_to_pfn_no_overrides(mfn);
+ if (get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) &&
+ m2p_find_override(mfn) == NULL)
+ set_phys_to_machine(pfn, mfn);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(m2p_remove_override);
+
+struct page *m2p_find_override(unsigned long mfn)
+{
+ unsigned long flags;
+ struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)];
+ struct page *p, *ret;
+
+ ret = NULL;
+
+ spin_lock_irqsave(&m2p_override_lock, flags);
+
+ list_for_each_entry(p, bucket, lru) {
+ if (page_private(p) == mfn) {
+ ret = p;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&m2p_override_lock, flags);
+
+ return ret;
+}
+
+unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
+{
+ struct page *p = m2p_find_override(mfn);
+ unsigned long ret = pfn;
+
+ if (p)
+ ret = page_to_pfn(p);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
+
+#ifdef CONFIG_XEN_DEBUG_FS
+#include <linux/debugfs.h>
+#include "debugfs.h"
+static int p2m_dump_show(struct seq_file *m, void *v)
+{
+ static const char * const level_name[] = { "top", "middle",
+ "entry", "abnormal", "error"};
+#define TYPE_IDENTITY 0
+#define TYPE_MISSING 1
+#define TYPE_PFN 2
+#define TYPE_UNKNOWN 3
+ static const char * const type_name[] = {
+ [TYPE_IDENTITY] = "identity",
+ [TYPE_MISSING] = "missing",
+ [TYPE_PFN] = "pfn",
+ [TYPE_UNKNOWN] = "abnormal"};
+ unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0;
+ unsigned int uninitialized_var(prev_level);
+ unsigned int uninitialized_var(prev_type);
+
+ if (!p2m_top)
+ return 0;
+
+ for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
+ unsigned idx = p2m_index(pfn);
+ unsigned lvl, type;
+
+ lvl = 4;
+ type = TYPE_UNKNOWN;
+ if (p2m_top[topidx] == p2m_mid_missing) {
+ lvl = 0; type = TYPE_MISSING;
+ } else if (p2m_top[topidx] == NULL) {
+ lvl = 0; type = TYPE_UNKNOWN;
+ } else if (p2m_top[topidx][mididx] == NULL) {
+ lvl = 1; type = TYPE_UNKNOWN;
+ } else if (p2m_top[topidx][mididx] == p2m_identity) {
+ lvl = 1; type = TYPE_IDENTITY;
+ } else if (p2m_top[topidx][mididx] == p2m_missing) {
+ lvl = 1; type = TYPE_MISSING;
+ } else if (p2m_top[topidx][mididx][idx] == 0) {
+ lvl = 2; type = TYPE_UNKNOWN;
+ } else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) {
+ lvl = 2; type = TYPE_IDENTITY;
+ } else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) {
+ lvl = 2; type = TYPE_MISSING;
+ } else if (p2m_top[topidx][mididx][idx] == pfn) {
+ lvl = 2; type = TYPE_PFN;
+ } else if (p2m_top[topidx][mididx][idx] != pfn) {
+ lvl = 2; type = TYPE_PFN;
+ }
+ if (pfn == 0) {
+ prev_level = lvl;
+ prev_type = type;
+ }
+ if (pfn == MAX_DOMAIN_PAGES-1) {
+ lvl = 3;
+ type = TYPE_UNKNOWN;
+ }
+ if (prev_type != type) {
+ seq_printf(m, " [0x%lx->0x%lx] %s\n",
+ prev_pfn_type, pfn, type_name[prev_type]);
+ prev_pfn_type = pfn;
+ prev_type = type;
+ }
+ if (prev_level != lvl) {
+ seq_printf(m, " [0x%lx->0x%lx] level %s\n",
+ prev_pfn_level, pfn, level_name[prev_level]);
+ prev_pfn_level = pfn;
+ prev_level = lvl;
+ }
+ }
+ return 0;
+#undef TYPE_IDENTITY
+#undef TYPE_MISSING
+#undef TYPE_PFN
+#undef TYPE_UNKNOWN
+}
+
+static int p2m_dump_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, p2m_dump_show, NULL);
+}
+
+static const struct file_operations p2m_dump_fops = {
+ .open = p2m_dump_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *d_mmu_debug;
+
+static int __init xen_p2m_debugfs(void)
+{
+ struct dentry *d_xen = xen_init_debugfs();
+
+ if (d_xen == NULL)
+ return -ENOMEM;
+
+ d_mmu_debug = debugfs_create_dir("mmu", d_xen);
+
+ debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops);
+ return 0;
+}
+fs_initcall(xen_p2m_debugfs);
+#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
new file mode 100644
index 00000000000..0e98e5d241d
--- /dev/null
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -0,0 +1,109 @@
+/* Glue code to lib/swiotlb-xen.c */
+
+#include <linux/dma-mapping.h>
+#include <linux/pci.h>
+#include <xen/swiotlb-xen.h>
+
+#include <asm/xen/hypervisor.h>
+#include <xen/xen.h>
+#include <asm/iommu_table.h>
+
+
+#include <asm/xen/swiotlb-xen.h>
+#ifdef CONFIG_X86_64
+#include <asm/iommu.h>
+#include <asm/dma.h>
+#endif
+#include <linux/export.h>
+
+int xen_swiotlb __read_mostly;
+
+static struct dma_map_ops xen_swiotlb_dma_ops = {
+ .mapping_error = xen_swiotlb_dma_mapping_error,
+ .alloc = xen_swiotlb_alloc_coherent,
+ .free = xen_swiotlb_free_coherent,
+ .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
+ .sync_single_for_device = xen_swiotlb_sync_single_for_device,
+ .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu,
+ .sync_sg_for_device = xen_swiotlb_sync_sg_for_device,
+ .map_sg = xen_swiotlb_map_sg_attrs,
+ .unmap_sg = xen_swiotlb_unmap_sg_attrs,
+ .map_page = xen_swiotlb_map_page,
+ .unmap_page = xen_swiotlb_unmap_page,
+ .dma_supported = xen_swiotlb_dma_supported,
+};
+
+/*
+ * pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary
+ *
+ * This returns non-zero if we are forced to use xen_swiotlb (by the boot
+ * option).
+ */
+int __init pci_xen_swiotlb_detect(void)
+{
+
+ if (!xen_pv_domain())
+ return 0;
+
+ /* If running as PV guest, either iommu=soft, or swiotlb=force will
+ * activate this IOMMU. If running as PV privileged, activate it
+ * irregardless.
+ */
+ if ((xen_initial_domain() || swiotlb || swiotlb_force))
+ xen_swiotlb = 1;
+
+ /* If we are running under Xen, we MUST disable the native SWIOTLB.
+ * Don't worry about swiotlb_force flag activating the native, as
+ * the 'swiotlb' flag is the only one turning it on. */
+ swiotlb = 0;
+
+#ifdef CONFIG_X86_64
+ /* pci_swiotlb_detect_4gb turns on native SWIOTLB if no_iommu == 0
+ * (so no iommu=X command line over-writes).
+ * Considering that PV guests do not want the *native SWIOTLB* but
+ * only Xen SWIOTLB it is not useful to us so set no_iommu=1 here.
+ */
+ if (max_pfn > MAX_DMA32_PFN)
+ no_iommu = 1;
+#endif
+ return xen_swiotlb;
+}
+
+void __init pci_xen_swiotlb_init(void)
+{
+ if (xen_swiotlb) {
+ xen_swiotlb_init(1, true /* early */);
+ dma_ops = &xen_swiotlb_dma_ops;
+
+#ifdef CONFIG_PCI
+ /* Make sure ACS will be enabled */
+ pci_request_acs();
+#endif
+ }
+}
+
+int pci_xen_swiotlb_init_late(void)
+{
+ int rc;
+
+ if (xen_swiotlb)
+ return 0;
+
+ rc = xen_swiotlb_init(1, false /* late */);
+ if (rc)
+ return rc;
+
+ dma_ops = &xen_swiotlb_dma_ops;
+#ifdef CONFIG_PCI
+ /* Make sure ACS will be enabled */
+ pci_request_acs();
+#endif
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late);
+
+IOMMU_INIT_FINISH(pci_xen_swiotlb_detect,
+ NULL,
+ pci_xen_swiotlb_init,
+ NULL);
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
new file mode 100644
index 00000000000..a8261716d58
--- /dev/null
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -0,0 +1,217 @@
+/******************************************************************************
+ * platform-pci-unplug.c
+ *
+ * Xen platform PCI device driver
+ * Copyright (c) 2010, Citrix
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+
+#include <xen/platform_pci.h>
+#include "xen-ops.h"
+
+#define XEN_PLATFORM_ERR_MAGIC -1
+#define XEN_PLATFORM_ERR_PROTOCOL -2
+#define XEN_PLATFORM_ERR_BLACKLIST -3
+
+#ifdef CONFIG_XEN_PVHVM
+/* store the value of xen_emul_unplug after the unplug is done */
+static int xen_platform_pci_unplug;
+static int xen_emul_unplug;
+
+static int check_platform_magic(void)
+{
+ short magic;
+ char protocol;
+
+ magic = inw(XEN_IOPORT_MAGIC);
+ if (magic != XEN_IOPORT_MAGIC_VAL) {
+ printk(KERN_ERR "Xen Platform PCI: unrecognised magic value\n");
+ return XEN_PLATFORM_ERR_MAGIC;
+ }
+
+ protocol = inb(XEN_IOPORT_PROTOVER);
+
+ printk(KERN_DEBUG "Xen Platform PCI: I/O protocol version %d\n",
+ protocol);
+
+ switch (protocol) {
+ case 1:
+ outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM);
+ outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER);
+ if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) {
+ printk(KERN_ERR "Xen Platform: blacklisted by host\n");
+ return XEN_PLATFORM_ERR_BLACKLIST;
+ }
+ break;
+ default:
+ printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version");
+ return XEN_PLATFORM_ERR_PROTOCOL;
+ }
+
+ return 0;
+}
+
+bool xen_has_pv_devices()
+{
+ if (!xen_domain())
+ return false;
+
+ /* PV domains always have them. */
+ if (xen_pv_domain())
+ return true;
+
+ /* And user has xen_platform_pci=0 set in guest config as
+ * driver did not modify the value. */
+ if (xen_platform_pci_unplug == 0)
+ return false;
+
+ if (xen_platform_pci_unplug & XEN_UNPLUG_NEVER)
+ return false;
+
+ if (xen_platform_pci_unplug & XEN_UNPLUG_ALL)
+ return true;
+
+ /* This is an odd one - we are going to run legacy
+ * and PV drivers at the same time. */
+ if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY)
+ return true;
+
+ /* And the caller has to follow with xen_pv_{disk,nic}_devices
+ * to be certain which driver can load. */
+ return false;
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_devices);
+
+static bool __xen_has_pv_device(int state)
+{
+ /* HVM domains might or might not */
+ if (xen_hvm_domain() && (xen_platform_pci_unplug & state))
+ return true;
+
+ return xen_has_pv_devices();
+}
+
+bool xen_has_pv_nic_devices(void)
+{
+ return __xen_has_pv_device(XEN_UNPLUG_ALL_NICS | XEN_UNPLUG_ALL);
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_nic_devices);
+
+bool xen_has_pv_disk_devices(void)
+{
+ return __xen_has_pv_device(XEN_UNPLUG_ALL_IDE_DISKS |
+ XEN_UNPLUG_AUX_IDE_DISKS | XEN_UNPLUG_ALL);
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_disk_devices);
+
+/*
+ * This one is odd - it determines whether you want to run PV _and_
+ * legacy (IDE) drivers together. This combination is only possible
+ * under HVM.
+ */
+bool xen_has_pv_and_legacy_disk_devices(void)
+{
+ if (!xen_domain())
+ return false;
+
+ /* N.B. This is only ever used in HVM mode */
+ if (xen_pv_domain())
+ return false;
+
+ if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_and_legacy_disk_devices);
+
+void xen_unplug_emulated_devices(void)
+{
+ int r;
+
+ /* user explicitly requested no unplug */
+ if (xen_emul_unplug & XEN_UNPLUG_NEVER)
+ return;
+ /* check the version of the xen platform PCI device */
+ r = check_platform_magic();
+ /* If the version matches enable the Xen platform PCI driver.
+ * Also enable the Xen platform PCI driver if the host does
+ * not support the unplug protocol (XEN_PLATFORM_ERR_MAGIC)
+ * but the user told us that unplugging is unnecessary. */
+ if (r && !(r == XEN_PLATFORM_ERR_MAGIC &&
+ (xen_emul_unplug & XEN_UNPLUG_UNNECESSARY)))
+ return;
+ /* Set the default value of xen_emul_unplug depending on whether or
+ * not the Xen PV frontends and the Xen platform PCI driver have
+ * been compiled for this kernel (modules or built-in are both OK). */
+ if (!xen_emul_unplug) {
+ if (xen_must_unplug_nics()) {
+ printk(KERN_INFO "Netfront and the Xen platform PCI driver have "
+ "been compiled for this kernel: unplug emulated NICs.\n");
+ xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
+ }
+ if (xen_must_unplug_disks()) {
+ printk(KERN_INFO "Blkfront and the Xen platform PCI driver have "
+ "been compiled for this kernel: unplug emulated disks.\n"
+ "You might have to change the root device\n"
+ "from /dev/hd[a-d] to /dev/xvd[a-d]\n"
+ "in your root= kernel command line option\n");
+ xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
+ }
+ }
+ /* Now unplug the emulated devices */
+ if (!(xen_emul_unplug & XEN_UNPLUG_UNNECESSARY))
+ outw(xen_emul_unplug, XEN_IOPORT_UNPLUG);
+ xen_platform_pci_unplug = xen_emul_unplug;
+}
+
+static int __init parse_xen_emul_unplug(char *arg)
+{
+ char *p, *q;
+ int l;
+
+ for (p = arg; p; p = q) {
+ q = strchr(p, ',');
+ if (q) {
+ l = q - p;
+ q++;
+ } else {
+ l = strlen(p);
+ }
+ if (!strncmp(p, "all", l))
+ xen_emul_unplug |= XEN_UNPLUG_ALL;
+ else if (!strncmp(p, "ide-disks", l))
+ xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
+ else if (!strncmp(p, "aux-ide-disks", l))
+ xen_emul_unplug |= XEN_UNPLUG_AUX_IDE_DISKS;
+ else if (!strncmp(p, "nics", l))
+ xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
+ else if (!strncmp(p, "unnecessary", l))
+ xen_emul_unplug |= XEN_UNPLUG_UNNECESSARY;
+ else if (!strncmp(p, "never", l))
+ xen_emul_unplug |= XEN_UNPLUG_NEVER;
+ else
+ printk(KERN_WARNING "unrecognised option '%s' "
+ "in parameter 'xen_emul_unplug'\n", p);
+ }
+ return 0;
+}
+early_param("xen_emul_unplug", parse_xen_emul_unplug);
+#endif
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 2341492bf7a..2e555163c2f 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -8,84 +8,622 @@
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/pm.h>
+#include <linux/memblock.h>
+#include <linux/cpuidle.h>
+#include <linux/cpufreq.h>
#include <asm/elf.h>
#include <asm/vdso.h>
#include <asm/e820.h>
#include <asm/setup.h>
+#include <asm/acpi.h>
+#include <asm/numa.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
+#include <xen/xen.h>
+#include <xen/page.h>
+#include <xen/interface/callback.h>
+#include <xen/interface/memory.h>
#include <xen/interface/physdev.h>
#include <xen/features.h>
-
#include "xen-ops.h"
#include "vdso.h"
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
+#ifdef CONFIG_X86_64
+extern asmlinkage void nmi(void);
+#endif
+extern void xen_sysenter_target(void);
+extern void xen_syscall_target(void);
+extern void xen_syscall32_target(void);
+
+/* Amount of extra memory space we add to the e820 ranges */
+struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
+
+/* Number of pages released from the initial allocation. */
+unsigned long xen_released_pages;
+
+/*
+ * The maximum amount of extra memory compared to the base size. The
+ * main scaling factor is the size of struct page. At extreme ratios
+ * of base:extra, all the base memory can be filled with page
+ * structures for the extra memory, leaving no space for anything
+ * else.
+ *
+ * 10x seems like a reasonable balance between scaling flexibility and
+ * leaving a practically usable system.
+ */
+#define EXTRA_MEM_RATIO (10)
+
+static void __init xen_add_extra_mem(u64 start, u64 size)
+{
+ unsigned long pfn;
+ int i;
+
+ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
+ /* Add new region. */
+ if (xen_extra_mem[i].size == 0) {
+ xen_extra_mem[i].start = start;
+ xen_extra_mem[i].size = size;
+ break;
+ }
+ /* Append to existing region. */
+ if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
+ xen_extra_mem[i].size += size;
+ break;
+ }
+ }
+ if (i == XEN_EXTRA_MEM_MAX_REGIONS)
+ printk(KERN_WARNING "Warning: not enough extra memory regions\n");
+
+ memblock_reserve(start, size);
+
+ xen_max_p2m_pfn = PFN_DOWN(start + size);
+ for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
+ unsigned long mfn = pfn_to_mfn(pfn);
+
+ if (WARN_ONCE(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn))
+ continue;
+ WARN_ONCE(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n",
+ pfn, mfn);
+
+ __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+ }
+}
+
+static unsigned long __init xen_do_chunk(unsigned long start,
+ unsigned long end, bool release)
+{
+ struct xen_memory_reservation reservation = {
+ .address_bits = 0,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+ unsigned long len = 0;
+ unsigned long pfn;
+ int ret;
+
+ for (pfn = start; pfn < end; pfn++) {
+ unsigned long frame;
+ unsigned long mfn = pfn_to_mfn(pfn);
+
+ if (release) {
+ /* Make sure pfn exists to start with */
+ if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
+ continue;
+ frame = mfn;
+ } else {
+ if (mfn != INVALID_P2M_ENTRY)
+ continue;
+ frame = pfn;
+ }
+ set_xen_guest_handle(reservation.extent_start, &frame);
+ reservation.nr_extents = 1;
+
+ ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap,
+ &reservation);
+ WARN(ret != 1, "Failed to %s pfn %lx err=%d\n",
+ release ? "release" : "populate", pfn, ret);
+
+ if (ret == 1) {
+ if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) {
+ if (release)
+ break;
+ set_xen_guest_handle(reservation.extent_start, &frame);
+ reservation.nr_extents = 1;
+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+ &reservation);
+ break;
+ }
+ len++;
+ } else
+ break;
+ }
+ if (len)
+ printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n",
+ release ? "Freeing" : "Populating",
+ start, end, len,
+ release ? "freed" : "added");
+
+ return len;
+}
+
+static unsigned long __init xen_release_chunk(unsigned long start,
+ unsigned long end)
+{
+ return xen_do_chunk(start, end, true);
+}
+
+static unsigned long __init xen_populate_chunk(
+ const struct e820entry *list, size_t map_size,
+ unsigned long max_pfn, unsigned long *last_pfn,
+ unsigned long credits_left)
+{
+ const struct e820entry *entry;
+ unsigned int i;
+ unsigned long done = 0;
+ unsigned long dest_pfn;
+
+ for (i = 0, entry = list; i < map_size; i++, entry++) {
+ unsigned long s_pfn;
+ unsigned long e_pfn;
+ unsigned long pfns;
+ long capacity;
+
+ if (credits_left <= 0)
+ break;
+
+ if (entry->type != E820_RAM)
+ continue;
+
+ e_pfn = PFN_DOWN(entry->addr + entry->size);
+
+ /* We only care about E820 after the xen_start_info->nr_pages */
+ if (e_pfn <= max_pfn)
+ continue;
+
+ s_pfn = PFN_UP(entry->addr);
+ /* If the E820 falls within the nr_pages, we want to start
+ * at the nr_pages PFN.
+ * If that would mean going past the E820 entry, skip it
+ */
+ if (s_pfn <= max_pfn) {
+ capacity = e_pfn - max_pfn;
+ dest_pfn = max_pfn;
+ } else {
+ capacity = e_pfn - s_pfn;
+ dest_pfn = s_pfn;
+ }
+
+ if (credits_left < capacity)
+ capacity = credits_left;
+
+ pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false);
+ done += pfns;
+ *last_pfn = (dest_pfn + pfns);
+ if (pfns < capacity)
+ break;
+ credits_left -= pfns;
+ }
+ return done;
+}
+
+static void __init xen_set_identity_and_release_chunk(
+ unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
+ unsigned long *released, unsigned long *identity)
+{
+ unsigned long pfn;
+
+ /*
+ * If the PFNs are currently mapped, clear the mappings
+ * (except for the ISA region which must be 1:1 mapped) to
+ * release the refcounts (in Xen) on the original frames.
+ */
+ for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
+ pte_t pte = __pte_ma(0);
+
+ if (pfn < PFN_UP(ISA_END_ADDRESS))
+ pte = mfn_pte(pfn, PAGE_KERNEL_IO);
+
+ (void)HYPERVISOR_update_va_mapping(
+ (unsigned long)__va(pfn << PAGE_SHIFT), pte, 0);
+ }
+
+ if (start_pfn < nr_pages)
+ *released += xen_release_chunk(
+ start_pfn, min(end_pfn, nr_pages));
+
+ *identity += set_phys_range_identity(start_pfn, end_pfn);
+}
+
+static unsigned long __init xen_set_identity_and_release(
+ const struct e820entry *list, size_t map_size, unsigned long nr_pages)
+{
+ phys_addr_t start = 0;
+ unsigned long released = 0;
+ unsigned long identity = 0;
+ const struct e820entry *entry;
+ int i;
+
+ /*
+ * Combine non-RAM regions and gaps until a RAM region (or the
+ * end of the map) is reached, then set the 1:1 map and
+ * release the pages (if available) in those non-RAM regions.
+ *
+ * The combined non-RAM regions are rounded to a whole number
+ * of pages so any partial pages are accessible via the 1:1
+ * mapping. This is needed for some BIOSes that put (for
+ * example) the DMI tables in a reserved region that begins on
+ * a non-page boundary.
+ */
+ for (i = 0, entry = list; i < map_size; i++, entry++) {
+ phys_addr_t end = entry->addr + entry->size;
+ if (entry->type == E820_RAM || i == map_size - 1) {
+ unsigned long start_pfn = PFN_DOWN(start);
+ unsigned long end_pfn = PFN_UP(end);
+
+ if (entry->type == E820_RAM)
+ end_pfn = PFN_UP(entry->addr);
+
+ if (start_pfn < end_pfn)
+ xen_set_identity_and_release_chunk(
+ start_pfn, end_pfn, nr_pages,
+ &released, &identity);
+
+ start = end;
+ }
+ }
+
+ if (released)
+ printk(KERN_INFO "Released %lu pages of unused memory\n", released);
+ if (identity)
+ printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity);
+
+ return released;
+}
+
+static unsigned long __init xen_get_max_pages(void)
+{
+ unsigned long max_pages = MAX_DOMAIN_PAGES;
+ domid_t domid = DOMID_SELF;
+ int ret;
+
+ /*
+ * For the initial domain we use the maximum reservation as
+ * the maximum page.
+ *
+ * For guest domains the current maximum reservation reflects
+ * the current maximum rather than the static maximum. In this
+ * case the e820 map provided to us will cover the static
+ * maximum region.
+ */
+ if (xen_initial_domain()) {
+ ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
+ if (ret > 0)
+ max_pages = ret;
+ }
+
+ return min(max_pages, MAX_DOMAIN_PAGES);
+}
+
+static void xen_align_and_add_e820_region(u64 start, u64 size, int type)
+{
+ u64 end = start + size;
+
+ /* Align RAM regions to page boundaries. */
+ if (type == E820_RAM) {
+ start = PAGE_ALIGN(start);
+ end &= ~((u64)PAGE_SIZE - 1);
+ }
+
+ e820_add_region(start, end - start, type);
+}
+
+void xen_ignore_unusable(struct e820entry *list, size_t map_size)
+{
+ struct e820entry *entry;
+ unsigned int i;
-unsigned long *phys_to_machine_mapping;
-EXPORT_SYMBOL(phys_to_machine_mapping);
+ for (i = 0, entry = list; i < map_size; i++, entry++) {
+ if (entry->type == E820_UNUSABLE)
+ entry->type = E820_RAM;
+ }
+}
/**
* machine_specific_memory_setup - Hook for machine specific memory setup.
**/
-
char * __init xen_memory_setup(void)
{
+ static struct e820entry map[E820MAX] __initdata;
+
unsigned long max_pfn = xen_start_info->nr_pages;
+ unsigned long long mem_end;
+ int rc;
+ struct xen_memory_map memmap;
+ unsigned long max_pages;
+ unsigned long last_pfn = 0;
+ unsigned long extra_pages = 0;
+ unsigned long populated;
+ int i;
+ int op;
+
+ max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
+ mem_end = PFN_PHYS(max_pfn);
- e820.nr_map = 0;
- add_memory_region(0, LOWMEMSIZE(), E820_RAM);
- add_memory_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM);
+ memmap.nr_entries = E820MAX;
+ set_xen_guest_handle(memmap.buffer, map);
+
+ op = xen_initial_domain() ?
+ XENMEM_machine_memory_map :
+ XENMEM_memory_map;
+ rc = HYPERVISOR_memory_op(op, &memmap);
+ if (rc == -ENOSYS) {
+ BUG_ON(xen_initial_domain());
+ memmap.nr_entries = 1;
+ map[0].addr = 0ULL;
+ map[0].size = mem_end;
+ /* 8MB slack (to balance backend allocations). */
+ map[0].size += 8ULL << 20;
+ map[0].type = E820_RAM;
+ rc = 0;
+ }
+ BUG_ON(rc);
+
+ /*
+ * Xen won't allow a 1:1 mapping to be created to UNUSABLE
+ * regions, so if we're using the machine memory map leave the
+ * region as RAM as it is in the pseudo-physical map.
+ *
+ * UNUSABLE regions in domUs are not handled and will need
+ * a patch in the future.
+ */
+ if (xen_initial_domain())
+ xen_ignore_unusable(map, memmap.nr_entries);
+
+ /* Make sure the Xen-supplied memory map is well-ordered. */
+ sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
+
+ max_pages = xen_get_max_pages();
+ if (max_pages > max_pfn)
+ extra_pages += max_pages - max_pfn;
+
+ /*
+ * Set P2M for all non-RAM pages and E820 gaps to be identity
+ * type PFNs. Any RAM pages that would be made inaccesible by
+ * this are first released.
+ */
+ xen_released_pages = xen_set_identity_and_release(
+ map, memmap.nr_entries, max_pfn);
+
+ /*
+ * Populate back the non-RAM pages and E820 gaps that had been
+ * released. */
+ populated = xen_populate_chunk(map, memmap.nr_entries,
+ max_pfn, &last_pfn, xen_released_pages);
+
+ xen_released_pages -= populated;
+ extra_pages += xen_released_pages;
+
+ if (last_pfn > max_pfn) {
+ max_pfn = min(MAX_DOMAIN_PAGES, last_pfn);
+ mem_end = PFN_PHYS(max_pfn);
+ }
+ /*
+ * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
+ * factor the base size. On non-highmem systems, the base
+ * size is the full initial memory allocation; on highmem it
+ * is limited to the max size of lowmem, so that it doesn't
+ * get completely filled.
+ *
+ * In principle there could be a problem in lowmem systems if
+ * the initial memory is also very large with respect to
+ * lowmem, but we won't try to deal with that here.
+ */
+ extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
+ extra_pages);
+ i = 0;
+ while (i < memmap.nr_entries) {
+ u64 addr = map[i].addr;
+ u64 size = map[i].size;
+ u32 type = map[i].type;
+
+ if (type == E820_RAM) {
+ if (addr < mem_end) {
+ size = min(size, mem_end - addr);
+ } else if (extra_pages) {
+ size = min(size, (u64)extra_pages * PAGE_SIZE);
+ extra_pages -= size / PAGE_SIZE;
+ xen_add_extra_mem(addr, size);
+ } else
+ type = E820_UNUSABLE;
+ }
+
+ xen_align_and_add_e820_region(addr, size, type);
+
+ map[i].addr += size;
+ map[i].size -= size;
+ if (map[i].size == 0)
+ i++;
+ }
+
+ /*
+ * Set the rest as identity mapped, in case PCI BARs are
+ * located here.
+ *
+ * PFNs above MAX_P2M_PFN are considered identity mapped as
+ * well.
+ */
+ set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul);
+
+ /*
+ * In domU, the ISA region is normal, usable memory, but we
+ * reserve ISA memory anyway because too many things poke
+ * about in there.
+ */
+ e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
+ E820_RESERVED);
+
+ /*
+ * Reserve Xen bits:
+ * - mfn_list
+ * - xen_start_info
+ * See comment above "struct start_info" in <xen/interface/xen.h>
+ * We tried to make the the memblock_reserve more selective so
+ * that it would be clear what region is reserved. Sadly we ran
+ * in the problem wherein on a 64-bit hypervisor with a 32-bit
+ * initial domain, the pt_base has the cr3 value which is not
+ * neccessarily where the pagetable starts! As Jan put it: "
+ * Actually, the adjustment turns out to be correct: The page
+ * tables for a 32-on-64 dom0 get allocated in the order "first L1",
+ * "first L2", "first L3", so the offset to the page table base is
+ * indeed 2. When reading xen/include/public/xen.h's comment
+ * very strictly, this is not a violation (since there nothing is said
+ * that the first thing in the page table space is pointed to by
+ * pt_base; I admit that this seems to be implied though, namely
+ * do I think that it is implied that the page table space is the
+ * range [pt_base, pt_base + nt_pt_frames), whereas that
+ * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
+ * which - without a priori knowledge - the kernel would have
+ * difficulty to figure out)." - so lets just fall back to the
+ * easy way and reserve the whole region.
+ */
+ memblock_reserve(__pa(xen_start_info->mfn_list),
+ xen_start_info->pt_base - xen_start_info->mfn_list);
+
+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
return "Xen";
}
-static void xen_idle(void)
+/*
+ * Machine specific memory setup for auto-translated guests.
+ */
+char * __init xen_auto_xlated_memory_setup(void)
{
- local_irq_disable();
+ static struct e820entry map[E820MAX] __initdata;
- if (need_resched())
- local_irq_enable();
- else {
- current_thread_info()->status &= ~TS_POLLING;
- smp_mb__after_clear_bit();
- safe_halt();
- current_thread_info()->status |= TS_POLLING;
- }
+ struct xen_memory_map memmap;
+ int i;
+ int rc;
+
+ memmap.nr_entries = E820MAX;
+ set_xen_guest_handle(memmap.buffer, map);
+
+ rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+ if (rc < 0)
+ panic("No memory map (%d)\n", rc);
+
+ sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries);
+
+ for (i = 0; i < memmap.nr_entries; i++)
+ e820_add_region(map[i].addr, map[i].size, map[i].type);
+
+ memblock_reserve(__pa(xen_start_info->mfn_list),
+ xen_start_info->pt_base - xen_start_info->mfn_list);
+
+ return "Xen";
}
/*
* Set the bit indicating "nosegneg" library variants should be used.
+ * We only need to bother in pure 32-bit mode; compat 32-bit processes
+ * can have un-truncated segments, so wrapping around is allowed.
*/
static void __init fiddle_vdso(void)
{
- extern const char vdso32_default_start;
- u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK);
+#ifdef CONFIG_X86_32
+ /*
+ * This could be called before selected_vdso32 is initialized, so
+ * just fiddle with both possible images. vdso_image_32_syscall
+ * can't be selected, since it only exists on 64-bit systems.
+ */
+ u32 *mask;
+ mask = vdso_image_32_int80.data +
+ vdso_image_32_int80.sym_VDSO32_NOTE_MASK;
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
+ mask = vdso_image_32_sysenter.data +
+ vdso_image_32_sysenter.sym_VDSO32_NOTE_MASK;
+ *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
+#endif
}
-void __init xen_arch_setup(void)
+static int register_callback(unsigned type, const void *func)
{
- struct physdev_set_iopl set_iopl;
- int rc;
+ struct callback_register callback = {
+ .type = type,
+ .address = XEN_CALLBACK(__KERNEL_CS, func),
+ .flags = CALLBACKF_mask_events,
+ };
+
+ return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
+}
+
+void xen_enable_sysenter(void)
+{
+ int ret;
+ unsigned sysenter_feature;
+
+#ifdef CONFIG_X86_32
+ sysenter_feature = X86_FEATURE_SEP;
+#else
+ sysenter_feature = X86_FEATURE_SYSENTER32;
+#endif
+
+ if (!boot_cpu_has(sysenter_feature))
+ return;
+
+ ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
+ if(ret != 0)
+ setup_clear_cpu_cap(sysenter_feature);
+}
+
+void xen_enable_syscall(void)
+{
+#ifdef CONFIG_X86_64
+ int ret;
+ ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
+ if (ret != 0) {
+ printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
+ /* Pretty fatal; 64-bit userspace has no other
+ mechanism for syscalls. */
+ }
+
+ if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
+ ret = register_callback(CALLBACKTYPE_syscall32,
+ xen_syscall32_target);
+ if (ret != 0)
+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
+ }
+#endif /* CONFIG_X86_64 */
+}
+
+void __init xen_pvmmu_arch_setup(void)
+{
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
- if (!xen_feature(XENFEAT_auto_translated_physmap))
- HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
+ HYPERVISOR_vm_assist(VMASST_CMD_enable,
+ VMASST_TYPE_pae_extended_cr3);
- HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
- __KERNEL_CS, (unsigned long)xen_failsafe_callback);
+ if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
+ register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
+ BUG();
- set_iopl.iopl = 1;
- rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
- if (rc != 0)
- printk(KERN_INFO "physdev_op failed %d\n", rc);
+ xen_enable_sysenter();
+ xen_enable_syscall();
+}
+
+/* This function is not called for HVM domains */
+void __init xen_arch_setup(void)
+{
+ xen_panic_handler_init();
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ xen_pvmmu_arch_setup();
#ifdef CONFIG_ACPI
if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
@@ -98,14 +636,12 @@ void __init xen_arch_setup(void)
MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
- pm_idle = xen_idle;
-
-#ifdef CONFIG_SMP
- /* fill cpus_possible with all available cpus */
- xen_fill_possible_map();
-#endif
-
- paravirt_disable_iospace();
-
+ /* Set up idle, making sure it calls safe_halt() pvop */
+ disable_cpuidle();
+ disable_cpufreq();
+ WARN_ON(xen_set_default_idle());
fiddle_vdso();
+#ifdef CONFIG_NUMA
+ numa_off = 1;
+#endif
}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index aafc5443740..7005974c3ff 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -11,12 +11,13 @@
* useful topology information for the kernel to make use of. As a
* result, all CPUs are treated as if they're single-core and
* single-threaded.
- *
- * This does not handle HOTPLUG_CPU yet.
*/
#include <linux/sched.h>
#include <linux/err.h>
+#include <linux/slab.h>
#include <linux/smp.h>
+#include <linux/irq_work.h>
+#include <linux/tick.h>
#include <asm/paravirt.h>
#include <asm/desc.h>
@@ -29,229 +30,401 @@
#include <asm/xen/interface.h>
#include <asm/xen/hypercall.h>
+#include <xen/xen.h>
#include <xen/page.h>
#include <xen/events.h>
+#include <xen/hvc-console.h>
#include "xen-ops.h"
#include "mmu.h"
-static cpumask_t cpu_initialized_map;
-static DEFINE_PER_CPU(int, resched_irq);
-static DEFINE_PER_CPU(int, callfunc_irq);
+cpumask_var_t xen_cpu_initialized_map;
-/*
- * Structure and data for smp_call_function(). This is designed to minimise
- * static memory requirements. It also looks cleaner.
- */
-static DEFINE_SPINLOCK(call_lock);
-
-struct call_data_struct {
- void (*func) (void *info);
- void *info;
- atomic_t started;
- atomic_t finished;
- int wait;
+struct xen_common_irq {
+ int irq;
+ char *name;
};
+static DEFINE_PER_CPU(struct xen_common_irq, xen_resched_irq) = { .irq = -1 };
+static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 };
+static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 };
+static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 };
+static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 };
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
-
-static struct call_data_struct *call_data;
+static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
+static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id);
/*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
+ * Reschedule call back.
*/
static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
{
+ inc_irq_stat(irq_resched_count);
+ scheduler_ipi();
+
return IRQ_HANDLED;
}
-static __cpuinit void cpu_bringup_and_idle(void)
+static void cpu_bringup(void)
{
- int cpu = smp_processor_id();
+ int cpu;
cpu_init();
-
+ touch_softlockup_watchdog();
preempt_disable();
- per_cpu(cpu_state, cpu) = CPU_ONLINE;
+
+ /* PVH runs in ring 0 and allows us to do native syscalls. Yay! */
+ if (!xen_feature(XENFEAT_supervisor_mode_kernel)) {
+ xen_enable_sysenter();
+ xen_enable_syscall();
+ }
+ cpu = smp_processor_id();
+ smp_store_cpu_info(cpu);
+ cpu_data(cpu).x86_max_cores = 1;
+ set_cpu_sibling_map(cpu);
xen_setup_cpu_clockevents();
+ notify_cpu_starting(cpu);
+
+ set_cpu_online(cpu, true);
+
+ this_cpu_write(cpu_state, CPU_ONLINE);
+
+ wmb();
+
/* We can take interrupts now: we're officially "up". */
local_irq_enable();
wmb(); /* make sure everything is out */
- cpu_idle();
}
+/* Note: cpu parameter is only relevant for PVH */
+static void cpu_bringup_and_idle(int cpu)
+{
+#ifdef CONFIG_X86_64
+ if (xen_feature(XENFEAT_auto_translated_physmap) &&
+ xen_feature(XENFEAT_supervisor_mode_kernel))
+ xen_pvh_secondary_vcpu_init(cpu);
+#endif
+ cpu_bringup();
+ cpu_startup_entry(CPUHP_ONLINE);
+}
+
+static void xen_smp_intr_free(unsigned int cpu)
+{
+ if (per_cpu(xen_resched_irq, cpu).irq >= 0) {
+ unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu).irq, NULL);
+ per_cpu(xen_resched_irq, cpu).irq = -1;
+ kfree(per_cpu(xen_resched_irq, cpu).name);
+ per_cpu(xen_resched_irq, cpu).name = NULL;
+ }
+ if (per_cpu(xen_callfunc_irq, cpu).irq >= 0) {
+ unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu).irq, NULL);
+ per_cpu(xen_callfunc_irq, cpu).irq = -1;
+ kfree(per_cpu(xen_callfunc_irq, cpu).name);
+ per_cpu(xen_callfunc_irq, cpu).name = NULL;
+ }
+ if (per_cpu(xen_debug_irq, cpu).irq >= 0) {
+ unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu).irq, NULL);
+ per_cpu(xen_debug_irq, cpu).irq = -1;
+ kfree(per_cpu(xen_debug_irq, cpu).name);
+ per_cpu(xen_debug_irq, cpu).name = NULL;
+ }
+ if (per_cpu(xen_callfuncsingle_irq, cpu).irq >= 0) {
+ unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu).irq,
+ NULL);
+ per_cpu(xen_callfuncsingle_irq, cpu).irq = -1;
+ kfree(per_cpu(xen_callfuncsingle_irq, cpu).name);
+ per_cpu(xen_callfuncsingle_irq, cpu).name = NULL;
+ }
+ if (xen_hvm_domain())
+ return;
+
+ if (per_cpu(xen_irq_work, cpu).irq >= 0) {
+ unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL);
+ per_cpu(xen_irq_work, cpu).irq = -1;
+ kfree(per_cpu(xen_irq_work, cpu).name);
+ per_cpu(xen_irq_work, cpu).name = NULL;
+ }
+};
static int xen_smp_intr_init(unsigned int cpu)
{
int rc;
- const char *resched_name, *callfunc_name;
-
- per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
+ char *resched_name, *callfunc_name, *debug_name;
resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
cpu,
xen_reschedule_interrupt,
- IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+ IRQF_PERCPU|IRQF_NOBALANCING,
resched_name,
NULL);
if (rc < 0)
goto fail;
- per_cpu(resched_irq, cpu) = rc;
+ per_cpu(xen_resched_irq, cpu).irq = rc;
+ per_cpu(xen_resched_irq, cpu).name = resched_name;
callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
cpu,
xen_call_function_interrupt,
- IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+ IRQF_PERCPU|IRQF_NOBALANCING,
+ callfunc_name,
+ NULL);
+ if (rc < 0)
+ goto fail;
+ per_cpu(xen_callfunc_irq, cpu).irq = rc;
+ per_cpu(xen_callfunc_irq, cpu).name = callfunc_name;
+
+ debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu);
+ rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt,
+ IRQF_PERCPU | IRQF_NOBALANCING,
+ debug_name, NULL);
+ if (rc < 0)
+ goto fail;
+ per_cpu(xen_debug_irq, cpu).irq = rc;
+ per_cpu(xen_debug_irq, cpu).name = debug_name;
+
+ callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
+ rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
+ cpu,
+ xen_call_function_single_interrupt,
+ IRQF_PERCPU|IRQF_NOBALANCING,
+ callfunc_name,
+ NULL);
+ if (rc < 0)
+ goto fail;
+ per_cpu(xen_callfuncsingle_irq, cpu).irq = rc;
+ per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name;
+
+ /*
+ * The IRQ worker on PVHVM goes through the native path and uses the
+ * IPI mechanism.
+ */
+ if (xen_hvm_domain())
+ return 0;
+
+ callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu);
+ rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR,
+ cpu,
+ xen_irq_work_interrupt,
+ IRQF_PERCPU|IRQF_NOBALANCING,
callfunc_name,
NULL);
if (rc < 0)
goto fail;
- per_cpu(callfunc_irq, cpu) = rc;
+ per_cpu(xen_irq_work, cpu).irq = rc;
+ per_cpu(xen_irq_work, cpu).name = callfunc_name;
return 0;
fail:
- if (per_cpu(resched_irq, cpu) >= 0)
- unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
- if (per_cpu(callfunc_irq, cpu) >= 0)
- unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+ xen_smp_intr_free(cpu);
return rc;
}
-void __init xen_fill_possible_map(void)
+static void __init xen_fill_possible_map(void)
{
int i, rc;
- for (i = 0; i < NR_CPUS; i++) {
+ if (xen_initial_domain())
+ return;
+
+ for (i = 0; i < nr_cpu_ids; i++) {
rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
- if (rc >= 0)
- cpu_set(i, cpu_possible_map);
+ if (rc >= 0) {
+ num_processors++;
+ set_cpu_possible(i, true);
+ }
}
}
-void __init xen_smp_prepare_boot_cpu(void)
+static void __init xen_filter_cpu_maps(void)
{
- int cpu;
+ int i, rc;
+ unsigned int subtract = 0;
+
+ if (!xen_initial_domain())
+ return;
+ num_processors = 0;
+ disabled_cpus = 0;
+ for (i = 0; i < nr_cpu_ids; i++) {
+ rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
+ if (rc >= 0) {
+ num_processors++;
+ set_cpu_possible(i, true);
+ } else {
+ set_cpu_possible(i, false);
+ set_cpu_present(i, false);
+ subtract++;
+ }
+ }
+#ifdef CONFIG_HOTPLUG_CPU
+ /* This is akin to using 'nr_cpus' on the Linux command line.
+ * Which is OK as when we use 'dom0_max_vcpus=X' we can only
+ * have up to X, while nr_cpu_ids is greater than X. This
+ * normally is not a problem, except when CPU hotplugging
+ * is involved and then there might be more than X CPUs
+ * in the guest - which will not work as there is no
+ * hypercall to expand the max number of VCPUs an already
+ * running guest has. So cap it up to X. */
+ if (subtract)
+ nr_cpu_ids = nr_cpu_ids - subtract;
+#endif
+
+}
+
+static void __init xen_smp_prepare_boot_cpu(void)
+{
BUG_ON(smp_processor_id() != 0);
native_smp_prepare_boot_cpu();
- /* We've switched to the "real" per-cpu gdt, so make sure the
- old memory can be recycled */
- make_lowmem_page_readwrite(&per_cpu__gdt_page);
+ if (xen_pv_domain()) {
+ if (!xen_feature(XENFEAT_writable_page_tables))
+ /* We've switched to the "real" per-cpu gdt, so make
+ * sure the old memory can be recycled. */
+ make_lowmem_page_readwrite(xen_initial_gdt);
- for_each_possible_cpu(cpu) {
- cpus_clear(per_cpu(cpu_sibling_map, cpu));
+#ifdef CONFIG_X86_32
/*
- * cpu_core_map lives in a per cpu area that is cleared
- * when the per cpu array is allocated.
- *
- * cpus_clear(per_cpu(cpu_core_map, cpu));
+ * Xen starts us with XEN_FLAT_RING1_DS, but linux code
+ * expects __USER_DS
*/
- }
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+#endif
- xen_setup_vcpu_info_placement();
+ xen_filter_cpu_maps();
+ xen_setup_vcpu_info_placement();
+ }
+ /*
+ * The alternative logic (which patches the unlock/lock) runs before
+ * the smp bootup up code is activated. Hence we need to set this up
+ * the core kernel is being patched. Otherwise we will have only
+ * modules patched but not core code.
+ */
+ xen_init_spinlocks();
}
-void __init xen_smp_prepare_cpus(unsigned int max_cpus)
+static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
{
unsigned cpu;
+ unsigned int i;
- for_each_possible_cpu(cpu) {
- cpus_clear(per_cpu(cpu_sibling_map, cpu));
- /*
- * cpu_core_ map will be zeroed when the per
- * cpu area is allocated.
- *
- * cpus_clear(per_cpu(cpu_core_map, cpu));
- */
+ if (skip_ioapic_setup) {
+ char *m = (max_cpus == 0) ?
+ "The nosmp parameter is incompatible with Xen; " \
+ "use Xen dom0_max_vcpus=1 parameter" :
+ "The noapic parameter is incompatible with Xen";
+
+ xen_raw_printk(m);
+ panic(m);
}
+ xen_init_lock_cpu(0);
- smp_store_cpu_info(0);
+ smp_store_boot_cpu_info();
+ cpu_data(0).x86_max_cores = 1;
+
+ for_each_possible_cpu(i) {
+ zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
+ zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
+ zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
+ }
set_cpu_sibling_map(0);
if (xen_smp_intr_init(0))
BUG();
- cpu_initialized_map = cpumask_of_cpu(0);
+ if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL))
+ panic("could not allocate xen_cpu_initialized_map\n");
+
+ cpumask_copy(xen_cpu_initialized_map, cpumask_of(0));
/* Restrict the possible_map according to max_cpus. */
while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
- for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
+ for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--)
continue;
- cpu_clear(cpu, cpu_possible_map);
+ set_cpu_possible(cpu, false);
}
- for_each_possible_cpu (cpu) {
- struct task_struct *idle;
-
- if (cpu == 0)
- continue;
-
- idle = fork_idle(cpu);
- if (IS_ERR(idle))
- panic("failed fork for CPU %d", cpu);
-
- cpu_set(cpu, cpu_present_map);
- }
-
- //init_xenbus_allowed_cpumask();
+ for_each_possible_cpu(cpu)
+ set_cpu_present(cpu, true);
}
-static __cpuinit int
+static int
cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
{
struct vcpu_guest_context *ctxt;
- struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
+ struct desc_struct *gdt;
+ unsigned long gdt_mfn;
- if (cpu_test_and_set(cpu, cpu_initialized_map))
+ if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
return 0;
ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
if (ctxt == NULL)
return -ENOMEM;
- ctxt->flags = VGCF_IN_KERNEL;
- ctxt->user_regs.ds = __USER_DS;
- ctxt->user_regs.es = __USER_DS;
+ gdt = get_cpu_gdt_table(cpu);
+
+#ifdef CONFIG_X86_32
+ /* Note: PVH is not yet supported on x86_32. */
ctxt->user_regs.fs = __KERNEL_PERCPU;
- ctxt->user_regs.gs = 0;
- ctxt->user_regs.ss = __KERNEL_DS;
+ ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
+#endif
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
- ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
- xen_copy_trap_info(ctxt->trap_ctxt);
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ ctxt->flags = VGCF_IN_KERNEL;
+ ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
+ ctxt->user_regs.ds = __USER_DS;
+ ctxt->user_regs.es = __USER_DS;
+ ctxt->user_regs.ss = __KERNEL_DS;
- ctxt->ldt_ents = 0;
+ xen_copy_trap_info(ctxt->trap_ctxt);
- BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
- make_lowmem_page_readonly(gdt->gdt);
+ ctxt->ldt_ents = 0;
- ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
- ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt);
+ BUG_ON((unsigned long)gdt & ~PAGE_MASK);
- ctxt->user_regs.cs = __KERNEL_CS;
- ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
+ gdt_mfn = arbitrary_virt_to_mfn(gdt);
+ make_lowmem_page_readonly(gdt);
+ make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
- ctxt->kernel_ss = __KERNEL_DS;
- ctxt->kernel_sp = idle->thread.sp0;
+ ctxt->gdt_frames[0] = gdt_mfn;
+ ctxt->gdt_ents = GDT_ENTRIES;
- ctxt->event_callback_cs = __KERNEL_CS;
- ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
- ctxt->failsafe_callback_cs = __KERNEL_CS;
- ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
+ ctxt->kernel_ss = __KERNEL_DS;
+ ctxt->kernel_sp = idle->thread.sp0;
- per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
+#ifdef CONFIG_X86_32
+ ctxt->event_callback_cs = __KERNEL_CS;
+ ctxt->failsafe_callback_cs = __KERNEL_CS;
+#else
+ ctxt->gs_base_kernel = per_cpu_offset(cpu);
+#endif
+ ctxt->event_callback_eip =
+ (unsigned long)xen_hypervisor_callback;
+ ctxt->failsafe_callback_eip =
+ (unsigned long)xen_failsafe_callback;
+ ctxt->user_regs.cs = __KERNEL_CS;
+ per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
+#ifdef CONFIG_X86_32
+ }
+#else
+ } else
+ /* N.B. The user_regs.eip (cpu_bringup_and_idle) is called with
+ * %rdi having the cpu number - which means are passing in
+ * as the first parameter the cpu. Subtle!
+ */
+ ctxt->user_regs.rdi = cpu;
+#endif
+ ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
-
if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
BUG();
@@ -259,21 +432,25 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
return 0;
}
-int __cpuinit xen_cpu_up(unsigned int cpu)
+static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
{
- struct task_struct *idle = idle_task(cpu);
int rc;
-#if 0
- rc = cpu_up_check(cpu);
- if (rc)
- return rc;
-#endif
-
- init_gdt(cpu);
per_cpu(current_task, cpu) = idle;
+#ifdef CONFIG_X86_32
irq_ctx_init(cpu);
+#else
+ clear_tsk_thread_flag(idle, TIF_FORK);
+#endif
+ per_cpu(kernel_stack, cpu) =
+ (unsigned long)task_stack_page(idle) -
+ KERNEL_STACK_OFFSET + THREAD_SIZE;
+
+ xen_setup_runstate_info(cpu);
xen_setup_timer(cpu);
+ xen_init_lock_cpu(cpu);
+
+ per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
/* make sure interrupts start blocked */
per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -283,29 +460,83 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
return rc;
if (num_online_cpus() == 1)
- alternatives_smp_switch(1);
+ /* Just in case we booted with a single CPU. */
+ alternatives_enable_smp();
rc = xen_smp_intr_init(cpu);
if (rc)
return rc;
- smp_store_cpu_info(cpu);
- set_cpu_sibling_map(cpu);
- /* This must be done before setting cpu_online_map */
- wmb();
-
- cpu_set(cpu, cpu_online_map);
-
rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
BUG_ON(rc);
+ while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
+ HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
+ barrier();
+ }
+
return 0;
}
-void xen_smp_cpus_done(unsigned int max_cpus)
+static void xen_smp_cpus_done(unsigned int max_cpus)
{
}
+#ifdef CONFIG_HOTPLUG_CPU
+static int xen_cpu_disable(void)
+{
+ unsigned int cpu = smp_processor_id();
+ if (cpu == 0)
+ return -EBUSY;
+
+ cpu_disable_common();
+
+ load_cr3(swapper_pg_dir);
+ return 0;
+}
+
+static void xen_cpu_die(unsigned int cpu)
+{
+ while (xen_pv_domain() && HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
+ current->state = TASK_UNINTERRUPTIBLE;
+ schedule_timeout(HZ/10);
+ }
+ xen_smp_intr_free(cpu);
+ xen_uninit_lock_cpu(cpu);
+ xen_teardown_timer(cpu);
+}
+
+static void xen_play_dead(void) /* used only with HOTPLUG_CPU */
+{
+ play_dead_common();
+ HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
+ cpu_bringup();
+ /*
+ * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down)
+ * clears certain data that the cpu_idle loop (which called us
+ * and that we return from) expects. The only way to get that
+ * data back is to call:
+ */
+ tick_nohz_idle_enter();
+}
+
+#else /* !CONFIG_HOTPLUG_CPU */
+static int xen_cpu_disable(void)
+{
+ return -ENOSYS;
+}
+
+static void xen_cpu_die(unsigned int cpu)
+{
+ BUG();
+}
+
+static void xen_play_dead(void)
+{
+ BUG();
+}
+
+#endif
static void stop_self(void *v)
{
int cpu = smp_processor_id();
@@ -314,108 +545,232 @@ static void stop_self(void *v)
load_cr3(swapper_pg_dir);
/* should set up a minimal gdt */
+ set_cpu_online(cpu, false);
+
HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
BUG();
}
-void xen_smp_send_stop(void)
+static void xen_stop_other_cpus(int wait)
{
- smp_call_function(stop_self, NULL, 0, 0);
+ smp_call_function(stop_self, NULL, wait);
}
-void xen_smp_send_reschedule(int cpu)
+static void xen_smp_send_reschedule(int cpu)
{
xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
}
+static void __xen_send_IPI_mask(const struct cpumask *mask,
+ int vector)
+{
+ unsigned cpu;
+
+ for_each_cpu_and(cpu, mask, cpu_online_mask)
+ xen_send_IPI_one(cpu, vector);
+}
+
+static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
+{
+ int cpu;
+
+ __xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
+
+ /* Make sure other vcpus get a chance to run if they need to. */
+ for_each_cpu(cpu, mask) {
+ if (xen_vcpu_stolen(cpu)) {
+ HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
+ break;
+ }
+ }
+}
+
+static void xen_smp_send_call_function_single_ipi(int cpu)
+{
+ __xen_send_IPI_mask(cpumask_of(cpu),
+ XEN_CALL_FUNCTION_SINGLE_VECTOR);
+}
+
+static inline int xen_map_vector(int vector)
+{
+ int xen_vector;
+
+ switch (vector) {
+ case RESCHEDULE_VECTOR:
+ xen_vector = XEN_RESCHEDULE_VECTOR;
+ break;
+ case CALL_FUNCTION_VECTOR:
+ xen_vector = XEN_CALL_FUNCTION_VECTOR;
+ break;
+ case CALL_FUNCTION_SINGLE_VECTOR:
+ xen_vector = XEN_CALL_FUNCTION_SINGLE_VECTOR;
+ break;
+ case IRQ_WORK_VECTOR:
+ xen_vector = XEN_IRQ_WORK_VECTOR;
+ break;
+#ifdef CONFIG_X86_64
+ case NMI_VECTOR:
+ case APIC_DM_NMI: /* Some use that instead of NMI_VECTOR */
+ xen_vector = XEN_NMI_VECTOR;
+ break;
+#endif
+ default:
+ xen_vector = -1;
+ printk(KERN_ERR "xen: vector 0x%x is not implemented\n",
+ vector);
+ }
+
+ return xen_vector;
+}
+
+void xen_send_IPI_mask(const struct cpumask *mask,
+ int vector)
+{
+ int xen_vector = xen_map_vector(vector);
+
+ if (xen_vector >= 0)
+ __xen_send_IPI_mask(mask, xen_vector);
+}
+
+void xen_send_IPI_all(int vector)
+{
+ int xen_vector = xen_map_vector(vector);
+
+ if (xen_vector >= 0)
+ __xen_send_IPI_mask(cpu_online_mask, xen_vector);
+}
-static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
+void xen_send_IPI_self(int vector)
+{
+ int xen_vector = xen_map_vector(vector);
+
+ if (xen_vector >= 0)
+ xen_send_IPI_one(smp_processor_id(), xen_vector);
+}
+
+void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
+ int vector)
{
unsigned cpu;
+ unsigned int this_cpu = smp_processor_id();
+ int xen_vector = xen_map_vector(vector);
- cpus_and(mask, mask, cpu_online_map);
+ if (!(num_online_cpus() > 1) || (xen_vector < 0))
+ return;
- for_each_cpu_mask(cpu, mask)
- xen_send_IPI_one(cpu, vector);
+ for_each_cpu_and(cpu, mask, cpu_online_mask) {
+ if (this_cpu == cpu)
+ continue;
+
+ xen_send_IPI_one(cpu, xen_vector);
+ }
}
-static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
+void xen_send_IPI_allbutself(int vector)
{
- void (*func) (void *info) = call_data->func;
- void *info = call_data->info;
- int wait = call_data->wait;
+ xen_send_IPI_mask_allbutself(cpu_online_mask, vector);
+}
- /*
- * Notify initiating CPU that I've grabbed the data and am
- * about to execute the function
- */
- mb();
- atomic_inc(&call_data->started);
- /*
- * At this point the info structure may be out of scope unless wait==1
- */
+static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
+{
irq_enter();
- (*func)(info);
- __get_cpu_var(irq_stat).irq_call_count++;
+ generic_smp_call_function_interrupt();
+ inc_irq_stat(irq_call_count);
irq_exit();
- if (wait) {
- mb(); /* commit everything before setting finished */
- atomic_inc(&call_data->finished);
- }
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
+{
+ irq_enter();
+ generic_smp_call_function_single_interrupt();
+ inc_irq_stat(irq_call_count);
+ irq_exit();
return IRQ_HANDLED;
}
-int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
- void *info, int wait)
+static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id)
{
- struct call_data_struct data;
- int cpus, cpu;
- bool yield;
+ irq_enter();
+ irq_work_run();
+ inc_irq_stat(apic_irq_work_irqs);
+ irq_exit();
- /* Holding any lock stops cpus from going down. */
- spin_lock(&call_lock);
+ return IRQ_HANDLED;
+}
- cpu_clear(smp_processor_id(), mask);
+static const struct smp_ops xen_smp_ops __initconst = {
+ .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
+ .smp_prepare_cpus = xen_smp_prepare_cpus,
+ .smp_cpus_done = xen_smp_cpus_done,
- cpus = cpus_weight(mask);
- if (!cpus) {
- spin_unlock(&call_lock);
- return 0;
- }
+ .cpu_up = xen_cpu_up,
+ .cpu_die = xen_cpu_die,
+ .cpu_disable = xen_cpu_disable,
+ .play_dead = xen_play_dead,
- /* Can deadlock when called with interrupts disabled */
- WARN_ON(irqs_disabled());
+ .stop_other_cpus = xen_stop_other_cpus,
+ .smp_send_reschedule = xen_smp_send_reschedule,
- data.func = func;
- data.info = info;
- atomic_set(&data.started, 0);
- data.wait = wait;
- if (wait)
- atomic_set(&data.finished, 0);
+ .send_call_func_ipi = xen_smp_send_call_function_ipi,
+ .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
+};
- call_data = &data;
- mb(); /* write everything before IPI */
+void __init xen_smp_init(void)
+{
+ smp_ops = xen_smp_ops;
+ xen_fill_possible_map();
+}
- /* Send a message to other CPUs and wait for them to respond */
- xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
+static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
+{
+ native_smp_prepare_cpus(max_cpus);
+ WARN_ON(xen_smp_intr_init(0));
- /* Make sure other vcpus get a chance to run if they need to. */
- yield = false;
- for_each_cpu_mask(cpu, mask)
- if (xen_vcpu_stolen(cpu))
- yield = true;
+ xen_init_lock_cpu(0);
+}
- if (yield)
- HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
+{
+ int rc;
+ /*
+ * xen_smp_intr_init() needs to run before native_cpu_up()
+ * so that IPI vectors are set up on the booting CPU before
+ * it is marked online in native_cpu_up().
+ */
+ rc = xen_smp_intr_init(cpu);
+ WARN_ON(rc);
+ if (!rc)
+ rc = native_cpu_up(cpu, tidle);
- /* Wait for response */
- while (atomic_read(&data.started) != cpus ||
- (wait && atomic_read(&data.finished) != cpus))
- cpu_relax();
+ /*
+ * We must initialize the slowpath CPU kicker _after_ the native
+ * path has executed. If we initialized it before none of the
+ * unlocker IPI kicks would reach the booting CPU as the booting
+ * CPU had not set itself 'online' in cpu_online_mask. That mask
+ * is checked when IPIs are sent (on HVM at least).
+ */
+ xen_init_lock_cpu(cpu);
+ return rc;
+}
- spin_unlock(&call_lock);
+static void xen_hvm_cpu_die(unsigned int cpu)
+{
+ xen_cpu_die(cpu);
+ native_cpu_die(cpu);
+}
- return 0;
+void __init xen_hvm_smp_init(void)
+{
+ if (!xen_have_vector_callback)
+ return;
+ smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
+ smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
+ smp_ops.cpu_up = xen_hvm_cpu_up;
+ smp_ops.cpu_die = xen_hvm_cpu_die;
+ smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
+ smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
+ smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu;
}
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
new file mode 100644
index 00000000000..c7c2d89efd7
--- /dev/null
+++ b/arch/x86/xen/smp.h
@@ -0,0 +1,11 @@
+#ifndef _XEN_SMP_H
+
+extern void xen_send_IPI_mask(const struct cpumask *mask,
+ int vector);
+extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
+ int vector);
+extern void xen_send_IPI_allbutself(int vector);
+extern void xen_send_IPI_all(int vector);
+extern void xen_send_IPI_self(int vector);
+
+#endif
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
new file mode 100644
index 00000000000..0ba5f3b967f
--- /dev/null
+++ b/arch/x86/xen/spinlock.c
@@ -0,0 +1,348 @@
+/*
+ * Split spinlock implementation out into its own file, so it can be
+ * compiled in a FTRACE-compatible way.
+ */
+#include <linux/kernel_stat.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/log2.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+
+#include <asm/paravirt.h>
+
+#include <xen/interface/xen.h>
+#include <xen/events.h>
+
+#include "xen-ops.h"
+#include "debugfs.h"
+
+enum xen_contention_stat {
+ TAKEN_SLOW,
+ TAKEN_SLOW_PICKUP,
+ TAKEN_SLOW_SPURIOUS,
+ RELEASED_SLOW,
+ RELEASED_SLOW_KICKED,
+ NR_CONTENTION_STATS
+};
+
+
+#ifdef CONFIG_XEN_DEBUG_FS
+#define HISTO_BUCKETS 30
+static struct xen_spinlock_stats
+{
+ u32 contention_stats[NR_CONTENTION_STATS];
+ u32 histo_spin_blocked[HISTO_BUCKETS+1];
+ u64 time_blocked;
+} spinlock_stats;
+
+static u8 zero_stats;
+
+static inline void check_zero(void)
+{
+ u8 ret;
+ u8 old = ACCESS_ONCE(zero_stats);
+ if (unlikely(old)) {
+ ret = cmpxchg(&zero_stats, old, 0);
+ /* This ensures only one fellow resets the stat */
+ if (ret == old)
+ memset(&spinlock_stats, 0, sizeof(spinlock_stats));
+ }
+}
+
+static inline void add_stats(enum xen_contention_stat var, u32 val)
+{
+ check_zero();
+ spinlock_stats.contention_stats[var] += val;
+}
+
+static inline u64 spin_time_start(void)
+{
+ return xen_clocksource_read();
+}
+
+static void __spin_time_accum(u64 delta, u32 *array)
+{
+ unsigned index = ilog2(delta);
+
+ check_zero();
+
+ if (index < HISTO_BUCKETS)
+ array[index]++;
+ else
+ array[HISTO_BUCKETS]++;
+}
+
+static inline void spin_time_accum_blocked(u64 start)
+{
+ u32 delta = xen_clocksource_read() - start;
+
+ __spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
+ spinlock_stats.time_blocked += delta;
+}
+#else /* !CONFIG_XEN_DEBUG_FS */
+static inline void add_stats(enum xen_contention_stat var, u32 val)
+{
+}
+
+static inline u64 spin_time_start(void)
+{
+ return 0;
+}
+
+static inline void spin_time_accum_blocked(u64 start)
+{
+}
+#endif /* CONFIG_XEN_DEBUG_FS */
+
+struct xen_lock_waiting {
+ struct arch_spinlock *lock;
+ __ticket_t want;
+};
+
+static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
+static DEFINE_PER_CPU(char *, irq_name);
+static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting);
+static cpumask_t waiting_cpus;
+
+static bool xen_pvspin = true;
+__visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
+{
+ int irq = __this_cpu_read(lock_kicker_irq);
+ struct xen_lock_waiting *w = &__get_cpu_var(lock_waiting);
+ int cpu = smp_processor_id();
+ u64 start;
+ unsigned long flags;
+
+ /* If kicker interrupts not initialized yet, just spin */
+ if (irq == -1)
+ return;
+
+ start = spin_time_start();
+
+ /*
+ * Make sure an interrupt handler can't upset things in a
+ * partially setup state.
+ */
+ local_irq_save(flags);
+ /*
+ * We don't really care if we're overwriting some other
+ * (lock,want) pair, as that would mean that we're currently
+ * in an interrupt context, and the outer context had
+ * interrupts enabled. That has already kicked the VCPU out
+ * of xen_poll_irq(), so it will just return spuriously and
+ * retry with newly setup (lock,want).
+ *
+ * The ordering protocol on this is that the "lock" pointer
+ * may only be set non-NULL if the "want" ticket is correct.
+ * If we're updating "want", we must first clear "lock".
+ */
+ w->lock = NULL;
+ smp_wmb();
+ w->want = want;
+ smp_wmb();
+ w->lock = lock;
+
+ /* This uses set_bit, which atomic and therefore a barrier */
+ cpumask_set_cpu(cpu, &waiting_cpus);
+ add_stats(TAKEN_SLOW, 1);
+
+ /* clear pending */
+ xen_clear_irq_pending(irq);
+
+ /* Only check lock once pending cleared */
+ barrier();
+
+ /*
+ * Mark entry to slowpath before doing the pickup test to make
+ * sure we don't deadlock with an unlocker.
+ */
+ __ticket_enter_slowpath(lock);
+
+ /*
+ * check again make sure it didn't become free while
+ * we weren't looking
+ */
+ if (ACCESS_ONCE(lock->tickets.head) == want) {
+ add_stats(TAKEN_SLOW_PICKUP, 1);
+ goto out;
+ }
+
+ /* Allow interrupts while blocked */
+ local_irq_restore(flags);
+
+ /*
+ * If an interrupt happens here, it will leave the wakeup irq
+ * pending, which will cause xen_poll_irq() to return
+ * immediately.
+ */
+
+ /* Block until irq becomes pending (or perhaps a spurious wakeup) */
+ xen_poll_irq(irq);
+ add_stats(TAKEN_SLOW_SPURIOUS, !xen_test_irq_pending(irq));
+
+ local_irq_save(flags);
+
+ kstat_incr_irq_this_cpu(irq);
+out:
+ cpumask_clear_cpu(cpu, &waiting_cpus);
+ w->lock = NULL;
+
+ local_irq_restore(flags);
+
+ spin_time_accum_blocked(start);
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_lock_spinning);
+
+static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)
+{
+ int cpu;
+
+ add_stats(RELEASED_SLOW, 1);
+
+ for_each_cpu(cpu, &waiting_cpus) {
+ const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu);
+
+ /* Make sure we read lock before want */
+ if (ACCESS_ONCE(w->lock) == lock &&
+ ACCESS_ONCE(w->want) == next) {
+ add_stats(RELEASED_SLOW_KICKED, 1);
+ xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
+ break;
+ }
+ }
+}
+
+static irqreturn_t dummy_handler(int irq, void *dev_id)
+{
+ BUG();
+ return IRQ_HANDLED;
+}
+
+void xen_init_lock_cpu(int cpu)
+{
+ int irq;
+ char *name;
+
+ if (!xen_pvspin)
+ return;
+
+ WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n",
+ cpu, per_cpu(lock_kicker_irq, cpu));
+
+ name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
+ irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
+ cpu,
+ dummy_handler,
+ IRQF_PERCPU|IRQF_NOBALANCING,
+ name,
+ NULL);
+
+ if (irq >= 0) {
+ disable_irq(irq); /* make sure it's never delivered */
+ per_cpu(lock_kicker_irq, cpu) = irq;
+ per_cpu(irq_name, cpu) = name;
+ }
+
+ printk("cpu %d spinlock event irq %d\n", cpu, irq);
+}
+
+void xen_uninit_lock_cpu(int cpu)
+{
+ if (!xen_pvspin)
+ return;
+
+ unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
+ per_cpu(lock_kicker_irq, cpu) = -1;
+ kfree(per_cpu(irq_name, cpu));
+ per_cpu(irq_name, cpu) = NULL;
+}
+
+
+/*
+ * Our init of PV spinlocks is split in two init functions due to us
+ * using paravirt patching and jump labels patching and having to do
+ * all of this before SMP code is invoked.
+ *
+ * The paravirt patching needs to be done _before_ the alternative asm code
+ * is started, otherwise we would not patch the core kernel code.
+ */
+void __init xen_init_spinlocks(void)
+{
+
+ if (!xen_pvspin) {
+ printk(KERN_DEBUG "xen: PV spinlocks disabled\n");
+ return;
+ }
+ printk(KERN_DEBUG "xen: PV spinlocks enabled\n");
+ pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning);
+ pv_lock_ops.unlock_kick = xen_unlock_kick;
+}
+
+/*
+ * While the jump_label init code needs to happend _after_ the jump labels are
+ * enabled and before SMP is started. Hence we use pre-SMP initcall level
+ * init. We cannot do it in xen_init_spinlocks as that is done before
+ * jump labels are activated.
+ */
+static __init int xen_init_spinlocks_jump(void)
+{
+ if (!xen_pvspin)
+ return 0;
+
+ if (!xen_domain())
+ return 0;
+
+ static_key_slow_inc(&paravirt_ticketlocks_enabled);
+ return 0;
+}
+early_initcall(xen_init_spinlocks_jump);
+
+static __init int xen_parse_nopvspin(char *arg)
+{
+ xen_pvspin = false;
+ return 0;
+}
+early_param("xen_nopvspin", xen_parse_nopvspin);
+
+#ifdef CONFIG_XEN_DEBUG_FS
+
+static struct dentry *d_spin_debug;
+
+static int __init xen_spinlock_debugfs(void)
+{
+ struct dentry *d_xen = xen_init_debugfs();
+
+ if (d_xen == NULL)
+ return -ENOMEM;
+
+ if (!xen_pvspin)
+ return 0;
+
+ d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
+
+ debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
+
+ debugfs_create_u32("taken_slow", 0444, d_spin_debug,
+ &spinlock_stats.contention_stats[TAKEN_SLOW]);
+ debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
+ &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);
+ debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
+ &spinlock_stats.contention_stats[TAKEN_SLOW_SPURIOUS]);
+
+ debugfs_create_u32("released_slow", 0444, d_spin_debug,
+ &spinlock_stats.contention_stats[RELEASED_SLOW]);
+ debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
+ &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]);
+
+ debugfs_create_u64("time_blocked", 0444, d_spin_debug,
+ &spinlock_stats.time_blocked);
+
+ debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
+ spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
+
+ return 0;
+}
+fs_initcall(xen_spinlock_debugfs);
+
+#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
new file mode 100644
index 00000000000..c4df9dbd63b
--- /dev/null
+++ b/arch/x86/xen/suspend.c
@@ -0,0 +1,97 @@
+#include <linux/types.h>
+#include <linux/clockchips.h>
+
+#include <xen/interface/xen.h>
+#include <xen/grant_table.h>
+#include <xen/events.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+#include <asm/fixmap.h>
+
+#include "xen-ops.h"
+#include "mmu.h"
+
+static void xen_pv_pre_suspend(void)
+{
+ xen_mm_pin_all();
+
+ xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
+ xen_start_info->console.domU.mfn =
+ mfn_to_pfn(xen_start_info->console.domU.mfn);
+
+ BUG_ON(!irqs_disabled());
+
+ HYPERVISOR_shared_info = &xen_dummy_shared_info;
+ if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
+ __pte_ma(0), 0))
+ BUG();
+}
+
+static void xen_hvm_post_suspend(int suspend_cancelled)
+{
+#ifdef CONFIG_XEN_PVHVM
+ int cpu;
+ xen_hvm_init_shared_info();
+ xen_callback_vector();
+ xen_unplug_emulated_devices();
+ if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
+ for_each_online_cpu(cpu) {
+ xen_setup_runstate_info(cpu);
+ }
+ }
+#endif
+}
+
+static void xen_pv_post_suspend(int suspend_cancelled)
+{
+ xen_build_mfn_list_list();
+
+ xen_setup_shared_info();
+
+ if (suspend_cancelled) {
+ xen_start_info->store_mfn =
+ pfn_to_mfn(xen_start_info->store_mfn);
+ xen_start_info->console.domU.mfn =
+ pfn_to_mfn(xen_start_info->console.domU.mfn);
+ } else {
+#ifdef CONFIG_SMP
+ BUG_ON(xen_cpu_initialized_map == NULL);
+ cpumask_copy(xen_cpu_initialized_map, cpu_online_mask);
+#endif
+ xen_vcpu_restore();
+ }
+
+ xen_mm_unpin_all();
+}
+
+void xen_arch_pre_suspend(void)
+{
+ if (xen_pv_domain())
+ xen_pv_pre_suspend();
+}
+
+void xen_arch_post_suspend(int cancelled)
+{
+ if (xen_pv_domain())
+ xen_pv_post_suspend(cancelled);
+ else
+ xen_hvm_post_suspend(cancelled);
+}
+
+static void xen_vcpu_notify_restore(void *data)
+{
+ unsigned long reason = (unsigned long)data;
+
+ /* Boot processor notified via generic timekeeping_resume() */
+ if ( smp_processor_id() == 0)
+ return;
+
+ clockevents_notify(reason, NULL);
+}
+
+void xen_arch_resume(void)
+{
+ on_each_cpu(xen_vcpu_notify_restore,
+ (void *)CLOCK_EVT_NOTIFY_RESUME, 1);
+}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index c39e1a5aa24..7b78f88c170 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -12,44 +12,34 @@
#include <linux/clocksource.h>
#include <linux/clockchips.h>
#include <linux/kernel_stat.h>
+#include <linux/math64.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/pvclock_gtod.h>
+#include <asm/pvclock.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
#include <xen/events.h>
+#include <xen/features.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
#include "xen-ops.h"
-#define XEN_SHIFT 22
-
/* Xen may fire a timer up to this many ns early */
#define TIMER_SLOP 100000
#define NS_PER_TICK (1000000000LL / HZ)
-static cycle_t xen_clocksource_read(void);
-
-/* These are perodically updated in shared_info, and then copied here. */
-struct shadow_time_info {
- u64 tsc_timestamp; /* TSC at last update of time vals. */
- u64 system_timestamp; /* Time, in nanosecs, since boot. */
- u32 tsc_to_nsec_mul;
- int tsc_shift;
- u32 version;
-};
-
-static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
-
/* runstate info updated by Xen */
-static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
/* snapshots of runstate info */
-static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
+static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot);
-/* unused ns of stolen and blocked time */
-static DEFINE_PER_CPU(u64, residual_stolen);
-static DEFINE_PER_CPU(u64, residual_blocked);
+/* unused ns of stolen time */
+static DEFINE_PER_CPU(u64, xen_residual_stolen);
/* return an consistent snapshot of 64-bit time/counter value */
static u64 get64(const u64 *p)
@@ -90,7 +80,7 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
BUG_ON(preemptible());
- state = &__get_cpu_var(runstate);
+ state = &__get_cpu_var(xen_runstate);
/*
* The runstate info is always updated by the hypervisor on
@@ -108,14 +98,14 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
/* return true when a vcpu could run but has no real cpu to run on */
bool xen_vcpu_stolen(int vcpu)
{
- return per_cpu(runstate, vcpu).state == RUNSTATE_runnable;
+ return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable;
}
-static void setup_runstate_info(int cpu)
+void xen_setup_runstate_info(int cpu)
{
struct vcpu_register_runstate_memory_area area;
- area.addr.v = &per_cpu(runstate, cpu);
+ area.addr.v = &per_cpu(xen_runstate, cpu);
if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
cpu, &area))
@@ -126,249 +116,125 @@ static void do_stolen_accounting(void)
{
struct vcpu_runstate_info state;
struct vcpu_runstate_info *snap;
- s64 blocked, runnable, offline, stolen;
+ s64 runnable, offline, stolen;
cputime_t ticks;
get_runstate_snapshot(&state);
WARN_ON(state.state != RUNSTATE_running);
- snap = &__get_cpu_var(runstate_snapshot);
+ snap = &__get_cpu_var(xen_runstate_snapshot);
/* work out how much time the VCPU has not been runn*ing* */
- blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
*snap = state;
/* Add the appropriate number of ticks of stolen time,
- including any left-overs from last time. Passing NULL to
- account_steal_time accounts the time as stolen. */
- stolen = runnable + offline + __get_cpu_var(residual_stolen);
+ including any left-overs from last time. */
+ stolen = runnable + offline + __this_cpu_read(xen_residual_stolen);
if (stolen < 0)
stolen = 0;
- ticks = 0;
- while (stolen >= NS_PER_TICK) {
- ticks++;
- stolen -= NS_PER_TICK;
- }
- __get_cpu_var(residual_stolen) = stolen;
- account_steal_time(NULL, ticks);
-
- /* Add the appropriate number of ticks of blocked time,
- including any left-overs from last time. Passing idle to
- account_steal_time accounts the time as idle/wait. */
- blocked += __get_cpu_var(residual_blocked);
-
- if (blocked < 0)
- blocked = 0;
-
- ticks = 0;
- while (blocked >= NS_PER_TICK) {
- ticks++;
- blocked -= NS_PER_TICK;
- }
- __get_cpu_var(residual_blocked) = blocked;
- account_steal_time(idle_task(smp_processor_id()), ticks);
+ ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
+ __this_cpu_write(xen_residual_stolen, stolen);
+ account_steal_ticks(ticks);
}
-/*
- * Xen sched_clock implementation. Returns the number of unstolen
- * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
- * states.
- */
-unsigned long long xen_sched_clock(void)
+/* Get the TSC speed from Xen */
+static unsigned long xen_tsc_khz(void)
{
- struct vcpu_runstate_info state;
- cycle_t now;
- u64 ret;
- s64 offset;
-
- /*
- * Ideally sched_clock should be called on a per-cpu basis
- * anyway, so preempt should already be disabled, but that's
- * not current practice at the moment.
- */
- preempt_disable();
-
- now = xen_clocksource_read();
-
- get_runstate_snapshot(&state);
-
- WARN_ON(state.state != RUNSTATE_running);
-
- offset = now - state.state_entry_time;
- if (offset < 0)
- offset = 0;
-
- ret = state.time[RUNSTATE_blocked] +
- state.time[RUNSTATE_running] +
- offset;
-
- preempt_enable();
-
- return ret;
-}
-
-
-/* Get the CPU speed from Xen */
-unsigned long xen_cpu_khz(void)
-{
- u64 xen_khz = 1000000ULL << 32;
- const struct vcpu_time_info *info =
+ struct pvclock_vcpu_time_info *info =
&HYPERVISOR_shared_info->vcpu_info[0].time;
- do_div(xen_khz, info->tsc_to_system_mul);
- if (info->tsc_shift < 0)
- xen_khz <<= -info->tsc_shift;
- else
- xen_khz >>= info->tsc_shift;
-
- return xen_khz;
+ return pvclock_tsc_khz(info);
}
-/*
- * Reads a consistent set of time-base values from Xen, into a shadow data
- * area.
- */
-static unsigned get_time_values_from_xen(void)
+cycle_t xen_clocksource_read(void)
{
- struct vcpu_time_info *src;
- struct shadow_time_info *dst;
+ struct pvclock_vcpu_time_info *src;
+ cycle_t ret;
- /* src is shared memory with the hypervisor, so we need to
- make sure we get a consistent snapshot, even in the face of
- being preempted. */
+ preempt_disable_notrace();
src = &__get_cpu_var(xen_vcpu)->time;
- dst = &__get_cpu_var(shadow_time);
-
- do {
- dst->version = src->version;
- rmb(); /* fetch version before data */
- dst->tsc_timestamp = src->tsc_timestamp;
- dst->system_timestamp = src->system_time;
- dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
- dst->tsc_shift = src->tsc_shift;
- rmb(); /* test version after fetching data */
- } while ((src->version & 1) | (dst->version ^ src->version));
-
- return dst->version;
+ ret = pvclock_clocksource_read(src);
+ preempt_enable_notrace();
+ return ret;
}
-/*
- * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
- * yielding a 64-bit result.
- */
-static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+static cycle_t xen_clocksource_get_cycles(struct clocksource *cs)
{
- u64 product;
-#ifdef __i386__
- u32 tmp1, tmp2;
-#endif
+ return xen_clocksource_read();
+}
- if (shift < 0)
- delta >>= -shift;
- else
- delta <<= shift;
-
-#ifdef __i386__
- __asm__ (
- "mul %5 ; "
- "mov %4,%%eax ; "
- "mov %%edx,%4 ; "
- "mul %5 ; "
- "xor %5,%5 ; "
- "add %4,%%eax ; "
- "adc %5,%%edx ; "
- : "=A" (product), "=r" (tmp1), "=r" (tmp2)
- : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
-#elif __x86_64__
- __asm__ (
- "mul %%rdx ; shrd $32,%%rdx,%%rax"
- : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
-#else
-#error implement me!
-#endif
+static void xen_read_wallclock(struct timespec *ts)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ struct pvclock_wall_clock *wall_clock = &(s->wc);
+ struct pvclock_vcpu_time_info *vcpu_time;
- return product;
+ vcpu_time = &get_cpu_var(xen_vcpu)->time;
+ pvclock_read_wallclock(wall_clock, vcpu_time, ts);
+ put_cpu_var(xen_vcpu);
}
-static u64 get_nsec_offset(struct shadow_time_info *shadow)
+static void xen_get_wallclock(struct timespec *now)
{
- u64 now, delta;
- now = native_read_tsc();
- delta = now - shadow->tsc_timestamp;
- return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+ xen_read_wallclock(now);
}
-static cycle_t xen_clocksource_read(void)
+static int xen_set_wallclock(const struct timespec *now)
{
- struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
- cycle_t ret;
- unsigned version;
-
- do {
- version = get_time_values_from_xen();
- barrier();
- ret = shadow->system_timestamp + get_nsec_offset(shadow);
- barrier();
- } while (version != __get_cpu_var(xen_vcpu)->time.version);
-
- put_cpu_var(shadow_time);
-
- return ret;
+ return -1;
}
-static void xen_read_wallclock(struct timespec *ts)
+static int xen_pvclock_gtod_notify(struct notifier_block *nb,
+ unsigned long was_set, void *priv)
{
- const struct shared_info *s = HYPERVISOR_shared_info;
- u32 version;
- u64 delta;
- struct timespec now;
+ /* Protected by the calling core code serialization */
+ static struct timespec next_sync;
- /* get wallclock at system boot */
- do {
- version = s->wc_version;
- rmb(); /* fetch version before time */
- now.tv_sec = s->wc_sec;
- now.tv_nsec = s->wc_nsec;
- rmb(); /* fetch time before checking version */
- } while ((s->wc_version & 1) | (version ^ s->wc_version));
+ struct xen_platform_op op;
+ struct timespec now;
- delta = xen_clocksource_read(); /* time since system boot */
- delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+ now = __current_kernel_time();
- now.tv_nsec = do_div(delta, NSEC_PER_SEC);
- now.tv_sec = delta;
+ /*
+ * We only take the expensive HV call when the clock was set
+ * or when the 11 minutes RTC synchronization time elapsed.
+ */
+ if (!was_set && timespec_compare(&now, &next_sync) < 0)
+ return NOTIFY_OK;
- set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
-}
+ op.cmd = XENPF_settime;
+ op.u.settime.secs = now.tv_sec;
+ op.u.settime.nsecs = now.tv_nsec;
+ op.u.settime.system_time = xen_clocksource_read();
-unsigned long xen_get_wallclock(void)
-{
- struct timespec ts;
+ (void)HYPERVISOR_dom0_op(&op);
- xen_read_wallclock(&ts);
+ /*
+ * Move the next drift compensation time 11 minutes
+ * ahead. That's emulating the sync_cmos_clock() update for
+ * the hardware RTC.
+ */
+ next_sync = now;
+ next_sync.tv_sec += 11 * 60;
- return ts.tv_sec;
+ return NOTIFY_OK;
}
-int xen_set_wallclock(unsigned long now)
-{
- /* do nothing for domU */
- return -1;
-}
+static struct notifier_block xen_pvclock_gtod_notifier = {
+ .notifier_call = xen_pvclock_gtod_notify,
+};
static struct clocksource xen_clocksource __read_mostly = {
.name = "xen",
.rating = 400,
- .read = xen_clocksource_read,
+ .read = xen_clocksource_get_cycles,
.mask = ~0,
- .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
- .shift = XEN_SHIFT,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
@@ -522,11 +388,16 @@ static const struct clock_event_device xen_vcpuop_clockevent = {
static const struct clock_event_device *xen_clockevent =
&xen_timerop_clockevent;
-static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
+
+struct xen_clock_event_device {
+ struct clock_event_device evt;
+ char *name;
+};
+static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt.irq = -1 };
static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
{
- struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
+ struct clock_event_device *evt = &__get_cpu_var(xen_clock_events).evt;
irqreturn_t ret;
ret = IRQ_NONE;
@@ -540,12 +411,31 @@ static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
return ret;
}
+void xen_teardown_timer(int cpu)
+{
+ struct clock_event_device *evt;
+ BUG_ON(cpu == 0);
+ evt = &per_cpu(xen_clock_events, cpu).evt;
+
+ if (evt->irq >= 0) {
+ unbind_from_irqhandler(evt->irq, NULL);
+ evt->irq = -1;
+ kfree(per_cpu(xen_clock_events, cpu).name);
+ per_cpu(xen_clock_events, cpu).name = NULL;
+ }
+}
+
void xen_setup_timer(int cpu)
{
- const char *name;
+ char *name;
struct clock_event_device *evt;
int irq;
+ evt = &per_cpu(xen_clock_events, cpu).evt;
+ WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu);
+ if (evt->irq >= 0)
+ xen_teardown_timer(cpu);
+
printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
name = kasprintf(GFP_KERNEL, "timer%d", cpu);
@@ -553,32 +443,51 @@ void xen_setup_timer(int cpu)
name = "<timer kasprintf failed>";
irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
- IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+ IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
+ IRQF_FORCE_RESUME,
name, NULL);
+ (void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX);
- evt = &per_cpu(xen_clock_events, cpu);
memcpy(evt, xen_clockevent, sizeof(*evt));
- evt->cpumask = cpumask_of_cpu(cpu);
+ evt->cpumask = cpumask_of(cpu);
evt->irq = irq;
-
- setup_runstate_info(cpu);
+ per_cpu(xen_clock_events, cpu).name = name;
}
+
void xen_setup_cpu_clockevents(void)
{
BUG_ON(preemptible());
- clockevents_register_device(&__get_cpu_var(xen_clock_events));
+ clockevents_register_device(&__get_cpu_var(xen_clock_events).evt);
}
-__init void xen_time_init(void)
+void xen_timer_resume(void)
{
- int cpu = smp_processor_id();
+ int cpu;
- get_time_values_from_xen();
+ pvclock_resume();
- clocksource_register(&xen_clocksource);
+ if (xen_clockevent != &xen_vcpuop_clockevent)
+ return;
+
+ for_each_online_cpu(cpu) {
+ if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+ BUG();
+ }
+}
+
+static const struct pv_time_ops xen_time_ops __initconst = {
+ .sched_clock = xen_clocksource_read,
+};
+
+static void __init xen_time_init(void)
+{
+ int cpu = smp_processor_id();
+ struct timespec tp;
+
+ clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
/* Successfully turned off 100Hz tick, so we have the
@@ -588,12 +497,66 @@ __init void xen_time_init(void)
}
/* Set initial system time with full resolution */
- xen_read_wallclock(&xtime);
- set_normalized_timespec(&wall_to_monotonic,
- -xtime.tv_sec, -xtime.tv_nsec);
+ xen_read_wallclock(&tp);
+ do_settimeofday(&tp);
setup_force_cpu_cap(X86_FEATURE_TSC);
+ xen_setup_runstate_info(cpu);
xen_setup_timer(cpu);
xen_setup_cpu_clockevents();
+
+ if (xen_initial_domain())
+ pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
}
+
+void __init xen_init_time_ops(void)
+{
+ pv_time_ops = xen_time_ops;
+
+ x86_init.timers.timer_init = xen_time_init;
+ x86_init.timers.setup_percpu_clockev = x86_init_noop;
+ x86_cpuinit.setup_percpu_clockev = x86_init_noop;
+
+ x86_platform.calibrate_tsc = xen_tsc_khz;
+ x86_platform.get_wallclock = xen_get_wallclock;
+ /* Dom0 uses the native method to set the hardware RTC. */
+ if (!xen_initial_domain())
+ x86_platform.set_wallclock = xen_set_wallclock;
+}
+
+#ifdef CONFIG_XEN_PVHVM
+static void xen_hvm_setup_cpu_clockevents(void)
+{
+ int cpu = smp_processor_id();
+ xen_setup_runstate_info(cpu);
+ /*
+ * xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence
+ * doing it xen_hvm_cpu_notify (which gets called by smp_init during
+ * early bootup and also during CPU hotplug events).
+ */
+ xen_setup_cpu_clockevents();
+}
+
+void __init xen_hvm_init_time_ops(void)
+{
+ /* vector callback is needed otherwise we cannot receive interrupts
+ * on cpu > 0 and at this point we don't know how many cpus are
+ * available */
+ if (!xen_have_vector_callback)
+ return;
+ if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
+ printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
+ "disable pv timer\n");
+ return;
+ }
+
+ pv_time_ops = xen_time_ops;
+ x86_init.timers.setup_percpu_clockev = xen_time_init;
+ x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
+
+ x86_platform.calibrate_tsc = xen_tsc_khz;
+ x86_platform.get_wallclock = xen_get_wallclock;
+ x86_platform.set_wallclock = xen_set_wallclock;
+}
+#endif
diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c
new file mode 100644
index 00000000000..520022d1a18
--- /dev/null
+++ b/arch/x86/xen/trace.c
@@ -0,0 +1,62 @@
+#include <linux/ftrace.h>
+#include <xen/interface/xen.h>
+
+#define N(x) [__HYPERVISOR_##x] = "("#x")"
+static const char *xen_hypercall_names[] = {
+ N(set_trap_table),
+ N(mmu_update),
+ N(set_gdt),
+ N(stack_switch),
+ N(set_callbacks),
+ N(fpu_taskswitch),
+ N(sched_op_compat),
+ N(dom0_op),
+ N(set_debugreg),
+ N(get_debugreg),
+ N(update_descriptor),
+ N(memory_op),
+ N(multicall),
+ N(update_va_mapping),
+ N(set_timer_op),
+ N(event_channel_op_compat),
+ N(xen_version),
+ N(console_io),
+ N(physdev_op_compat),
+ N(grant_table_op),
+ N(vm_assist),
+ N(update_va_mapping_otherdomain),
+ N(iret),
+ N(vcpu_op),
+ N(set_segment_base),
+ N(mmuext_op),
+ N(acm_op),
+ N(nmi_op),
+ N(sched_op),
+ N(callback_op),
+ N(xenoprof_op),
+ N(event_channel_op),
+ N(physdev_op),
+ N(hvm_op),
+
+/* Architecture-specific hypercall definitions. */
+ N(arch_0),
+ N(arch_1),
+ N(arch_2),
+ N(arch_3),
+ N(arch_4),
+ N(arch_5),
+ N(arch_6),
+ N(arch_7),
+};
+#undef N
+
+static const char *xen_hypercall_name(unsigned op)
+{
+ if (op < ARRAY_SIZE(xen_hypercall_names) && xen_hypercall_names[op] != NULL)
+ return xen_hypercall_names[op];
+
+ return "";
+}
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/xen.h>
diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c
new file mode 100644
index 00000000000..6722e3733f0
--- /dev/null
+++ b/arch/x86/xen/vga.c
@@ -0,0 +1,74 @@
+#include <linux/screen_info.h>
+#include <linux/init.h>
+
+#include <asm/bootparam.h>
+#include <asm/setup.h>
+
+#include <xen/interface/xen.h>
+
+#include "xen-ops.h"
+
+void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
+{
+ struct screen_info *screen_info = &boot_params.screen_info;
+
+ /* This is drawn from a dump from vgacon:startup in
+ * standard Linux. */
+ screen_info->orig_video_mode = 3;
+ screen_info->orig_video_isVGA = 1;
+ screen_info->orig_video_lines = 25;
+ screen_info->orig_video_cols = 80;
+ screen_info->orig_video_ega_bx = 3;
+ screen_info->orig_video_points = 16;
+ screen_info->orig_y = screen_info->orig_video_lines - 1;
+
+ switch (info->video_type) {
+ case XEN_VGATYPE_TEXT_MODE_3:
+ if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3)
+ + sizeof(info->u.text_mode_3))
+ break;
+ screen_info->orig_video_lines = info->u.text_mode_3.rows;
+ screen_info->orig_video_cols = info->u.text_mode_3.columns;
+ screen_info->orig_x = info->u.text_mode_3.cursor_x;
+ screen_info->orig_y = info->u.text_mode_3.cursor_y;
+ screen_info->orig_video_points =
+ info->u.text_mode_3.font_height;
+ break;
+
+ case XEN_VGATYPE_EFI_LFB:
+ case XEN_VGATYPE_VESA_LFB:
+ if (size < offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.gbl_caps))
+ break;
+ screen_info->orig_video_isVGA = VIDEO_TYPE_VLFB;
+ screen_info->lfb_width = info->u.vesa_lfb.width;
+ screen_info->lfb_height = info->u.vesa_lfb.height;
+ screen_info->lfb_depth = info->u.vesa_lfb.bits_per_pixel;
+ screen_info->lfb_base = info->u.vesa_lfb.lfb_base;
+ screen_info->lfb_size = info->u.vesa_lfb.lfb_size;
+ screen_info->lfb_linelength = info->u.vesa_lfb.bytes_per_line;
+ screen_info->red_size = info->u.vesa_lfb.red_size;
+ screen_info->red_pos = info->u.vesa_lfb.red_pos;
+ screen_info->green_size = info->u.vesa_lfb.green_size;
+ screen_info->green_pos = info->u.vesa_lfb.green_pos;
+ screen_info->blue_size = info->u.vesa_lfb.blue_size;
+ screen_info->blue_pos = info->u.vesa_lfb.blue_pos;
+ screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size;
+ screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos;
+
+ if (info->video_type == XEN_VGATYPE_EFI_LFB) {
+ screen_info->orig_video_isVGA = VIDEO_TYPE_EFI;
+ break;
+ }
+
+ if (size >= offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.gbl_caps)
+ + sizeof(info->u.vesa_lfb.gbl_caps))
+ screen_info->capabilities = info->u.vesa_lfb.gbl_caps;
+ if (size >= offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.mode_attrs)
+ + sizeof(info->u.vesa_lfb.mode_attrs))
+ screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs;
+ break;
+ }
+}
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 6b7190449d0..3e45aa00071 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -1,47 +1,39 @@
/*
- Asm versions of Xen pv-ops, suitable for either direct use or inlining.
- The inline versions are the same as the direct-use versions, with the
- pre- and post-amble chopped off.
-
- This code is encoded for size rather than absolute efficiency,
- with a view to being able to inline as much as possible.
-
- We only bother with direct forms (ie, vcpu in pda) of the operations
- here; the indirect forms are better handled in C, since they're
- generally too large to inline anyway.
+ * Asm versions of Xen pv-ops, suitable for either direct use or
+ * inlining. The inline versions are the same as the direct-use
+ * versions, with the pre- and post-amble chopped off.
+ *
+ * This code is encoded for size rather than absolute efficiency, with
+ * a view to being able to inline as much as possible.
+ *
+ * We only bother with direct forms (ie, vcpu in percpu data) of the
+ * operations here; the indirect forms are better handled in C, since
+ * they're generally too large to inline anyway.
*/
-#include <linux/linkage.h>
-
#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
#include <asm/percpu.h>
#include <asm/processor-flags.h>
-#include <asm/segment.h>
-
-#include <xen/interface/xen.h>
-#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
-#define ENDPATCH(x) .globl x##_end; x##_end=.
-
-/* Pseudo-flag used for virtual NMI, which we don't implement yet */
-#define XEN_EFLAGS_NMI 0x80000000
+#include "xen-asm.h"
/*
- Enable events. This clears the event mask and tests the pending
- event status with one and operation. If there are pending
- events, then enter the hypervisor to get them handled.
+ * Enable events. This clears the event mask and tests the pending
+ * event status with one and operation. If there are pending events,
+ * then enter the hypervisor to get them handled.
*/
ENTRY(xen_irq_enable_direct)
/* Unmask events */
- movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+ movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
- /* Preempt here doesn't matter because that will deal with
- any pending interrupts. The pending check may end up being
- run on the wrong CPU, but that doesn't hurt. */
+ /*
+ * Preempt here doesn't matter because that will deal with any
+ * pending interrupts. The pending check may end up being run
+ * on the wrong CPU, but that doesn't hurt.
+ */
/* Test for pending */
- testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
jz 1f
2: call check_events
@@ -53,29 +45,29 @@ ENDPATCH(xen_irq_enable_direct)
/*
- Disabling events is simply a matter of making the event mask
- non-zero.
+ * Disabling events is simply a matter of making the event mask
+ * non-zero.
*/
ENTRY(xen_irq_disable_direct)
- movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+ movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
ENDPATCH(xen_irq_disable_direct)
ret
ENDPROC(xen_irq_disable_direct)
RELOC(xen_irq_disable_direct, 0)
/*
- (xen_)save_fl is used to get the current interrupt enable status.
- Callers expect the status to be in X86_EFLAGS_IF, and other bits
- may be set in the return value. We take advantage of this by
- making sure that X86_EFLAGS_IF has the right value (and other bits
- in that byte are 0), but other bits in the return value are
- undefined. We need to toggle the state of the bit, because
- Xen and x86 use opposite senses (mask vs enable).
+ * (xen_)save_fl is used to get the current interrupt enable status.
+ * Callers expect the status to be in X86_EFLAGS_IF, and other bits
+ * may be set in the return value. We take advantage of this by
+ * making sure that X86_EFLAGS_IF has the right value (and other bits
+ * in that byte are 0), but other bits in the return value are
+ * undefined. We need to toggle the state of the bit, because Xen and
+ * x86 use opposite senses (mask vs enable).
*/
ENTRY(xen_save_fl_direct)
- testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
setz %ah
- addb %ah,%ah
+ addb %ah, %ah
ENDPATCH(xen_save_fl_direct)
ret
ENDPROC(xen_save_fl_direct)
@@ -83,23 +75,28 @@ ENDPATCH(xen_save_fl_direct)
/*
- In principle the caller should be passing us a value return
- from xen_save_fl_direct, but for robustness sake we test only
- the X86_EFLAGS_IF flag rather than the whole byte. After
- setting the interrupt mask state, it checks for unmasked
- pending events and enters the hypervisor to get them delivered
- if so.
+ * In principle the caller should be passing us a value return from
+ * xen_save_fl_direct, but for robustness sake we test only the
+ * X86_EFLAGS_IF flag rather than the whole byte. After setting the
+ * interrupt mask state, it checks for unmasked pending events and
+ * enters the hypervisor to get them delivered if so.
*/
ENTRY(xen_restore_fl_direct)
+#ifdef CONFIG_X86_64
+ testw $X86_EFLAGS_IF, %di
+#else
testb $X86_EFLAGS_IF>>8, %ah
- setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
- /* Preempt here doesn't matter because that will deal with
- any pending interrupts. The pending check may end up being
- run on the wrong CPU, but that doesn't hurt. */
+#endif
+ setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ /*
+ * Preempt here doesn't matter because that will deal with any
+ * pending interrupts. The pending check may end up being run
+ * on the wrong CPU, but that doesn't hurt.
+ */
/* check for unmasked and pending */
- cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
- jz 1f
+ cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
+ jnz 1f
2: call check_events
1:
ENDPATCH(xen_restore_fl_direct)
@@ -107,190 +104,39 @@ ENDPATCH(xen_restore_fl_direct)
ENDPROC(xen_restore_fl_direct)
RELOC(xen_restore_fl_direct, 2b+1)
-/*
- This is run where a normal iret would be run, with the same stack setup:
- 8: eflags
- 4: cs
- esp-> 0: eip
-
- This attempts to make sure that any pending events are dealt
- with on return to usermode, but there is a small window in
- which an event can happen just before entering usermode. If
- the nested interrupt ends up setting one of the TIF_WORK_MASK
- pending work flags, they will not be tested again before
- returning to usermode. This means that a process can end up
- with pending work, which will be unprocessed until the process
- enters and leaves the kernel again, which could be an
- unbounded amount of time. This means that a pending signal or
- reschedule event could be indefinitely delayed.
-
- The fix is to notice a nested interrupt in the critical
- window, and if one occurs, then fold the nested interrupt into
- the current interrupt stack frame, and re-process it
- iteratively rather than recursively. This means that it will
- exit via the normal path, and all pending work will be dealt
- with appropriately.
-
- Because the nested interrupt handler needs to deal with the
- current stack state in whatever form its in, we keep things
- simple by only using a single register which is pushed/popped
- on the stack.
-
- Non-direct iret could be done in the same way, but it would
- require an annoying amount of code duplication. We'll assume
- that direct mode will be the common case once the hypervisor
- support becomes commonplace.
- */
-ENTRY(xen_iret_direct)
- /* test eflags for special cases */
- testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
- jnz hyper_iret
-
- push %eax
- ESP_OFFSET=4 # bytes pushed onto stack
-
- /* Store vcpu_info pointer for easy access. Do it this
- way to avoid having to reload %fs */
-#ifdef CONFIG_SMP
- GET_THREAD_INFO(%eax)
- movl TI_cpu(%eax),%eax
- movl __per_cpu_offset(,%eax,4),%eax
- lea per_cpu__xen_vcpu_info(%eax),%eax
-#else
- movl $per_cpu__xen_vcpu_info, %eax
-#endif
-
- /* check IF state we're restoring */
- testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
-
- /* Maybe enable events. Once this happens we could get a
- recursive event, so the critical region starts immediately
- afterwards. However, if that happens we don't end up
- resuming the code, so we don't have to be worried about
- being preempted to another CPU. */
- setz XEN_vcpu_info_mask(%eax)
-xen_iret_start_crit:
-
- /* check for unmasked and pending */
- cmpw $0x0001, XEN_vcpu_info_pending(%eax)
-
- /* If there's something pending, mask events again so we
- can jump back into xen_hypervisor_callback */
- sete XEN_vcpu_info_mask(%eax)
-
- popl %eax
-
- /* From this point on the registers are restored and the stack
- updated, so we don't need to worry about it if we're preempted */
-iret_restore_end:
-
- /* Jump to hypervisor_callback after fixing up the stack.
- Events are masked, so jumping out of the critical
- region is OK. */
- je xen_hypervisor_callback
-
- iret
-xen_iret_end_crit:
-
-hyper_iret:
- /* put this out of line since its very rarely used */
- jmp hypercall_page + __HYPERVISOR_iret * 32
-
- .globl xen_iret_start_crit, xen_iret_end_crit
-
-/*
- This is called by xen_hypervisor_callback in entry.S when it sees
- that the EIP at the time of interrupt was between xen_iret_start_crit
- and xen_iret_end_crit. We're passed the EIP in %eax so we can do
- a more refined determination of what to do.
-
- The stack format at this point is:
- ----------------
- ss : (ss/esp may be present if we came from usermode)
- esp :
- eflags } outer exception info
- cs }
- eip }
- ---------------- <- edi (copy dest)
- eax : outer eax if it hasn't been restored
- ----------------
- eflags } nested exception info
- cs } (no ss/esp because we're nested
- eip } from the same ring)
- orig_eax }<- esi (copy src)
- - - - - - - - -
- fs }
- es }
- ds } SAVE_ALL state
- eax }
- : :
- ebx }
- ----------------
- return addr <- esp
- ----------------
-
- In order to deliver the nested exception properly, we need to shift
- everything from the return addr up to the error code so it
- sits just under the outer exception info. This means that when we
- handle the exception, we do it in the context of the outer exception
- rather than starting a new one.
-
- The only caveat is that if the outer eax hasn't been
- restored yet (ie, it's still on stack), we need to insert
- its value into the SAVE_ALL state before going on, since
- it's usermode state which we eventually need to restore.
- */
-ENTRY(xen_iret_crit_fixup)
- /* offsets +4 for return address */
-
- /*
- Paranoia: Make sure we're really coming from userspace.
- One could imagine a case where userspace jumps into the
- critical range address, but just before the CPU delivers a GP,
- it decides to deliver an interrupt instead. Unlikely?
- Definitely. Easy to avoid? Yes. The Intel documents
- explicitly say that the reported EIP for a bad jump is the
- jump instruction itself, not the destination, but some virtual
- environments get this wrong.
- */
- movl PT_CS+4(%esp), %ecx
- andl $SEGMENT_RPL_MASK, %ecx
- cmpl $USER_RPL, %ecx
- je 2f
-
- lea PT_ORIG_EAX+4(%esp), %esi
- lea PT_EFLAGS+4(%esp), %edi
-
- /* If eip is before iret_restore_end then stack
- hasn't been restored yet. */
- cmp $iret_restore_end, %eax
- jae 1f
-
- movl 0+4(%edi),%eax /* copy EAX */
- movl %eax, PT_EAX+4(%esp)
-
- lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */
-
- /* set up the copy */
-1: std
- mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */
- rep movsl
- cld
-
- lea 4(%edi),%esp /* point esp to new frame */
-2: ret
-
/*
- Force an event check by making a hypercall,
- but preserve regs before making the call.
+ * Force an event check by making a hypercall, but preserve regs
+ * before making the call.
*/
check_events:
+#ifdef CONFIG_X86_32
push %eax
push %ecx
push %edx
- call force_evtchn_callback
+ call xen_force_evtchn_callback
pop %edx
pop %ecx
pop %eax
+#else
+ push %rax
+ push %rcx
+ push %rdx
+ push %rsi
+ push %rdi
+ push %r8
+ push %r9
+ push %r10
+ push %r11
+ call xen_force_evtchn_callback
+ pop %r11
+ pop %r10
+ pop %r9
+ pop %r8
+ pop %rdi
+ pop %rsi
+ pop %rdx
+ pop %rcx
+ pop %rax
+#endif
ret
diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h
new file mode 100644
index 00000000000..465276467a4
--- /dev/null
+++ b/arch/x86/xen/xen-asm.h
@@ -0,0 +1,12 @@
+#ifndef _XEN_XEN_ASM_H
+#define _XEN_XEN_ASM_H
+
+#include <linux/linkage.h>
+
+#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
+#define ENDPATCH(x) .globl x##_end; x##_end=.
+
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI 0x80000000
+
+#endif
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
new file mode 100644
index 00000000000..fd92a64d748
--- /dev/null
+++ b/arch/x86/xen/xen-asm_32.S
@@ -0,0 +1,237 @@
+/*
+ * Asm versions of Xen pv-ops, suitable for either direct use or
+ * inlining. The inline versions are the same as the direct-use
+ * versions, with the pre- and post-amble chopped off.
+ *
+ * This code is encoded for size rather than absolute efficiency, with
+ * a view to being able to inline as much as possible.
+ *
+ * We only bother with direct forms (ie, vcpu in pda) of the
+ * operations here; the indirect forms are better handled in C, since
+ * they're generally too large to inline anyway.
+ */
+
+#include <asm/thread_info.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+#include <asm/asm.h>
+
+#include <xen/interface/xen.h>
+
+#include "xen-asm.h"
+
+/*
+ * Force an event check by making a hypercall, but preserve regs
+ * before making the call.
+ */
+check_events:
+ push %eax
+ push %ecx
+ push %edx
+ call xen_force_evtchn_callback
+ pop %edx
+ pop %ecx
+ pop %eax
+ ret
+
+/*
+ * We can't use sysexit directly, because we're not running in ring0.
+ * But we can easily fake it up using iret. Assuming xen_sysexit is
+ * jumped to with a standard stack frame, we can just strip it back to
+ * a standard iret frame and use iret.
+ */
+ENTRY(xen_sysexit)
+ movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */
+ orl $X86_EFLAGS_IF, PT_EFLAGS(%esp)
+ lea PT_EIP(%esp), %esp
+
+ jmp xen_iret
+ENDPROC(xen_sysexit)
+
+/*
+ * This is run where a normal iret would be run, with the same stack setup:
+ * 8: eflags
+ * 4: cs
+ * esp-> 0: eip
+ *
+ * This attempts to make sure that any pending events are dealt with
+ * on return to usermode, but there is a small window in which an
+ * event can happen just before entering usermode. If the nested
+ * interrupt ends up setting one of the TIF_WORK_MASK pending work
+ * flags, they will not be tested again before returning to
+ * usermode. This means that a process can end up with pending work,
+ * which will be unprocessed until the process enters and leaves the
+ * kernel again, which could be an unbounded amount of time. This
+ * means that a pending signal or reschedule event could be
+ * indefinitely delayed.
+ *
+ * The fix is to notice a nested interrupt in the critical window, and
+ * if one occurs, then fold the nested interrupt into the current
+ * interrupt stack frame, and re-process it iteratively rather than
+ * recursively. This means that it will exit via the normal path, and
+ * all pending work will be dealt with appropriately.
+ *
+ * Because the nested interrupt handler needs to deal with the current
+ * stack state in whatever form its in, we keep things simple by only
+ * using a single register which is pushed/popped on the stack.
+ */
+
+.macro POP_FS
+1:
+ popw %fs
+.pushsection .fixup, "ax"
+2: movw $0, (%esp)
+ jmp 1b
+.popsection
+ _ASM_EXTABLE(1b,2b)
+.endm
+
+ENTRY(xen_iret)
+ /* test eflags for special cases */
+ testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
+ jnz hyper_iret
+
+ push %eax
+ ESP_OFFSET=4 # bytes pushed onto stack
+
+ /* Store vcpu_info pointer for easy access */
+#ifdef CONFIG_SMP
+ pushw %fs
+ movl $(__KERNEL_PERCPU), %eax
+ movl %eax, %fs
+ movl %fs:xen_vcpu, %eax
+ POP_FS
+#else
+ movl %ss:xen_vcpu, %eax
+#endif
+
+ /* check IF state we're restoring */
+ testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
+
+ /*
+ * Maybe enable events. Once this happens we could get a
+ * recursive event, so the critical region starts immediately
+ * afterwards. However, if that happens we don't end up
+ * resuming the code, so we don't have to be worried about
+ * being preempted to another CPU.
+ */
+ setz %ss:XEN_vcpu_info_mask(%eax)
+xen_iret_start_crit:
+
+ /* check for unmasked and pending */
+ cmpw $0x0001, %ss:XEN_vcpu_info_pending(%eax)
+
+ /*
+ * If there's something pending, mask events again so we can
+ * jump back into xen_hypervisor_callback. Otherwise do not
+ * touch XEN_vcpu_info_mask.
+ */
+ jne 1f
+ movb $1, %ss:XEN_vcpu_info_mask(%eax)
+
+1: popl %eax
+
+ /*
+ * From this point on the registers are restored and the stack
+ * updated, so we don't need to worry about it if we're
+ * preempted
+ */
+iret_restore_end:
+
+ /*
+ * Jump to hypervisor_callback after fixing up the stack.
+ * Events are masked, so jumping out of the critical region is
+ * OK.
+ */
+ je xen_hypervisor_callback
+
+1: iret
+xen_iret_end_crit:
+ _ASM_EXTABLE(1b, iret_exc)
+
+hyper_iret:
+ /* put this out of line since its very rarely used */
+ jmp hypercall_page + __HYPERVISOR_iret * 32
+
+ .globl xen_iret_start_crit, xen_iret_end_crit
+
+/*
+ * This is called by xen_hypervisor_callback in entry.S when it sees
+ * that the EIP at the time of interrupt was between
+ * xen_iret_start_crit and xen_iret_end_crit. We're passed the EIP in
+ * %eax so we can do a more refined determination of what to do.
+ *
+ * The stack format at this point is:
+ * ----------------
+ * ss : (ss/esp may be present if we came from usermode)
+ * esp :
+ * eflags } outer exception info
+ * cs }
+ * eip }
+ * ---------------- <- edi (copy dest)
+ * eax : outer eax if it hasn't been restored
+ * ----------------
+ * eflags } nested exception info
+ * cs } (no ss/esp because we're nested
+ * eip } from the same ring)
+ * orig_eax }<- esi (copy src)
+ * - - - - - - - -
+ * fs }
+ * es }
+ * ds } SAVE_ALL state
+ * eax }
+ * : :
+ * ebx }<- esp
+ * ----------------
+ *
+ * In order to deliver the nested exception properly, we need to shift
+ * everything from the return addr up to the error code so it sits
+ * just under the outer exception info. This means that when we
+ * handle the exception, we do it in the context of the outer
+ * exception rather than starting a new one.
+ *
+ * The only caveat is that if the outer eax hasn't been restored yet
+ * (ie, it's still on stack), we need to insert its value into the
+ * SAVE_ALL state before going on, since it's usermode state which we
+ * eventually need to restore.
+ */
+ENTRY(xen_iret_crit_fixup)
+ /*
+ * Paranoia: Make sure we're really coming from kernel space.
+ * One could imagine a case where userspace jumps into the
+ * critical range address, but just before the CPU delivers a
+ * GP, it decides to deliver an interrupt instead. Unlikely?
+ * Definitely. Easy to avoid? Yes. The Intel documents
+ * explicitly say that the reported EIP for a bad jump is the
+ * jump instruction itself, not the destination, but some
+ * virtual environments get this wrong.
+ */
+ movl PT_CS(%esp), %ecx
+ andl $SEGMENT_RPL_MASK, %ecx
+ cmpl $USER_RPL, %ecx
+ je 2f
+
+ lea PT_ORIG_EAX(%esp), %esi
+ lea PT_EFLAGS(%esp), %edi
+
+ /*
+ * If eip is before iret_restore_end then stack
+ * hasn't been restored yet.
+ */
+ cmp $iret_restore_end, %eax
+ jae 1f
+
+ movl 0+4(%edi), %eax /* copy EAX (just above top of frame) */
+ movl %eax, PT_EAX(%esp)
+
+ lea ESP_OFFSET(%edi), %edi /* move dest up over saved regs */
+
+ /* set up the copy */
+1: std
+ mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */
+ rep movsl
+ cld
+
+ lea 4(%edi), %esp /* point esp to new frame */
+2: jmp xen_do_upcall
+
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
new file mode 100644
index 00000000000..53adefda427
--- /dev/null
+++ b/arch/x86/xen/xen-asm_64.S
@@ -0,0 +1,159 @@
+/*
+ * Asm versions of Xen pv-ops, suitable for either direct use or
+ * inlining. The inline versions are the same as the direct-use
+ * versions, with the pre- and post-amble chopped off.
+ *
+ * This code is encoded for size rather than absolute efficiency, with
+ * a view to being able to inline as much as possible.
+ *
+ * We only bother with direct forms (ie, vcpu in pda) of the
+ * operations here; the indirect forms are better handled in C, since
+ * they're generally too large to inline anyway.
+ */
+
+#include <asm/errno.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+
+#include <xen/interface/xen.h>
+
+#include "xen-asm.h"
+
+ENTRY(xen_adjust_exception_frame)
+ mov 8+0(%rsp), %rcx
+ mov 8+8(%rsp), %r11
+ ret $16
+
+hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
+/*
+ * Xen64 iret frame:
+ *
+ * ss
+ * rsp
+ * rflags
+ * cs
+ * rip <-- standard iret frame
+ *
+ * flags
+ *
+ * rcx }
+ * r11 }<-- pushed by hypercall page
+ * rsp->rax }
+ */
+ENTRY(xen_iret)
+ pushq $0
+1: jmp hypercall_iret
+ENDPATCH(xen_iret)
+RELOC(xen_iret, 1b+1)
+
+/*
+ * sysexit is not used for 64-bit processes, so it's only ever used to
+ * return to 32-bit compat userspace.
+ */
+ENTRY(xen_sysexit)
+ pushq $__USER32_DS
+ pushq %rcx
+ pushq $X86_EFLAGS_IF
+ pushq $__USER32_CS
+ pushq %rdx
+
+ pushq $0
+1: jmp hypercall_iret
+ENDPATCH(xen_sysexit)
+RELOC(xen_sysexit, 1b+1)
+
+ENTRY(xen_sysret64)
+ /*
+ * We're already on the usermode stack at this point, but
+ * still with the kernel gs, so we can easily switch back
+ */
+ movq %rsp, PER_CPU_VAR(old_rsp)
+ movq PER_CPU_VAR(kernel_stack), %rsp
+
+ pushq $__USER_DS
+ pushq PER_CPU_VAR(old_rsp)
+ pushq %r11
+ pushq $__USER_CS
+ pushq %rcx
+
+ pushq $VGCF_in_syscall
+1: jmp hypercall_iret
+ENDPATCH(xen_sysret64)
+RELOC(xen_sysret64, 1b+1)
+
+ENTRY(xen_sysret32)
+ /*
+ * We're already on the usermode stack at this point, but
+ * still with the kernel gs, so we can easily switch back
+ */
+ movq %rsp, PER_CPU_VAR(old_rsp)
+ movq PER_CPU_VAR(kernel_stack), %rsp
+
+ pushq $__USER32_DS
+ pushq PER_CPU_VAR(old_rsp)
+ pushq %r11
+ pushq $__USER32_CS
+ pushq %rcx
+
+ pushq $0
+1: jmp hypercall_iret
+ENDPATCH(xen_sysret32)
+RELOC(xen_sysret32, 1b+1)
+
+/*
+ * Xen handles syscall callbacks much like ordinary exceptions, which
+ * means we have:
+ * - kernel gs
+ * - kernel rsp
+ * - an iret-like stack frame on the stack (including rcx and r11):
+ * ss
+ * rsp
+ * rflags
+ * cs
+ * rip
+ * r11
+ * rsp->rcx
+ *
+ * In all the entrypoints, we undo all that to make it look like a
+ * CPU-generated syscall/sysenter and jump to the normal entrypoint.
+ */
+
+.macro undo_xen_syscall
+ mov 0*8(%rsp), %rcx
+ mov 1*8(%rsp), %r11
+ mov 5*8(%rsp), %rsp
+.endm
+
+/* Normal 64-bit system call target */
+ENTRY(xen_syscall_target)
+ undo_xen_syscall
+ jmp system_call_after_swapgs
+ENDPROC(xen_syscall_target)
+
+#ifdef CONFIG_IA32_EMULATION
+
+/* 32-bit compat syscall target */
+ENTRY(xen_syscall32_target)
+ undo_xen_syscall
+ jmp ia32_cstar_target
+ENDPROC(xen_syscall32_target)
+
+/* 32-bit compat sysenter target */
+ENTRY(xen_sysenter_target)
+ undo_xen_syscall
+ jmp ia32_sysenter_target
+ENDPROC(xen_sysenter_target)
+
+#else /* !CONFIG_IA32_EMULATION */
+
+ENTRY(xen_syscall32_target)
+ENTRY(xen_sysenter_target)
+ lea 16(%rsp), %rsp /* strip %rcx, %r11 */
+ mov $-ENOSYS, %rax
+ pushq $0
+ jmp hypercall_iret
+ENDPROC(xen_syscall32_target)
+ENDPROC(xen_sysenter_target)
+
+#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 288d587ce73..485b6958554 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -5,36 +5,126 @@
#include <linux/elfnote.h>
#include <linux/init.h>
+
#include <asm/boot.h>
+#include <asm/asm.h>
+#include <asm/page_types.h>
+
#include <xen/interface/elfnote.h>
+#include <xen/interface/features.h>
+#include <asm/xen/interface.h>
+
+#ifdef CONFIG_XEN_PVH
+#define PVH_FEATURES_STR "|writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel"
+/* Note the lack of 'hvm_callback_vector'. Older hypervisor will
+ * balk at this being part of XEN_ELFNOTE_FEATURES, so we put it in
+ * XEN_ELFNOTE_SUPPORTED_FEATURES which older hypervisors will ignore.
+ */
+#define PVH_FEATURES ((1 << XENFEAT_writable_page_tables) | \
+ (1 << XENFEAT_auto_translated_physmap) | \
+ (1 << XENFEAT_supervisor_mode_kernel) | \
+ (1 << XENFEAT_hvm_callback_vector))
+/* The XENFEAT_writable_page_tables is not stricly neccessary as we set that
+ * up regardless whether this CONFIG option is enabled or not, but it
+ * clarifies what the right flags need to be.
+ */
+#else
+#define PVH_FEATURES_STR ""
+#define PVH_FEATURES (0)
+#endif
__INIT
ENTRY(startup_xen)
- movl %esi,xen_start_info
cld
- movl $(init_thread_union+THREAD_SIZE),%esp
+#ifdef CONFIG_X86_32
+ mov %esi,xen_start_info
+ mov $init_thread_union+THREAD_SIZE,%esp
+#else
+ mov %rsi,xen_start_info
+ mov $init_thread_union+THREAD_SIZE,%rsp
+#endif
jmp xen_start_kernel
__FINIT
-.pushsection .bss.page_aligned
- .align PAGE_SIZE_asm
+.pushsection .text
+ .balign PAGE_SIZE
ENTRY(hypercall_page)
- .skip 0x1000
+#define NEXT_HYPERCALL(x) \
+ ENTRY(xen_hypercall_##x) \
+ .skip 32
+
+NEXT_HYPERCALL(set_trap_table)
+NEXT_HYPERCALL(mmu_update)
+NEXT_HYPERCALL(set_gdt)
+NEXT_HYPERCALL(stack_switch)
+NEXT_HYPERCALL(set_callbacks)
+NEXT_HYPERCALL(fpu_taskswitch)
+NEXT_HYPERCALL(sched_op_compat)
+NEXT_HYPERCALL(platform_op)
+NEXT_HYPERCALL(set_debugreg)
+NEXT_HYPERCALL(get_debugreg)
+NEXT_HYPERCALL(update_descriptor)
+NEXT_HYPERCALL(ni)
+NEXT_HYPERCALL(memory_op)
+NEXT_HYPERCALL(multicall)
+NEXT_HYPERCALL(update_va_mapping)
+NEXT_HYPERCALL(set_timer_op)
+NEXT_HYPERCALL(event_channel_op_compat)
+NEXT_HYPERCALL(xen_version)
+NEXT_HYPERCALL(console_io)
+NEXT_HYPERCALL(physdev_op_compat)
+NEXT_HYPERCALL(grant_table_op)
+NEXT_HYPERCALL(vm_assist)
+NEXT_HYPERCALL(update_va_mapping_otherdomain)
+NEXT_HYPERCALL(iret)
+NEXT_HYPERCALL(vcpu_op)
+NEXT_HYPERCALL(set_segment_base)
+NEXT_HYPERCALL(mmuext_op)
+NEXT_HYPERCALL(xsm_op)
+NEXT_HYPERCALL(nmi_op)
+NEXT_HYPERCALL(sched_op)
+NEXT_HYPERCALL(callback_op)
+NEXT_HYPERCALL(xenoprof_op)
+NEXT_HYPERCALL(event_channel_op)
+NEXT_HYPERCALL(physdev_op)
+NEXT_HYPERCALL(hvm_op)
+NEXT_HYPERCALL(sysctl)
+NEXT_HYPERCALL(domctl)
+NEXT_HYPERCALL(kexec_op)
+NEXT_HYPERCALL(tmem_op) /* 38 */
+ENTRY(xen_hypercall_rsvr)
+ .skip 320
+NEXT_HYPERCALL(mca) /* 48 */
+NEXT_HYPERCALL(arch_1)
+NEXT_HYPERCALL(arch_2)
+NEXT_HYPERCALL(arch_3)
+NEXT_HYPERCALL(arch_4)
+NEXT_HYPERCALL(arch_5)
+NEXT_HYPERCALL(arch_6)
+ .balign PAGE_SIZE
.popsection
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
- ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET)
- ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen)
- ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
- ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
-#ifdef CONFIG_X86_PAE
- ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
+#ifdef CONFIG_X86_32
+ ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
#else
- ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no")
+ ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
#endif
+ ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
+ ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
+ ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables|pae_pgdir_above_4gb"; .asciz PVH_FEATURES_STR)
+ ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long (PVH_FEATURES) |
+ (1 << XENFEAT_writable_page_tables) |
+ (1 << XENFEAT_dom0))
+ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
+ ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
+ .quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
+ ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
+ ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
+ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
#endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index b02a909bfd4..97d87659f77 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -2,66 +2,129 @@
#define XEN_OPS_H
#include <linux/init.h>
+#include <linux/clocksource.h>
+#include <linux/irqreturn.h>
+#include <xen/xen-ops.h>
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
+extern void *xen_initial_gdt;
+
+struct trap_info;
void xen_copy_trap_info(struct trap_info *traps);
-DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
+DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
DECLARE_PER_CPU(unsigned long, xen_cr3);
DECLARE_PER_CPU(unsigned long, xen_current_cr3);
extern struct start_info *xen_start_info;
+extern struct shared_info xen_dummy_shared_info;
extern struct shared_info *HYPERVISOR_shared_info;
-char * __init xen_memory_setup(void);
-void __init xen_arch_setup(void);
-void __init xen_init_IRQ(void);
+void xen_setup_mfn_list_list(void);
+void xen_setup_shared_info(void);
+void xen_build_mfn_list_list(void);
+void xen_setup_machphys_mapping(void);
+void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
+void xen_reserve_top(void);
+extern unsigned long xen_max_p2m_pfn;
-void xen_setup_timer(int cpu);
-void xen_setup_cpu_clockevents(void);
-unsigned long xen_cpu_khz(void);
-void __init xen_time_init(void);
-unsigned long xen_get_wallclock(void);
-int xen_set_wallclock(unsigned long time);
-unsigned long long xen_sched_clock(void);
+void xen_mm_pin_all(void);
+void xen_mm_unpin_all(void);
+void xen_set_pat(u64);
-bool xen_vcpu_stolen(int vcpu);
+char * __init xen_memory_setup(void);
+char * xen_auto_xlated_memory_setup(void);
+void __init xen_arch_setup(void);
+void xen_enable_sysenter(void);
+void xen_enable_syscall(void);
+void xen_vcpu_restore(void);
-void xen_mark_init_mm_pinned(void);
+void xen_callback_vector(void);
+void xen_hvm_init_shared_info(void);
+void xen_unplug_emulated_devices(void);
-void __init xen_fill_possible_map(void);
+void __init xen_build_dynamic_phys_to_machine(void);
+unsigned long __init xen_revector_p2m_tree(void);
-void __init xen_setup_vcpu_info_placement(void);
-void xen_smp_prepare_boot_cpu(void);
-void xen_smp_prepare_cpus(unsigned int max_cpus);
-int xen_cpu_up(unsigned int cpu);
-void xen_smp_cpus_done(unsigned int max_cpus);
+void xen_init_irq_ops(void);
+void xen_setup_timer(int cpu);
+void xen_setup_runstate_info(int cpu);
+void xen_teardown_timer(int cpu);
+cycle_t xen_clocksource_read(void);
+void xen_setup_cpu_clockevents(void);
+void __init xen_init_time_ops(void);
+void __init xen_hvm_init_time_ops(void);
-void xen_smp_send_stop(void);
-void xen_smp_send_reschedule(int cpu);
-int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic,
- int wait);
-int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
- int nonatomic, int wait);
+irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
-int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
- void *info, int wait);
+bool xen_vcpu_stolen(int vcpu);
+void xen_setup_vcpu_info_placement(void);
+
+#ifdef CONFIG_SMP
+void xen_smp_init(void);
+void __init xen_hvm_smp_init(void);
+
+extern cpumask_var_t xen_cpu_initialized_map;
+#else
+static inline void xen_smp_init(void) {}
+static inline void xen_hvm_smp_init(void) {}
+#endif
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+void __init xen_init_spinlocks(void);
+void xen_init_lock_cpu(int cpu);
+void xen_uninit_lock_cpu(int cpu);
+#else
+static inline void xen_init_spinlocks(void)
+{
+}
+static inline void xen_init_lock_cpu(int cpu)
+{
+}
+static inline void xen_uninit_lock_cpu(int cpu)
+{
+}
+#endif
+
+struct dom0_vga_console_info;
+
+#ifdef CONFIG_XEN_DOM0
+void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
+void __init xen_init_apic(void);
+#else
+static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
+ size_t size)
+{
+}
+static inline void __init xen_init_apic(void)
+{
+}
+#endif
/* Declare an asm function, along with symbols needed to make it
inlineable */
#define DECL_ASM(ret, name, ...) \
- ret name(__VA_ARGS__); \
- extern char name##_end[]; \
- extern char name##_reloc[] \
+ __visible ret name(__VA_ARGS__); \
+ extern char name##_end[] __visible; \
+ extern char name##_reloc[] __visible
DECL_ASM(void, xen_irq_enable_direct, void);
DECL_ASM(void, xen_irq_disable_direct, void);
DECL_ASM(unsigned long, xen_save_fl_direct, void);
DECL_ASM(void, xen_restore_fl_direct, unsigned long);
-void xen_iret_direct(void);
+/* These are not functions, and cannot be called normally */
+__visible void xen_iret(void);
+__visible void xen_sysexit(void);
+__visible void xen_sysret32(void);
+__visible void xen_sysret64(void);
+__visible void xen_adjust_exception_frame(void);
+
+extern int xen_panic_handler_init(void);
+
+void xen_pvh_secondary_vcpu_init(int cpu);
#endif /* XEN_OPS_H */