aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/xen
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/Kconfig36
-rw-r--r--arch/x86/xen/Makefile18
-rw-r--r--arch/x86/xen/apic.c34
-rw-r--r--arch/x86/xen/debugfs.c104
-rw-r--r--arch/x86/xen/debugfs.h4
-rw-r--r--arch/x86/xen/enlighten.c1842
-rw-r--r--arch/x86/xen/grant-table.c188
-rw-r--r--arch/x86/xen/irq.c83
-rw-r--r--arch/x86/xen/mmu.c2178
-rw-r--r--arch/x86/xen/mmu.h43
-rw-r--r--arch/x86/xen/multicalls.c190
-rw-r--r--arch/x86/xen/multicalls.h10
-rw-r--r--arch/x86/xen/p2m.c1340
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c109
-rw-r--r--arch/x86/xen/platform-pci-unplug.c217
-rw-r--r--arch/x86/xen/setup.c531
-rw-r--r--arch/x86/xen/smp.c536
-rw-r--r--arch/x86/xen/smp.h11
-rw-r--r--arch/x86/xen/spinlock.c448
-rw-r--r--arch/x86/xen/suspend.c54
-rw-r--r--arch/x86/xen/time.c266
-rw-r--r--arch/x86/xen/trace.c62
-rw-r--r--arch/x86/xen/vga.c74
-rw-r--r--arch/x86/xen/xen-asm.S142
-rw-r--r--arch/x86/xen/xen-asm.h12
-rw-r--r--arch/x86/xen/xen-asm_32.S366
-rw-r--r--arch/x86/xen/xen-asm_64.S256
-rw-r--r--arch/x86/xen/xen-head.S83
-rw-r--r--arch/x86/xen/xen-ops.h84
29 files changed, 6796 insertions, 2525 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 87b9ab16642..e88fda867a3 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -4,29 +4,38 @@
config XEN
bool "Xen guest support"
- select PARAVIRT
+ depends on PARAVIRT
select PARAVIRT_CLOCK
- depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
- depends on X86_CMPXCHG && X86_TSC
+ select XEN_HAVE_PVMMU
+ depends on X86_64 || (X86_32 && X86_PAE)
+ depends on X86_TSC
help
This is the Linux Xen port. Enabling this will allow the
kernel to boot in a paravirtualized environment under the
Xen hypervisor.
+config XEN_DOM0
+ def_bool y
+ depends on XEN && PCI_XEN && SWIOTLB_XEN
+ depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI
+
+config XEN_PVHVM
+ def_bool y
+ depends on XEN && PCI && X86_LOCAL_APIC
+
config XEN_MAX_DOMAIN_MEMORY
- int "Maximum allowed size of a domain in gigabytes"
- default 8 if X86_32
- default 32 if X86_64
+ int
+ default 500 if X86_64
+ default 64 if X86_32
depends on XEN
help
- The pseudo-physical to machine address array is sized
- according to the maximum possible memory size of a Xen
- domain. This array uses 1 page per gigabyte, so there's no
- need to be too stingy here.
+ This only affects the sizing of some bss arrays, the unused
+ portions of which are freed.
config XEN_SAVE_RESTORE
bool
- depends on XEN && PM
+ depends on XEN
+ select HIBERNATE_CALLBACKS
default y
config XEN_DEBUG_FS
@@ -36,3 +45,8 @@ config XEN_DEBUG_FS
help
Enable statistics output and various tuning options in debugfs.
Enabling this option may incur a significant performance overhead.
+
+config XEN_PVH
+ bool "Support for running as a PVH guest"
+ depends on X86_64 && XEN && XEN_PVHVM
+ def_bool n
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 6dcefba7836..96ab2c09cb6 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -5,8 +5,20 @@ CFLAGS_REMOVE_time.o = -pg
CFLAGS_REMOVE_irq.o = -pg
endif
+# Make sure early boot has no stackprotector
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_enlighten.o := $(nostackp)
+CFLAGS_mmu.o := $(nostackp)
+
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
- time.o xen-asm_$(BITS).o grant-table.o suspend.o
+ time.o xen-asm.o xen-asm_$(BITS).o \
+ grant-table.o suspend.o platform-pci-unplug.o \
+ p2m.o
+
+obj-$(CONFIG_EVENT_TRACING) += trace.o
-obj-$(CONFIG_SMP) += smp.o spinlock.o
-obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file
+obj-$(CONFIG_SMP) += smp.o
+obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
+obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
+obj-$(CONFIG_XEN_DOM0) += apic.o vga.o
+obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
new file mode 100644
index 00000000000..7005ced5d1a
--- /dev/null
+++ b/arch/x86/xen/apic.c
@@ -0,0 +1,34 @@
+#include <linux/init.h>
+
+#include <asm/x86_init.h>
+#include <asm/apic.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
+#include <xen/interface/physdev.h>
+#include "xen-ops.h"
+
+static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
+{
+ struct physdev_apic apic_op;
+ int ret;
+
+ apic_op.apic_physbase = mpc_ioapic_addr(apic);
+ apic_op.reg = reg;
+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
+ if (!ret)
+ return apic_op.value;
+
+ /* fallback to return an emulated IO_APIC values */
+ if (reg == 0x1)
+ return 0x00170020;
+ else if (reg == 0x0)
+ return apic << 24;
+
+ return 0xfd;
+}
+
+void __init xen_init_apic(void)
+{
+ x86_io_apic_ops.read = xen_io_apic_read;
+}
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index b53225d2cac..c8377fb26cd 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -1,5 +1,6 @@
#include <linux/init.h>
#include <linux/debugfs.h>
+#include <linux/slab.h>
#include <linux/module.h>
#include "debugfs.h"
@@ -18,106 +19,3 @@ struct dentry * __init xen_init_debugfs(void)
return d_xen_debug;
}
-struct array_data
-{
- void *array;
- unsigned elements;
-};
-
-static int u32_array_open(struct inode *inode, struct file *file)
-{
- file->private_data = NULL;
- return nonseekable_open(inode, file);
-}
-
-static size_t format_array(char *buf, size_t bufsize, const char *fmt,
- u32 *array, unsigned array_size)
-{
- size_t ret = 0;
- unsigned i;
-
- for(i = 0; i < array_size; i++) {
- size_t len;
-
- len = snprintf(buf, bufsize, fmt, array[i]);
- len++; /* ' ' or '\n' */
- ret += len;
-
- if (buf) {
- buf += len;
- bufsize -= len;
- buf[-1] = (i == array_size-1) ? '\n' : ' ';
- }
- }
-
- ret++; /* \0 */
- if (buf)
- *buf = '\0';
-
- return ret;
-}
-
-static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size)
-{
- size_t len = format_array(NULL, 0, fmt, array, array_size);
- char *ret;
-
- ret = kmalloc(len, GFP_KERNEL);
- if (ret == NULL)
- return NULL;
-
- format_array(ret, len, fmt, array, array_size);
- return ret;
-}
-
-static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
- loff_t *ppos)
-{
- struct inode *inode = file->f_path.dentry->d_inode;
- struct array_data *data = inode->i_private;
- size_t size;
-
- if (*ppos == 0) {
- if (file->private_data) {
- kfree(file->private_data);
- file->private_data = NULL;
- }
-
- file->private_data = format_array_alloc("%u", data->array, data->elements);
- }
-
- size = 0;
- if (file->private_data)
- size = strlen(file->private_data);
-
- return simple_read_from_buffer(buf, len, ppos, file->private_data, size);
-}
-
-static int xen_array_release(struct inode *inode, struct file *file)
-{
- kfree(file->private_data);
-
- return 0;
-}
-
-static struct file_operations u32_array_fops = {
- .owner = THIS_MODULE,
- .open = u32_array_open,
- .release= xen_array_release,
- .read = u32_array_read,
-};
-
-struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
- struct dentry *parent,
- u32 *array, unsigned elements)
-{
- struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
-
- if (data == NULL)
- return NULL;
-
- data->array = array;
- data->elements = elements;
-
- return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
-}
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
index e2813208483..12ebf3325c7 100644
--- a/arch/x86/xen/debugfs.h
+++ b/arch/x86/xen/debugfs.h
@@ -3,8 +3,4 @@
struct dentry * __init xen_init_debugfs(void);
-struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
- struct dentry *parent,
- u32 *array, unsigned elements);
-
#endif /* _XEN_DEBUGFS_H */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index b58e9633814..ffb101e4573 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -11,6 +11,7 @@
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
+#include <linux/cpu.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/smp.h>
@@ -20,86 +21,119 @@
#include <linux/delay.h>
#include <linux/start_kernel.h>
#include <linux/sched.h>
+#include <linux/kprobes.h>
#include <linux/bootmem.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
#include <linux/highmem.h>
#include <linux/console.h>
+#include <linux/pci.h>
+#include <linux/gfp.h>
+#include <linux/memblock.h>
+#include <linux/edd.h>
+#include <xen/xen.h>
+#include <xen/events.h>
#include <xen/interface/xen.h>
#include <xen/interface/version.h>
#include <xen/interface/physdev.h>
#include <xen/interface/vcpu.h>
+#include <xen/interface/memory.h>
+#include <xen/interface/xen-mca.h>
#include <xen/features.h>
#include <xen/page.h>
+#include <xen/hvm.h>
#include <xen/hvc-console.h>
+#include <xen/acpi.h>
#include <asm/paravirt.h>
#include <asm/apic.h>
#include <asm/page.h>
+#include <asm/xen/pci.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
#include <asm/fixmap.h>
#include <asm/processor.h>
+#include <asm/proto.h>
#include <asm/msr-index.h>
+#include <asm/traps.h>
#include <asm/setup.h>
#include <asm/desc.h>
+#include <asm/pgalloc.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/reboot.h>
+#include <asm/stackprotector.h>
+#include <asm/hypervisor.h>
+#include <asm/mwait.h>
+#include <asm/pci_x86.h>
+#include <asm/pat.h>
+
+#ifdef CONFIG_ACPI
+#include <linux/acpi.h>
+#include <asm/acpi.h>
+#include <acpi/pdc_intel.h>
+#include <acpi/processor.h>
+#include <xen/interface/platform.h>
+#endif
#include "xen-ops.h"
#include "mmu.h"
+#include "smp.h"
#include "multicalls.h"
EXPORT_SYMBOL_GPL(hypercall_page);
+/*
+ * Pointer to the xen_vcpu_info structure or
+ * &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info
+ * and xen_vcpu_setup for details. By default it points to share_info->vcpu_info
+ * but if the hypervisor supports VCPUOP_register_vcpu_info then it can point
+ * to xen_vcpu_info. The pointer is used in __xen_evtchn_do_upcall to
+ * acknowledge pending events.
+ * Also more subtly it is used by the patched version of irq enable/disable
+ * e.g. xen_irq_enable_direct and xen_iret in PV mode.
+ *
+ * The desire to be able to do those mask/unmask operations as a single
+ * instruction by using the per-cpu offset held in %gs is the real reason
+ * vcpu info is in a per-cpu pointer and the original reason for this
+ * hypercall.
+ *
+ */
DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
-DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
-
-enum xen_domain_type xen_domain_type = XEN_NATIVE;
-EXPORT_SYMBOL_GPL(xen_domain_type);
/*
- * Identity map, in addition to plain kernel map. This needs to be
- * large enough to allocate page table pages to allocate the rest.
- * Each page can map 2MB.
+ * Per CPU pages used if hypervisor supports VCPUOP_register_vcpu_info
+ * hypercall. This can be used both in PV and PVHVM mode. The structure
+ * overrides the default per_cpu(xen_vcpu, cpu) value.
*/
-static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
+DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
-#ifdef CONFIG_X86_64
-/* l3 pud for userspace vsyscall mapping */
-static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
-#endif /* CONFIG_X86_64 */
+enum xen_domain_type xen_domain_type = XEN_NATIVE;
+EXPORT_SYMBOL_GPL(xen_domain_type);
-/*
- * Note about cr3 (pagetable base) values:
- *
- * xen_cr3 contains the current logical cr3 value; it contains the
- * last set cr3. This may not be the current effective cr3, because
- * its update may be being lazily deferred. However, a vcpu looking
- * at its own cr3 can use this value knowing that it everything will
- * be self-consistent.
- *
- * xen_current_cr3 contains the actual vcpu cr3; it is set once the
- * hypercall to set the vcpu cr3 is complete (so it may be a little
- * out of date, but it will never be set early). If one vcpu is
- * looking at another vcpu's cr3 value, it should use this variable.
- */
-DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
-DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
+unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
+EXPORT_SYMBOL(machine_to_phys_mapping);
+unsigned long machine_to_phys_nr;
+EXPORT_SYMBOL(machine_to_phys_nr);
struct start_info *xen_start_info;
EXPORT_SYMBOL_GPL(xen_start_info);
struct shared_info xen_dummy_shared_info;
+void *xen_initial_gdt;
+
+RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
+__read_mostly int xen_have_vector_callback;
+EXPORT_SYMBOL_GPL(xen_have_vector_callback);
+
/*
* Point at some empty memory to start with. We map the real shared_info
* page as soon as fixmap is up and running.
*/
-struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
+struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
/*
* Flag to determine whether vcpu info placement is available on all
@@ -114,14 +148,28 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
*
* 0: not available, 1: available
*/
-static int have_vcpu_info_placement =
-#ifdef CONFIG_X86_32
- 1
-#else
- 0
-#endif
- ;
+static int have_vcpu_info_placement = 1;
+
+struct tls_descs {
+ struct desc_struct desc[3];
+};
+
+/*
+ * Updating the 3 TLS descriptors in the GDT on every task switch is
+ * surprisingly expensive so we avoid updating them if they haven't
+ * changed. Since Xen writes different descriptors than the one
+ * passed in the update_descriptor hypercall we keep shadow copies to
+ * compare against.
+ */
+static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
+static void clamp_max_cpus(void)
+{
+#ifdef CONFIG_SMP
+ if (setup_max_cpus > MAX_VIRT_CPUS)
+ setup_max_cpus = MAX_VIRT_CPUS;
+#endif
+}
static void xen_vcpu_setup(int cpu)
{
@@ -130,34 +178,53 @@ static void xen_vcpu_setup(int cpu)
struct vcpu_info *vcpup;
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
- per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
- if (!have_vcpu_info_placement)
- return; /* already tested, not available */
+ /*
+ * This path is called twice on PVHVM - first during bootup via
+ * smp_init -> xen_hvm_cpu_notify, and then if the VCPU is being
+ * hotplugged: cpu_up -> xen_hvm_cpu_notify.
+ * As we can only do the VCPUOP_register_vcpu_info once lets
+ * not over-write its result.
+ *
+ * For PV it is called during restore (xen_vcpu_restore) and bootup
+ * (xen_setup_vcpu_info_placement). The hotplug mechanism does not
+ * use this function.
+ */
+ if (xen_hvm_domain()) {
+ if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu))
+ return;
+ }
+ if (cpu < MAX_VIRT_CPUS)
+ per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+
+ if (!have_vcpu_info_placement) {
+ if (cpu >= MAX_VIRT_CPUS)
+ clamp_max_cpus();
+ return;
+ }
vcpup = &per_cpu(xen_vcpu_info, cpu);
-
- info.mfn = virt_to_mfn(vcpup);
+ info.mfn = arbitrary_virt_to_mfn(vcpup);
info.offset = offset_in_page(vcpup);
- printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
- cpu, vcpup, info.mfn, info.offset);
-
/* Check to see if the hypervisor will put the vcpu_info
structure where we want it, which allows direct access via
- a percpu-variable. */
+ a percpu-variable.
+ N.B. This hypercall can _only_ be called once per CPU. Subsequent
+ calls will error out with -EINVAL. This is due to the fact that
+ hypervisor has no unregister variant and this hypercall does not
+ allow to over-write info.mfn and info.offset.
+ */
err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);
if (err) {
printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
have_vcpu_info_placement = 0;
+ clamp_max_cpus();
} else {
/* This cpu is using the registered vcpu info, even if
later ones fail to. */
per_cpu(xen_vcpu, cpu) = vcpup;
-
- printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
- cpu, vcpup);
}
}
@@ -168,24 +235,24 @@ static void xen_vcpu_setup(int cpu)
*/
void xen_vcpu_restore(void)
{
- if (have_vcpu_info_placement) {
- int cpu;
+ int cpu;
- for_each_online_cpu(cpu) {
- bool other_cpu = (cpu != smp_processor_id());
+ for_each_possible_cpu(cpu) {
+ bool other_cpu = (cpu != smp_processor_id());
+ bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL);
- if (other_cpu &&
- HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL))
- BUG();
+ if (other_cpu && is_up &&
+ HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL))
+ BUG();
- xen_vcpu_setup(cpu);
+ xen_setup_runstate_info(cpu);
- if (other_cpu &&
- HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
- BUG();
- }
+ if (have_vcpu_info_placement)
+ xen_vcpu_setup(cpu);
- BUG_ON(!have_vcpu_info_placement);
+ if (other_cpu && is_up &&
+ HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
+ BUG();
}
}
@@ -195,28 +262,75 @@ static void __init xen_banner(void)
struct xen_extraversion extra;
HYPERVISOR_xen_version(XENVER_extraversion, &extra);
- printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
- pv_info.name);
+ pr_info("Booting paravirtualized kernel %son %s\n",
+ xen_feature(XENFEAT_auto_translated_physmap) ?
+ "with PVH extensions " : "", pv_info.name);
printk(KERN_INFO "Xen version: %d.%d%s%s\n",
version >> 16, version & 0xffff, extra.extraversion,
xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
}
+/* Check if running on Xen version (major, minor) or later */
+bool
+xen_running_on_version_or_later(unsigned int major, unsigned int minor)
+{
+ unsigned int version;
+
+ if (!xen_domain())
+ return false;
+
+ version = HYPERVISOR_xen_version(XENVER_version, NULL);
+ if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) ||
+ ((version >> 16) > major))
+ return true;
+ return false;
+}
+
+#define CPUID_THERM_POWER_LEAF 6
+#define APERFMPERF_PRESENT 0
+
+static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
+static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
+
+static __read_mostly unsigned int cpuid_leaf1_ecx_set_mask;
+static __read_mostly unsigned int cpuid_leaf5_ecx_val;
+static __read_mostly unsigned int cpuid_leaf5_edx_val;
static void xen_cpuid(unsigned int *ax, unsigned int *bx,
unsigned int *cx, unsigned int *dx)
{
+ unsigned maskebx = ~0;
+ unsigned maskecx = ~0;
unsigned maskedx = ~0;
-
+ unsigned setecx = 0;
/*
* Mask out inconvenient features, to try and disable as many
* unsupported kernel subsystems as possible.
*/
- if (*ax == 1)
- maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */
- (1 << X86_FEATURE_ACPI) | /* disable ACPI */
- (1 << X86_FEATURE_MCE) | /* disable MCE */
- (1 << X86_FEATURE_MCA) | /* disable MCA */
- (1 << X86_FEATURE_ACC)); /* thermal monitoring */
+ switch (*ax) {
+ case 1:
+ maskecx = cpuid_leaf1_ecx_mask;
+ setecx = cpuid_leaf1_ecx_set_mask;
+ maskedx = cpuid_leaf1_edx_mask;
+ break;
+
+ case CPUID_MWAIT_LEAF:
+ /* Synthesize the values.. */
+ *ax = 0;
+ *bx = 0;
+ *cx = cpuid_leaf5_ecx_val;
+ *dx = cpuid_leaf5_edx_val;
+ return;
+
+ case CPUID_THERM_POWER_LEAF:
+ /* Disabling APERFMPERF for kernel usage */
+ maskecx = ~(1 << APERFMPERF_PRESENT);
+ break;
+
+ case 0xb:
+ /* Suppress extended topology stuff */
+ maskebx = 0;
+ break;
+ }
asm(XEN_EMULATE_PREFIX "cpuid"
: "=a" (*ax),
@@ -224,7 +338,113 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
"=c" (*cx),
"=d" (*dx)
: "0" (*ax), "2" (*cx));
+
+ *bx &= maskebx;
+ *cx &= maskecx;
+ *cx |= setecx;
*dx &= maskedx;
+
+}
+
+static bool __init xen_check_mwait(void)
+{
+#ifdef CONFIG_ACPI
+ struct xen_platform_op op = {
+ .cmd = XENPF_set_processor_pminfo,
+ .u.set_pminfo.id = -1,
+ .u.set_pminfo.type = XEN_PM_PDC,
+ };
+ uint32_t buf[3];
+ unsigned int ax, bx, cx, dx;
+ unsigned int mwait_mask;
+
+ /* We need to determine whether it is OK to expose the MWAIT
+ * capability to the kernel to harvest deeper than C3 states from ACPI
+ * _CST using the processor_harvest_xen.c module. For this to work, we
+ * need to gather the MWAIT_LEAF values (which the cstate.c code
+ * checks against). The hypervisor won't expose the MWAIT flag because
+ * it would break backwards compatibility; so we will find out directly
+ * from the hardware and hypercall.
+ */
+ if (!xen_initial_domain())
+ return false;
+
+ /*
+ * When running under platform earlier than Xen4.2, do not expose
+ * mwait, to avoid the risk of loading native acpi pad driver
+ */
+ if (!xen_running_on_version_or_later(4, 2))
+ return false;
+
+ ax = 1;
+ cx = 0;
+
+ native_cpuid(&ax, &bx, &cx, &dx);
+
+ mwait_mask = (1 << (X86_FEATURE_EST % 32)) |
+ (1 << (X86_FEATURE_MWAIT % 32));
+
+ if ((cx & mwait_mask) != mwait_mask)
+ return false;
+
+ /* We need to emulate the MWAIT_LEAF and for that we need both
+ * ecx and edx. The hypercall provides only partial information.
+ */
+
+ ax = CPUID_MWAIT_LEAF;
+ bx = 0;
+ cx = 0;
+ dx = 0;
+
+ native_cpuid(&ax, &bx, &cx, &dx);
+
+ /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so,
+ * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3.
+ */
+ buf[0] = ACPI_PDC_REVISION_ID;
+ buf[1] = 1;
+ buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP);
+
+ set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
+
+ if ((HYPERVISOR_dom0_op(&op) == 0) &&
+ (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) {
+ cpuid_leaf5_ecx_val = cx;
+ cpuid_leaf5_edx_val = dx;
+ }
+ return true;
+#else
+ return false;
+#endif
+}
+static void __init xen_init_cpuid_mask(void)
+{
+ unsigned int ax, bx, cx, dx;
+ unsigned int xsave_mask;
+
+ cpuid_leaf1_edx_mask =
+ ~((1 << X86_FEATURE_MTRR) | /* disable MTRR */
+ (1 << X86_FEATURE_ACC)); /* thermal monitoring */
+
+ if (!xen_initial_domain())
+ cpuid_leaf1_edx_mask &=
+ ~((1 << X86_FEATURE_ACPI)); /* disable ACPI */
+
+ cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32));
+
+ ax = 1;
+ cx = 0;
+ cpuid(1, &ax, &bx, &cx, &dx);
+
+ xsave_mask =
+ (1 << (X86_FEATURE_XSAVE % 32)) |
+ (1 << (X86_FEATURE_OSXSAVE % 32));
+
+ /* Xen will set CR4.OSXSAVE if supported and not disabled by force */
+ if ((cx & xsave_mask) != xsave_mask)
+ cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */
+ if (xen_check_mwait())
+ cpuid_leaf1_ecx_set_mask = (1 << (X86_FEATURE_MWAIT % 32));
}
static void xen_set_debugreg(int reg, unsigned long val)
@@ -237,10 +457,10 @@ static unsigned long xen_get_debugreg(int reg)
return HYPERVISOR_get_debugreg(reg);
}
-static void xen_leave_lazy(void)
+static void xen_end_context_switch(struct task_struct *next)
{
- paravirt_leave_lazy(paravirt_get_lazy_mode());
xen_mc_flush();
+ paravirt_end_context_switch(next);
}
static unsigned long xen_store_tr(void)
@@ -306,6 +526,8 @@ static void xen_set_ldt(const void *addr, unsigned entries)
struct mmuext_op *op;
struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+ trace_xen_cpu_set_ldt(addr, entries);
+
op = mcs.args;
op->cmd = MMUEXT_SET_LDT;
op->arg1.linear_addr = (unsigned long)addr;
@@ -318,38 +540,110 @@ static void xen_set_ldt(const void *addr, unsigned entries)
static void xen_load_gdt(const struct desc_ptr *dtr)
{
- unsigned long *frames;
unsigned long va = dtr->address;
unsigned int size = dtr->size + 1;
unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+ unsigned long frames[pages];
int f;
- struct multicall_space mcs;
- /* A GDT can be up to 64k in size, which corresponds to 8192
- 8-byte entries, or 16 4k pages.. */
+ /*
+ * A GDT can be up to 64k in size, which corresponds to 8192
+ * 8-byte entries, or 16 4k pages..
+ */
BUG_ON(size > 65536);
BUG_ON(va & ~PAGE_MASK);
- mcs = xen_mc_entry(sizeof(*frames) * pages);
- frames = mcs.args;
-
for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
- frames[f] = virt_to_mfn(va);
+ int level;
+ pte_t *ptep;
+ unsigned long pfn, mfn;
+ void *virt;
+
+ /*
+ * The GDT is per-cpu and is in the percpu data area.
+ * That can be virtually mapped, so we need to do a
+ * page-walk to get the underlying MFN for the
+ * hypercall. The page can also be in the kernel's
+ * linear range, so we need to RO that mapping too.
+ */
+ ptep = lookup_address(va, &level);
+ BUG_ON(ptep == NULL);
+
+ pfn = pte_pfn(*ptep);
+ mfn = pfn_to_mfn(pfn);
+ virt = __va(PFN_PHYS(pfn));
+
+ frames[f] = mfn;
+
make_lowmem_page_readonly((void *)va);
+ make_lowmem_page_readonly(virt);
}
- MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
+ if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
+ BUG();
+}
- xen_mc_issue(PARAVIRT_LAZY_CPU);
+/*
+ * load_gdt for early boot, when the gdt is only mapped once
+ */
+static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
+{
+ unsigned long va = dtr->address;
+ unsigned int size = dtr->size + 1;
+ unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+ unsigned long frames[pages];
+ int f;
+
+ /*
+ * A GDT can be up to 64k in size, which corresponds to 8192
+ * 8-byte entries, or 16 4k pages..
+ */
+
+ BUG_ON(size > 65536);
+ BUG_ON(va & ~PAGE_MASK);
+
+ for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
+ pte_t pte;
+ unsigned long pfn, mfn;
+
+ pfn = virt_to_pfn(va);
+ mfn = pfn_to_mfn(pfn);
+
+ pte = pfn_pte(pfn, PAGE_KERNEL_RO);
+
+ if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
+ BUG();
+
+ frames[f] = mfn;
+ }
+
+ if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
+ BUG();
+}
+
+static inline bool desc_equal(const struct desc_struct *d1,
+ const struct desc_struct *d2)
+{
+ return d1->a == d2->a && d1->b == d2->b;
}
static void load_TLS_descriptor(struct thread_struct *t,
unsigned int cpu, unsigned int i)
{
- struct desc_struct *gdt = get_cpu_gdt_table(cpu);
- xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
- struct multicall_space mc = __xen_mc_entry(0);
+ struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
+ struct desc_struct *gdt;
+ xmaddr_t maddr;
+ struct multicall_space mc;
+
+ if (desc_equal(shadow, &t->tls_array[i]))
+ return;
+
+ *shadow = t->tls_array[i];
+
+ gdt = get_cpu_gdt_table(cpu);
+ maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+ mc = __xen_mc_entry(0);
MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
}
@@ -357,13 +651,14 @@ static void load_TLS_descriptor(struct thread_struct *t,
static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
{
/*
- * XXX sleazy hack: If we're being called in a lazy-cpu zone,
- * it means we're in a context switch, and %gs has just been
- * saved. This means we can zero it out to prevent faults on
- * exit from the hypervisor if the next process has no %gs.
- * Either way, it has been saved, and the new value will get
- * loaded properly. This will go away as soon as Xen has been
- * modified to not save/restore %gs for normal hypercalls.
+ * XXX sleazy hack: If we're being called in a lazy-cpu zone
+ * and lazy gs handling is enabled, it means we're in a
+ * context switch, and %gs has just been saved. This means we
+ * can zero it out to prevent faults on exit from the
+ * hypervisor if the next process has no %gs. Either way, it
+ * has been saved, and the new value will get loaded properly.
+ * This will go away as soon as Xen has been modified to not
+ * save/restore %gs for normal hypercalls.
*
* On x86_64, this hack is not used for %gs, because gs points
* to KERNEL_GS_BASE (and uses it for PDA references), so we
@@ -375,7 +670,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
*/
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
#ifdef CONFIG_X86_32
- loadsegment(gs, 0);
+ lazy_load_gs(0);
#else
loadsegment(fs, 0);
#endif
@@ -404,6 +699,8 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
u64 entry = *(u64 *)ptr;
+ trace_xen_cpu_write_ldt_entry(dt, entrynum, entry);
+
preempt_disable();
xen_mc_flush();
@@ -416,16 +713,57 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
static int cvt_gate_to_trap(int vector, const gate_desc *val,
struct trap_info *info)
{
- if (val->type != 0xf && val->type != 0xe)
+ unsigned long addr;
+
+ if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
return 0;
info->vector = vector;
- info->address = gate_offset(*val);
+
+ addr = gate_offset(*val);
+#ifdef CONFIG_X86_64
+ /*
+ * Look for known traps using IST, and substitute them
+ * appropriately. The debugger ones are the only ones we care
+ * about. Xen will handle faults like double_fault,
+ * so we should never see them. Warn if
+ * there's an unexpected IST-using fault handler.
+ */
+ if (addr == (unsigned long)debug)
+ addr = (unsigned long)xen_debug;
+ else if (addr == (unsigned long)int3)
+ addr = (unsigned long)xen_int3;
+ else if (addr == (unsigned long)stack_segment)
+ addr = (unsigned long)xen_stack_segment;
+ else if (addr == (unsigned long)double_fault) {
+ /* Don't need to handle these */
+ return 0;
+#ifdef CONFIG_X86_MCE
+ } else if (addr == (unsigned long)machine_check) {
+ /*
+ * when xen hypervisor inject vMCE to guest,
+ * use native mce handler to handle it
+ */
+ ;
+#endif
+ } else if (addr == (unsigned long)nmi)
+ /*
+ * Use the native version as well.
+ */
+ ;
+ else {
+ /* Some other trap using IST? */
+ if (WARN_ON(val->ist != 0))
+ return 0;
+ }
+#endif /* CONFIG_X86_64 */
+ info->address = addr;
+
info->cs = gate_segment(*val);
info->flags = val->dpl;
/* interrupt gates clear IF */
- if (val->type == 0xe)
- info->flags |= 4;
+ if (val->type == GATE_INTERRUPT)
+ info->flags |= 1 << 2;
return 1;
}
@@ -440,10 +778,12 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
unsigned long p = (unsigned long)&dt[entrynum];
unsigned long start, end;
+ trace_xen_cpu_write_idt_entry(dt, entrynum, g);
+
preempt_disable();
- start = __get_cpu_var(idt_desc).address;
- end = start + __get_cpu_var(idt_desc).size + 1;
+ start = __this_cpu_read(idt_desc.address);
+ end = start + __this_cpu_read(idt_desc.size) + 1;
xen_mc_flush();
@@ -494,6 +834,8 @@ static void xen_load_idt(const struct desc_ptr *desc)
static DEFINE_SPINLOCK(lock);
static struct trap_info traps[257];
+ trace_xen_cpu_load_idt(desc);
+
spin_lock(&lock);
__get_cpu_var(idt_desc) = *desc;
@@ -512,6 +854,8 @@ static void xen_load_idt(const struct desc_ptr *desc)
static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
const void *desc, int type)
{
+ trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
+
preempt_disable();
switch (type) {
@@ -521,7 +865,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
break;
default: {
- xmaddr_t maddr = virt_to_machine(&dt[entry]);
+ xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
xen_mc_flush();
if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
@@ -533,10 +877,37 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
preempt_enable();
}
+/*
+ * Version of write_gdt_entry for use at early boot-time needed to
+ * update an entry as simply as possible.
+ */
+static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
+ const void *desc, int type)
+{
+ trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
+
+ switch (type) {
+ case DESC_LDT:
+ case DESC_TSS:
+ /* ignore */
+ break;
+
+ default: {
+ xmaddr_t maddr = virt_to_machine(&dt[entry]);
+
+ if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
+ dt[entry] = *(struct desc_struct *)desc;
+ }
+
+ }
+}
+
static void xen_load_sp0(struct tss_struct *tss,
struct thread_struct *thread)
{
- struct multicall_space mcs = xen_mc_entry(0);
+ struct multicall_space mcs;
+
+ mcs = xen_mc_entry(0);
MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
xen_mc_issue(PARAVIRT_LAZY_CPU);
}
@@ -555,9 +926,40 @@ static void xen_io_delay(void)
}
#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned long xen_set_apic_id(unsigned int x)
+{
+ WARN_ON(1);
+ return x;
+}
+static unsigned int xen_get_apic_id(unsigned long x)
+{
+ return ((x)>>24) & 0xFFu;
+}
static u32 xen_apic_read(u32 reg)
{
- return 0;
+ struct xen_platform_op op = {
+ .cmd = XENPF_get_cpuinfo,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.pcpu_info.xen_cpuid = 0,
+ };
+ int ret = 0;
+
+ /* Shouldn't need this as APIC is turned off for PV, and we only
+ * get called on the bootup processor. But just in case. */
+ if (!xen_initial_domain() || smp_processor_id())
+ return 0;
+
+ if (reg == APIC_LVR)
+ return 0x10;
+
+ if (reg != APIC_ID)
+ return 0;
+
+ ret = HYPERVISOR_dom0_op(&op);
+ if (ret)
+ return 0;
+
+ return op.u.pcpu_info.apic_id << 24;
}
static void xen_apic_write(u32 reg, u32 val)
@@ -587,110 +989,59 @@ static u32 xen_safe_apic_wait_icr_idle(void)
return 0;
}
-static struct apic_ops xen_basic_apic_ops = {
- .read = xen_apic_read,
- .write = xen_apic_write,
- .icr_read = xen_apic_icr_read,
- .icr_write = xen_apic_icr_write,
- .wait_icr_idle = xen_apic_wait_icr_idle,
- .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
-};
-
-#endif
-
-static void xen_flush_tlb(void)
+static void set_xen_basic_apic_ops(void)
{
- struct mmuext_op *op;
- struct multicall_space mcs;
-
- preempt_disable();
-
- mcs = xen_mc_entry(sizeof(*op));
-
- op = mcs.args;
- op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+ apic->read = xen_apic_read;
+ apic->write = xen_apic_write;
+ apic->icr_read = xen_apic_icr_read;
+ apic->icr_write = xen_apic_icr_write;
+ apic->wait_icr_idle = xen_apic_wait_icr_idle;
+ apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle;
+ apic->set_apic_id = xen_set_apic_id;
+ apic->get_apic_id = xen_get_apic_id;
- xen_mc_issue(PARAVIRT_LAZY_MMU);
-
- preempt_enable();
+#ifdef CONFIG_SMP
+ apic->send_IPI_allbutself = xen_send_IPI_allbutself;
+ apic->send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself;
+ apic->send_IPI_mask = xen_send_IPI_mask;
+ apic->send_IPI_all = xen_send_IPI_all;
+ apic->send_IPI_self = xen_send_IPI_self;
+#endif
}
-static void xen_flush_tlb_single(unsigned long addr)
-{
- struct mmuext_op *op;
- struct multicall_space mcs;
-
- preempt_disable();
-
- mcs = xen_mc_entry(sizeof(*op));
- op = mcs.args;
- op->cmd = MMUEXT_INVLPG_LOCAL;
- op->arg1.linear_addr = addr & PAGE_MASK;
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
- xen_mc_issue(PARAVIRT_LAZY_MMU);
-
- preempt_enable();
-}
+#endif
-static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
- unsigned long va)
+static void xen_clts(void)
{
- struct {
- struct mmuext_op op;
- cpumask_t mask;
- } *args;
- cpumask_t cpumask = *cpus;
struct multicall_space mcs;
- /*
- * A couple of (to be removed) sanity checks:
- *
- * - current CPU must not be in mask
- * - mask must exist :)
- */
- BUG_ON(cpus_empty(cpumask));
- BUG_ON(cpu_isset(smp_processor_id(), cpumask));
- BUG_ON(!mm);
-
- /* If a CPU which we ran on has gone down, OK. */
- cpus_and(cpumask, cpumask, cpu_online_map);
- if (cpus_empty(cpumask))
- return;
-
- mcs = xen_mc_entry(sizeof(*args));
- args = mcs.args;
- args->mask = cpumask;
- args->op.arg2.vcpumask = &args->mask;
-
- if (va == TLB_FLUSH_ALL) {
- args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
- } else {
- args->op.cmd = MMUEXT_INVLPG_MULTI;
- args->op.arg1.linear_addr = va;
- }
+ mcs = xen_mc_entry(0);
- MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
+ MULTI_fpu_taskswitch(mcs.mc, 0);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
}
-static void xen_clts(void)
-{
- struct multicall_space mcs;
+static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
- mcs = xen_mc_entry(0);
+static unsigned long xen_read_cr0(void)
+{
+ unsigned long cr0 = this_cpu_read(xen_cr0_value);
- MULTI_fpu_taskswitch(mcs.mc, 0);
+ if (unlikely(cr0 == 0)) {
+ cr0 = native_read_cr0();
+ this_cpu_write(xen_cr0_value, cr0);
+ }
- xen_mc_issue(PARAVIRT_LAZY_CPU);
+ return cr0;
}
static void xen_write_cr0(unsigned long cr0)
{
struct multicall_space mcs;
+ this_cpu_write(xen_cr0_value, cr0);
+
/* Only pay attention to cr0.TS; everything else is
ignored. */
mcs = xen_mc_entry(0);
@@ -700,21 +1051,6 @@ static void xen_write_cr0(unsigned long cr0)
xen_mc_issue(PARAVIRT_LAZY_CPU);
}
-static void xen_write_cr2(unsigned long cr2)
-{
- x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
-}
-
-static unsigned long xen_read_cr2(void)
-{
- return x86_read_percpu(xen_vcpu)->arch.cr2;
-}
-
-static unsigned long xen_read_cr2_direct(void)
-{
- return x86_read_percpu(xen_vcpu_info.arch.cr2);
-}
-
static void xen_write_cr4(unsigned long cr4)
{
cr4 &= ~X86_CR4_PGE;
@@ -722,72 +1058,16 @@ static void xen_write_cr4(unsigned long cr4)
native_write_cr4(cr4);
}
-
-static unsigned long xen_read_cr3(void)
-{
- return x86_read_percpu(xen_cr3);
-}
-
-static void set_current_cr3(void *v)
+#ifdef CONFIG_X86_64
+static inline unsigned long xen_read_cr8(void)
{
- x86_write_percpu(xen_current_cr3, (unsigned long)v);
+ return 0;
}
-
-static void __xen_write_cr3(bool kernel, unsigned long cr3)
+static inline void xen_write_cr8(unsigned long val)
{
- struct mmuext_op *op;
- struct multicall_space mcs;
- unsigned long mfn;
-
- if (cr3)
- mfn = pfn_to_mfn(PFN_DOWN(cr3));
- else
- mfn = 0;
-
- WARN_ON(mfn == 0 && kernel);
-
- mcs = __xen_mc_entry(sizeof(*op));
-
- op = mcs.args;
- op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
- op->arg1.mfn = mfn;
-
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
- if (kernel) {
- x86_write_percpu(xen_cr3, cr3);
-
- /* Update xen_current_cr3 once the batch has actually
- been submitted. */
- xen_mc_callback(set_current_cr3, (void *)cr3);
- }
+ BUG_ON(val);
}
-
-static void xen_write_cr3(unsigned long cr3)
-{
- BUG_ON(preemptible());
-
- xen_mc_batch(); /* disables interrupts */
-
- /* Update while interrupts are disabled, so its atomic with
- respect to ipis */
- x86_write_percpu(xen_cr3, cr3);
-
- __xen_write_cr3(true, cr3);
-
-#ifdef CONFIG_X86_64
- {
- pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
- if (user_pgd)
- __xen_write_cr3(false, __pa(user_pgd));
- else
- __xen_write_cr3(false, 0);
- }
#endif
-
- xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
-}
-
static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
{
int ret;
@@ -806,7 +1086,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
set:
base = ((u64)high << 32) | low;
if (HYPERVISOR_set_segment_base(which, base) != 0)
- ret = -EFAULT;
+ ret = -EIO;
break;
#endif
@@ -822,6 +1102,11 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
Xen console noise. */
break;
+ case MSR_IA32_CR_PAT:
+ if (smp_processor_id() == 0)
+ xen_set_pat(((u64)high << 32) | low);
+ break;
+
default:
ret = native_write_msr_safe(msr, low, high);
}
@@ -829,185 +1114,6 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
return ret;
}
-/* Early in boot, while setting up the initial pagetable, assume
- everything is pinned. */
-static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
-{
-#ifdef CONFIG_FLATMEM
- BUG_ON(mem_map); /* should only be used early */
-#endif
- make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
-}
-
-/* Early release_pte assumes that all pts are pinned, since there's
- only init_mm and anything attached to that is pinned. */
-static void xen_release_pte_init(unsigned long pfn)
-{
- make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
-}
-
-static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
-{
- struct mmuext_op op;
- op.cmd = cmd;
- op.arg1.mfn = pfn_to_mfn(pfn);
- if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
- BUG();
-}
-
-/* This needs to make sure the new pte page is pinned iff its being
- attached to a pinned pagetable. */
-static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
-{
- struct page *page = pfn_to_page(pfn);
-
- if (PagePinned(virt_to_page(mm->pgd))) {
- SetPagePinned(page);
-
- vm_unmap_aliases();
- if (!PageHighMem(page)) {
- make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
- if (level == PT_PTE && USE_SPLIT_PTLOCKS)
- pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
- } else {
- /* make sure there are no stray mappings of
- this page */
- kmap_flush_unused();
- }
- }
-}
-
-static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
-{
- xen_alloc_ptpage(mm, pfn, PT_PTE);
-}
-
-static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
-{
- xen_alloc_ptpage(mm, pfn, PT_PMD);
-}
-
-static int xen_pgd_alloc(struct mm_struct *mm)
-{
- pgd_t *pgd = mm->pgd;
- int ret = 0;
-
- BUG_ON(PagePinned(virt_to_page(pgd)));
-
-#ifdef CONFIG_X86_64
- {
- struct page *page = virt_to_page(pgd);
- pgd_t *user_pgd;
-
- BUG_ON(page->private != 0);
-
- ret = -ENOMEM;
-
- user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
- page->private = (unsigned long)user_pgd;
-
- if (user_pgd != NULL) {
- user_pgd[pgd_index(VSYSCALL_START)] =
- __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
- ret = 0;
- }
-
- BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
- }
-#endif
-
- return ret;
-}
-
-static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-#ifdef CONFIG_X86_64
- pgd_t *user_pgd = xen_get_user_pgd(pgd);
-
- if (user_pgd)
- free_page((unsigned long)user_pgd);
-#endif
-}
-
-/* This should never happen until we're OK to use struct page */
-static void xen_release_ptpage(unsigned long pfn, unsigned level)
-{
- struct page *page = pfn_to_page(pfn);
-
- if (PagePinned(page)) {
- if (!PageHighMem(page)) {
- if (level == PT_PTE && USE_SPLIT_PTLOCKS)
- pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
- make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
- }
- ClearPagePinned(page);
- }
-}
-
-static void xen_release_pte(unsigned long pfn)
-{
- xen_release_ptpage(pfn, PT_PTE);
-}
-
-static void xen_release_pmd(unsigned long pfn)
-{
- xen_release_ptpage(pfn, PT_PMD);
-}
-
-#if PAGETABLE_LEVELS == 4
-static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
-{
- xen_alloc_ptpage(mm, pfn, PT_PUD);
-}
-
-static void xen_release_pud(unsigned long pfn)
-{
- xen_release_ptpage(pfn, PT_PUD);
-}
-#endif
-
-#ifdef CONFIG_HIGHPTE
-static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
-{
- pgprot_t prot = PAGE_KERNEL;
-
- if (PagePinned(page))
- prot = PAGE_KERNEL_RO;
-
- if (0 && PageHighMem(page))
- printk("mapping highpte %lx type %d prot %s\n",
- page_to_pfn(page), type,
- (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
-
- return kmap_atomic_prot(page, type, prot);
-}
-#endif
-
-#ifdef CONFIG_X86_32
-static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
-{
- /* If there's an existing pte, then don't allow _PAGE_RW to be set */
- if (pte_val_ma(*ptep) & _PAGE_PRESENT)
- pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
- pte_val_ma(pte));
-
- return pte;
-}
-
-/* Init-time set_pte while constructing initial pagetables, which
- doesn't allow RO pagetable pages to be remapped RW */
-static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
-{
- pte = mask_rw_pte(ptep, pte);
-
- xen_set_pte(ptep, pte);
-}
-#endif
-
-static __init void xen_pagetable_setup_start(pgd_t *base)
-{
-}
-
void xen_setup_shared_info(void)
{
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
@@ -1028,38 +1134,7 @@ void xen_setup_shared_info(void)
xen_setup_mfn_list_list();
}
-static __init void xen_pagetable_setup_done(pgd_t *base)
-{
- xen_setup_shared_info();
-}
-
-static __init void xen_post_allocator_init(void)
-{
- pv_mmu_ops.set_pte = xen_set_pte;
- pv_mmu_ops.set_pmd = xen_set_pmd;
- pv_mmu_ops.set_pud = xen_set_pud;
-#if PAGETABLE_LEVELS == 4
- pv_mmu_ops.set_pgd = xen_set_pgd;
-#endif
-
- /* This will work as long as patching hasn't happened yet
- (which it hasn't) */
- pv_mmu_ops.alloc_pte = xen_alloc_pte;
- pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
- pv_mmu_ops.release_pte = xen_release_pte;
- pv_mmu_ops.release_pmd = xen_release_pmd;
-#if PAGETABLE_LEVELS == 4
- pv_mmu_ops.alloc_pud = xen_alloc_pud;
- pv_mmu_ops.release_pud = xen_release_pud;
-#endif
-
-#ifdef CONFIG_X86_64
- SetPagePinned(virt_to_page(level3_user_vsyscall));
-#endif
- xen_mark_init_mm_pinned();
-}
-
-/* This is called once we have the cpu_possible_map */
+/* This is called once we have the cpu_possible_mask */
void xen_setup_vcpu_info_placement(void)
{
int cpu;
@@ -1068,14 +1143,13 @@ void xen_setup_vcpu_info_placement(void)
xen_vcpu_setup(cpu);
/* xen_vcpu_setup managed to place the vcpu_info within the
- percpu area for all cpus, so make use of it */
- if (have_vcpu_info_placement) {
- printk(KERN_INFO "Xen: using vcpu_info placement\n");
-
- pv_irq_ops.save_fl = xen_save_fl_direct;
- pv_irq_ops.restore_fl = xen_restore_fl_direct;
- pv_irq_ops.irq_disable = xen_irq_disable_direct;
- pv_irq_ops.irq_enable = xen_irq_enable_direct;
+ * percpu area for all cpus, so make use of it. Note that for
+ * PVH we want to use native IRQ mechanism. */
+ if (have_vcpu_info_placement && !xen_pvh_domain()) {
+ pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
+ pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
+ pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
+ pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
}
}
@@ -1133,75 +1207,22 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
return ret;
}
-static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
-{
- pte_t pte;
-
- phys >>= PAGE_SHIFT;
-
- switch (idx) {
- case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
-#ifdef CONFIG_X86_F00F_BUG
- case FIX_F00F_IDT:
-#endif
-#ifdef CONFIG_X86_32
- case FIX_WP_TEST:
- case FIX_VDSO:
-# ifdef CONFIG_HIGHMEM
- case FIX_KMAP_BEGIN ... FIX_KMAP_END:
-# endif
-#else
- case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
- case FIX_APIC_BASE: /* maps dummy local APIC */
-#endif
- pte = pfn_pte(phys, prot);
- break;
-
- default:
- pte = mfn_pte(phys, prot);
- break;
- }
-
- __native_set_fixmap(idx, pte);
+static const struct pv_info xen_info __initconst = {
+ .paravirt_enabled = 1,
+ .shared_kernel_pmd = 0,
#ifdef CONFIG_X86_64
- /* Replicate changes to map the vsyscall page into the user
- pagetable vsyscall mapping. */
- if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
- unsigned long vaddr = __fix_to_virt(idx);
- set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
- }
+ .extra_user_64bit_cs = FLAT_USER_CS64,
#endif
-}
-
-static const struct pv_info xen_info __initdata = {
- .paravirt_enabled = 1,
- .shared_kernel_pmd = 0,
.name = "Xen",
};
-static const struct pv_init_ops xen_init_ops __initdata = {
+static const struct pv_init_ops xen_init_ops __initconst = {
.patch = xen_patch,
-
- .banner = xen_banner,
- .memory_setup = xen_memory_setup,
- .arch_setup = xen_arch_setup,
- .post_allocator_init = xen_post_allocator_init,
-};
-
-static const struct pv_time_ops xen_time_ops __initdata = {
- .time_init = xen_time_init,
-
- .set_wallclock = xen_set_wallclock,
- .get_wallclock = xen_get_wallclock,
- .get_tsc_khz = xen_tsc_khz,
- .sched_clock = xen_sched_clock,
};
-static const struct pv_cpu_ops xen_cpu_ops __initdata = {
+static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.cpuid = xen_cpuid,
.set_debugreg = xen_set_debugreg,
@@ -1209,20 +1230,28 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
.clts = xen_clts,
- .read_cr0 = native_read_cr0,
+ .read_cr0 = xen_read_cr0,
.write_cr0 = xen_write_cr0,
.read_cr4 = native_read_cr4,
.read_cr4_safe = native_read_cr4_safe,
.write_cr4 = xen_write_cr4,
+#ifdef CONFIG_X86_64
+ .read_cr8 = xen_read_cr8,
+ .write_cr8 = xen_write_cr8,
+#endif
+
.wbinvd = native_wbinvd,
.read_msr = native_read_msr_safe,
.write_msr = xen_write_msr_safe,
+
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
+ .read_tscp = native_read_tscp,
+
.iret = xen_iret,
.irq_enable_sysexit = xen_sysexit,
#ifdef CONFIG_X86_64
@@ -1242,7 +1271,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
.alloc_ldt = xen_alloc_ldt,
.free_ldt = xen_free_ldt,
- .store_gdt = native_store_gdt,
.store_idt = native_store_idt,
.store_tr = xen_store_tr,
@@ -1257,109 +1285,20 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
/* Xen takes care of %gs when switching to usermode for us */
.swapgs = paravirt_nop,
- .lazy_mode = {
- .enter = paravirt_enter_lazy_cpu,
- .leave = xen_leave_lazy,
- },
+ .start_context_switch = paravirt_start_context_switch,
+ .end_context_switch = xen_end_context_switch,
};
-static const struct pv_apic_ops xen_apic_ops __initdata = {
+static const struct pv_apic_ops xen_apic_ops __initconst = {
#ifdef CONFIG_X86_LOCAL_APIC
- .setup_boot_clock = paravirt_nop,
- .setup_secondary_clock = paravirt_nop,
.startup_ipi_hook = paravirt_nop,
#endif
};
-static const struct pv_mmu_ops xen_mmu_ops __initdata = {
- .pagetable_setup_start = xen_pagetable_setup_start,
- .pagetable_setup_done = xen_pagetable_setup_done,
-
- .read_cr2 = xen_read_cr2,
- .write_cr2 = xen_write_cr2,
-
- .read_cr3 = xen_read_cr3,
- .write_cr3 = xen_write_cr3,
-
- .flush_tlb_user = xen_flush_tlb,
- .flush_tlb_kernel = xen_flush_tlb,
- .flush_tlb_single = xen_flush_tlb_single,
- .flush_tlb_others = xen_flush_tlb_others,
-
- .pte_update = paravirt_nop,
- .pte_update_defer = paravirt_nop,
-
- .pgd_alloc = xen_pgd_alloc,
- .pgd_free = xen_pgd_free,
-
- .alloc_pte = xen_alloc_pte_init,
- .release_pte = xen_release_pte_init,
- .alloc_pmd = xen_alloc_pte_init,
- .alloc_pmd_clone = paravirt_nop,
- .release_pmd = xen_release_pte_init,
-
-#ifdef CONFIG_HIGHPTE
- .kmap_atomic_pte = xen_kmap_atomic_pte,
-#endif
-
-#ifdef CONFIG_X86_64
- .set_pte = xen_set_pte,
-#else
- .set_pte = xen_set_pte_init,
-#endif
- .set_pte_at = xen_set_pte_at,
- .set_pmd = xen_set_pmd_hyper,
-
- .ptep_modify_prot_start = __ptep_modify_prot_start,
- .ptep_modify_prot_commit = __ptep_modify_prot_commit,
-
- .pte_val = xen_pte_val,
- .pte_flags = native_pte_flags,
- .pgd_val = xen_pgd_val,
-
- .make_pte = xen_make_pte,
- .make_pgd = xen_make_pgd,
-
-#ifdef CONFIG_X86_PAE
- .set_pte_atomic = xen_set_pte_atomic,
- .set_pte_present = xen_set_pte_at,
- .pte_clear = xen_pte_clear,
- .pmd_clear = xen_pmd_clear,
-#endif /* CONFIG_X86_PAE */
- .set_pud = xen_set_pud_hyper,
-
- .make_pmd = xen_make_pmd,
- .pmd_val = xen_pmd_val,
-
-#if PAGETABLE_LEVELS == 4
- .pud_val = xen_pud_val,
- .make_pud = xen_make_pud,
- .set_pgd = xen_set_pgd_hyper,
-
- .alloc_pud = xen_alloc_pte_init,
- .release_pud = xen_release_pte_init,
-#endif /* PAGETABLE_LEVELS == 4 */
-
- .activate_mm = xen_activate_mm,
- .dup_mmap = xen_dup_mmap,
- .exit_mmap = xen_exit_mmap,
-
- .lazy_mode = {
- .enter = paravirt_enter_lazy_mmu,
- .leave = xen_leave_lazy,
- },
-
- .set_fixmap = xen_set_fixmap,
-};
-
static void xen_reboot(int reason)
{
struct sched_shutdown r = { .reason = reason };
-#ifdef CONFIG_SMP
- smp_send_stop();
-#endif
-
if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
BUG();
}
@@ -1379,267 +1318,275 @@ static void xen_machine_halt(void)
xen_reboot(SHUTDOWN_poweroff);
}
+static void xen_machine_power_off(void)
+{
+ if (pm_power_off)
+ pm_power_off();
+ xen_reboot(SHUTDOWN_poweroff);
+}
+
static void xen_crash_shutdown(struct pt_regs *regs)
{
xen_reboot(SHUTDOWN_crash);
}
-static const struct machine_ops __initdata xen_machine_ops = {
+static int
+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ xen_reboot(SHUTDOWN_crash);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block xen_panic_block = {
+ .notifier_call= xen_panic_event,
+ .priority = INT_MIN
+};
+
+int xen_panic_handler_init(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
+ return 0;
+}
+
+static const struct machine_ops xen_machine_ops __initconst = {
.restart = xen_restart,
.halt = xen_machine_halt,
- .power_off = xen_machine_halt,
+ .power_off = xen_machine_power_off,
.shutdown = xen_machine_halt,
.crash_shutdown = xen_crash_shutdown,
.emergency_restart = xen_emergency_restart,
};
-
-static void __init xen_reserve_top(void)
+static void __init xen_boot_params_init_edd(void)
{
-#ifdef CONFIG_X86_32
- unsigned long top = HYPERVISOR_VIRT_START;
- struct xen_platform_parameters pp;
-
- if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
- top = pp.virt_start;
+#if IS_ENABLED(CONFIG_EDD)
+ struct xen_platform_op op;
+ struct edd_info *edd_info;
+ u32 *mbr_signature;
+ unsigned nr;
+ int ret;
- reserve_top_address(-top);
-#endif /* CONFIG_X86_32 */
+ edd_info = boot_params.eddbuf;
+ mbr_signature = boot_params.edd_mbr_sig_buffer;
+
+ op.cmd = XENPF_firmware_info;
+
+ op.u.firmware_info.type = XEN_FW_DISK_INFO;
+ for (nr = 0; nr < EDDMAXNR; nr++) {
+ struct edd_info *info = edd_info + nr;
+
+ op.u.firmware_info.index = nr;
+ info->params.length = sizeof(info->params);
+ set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params,
+ &info->params);
+ ret = HYPERVISOR_dom0_op(&op);
+ if (ret)
+ break;
+
+#define C(x) info->x = op.u.firmware_info.u.disk_info.x
+ C(device);
+ C(version);
+ C(interface_support);
+ C(legacy_max_cylinder);
+ C(legacy_max_head);
+ C(legacy_sectors_per_track);
+#undef C
+ }
+ boot_params.eddbuf_entries = nr;
+
+ op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE;
+ for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) {
+ op.u.firmware_info.index = nr;
+ ret = HYPERVISOR_dom0_op(&op);
+ if (ret)
+ break;
+ mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature;
+ }
+ boot_params.edd_mbr_sig_buf_entries = nr;
+#endif
}
/*
- * Like __va(), but returns address in the kernel mapping (which is
- * all we have until the physical memory mapping has been set up.
+ * Set up the GDT and segment registers for -fstack-protector. Until
+ * we do this, we have to be careful not to call any stack-protected
+ * function, which is most of the kernel.
+ *
+ * Note, that it is __ref because the only caller of this after init
+ * is PVH which is not going to use xen_load_gdt_boot or other
+ * __init functions.
*/
-static void *__ka(phys_addr_t paddr)
+static void __ref xen_setup_gdt(int cpu)
{
+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
#ifdef CONFIG_X86_64
- return (void *)(paddr + __START_KERNEL_map);
+ unsigned long dummy;
+
+ load_percpu_segment(cpu); /* We need to access per-cpu area */
+ switch_to_new_gdt(cpu); /* GDT and GS set */
+
+ /* We are switching of the Xen provided GDT to our HVM mode
+ * GDT. The new GDT has __KERNEL_CS with CS.L = 1
+ * and we are jumping to reload it.
+ */
+ asm volatile ("pushq %0\n"
+ "leaq 1f(%%rip),%0\n"
+ "pushq %0\n"
+ "lretq\n"
+ "1:\n"
+ : "=&r" (dummy) : "0" (__KERNEL_CS));
+
+ /*
+ * While not needed, we also set the %es, %ds, and %fs
+ * to zero. We don't care about %ss as it is NULL.
+ * Strictly speaking this is not needed as Xen zeros those
+ * out (and also MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE)
+ *
+ * Linux zeros them in cpu_init() and in secondary_startup_64
+ * (for BSP).
+ */
+ loadsegment(es, 0);
+ loadsegment(ds, 0);
+ loadsegment(fs, 0);
#else
- return __va(paddr);
-#endif
-}
-
-/* Convert a machine address to physical address */
-static unsigned long m2p(phys_addr_t maddr)
-{
- phys_addr_t paddr;
-
- maddr &= PTE_PFN_MASK;
- paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
-
- return paddr;
-}
-
-/* Convert a machine address to kernel virtual */
-static void *m2v(phys_addr_t maddr)
-{
- return __ka(m2p(maddr));
-}
-
-static void set_page_prot(void *addr, pgprot_t prot)
-{
- unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
- pte_t pte = pfn_pte(pfn, prot);
-
- if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
+ /* PVH: TODO Implement. */
BUG();
-}
-
-static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
-{
- unsigned pmdidx, pteidx;
- unsigned ident_pte;
- unsigned long pfn;
-
- ident_pte = 0;
- pfn = 0;
- for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
- pte_t *pte_page;
-
- /* Reuse or allocate a page of ptes */
- if (pmd_present(pmd[pmdidx]))
- pte_page = m2v(pmd[pmdidx].pmd);
- else {
- /* Check for free pte pages */
- if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
- break;
-
- pte_page = &level1_ident_pgt[ident_pte];
- ident_pte += PTRS_PER_PTE;
-
- pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
- }
-
- /* Install mappings */
- for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
- pte_t pte;
-
- if (pfn > max_pfn_mapped)
- max_pfn_mapped = pfn;
-
- if (!pte_none(pte_page[pteidx]))
- continue;
-
- pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
- pte_page[pteidx] = pte;
- }
+#endif
+ return; /* PVH does not need any PV GDT ops. */
}
+ pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
+ pv_cpu_ops.load_gdt = xen_load_gdt_boot;
- for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
- set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
-
- set_page_prot(pmd, PAGE_KERNEL_RO);
-}
-
-#ifdef CONFIG_X86_64
-static void convert_pfn_mfn(void *v)
-{
- pte_t *pte = v;
- int i;
+ setup_stack_canary_segment(0);
+ switch_to_new_gdt(0);
- /* All levels are converted the same way, so just treat them
- as ptes. */
- for (i = 0; i < PTRS_PER_PTE; i++)
- pte[i] = xen_make_pte(pte[i].pte);
+ pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry;
+ pv_cpu_ops.load_gdt = xen_load_gdt;
}
/*
- * Set up the inital kernel pagetable.
- *
- * We can construct this by grafting the Xen provided pagetable into
- * head_64.S's preconstructed pagetables. We copy the Xen L2's into
- * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
- * means that only the kernel has a physical mapping to start with -
- * but that's enough to get __va working. We need to fill in the rest
- * of the physical mapping once some sort of allocator has been set
- * up.
+ * A PV guest starts with default flags that are not set for PVH, set them
+ * here asap.
*/
-static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
- unsigned long max_pfn)
+static void xen_pvh_set_cr_flags(int cpu)
{
- pud_t *l3;
- pmd_t *l2;
-
- /* Zap identity mapping */
- init_level4_pgt[0] = __pgd(0);
-
- /* Pre-constructed entries are in pfn, so convert to mfn */
- convert_pfn_mfn(init_level4_pgt);
- convert_pfn_mfn(level3_ident_pgt);
- convert_pfn_mfn(level3_kernel_pgt);
-
- l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
- l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
-
- memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
- memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
-
- l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
- l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
- memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
-
- /* Set up identity map */
- xen_map_identity_early(level2_ident_pgt, max_pfn);
-
- /* Make pagetable pieces RO */
- set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
- set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
- set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
- set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
- set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
- set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
- /* Pin down new L4 */
- pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
- PFN_DOWN(__pa_symbol(init_level4_pgt)));
-
- /* Unpin Xen-provided one */
- pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
-
- /* Switch over */
- pgd = init_level4_pgt;
+ /* Some of these are setup in 'secondary_startup_64'. The others:
+ * X86_CR0_TS, X86_CR0_PE, X86_CR0_ET are set by Xen for HVM guests
+ * (which PVH shared codepaths), while X86_CR0_PG is for PVH. */
+ write_cr0(read_cr0() | X86_CR0_MP | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM);
+ if (!cpu)
+ return;
/*
- * At this stage there can be no user pgd, and no page
- * structure to attach it to, so make sure we just set kernel
- * pgd.
- */
- xen_mc_batch();
- __xen_write_cr3(true, __pa(pgd));
- xen_mc_issue(PARAVIRT_LAZY_CPU);
-
- reserve_early(__pa(xen_start_info->pt_base),
- __pa(xen_start_info->pt_base +
- xen_start_info->nr_pt_frames * PAGE_SIZE),
- "XEN PAGETABLES");
+ * For BSP, PSE PGE are set in probe_page_size_mask(), for APs
+ * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu_init.
+ */
+ if (cpu_has_pse)
+ set_in_cr4(X86_CR4_PSE);
- return pgd;
+ if (cpu_has_pge)
+ set_in_cr4(X86_CR4_PGE);
}
-#else /* !CONFIG_X86_64 */
-static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
-static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
- unsigned long max_pfn)
+/*
+ * Note, that it is ref - because the only caller of this after init
+ * is PVH which is not going to use xen_load_gdt_boot or other
+ * __init functions.
+ */
+void __ref xen_pvh_secondary_vcpu_init(int cpu)
{
- pmd_t *kernel_pmd;
-
- init_pg_tables_start = __pa(pgd);
- init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
- max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
-
- kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
- memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
-
- xen_map_identity_early(level2_kernel_pgt, max_pfn);
-
- memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
- set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
- __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
-
- set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
- set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
- set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
+ xen_setup_gdt(cpu);
+ xen_pvh_set_cr_flags(cpu);
+}
- pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+static void __init xen_pvh_early_guest_init(void)
+{
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ return;
- xen_write_cr3(__pa(swapper_pg_dir));
+ if (!xen_feature(XENFEAT_hvm_callback_vector))
+ return;
- pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
+ xen_have_vector_callback = 1;
+ xen_pvh_set_cr_flags(0);
- return swapper_pg_dir;
+#ifdef CONFIG_X86_32
+ BUG(); /* PVH: Implement proper support. */
+#endif
}
-#endif /* CONFIG_X86_64 */
/* First C function to be called on Xen boot */
-asmlinkage void __init xen_start_kernel(void)
+asmlinkage __visible void __init xen_start_kernel(void)
{
- pgd_t *pgd;
+ struct physdev_set_iopl set_iopl;
+ int rc;
if (!xen_start_info)
return;
xen_domain_type = XEN_PV_DOMAIN;
- BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0);
-
xen_setup_features();
+ xen_pvh_early_guest_init();
+ xen_setup_machphys_mapping();
/* Install Xen paravirt ops */
pv_info = xen_info;
pv_init_ops = xen_init_ops;
- pv_time_ops = xen_time_ops;
- pv_cpu_ops = xen_cpu_ops;
pv_apic_ops = xen_apic_ops;
- pv_mmu_ops = xen_mmu_ops;
+ if (!xen_pvh_domain())
+ pv_cpu_ops = xen_cpu_ops;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ x86_init.resources.memory_setup = xen_auto_xlated_memory_setup;
+ else
+ x86_init.resources.memory_setup = xen_memory_setup;
+ x86_init.oem.arch_setup = xen_arch_setup;
+ x86_init.oem.banner = xen_banner;
+
+ xen_init_time_ops();
+
+ /*
+ * Set up some pagetable state before starting to set any ptes.
+ */
+
+ xen_init_mmu_ops();
+
+ /* Prevent unwanted bits from being set in PTEs. */
+ __supported_pte_mask &= ~_PAGE_GLOBAL;
+#if 0
+ if (!xen_initial_domain())
+#endif
+ __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
+
+ __supported_pte_mask |= _PAGE_IOMAP;
+
+ /*
+ * Prevent page tables from being allocated in highmem, even
+ * if CONFIG_HIGHPTE is enabled.
+ */
+ __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
+
+ /* Work out if we support NX */
+ x86_configure_nx();
+
+ /* Get mfn list */
+ xen_build_dynamic_phys_to_machine();
+
+ /*
+ * Set up kernel GDT and segment registers, mainly so that
+ * -fstack-protector code can be executed.
+ */
+ xen_setup_gdt(0);
xen_init_irq_ops();
+ xen_init_cpuid_mask();
#ifdef CONFIG_X86_LOCAL_APIC
/*
* set up the basic apic ops.
*/
- apic_ops = &xen_basic_apic_ops;
+ set_xen_basic_apic_ops();
#endif
if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
@@ -1649,50 +1596,74 @@ asmlinkage void __init xen_start_kernel(void)
machine_ops = xen_machine_ops;
-#ifdef CONFIG_X86_64
- /* Disable until direct per-cpu data access. */
- have_vcpu_info_placement = 0;
- x86_64_init_pda();
-#endif
+ /*
+ * The only reliable way to retain the initial address of the
+ * percpu gdt_page is to remember it here, so we can go and
+ * mark it RW later, when the initial percpu area is freed.
+ */
+ xen_initial_gdt = &per_cpu(gdt_page, 0);
xen_smp_init();
- /* Get mfn list */
- if (!xen_feature(XENFEAT_auto_translated_physmap))
- xen_build_dynamic_phys_to_machine();
-
- pgd = (pgd_t *)xen_start_info->pt_base;
-
- /* Prevent unwanted bits from being set in PTEs. */
- __supported_pte_mask &= ~_PAGE_GLOBAL;
- if (!xen_initial_domain())
- __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
-
+#ifdef CONFIG_ACPI_NUMA
+ /*
+ * The pages we from Xen are not related to machine pages, so
+ * any NUMA information the kernel tries to get from ACPI will
+ * be meaningless. Prevent it from trying.
+ */
+ acpi_numa = -1;
+#endif
+#ifdef CONFIG_X86_PAT
+ /*
+ * For right now disable the PAT. We should remove this once
+ * git commit 8eaffa67b43e99ae581622c5133e20b0f48bcef1
+ * (xen/pat: Disable PAT support for now) is reverted.
+ */
+ pat_enabled = 0;
+#endif
/* Don't do the full vcpu_info placement stuff until we have a
possible map and a non-dummy shared_info. */
per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
local_irq_disable();
- early_boot_irqs_off();
+ early_boot_irqs_disabled = true;
xen_raw_console_write("mapping kernel into physical memory\n");
- pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
+ xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages);
- init_mm.pgd = pgd;
+ /* Allocate and initialize top and mid mfn levels for p2m structure */
+ xen_build_mfn_list_list();
/* keep using Xen gdt for now; no urgent need to change it */
+#ifdef CONFIG_X86_32
pv_info.kernel_rpl = 1;
if (xen_feature(XENFEAT_supervisor_mode_kernel))
pv_info.kernel_rpl = 0;
-
+#else
+ pv_info.kernel_rpl = 0;
+#endif
/* set the limit of our address space */
xen_reserve_top();
+ /* PVH: runs at default kernel iopl of 0 */
+ if (!xen_pvh_domain()) {
+ /*
+ * We used to do this in xen_arch_setup, but that is too late
+ * on AMD were early_cpu_init (run before ->arch_setup()) calls
+ * early_amd_init which pokes 0xcf8 port.
+ */
+ set_iopl.iopl = 1;
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+ if (rc != 0)
+ xen_raw_printk("physdev_op failed %d\n", rc);
+ }
+
#ifdef CONFIG_X86_32
/* set up basic CPUID stuff */
cpu_detect(&new_cpu_data);
- new_cpu_data.hard_math = 1;
+ set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
+ new_cpu_data.wp_works_ok = 1;
new_cpu_data.x86_capability[0] = cpuid_edx(1);
#endif
@@ -1707,10 +1678,46 @@ asmlinkage void __init xen_start_kernel(void)
add_preferred_console("xenboot", 0, NULL);
add_preferred_console("tty", 0, NULL);
add_preferred_console("hvc", 0, NULL);
- }
+ if (pci_xen)
+ x86_init.pci.arch_init = pci_xen_init;
+ } else {
+ const struct dom0_vga_console_info *info =
+ (void *)((char *)xen_start_info +
+ xen_start_info->console.dom0.info_off);
+ struct xen_platform_op op = {
+ .cmd = XENPF_firmware_info,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS,
+ };
+
+ xen_init_vga(info, xen_start_info->console.dom0.info_size);
+ xen_start_info->console.domU.mfn = 0;
+ xen_start_info->console.domU.evtchn = 0;
+
+ if (HYPERVISOR_dom0_op(&op) == 0)
+ boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags;
+
+ xen_init_apic();
+
+ /* Make sure ACS will be enabled */
+ pci_request_acs();
+
+ xen_acpi_sleep_register();
+
+ /* Avoid searching for BIOS MP tables */
+ x86_init.mpparse.find_smp_config = x86_init_noop;
+ x86_init.mpparse.get_smp_config = x86_init_uint_noop;
+ xen_boot_params_init_edd();
+ }
+#ifdef CONFIG_PCI
+ /* PCI BIOS service won't work from a PV guest. */
+ pci_probe &= ~PCI_PROBE_BIOS;
+#endif
xen_raw_console_write("about to get started...\n");
+ xen_setup_runstate_info(0);
+
/* Start the world */
#ifdef CONFIG_X86_32
i386_start_kernel();
@@ -1718,3 +1725,132 @@ asmlinkage void __init xen_start_kernel(void)
x86_64_start_reservations((char *)__pa_symbol(&boot_params));
#endif
}
+
+void __ref xen_hvm_init_shared_info(void)
+{
+ int cpu;
+ struct xen_add_to_physmap xatp;
+ static struct shared_info *shared_info_page = 0;
+
+ if (!shared_info_page)
+ shared_info_page = (struct shared_info *)
+ extend_brk(PAGE_SIZE, PAGE_SIZE);
+ xatp.domid = DOMID_SELF;
+ xatp.idx = 0;
+ xatp.space = XENMAPSPACE_shared_info;
+ xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
+ if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
+ BUG();
+
+ HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
+
+ /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
+ * page, we use it in the event channel upcall and in some pvclock
+ * related functions. We don't need the vcpu_info placement
+ * optimizations because we don't use any pv_mmu or pv_irq op on
+ * HVM.
+ * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
+ * online but xen_hvm_init_shared_info is run at resume time too and
+ * in that case multiple vcpus might be online. */
+ for_each_online_cpu(cpu) {
+ /* Leave it to be NULL. */
+ if (cpu >= MAX_VIRT_CPUS)
+ continue;
+ per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+ }
+}
+
+#ifdef CONFIG_XEN_PVHVM
+static void __init init_hvm_pv_info(void)
+{
+ int major, minor;
+ uint32_t eax, ebx, ecx, edx, pages, msr, base;
+ u64 pfn;
+
+ base = xen_cpuid_base();
+ cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+
+ major = eax >> 16;
+ minor = eax & 0xffff;
+ printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
+
+ cpuid(base + 2, &pages, &msr, &ecx, &edx);
+
+ pfn = __pa(hypercall_page);
+ wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+
+ xen_setup_features();
+
+ pv_info.name = "Xen HVM";
+
+ xen_domain_type = XEN_HVM_DOMAIN;
+}
+
+static int xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action,
+ void *hcpu)
+{
+ int cpu = (long)hcpu;
+ switch (action) {
+ case CPU_UP_PREPARE:
+ xen_vcpu_setup(cpu);
+ if (xen_have_vector_callback) {
+ if (xen_feature(XENFEAT_hvm_safe_pvclock))
+ xen_setup_timer(cpu);
+ }
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block xen_hvm_cpu_notifier = {
+ .notifier_call = xen_hvm_cpu_notify,
+};
+
+static void __init xen_hvm_guest_init(void)
+{
+ init_hvm_pv_info();
+
+ xen_hvm_init_shared_info();
+
+ xen_panic_handler_init();
+
+ if (xen_feature(XENFEAT_hvm_callback_vector))
+ xen_have_vector_callback = 1;
+ xen_hvm_smp_init();
+ register_cpu_notifier(&xen_hvm_cpu_notifier);
+ xen_unplug_emulated_devices();
+ x86_init.irqs.intr_init = xen_init_IRQ;
+ xen_hvm_init_time_ops();
+ xen_hvm_init_mmu_ops();
+}
+
+static uint32_t __init xen_hvm_platform(void)
+{
+ if (xen_pv_domain())
+ return 0;
+
+ return xen_cpuid_base();
+}
+
+bool xen_hvm_need_lapic(void)
+{
+ if (xen_pv_domain())
+ return false;
+ if (!xen_hvm_domain())
+ return false;
+ if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback)
+ return false;
+ return true;
+}
+EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
+
+const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = {
+ .name = "Xen HVM",
+ .detect = xen_hvm_platform,
+ .init_platform = xen_hvm_guest_init,
+ .x2apic_available = xen_x2apic_para_available,
+};
+EXPORT_SYMBOL(x86_hyper_xen_hvm);
+#endif
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index 49ba9b5224d..ebfa9b2c871 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -36,56 +36,190 @@
#include <linux/sched.h>
#include <linux/mm.h>
+#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <xen/interface/xen.h>
#include <xen/page.h>
#include <xen/grant_table.h>
+#include <xen/xen.h>
#include <asm/pgtable.h>
-static int map_pte_fn(pte_t *pte, struct page *pmd_page,
- unsigned long addr, void *data)
+static struct gnttab_vm_area {
+ struct vm_struct *area;
+ pte_t **ptes;
+} gnttab_shared_vm_area, gnttab_status_vm_area;
+
+int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
+ unsigned long max_nr_gframes,
+ void **__shared)
{
- unsigned long **frames = (unsigned long **)data;
+ void *shared = *__shared;
+ unsigned long addr;
+ unsigned long i;
+
+ if (shared == NULL)
+ *__shared = shared = gnttab_shared_vm_area.area->addr;
+
+ addr = (unsigned long)shared;
+
+ for (i = 0; i < nr_gframes; i++) {
+ set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i],
+ mfn_pte(frames[i], PAGE_KERNEL));
+ addr += PAGE_SIZE;
+ }
- set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
- (*frames)++;
return 0;
}
-static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
- unsigned long addr, void *data)
+int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
+ unsigned long max_nr_gframes,
+ grant_status_t **__shared)
{
+ grant_status_t *shared = *__shared;
+ unsigned long addr;
+ unsigned long i;
+
+ if (shared == NULL)
+ *__shared = shared = gnttab_status_vm_area.area->addr;
+
+ addr = (unsigned long)shared;
+
+ for (i = 0; i < nr_gframes; i++) {
+ set_pte_at(&init_mm, addr, gnttab_status_vm_area.ptes[i],
+ mfn_pte(frames[i], PAGE_KERNEL));
+ addr += PAGE_SIZE;
+ }
- set_pte_at(&init_mm, addr, pte, __pte(0));
return 0;
}
-int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
- unsigned long max_nr_gframes,
- struct grant_entry **__shared)
+void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
+{
+ pte_t **ptes;
+ unsigned long addr;
+ unsigned long i;
+
+ if (shared == gnttab_status_vm_area.area->addr)
+ ptes = gnttab_status_vm_area.ptes;
+ else
+ ptes = gnttab_shared_vm_area.ptes;
+
+ addr = (unsigned long)shared;
+
+ for (i = 0; i < nr_gframes; i++) {
+ set_pte_at(&init_mm, addr, ptes[i], __pte(0));
+ addr += PAGE_SIZE;
+ }
+}
+
+static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames)
+{
+ area->ptes = kmalloc(sizeof(pte_t *) * nr_frames, GFP_KERNEL);
+ if (area->ptes == NULL)
+ return -ENOMEM;
+
+ area->area = alloc_vm_area(PAGE_SIZE * nr_frames, area->ptes);
+ if (area->area == NULL) {
+ kfree(area->ptes);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void arch_gnttab_vfree(struct gnttab_vm_area *area)
+{
+ free_vm_area(area->area);
+ kfree(area->ptes);
+}
+
+int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status)
+{
+ int ret;
+
+ if (!xen_pv_domain())
+ return 0;
+
+ ret = arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Always allocate the space for the status frames in case
+ * we're migrated to a host with V2 support.
+ */
+ ret = arch_gnttab_valloc(&gnttab_status_vm_area, nr_status);
+ if (ret < 0)
+ goto err;
+
+ return 0;
+ err:
+ arch_gnttab_vfree(&gnttab_shared_vm_area);
+ return -ENOMEM;
+}
+
+#ifdef CONFIG_XEN_PVH
+#include <xen/balloon.h>
+#include <xen/events.h>
+#include <linux/slab.h>
+static int __init xlated_setup_gnttab_pages(void)
{
+ struct page **pages;
+ xen_pfn_t *pfns;
int rc;
- struct grant_entry *shared = *__shared;
-
- if (shared == NULL) {
- struct vm_struct *area =
- xen_alloc_vm_area(PAGE_SIZE * max_nr_gframes);
- BUG_ON(area == NULL);
- shared = area->addr;
- *__shared = shared;
+ unsigned int i;
+ unsigned long nr_grant_frames = gnttab_max_grant_frames();
+
+ BUG_ON(nr_grant_frames == 0);
+ pages = kcalloc(nr_grant_frames, sizeof(pages[0]), GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ pfns = kcalloc(nr_grant_frames, sizeof(pfns[0]), GFP_KERNEL);
+ if (!pfns) {
+ kfree(pages);
+ return -ENOMEM;
+ }
+ rc = alloc_xenballooned_pages(nr_grant_frames, pages, 0 /* lowmem */);
+ if (rc) {
+ pr_warn("%s Couldn't balloon alloc %ld pfns rc:%d\n", __func__,
+ nr_grant_frames, rc);
+ kfree(pages);
+ kfree(pfns);
+ return rc;
}
+ for (i = 0; i < nr_grant_frames; i++)
+ pfns[i] = page_to_pfn(pages[i]);
- rc = apply_to_page_range(&init_mm, (unsigned long)shared,
- PAGE_SIZE * nr_gframes,
- map_pte_fn, &frames);
- return rc;
+ rc = arch_gnttab_map_shared(pfns, nr_grant_frames, nr_grant_frames,
+ &xen_auto_xlat_grant_frames.vaddr);
+
+ if (rc) {
+ pr_warn("%s Couldn't map %ld pfns rc:%d\n", __func__,
+ nr_grant_frames, rc);
+ free_xenballooned_pages(nr_grant_frames, pages);
+ kfree(pages);
+ kfree(pfns);
+ return rc;
+ }
+ kfree(pages);
+
+ xen_auto_xlat_grant_frames.pfn = pfns;
+ xen_auto_xlat_grant_frames.count = nr_grant_frames;
+
+ return 0;
}
-void arch_gnttab_unmap_shared(struct grant_entry *shared,
- unsigned long nr_gframes)
+static int __init xen_pvh_gnttab_setup(void)
{
- apply_to_page_range(&init_mm, (unsigned long)shared,
- PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
+ if (!xen_pvh_domain())
+ return -ENODEV;
+
+ return xlated_setup_gnttab_pages();
}
+/* Call it _before_ __gnttab_init as we need to initialize the
+ * xen_auto_xlat_grant_frames first. */
+core_initcall(xen_pvh_gnttab_setup);
+#endif
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index bb042608c60..a1207cb6472 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -1,8 +1,12 @@
#include <linux/hardirq.h>
+#include <asm/x86_init.h>
+
#include <xen/interface/xen.h>
#include <xen/interface/sched.h>
#include <xen/interface/vcpu.h>
+#include <xen/features.h>
+#include <xen/events.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
@@ -19,27 +23,12 @@ void xen_force_evtchn_callback(void)
(void)HYPERVISOR_xen_version(0, NULL);
}
-static void __init __xen_init_IRQ(void)
-{
- int i;
-
- /* Create identity vector->irq map */
- for(i = 0; i < NR_VECTORS; i++) {
- int cpu;
-
- for_each_possible_cpu(cpu)
- per_cpu(vector_irq, cpu)[i] = i;
- }
-
- xen_init_IRQ();
-}
-
-static unsigned long xen_save_fl(void)
+asmlinkage __visible unsigned long xen_save_fl(void)
{
struct vcpu_info *vcpu;
unsigned long flags;
- vcpu = x86_read_percpu(xen_vcpu);
+ vcpu = this_cpu_read(xen_vcpu);
/* flag has opposite sense of mask */
flags = !vcpu->evtchn_upcall_mask;
@@ -50,53 +39,53 @@ static unsigned long xen_save_fl(void)
*/
return (-flags) & X86_EFLAGS_IF;
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
-static void xen_restore_fl(unsigned long flags)
+__visible void xen_restore_fl(unsigned long flags)
{
struct vcpu_info *vcpu;
/* convert from IF type flag */
flags = !(flags & X86_EFLAGS_IF);
- /* There's a one instruction preempt window here. We need to
- make sure we're don't switch CPUs between getting the vcpu
- pointer and updating the mask. */
+ /* See xen_irq_enable() for why preemption must be disabled. */
preempt_disable();
- vcpu = x86_read_percpu(xen_vcpu);
+ vcpu = this_cpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = flags;
- preempt_enable_no_resched();
-
- /* Doesn't matter if we get preempted here, because any
- pending event will get dealt with anyway. */
if (flags == 0) {
- preempt_check_resched();
barrier(); /* unmask then check (avoid races) */
if (unlikely(vcpu->evtchn_upcall_pending))
xen_force_evtchn_callback();
- }
+ preempt_enable();
+ } else
+ preempt_enable_no_resched();
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
-static void xen_irq_disable(void)
+asmlinkage __visible void xen_irq_disable(void)
{
/* There's a one instruction preempt window here. We need to
make sure we're don't switch CPUs between getting the vcpu
pointer and updating the mask. */
preempt_disable();
- x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
+ this_cpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
preempt_enable_no_resched();
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
-static void xen_irq_enable(void)
+asmlinkage __visible void xen_irq_enable(void)
{
struct vcpu_info *vcpu;
- /* We don't need to worry about being preempted here, since
- either a) interrupts are disabled, so no preemption, or b)
- the caller is confused and is trying to re-enable interrupts
- on an indeterminate processor. */
+ /*
+ * We may be preempted as soon as vcpu->evtchn_upcall_mask is
+ * cleared, so disable preemption to ensure we check for
+ * events on the VCPU we are still running on.
+ */
+ preempt_disable();
- vcpu = x86_read_percpu(xen_vcpu);
+ vcpu = this_cpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = 0;
/* Doesn't matter if we get preempted here, because any
@@ -105,7 +94,10 @@ static void xen_irq_enable(void)
barrier(); /* unmask then check (avoid races) */
if (unlikely(vcpu->evtchn_upcall_pending))
xen_force_evtchn_callback();
+
+ preempt_enable();
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
static void xen_safe_halt(void)
{
@@ -122,12 +114,12 @@ static void xen_halt(void)
xen_safe_halt();
}
-static const struct pv_irq_ops xen_irq_ops __initdata = {
- .init_IRQ = __xen_init_IRQ,
- .save_fl = xen_save_fl,
- .restore_fl = xen_restore_fl,
- .irq_disable = xen_irq_disable,
- .irq_enable = xen_irq_enable,
+static const struct pv_irq_ops xen_irq_ops __initconst = {
+ .save_fl = PV_CALLEE_SAVE(xen_save_fl),
+ .restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
+ .irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
+ .irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
+
.safe_halt = xen_safe_halt,
.halt = xen_halt,
#ifdef CONFIG_X86_64
@@ -135,7 +127,10 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
#endif
};
-void __init xen_init_irq_ops()
+void __init xen_init_irq_ops(void)
{
- pv_irq_ops = xen_irq_ops;
+ /* For PVH we use default pv_irq_ops settings. */
+ if (!xen_feature(XENFEAT_hvm_callback_vector))
+ pv_irq_ops = xen_irq_ops;
+ x86_init.irqs.intr_init = xen_init_IRQ;
}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 503c240e26c..e8a1201c329 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -42,77 +42,80 @@
#include <linux/highmem.h>
#include <linux/debugfs.h>
#include <linux/bug.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <linux/memblock.h>
+#include <linux/seq_file.h>
+#include <linux/crash_dump.h>
+
+#include <trace/events/xen.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/fixmap.h>
#include <asm/mmu_context.h>
+#include <asm/setup.h>
#include <asm/paravirt.h>
+#include <asm/e820.h>
#include <asm/linkage.h>
+#include <asm/page.h>
+#include <asm/init.h>
+#include <asm/pat.h>
+#include <asm/smp.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
+#include <xen/xen.h>
#include <xen/page.h>
#include <xen/interface/xen.h>
+#include <xen/interface/hvm/hvm_op.h>
+#include <xen/interface/version.h>
+#include <xen/interface/memory.h>
+#include <xen/hvc-console.h>
#include "multicalls.h"
#include "mmu.h"
#include "debugfs.h"
-#define MMU_UPDATE_HISTO 30
-
-#ifdef CONFIG_XEN_DEBUG_FS
-
-static struct {
- u32 pgd_update;
- u32 pgd_update_pinned;
- u32 pgd_update_batched;
-
- u32 pud_update;
- u32 pud_update_pinned;
- u32 pud_update_batched;
-
- u32 pmd_update;
- u32 pmd_update_pinned;
- u32 pmd_update_batched;
-
- u32 pte_update;
- u32 pte_update_pinned;
- u32 pte_update_batched;
-
- u32 mmu_update;
- u32 mmu_update_extended;
- u32 mmu_update_histo[MMU_UPDATE_HISTO];
-
- u32 prot_commit;
- u32 prot_commit_batched;
-
- u32 set_pte_at;
- u32 set_pte_at_batched;
- u32 set_pte_at_pinned;
- u32 set_pte_at_current;
- u32 set_pte_at_kernel;
-} mmu_stats;
-
-static u8 zero_stats;
-
-static inline void check_zero(void)
-{
- if (unlikely(zero_stats)) {
- memset(&mmu_stats, 0, sizeof(mmu_stats));
- zero_stats = 0;
- }
-}
-
-#define ADD_STATS(elem, val) \
- do { check_zero(); mmu_stats.elem += (val); } while(0)
+/*
+ * Protects atomic reservation decrease/increase against concurrent increases.
+ * Also protects non-atomic updates of current_pages and balloon lists.
+ */
+DEFINE_SPINLOCK(xen_reservation_lock);
-#else /* !CONFIG_XEN_DEBUG_FS */
+#ifdef CONFIG_X86_32
+/*
+ * Identity map, in addition to plain kernel map. This needs to be
+ * large enough to allocate page table pages to allocate the rest.
+ * Each page can map 2MB.
+ */
+#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)
+static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
+#endif
+#ifdef CONFIG_X86_64
+/* l3 pud for userspace vsyscall mapping */
+static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
+#endif /* CONFIG_X86_64 */
-#define ADD_STATS(elem, val) do { (void)(val); } while(0)
+/*
+ * Note about cr3 (pagetable base) values:
+ *
+ * xen_cr3 contains the current logical cr3 value; it contains the
+ * last set cr3. This may not be the current effective cr3, because
+ * its update may be being lazily deferred. However, a vcpu looking
+ * at its own cr3 can use this value knowing that it everything will
+ * be self-consistent.
+ *
+ * xen_current_cr3 contains the actual vcpu cr3; it is set once the
+ * hypercall to set the vcpu cr3 is complete (so it may be a little
+ * out of date, but it will never be set early). If one vcpu is
+ * looking at another vcpu's cr3 value, it should use this variable.
+ */
+DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
+DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
-#endif /* CONFIG_XEN_DEBUG_FS */
/*
* Just beyond the highest usermode address. STACK_TOP_MAX has a
@@ -120,126 +123,11 @@ static inline void check_zero(void)
*/
#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
-
-#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
-#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
-
-/* Placeholder for holes in the address space */
-static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
- { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
-
- /* Array of pointers to pages containing p2m entries */
-static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
- { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
-
-/* Arrays of p2m arrays expressed in mfns used for save/restore */
-static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
-
-static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
- __page_aligned_bss;
-
-static inline unsigned p2m_top_index(unsigned long pfn)
-{
- BUG_ON(pfn >= MAX_DOMAIN_PAGES);
- return pfn / P2M_ENTRIES_PER_PAGE;
-}
-
-static inline unsigned p2m_index(unsigned long pfn)
-{
- return pfn % P2M_ENTRIES_PER_PAGE;
-}
-
-/* Build the parallel p2m_top_mfn structures */
-void xen_setup_mfn_list_list(void)
-{
- unsigned pfn, idx;
-
- for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
- unsigned topidx = p2m_top_index(pfn);
-
- p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
- }
-
- for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
- unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
- p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
- }
-
- BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
-
- HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
- virt_to_mfn(p2m_top_mfn_list);
- HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
-}
-
-/* Set up p2m_top to point to the domain-builder provided p2m pages */
-void __init xen_build_dynamic_phys_to_machine(void)
-{
- unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
- unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
- unsigned pfn;
-
- for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
- unsigned topidx = p2m_top_index(pfn);
-
- p2m_top[topidx] = &mfn_list[pfn];
- }
-}
-
-unsigned long get_phys_to_machine(unsigned long pfn)
-{
- unsigned topidx, idx;
-
- if (unlikely(pfn >= MAX_DOMAIN_PAGES))
- return INVALID_P2M_ENTRY;
-
- topidx = p2m_top_index(pfn);
- idx = p2m_index(pfn);
- return p2m_top[topidx][idx];
-}
-EXPORT_SYMBOL_GPL(get_phys_to_machine);
-
-static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
+unsigned long arbitrary_virt_to_mfn(void *vaddr)
{
- unsigned long *p;
- unsigned i;
-
- p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
- BUG_ON(p == NULL);
-
- for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
- p[i] = INVALID_P2M_ENTRY;
+ xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
- if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
- free_page((unsigned long)p);
- else
- *mfnp = virt_to_mfn(p);
-}
-
-void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
-{
- unsigned topidx, idx;
-
- if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
- BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
- return;
- }
-
- if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
- BUG_ON(mfn != INVALID_P2M_ENTRY);
- return;
- }
-
- topidx = p2m_top_index(pfn);
- if (p2m_top[topidx] == p2m_missing) {
- /* no need to allocate a page to store an invalid entry */
- if (mfn == INVALID_P2M_ENTRY)
- return;
- alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]);
- }
-
- idx = p2m_index(pfn);
- p2m_top[topidx][idx] = mfn;
+ return PFN_DOWN(maddr.maddr);
}
xmaddr_t arbitrary_virt_to_machine(void *vaddr)
@@ -263,6 +151,7 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr)
offset = address & ~PAGE_MASK;
return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
}
+EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
void make_lowmem_page_readonly(void *vaddr)
{
@@ -271,7 +160,8 @@ void make_lowmem_page_readonly(void *vaddr)
unsigned int level;
pte = lookup_address(address, &level);
- BUG_ON(pte == NULL);
+ if (pte == NULL)
+ return; /* vaddr missing */
ptev = pte_wrprotect(*pte);
@@ -286,7 +176,8 @@ void make_lowmem_page_readwrite(void *vaddr)
unsigned int level;
pte = lookup_address(address, &level);
- BUG_ON(pte == NULL);
+ if (pte == NULL)
+ return; /* vaddr missing */
ptev = pte_mkwrite(*pte);
@@ -302,6 +193,26 @@ static bool xen_page_pinned(void *ptr)
return PagePinned(page);
}
+void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
+{
+ struct multicall_space mcs;
+ struct mmu_update *u;
+
+ trace_xen_mmu_set_domain_pte(ptep, pteval, domid);
+
+ mcs = xen_mc_entry(sizeof(*u));
+ u = mcs.args;
+
+ /* ptep might be kmapped when using 32-bit HIGHPTE */
+ u->ptr = virt_to_machine(ptep).maddr;
+ u->val = pte_val_ma(pteval);
+
+ MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+EXPORT_SYMBOL_GPL(xen_set_domain_pte);
+
static void xen_extend_mmu_update(const struct mmu_update *update)
{
struct multicall_space mcs;
@@ -310,27 +221,35 @@ static void xen_extend_mmu_update(const struct mmu_update *update)
mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
if (mcs.mc != NULL) {
- ADD_STATS(mmu_update_extended, 1);
- ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
-
mcs.mc->args[1]++;
-
- if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
- ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
- else
- ADD_STATS(mmu_update_histo[0], 1);
} else {
- ADD_STATS(mmu_update, 1);
mcs = __xen_mc_entry(sizeof(*u));
MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
- ADD_STATS(mmu_update_histo[1], 1);
}
u = mcs.args;
*u = *update;
}
-void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
+static void xen_extend_mmuext_op(const struct mmuext_op *op)
+{
+ struct multicall_space mcs;
+ struct mmuext_op *u;
+
+ mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));
+
+ if (mcs.mc != NULL) {
+ mcs.mc->args[1]++;
+ } else {
+ mcs = __xen_mc_entry(sizeof(*u));
+ MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
+ }
+
+ u = mcs.args;
+ *u = *op;
+}
+
+static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
{
struct mmu_update u;
@@ -343,16 +262,14 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
u.val = pmd_val_ma(val);
xen_extend_mmu_update(&u);
- ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
-
xen_mc_issue(PARAVIRT_LAZY_MMU);
preempt_enable();
}
-void xen_set_pmd(pmd_t *ptr, pmd_t val)
+static void xen_set_pmd(pmd_t *ptr, pmd_t val)
{
- ADD_STATS(pmd_update, 1);
+ trace_xen_mmu_set_pmd(ptr, val);
/* If page is not pinned, we can just update the entry
directly */
@@ -361,8 +278,6 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
return;
}
- ADD_STATS(pmd_update_pinned, 1);
-
xen_set_pmd_hyper(ptr, val);
}
@@ -375,42 +290,60 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
}
-void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval)
+static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
{
- /* updates to init_mm may be done without lock */
- if (mm == &init_mm)
- preempt_disable();
+ struct mmu_update u;
- ADD_STATS(set_pte_at, 1);
-// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
- ADD_STATS(set_pte_at_current, mm == current->mm);
- ADD_STATS(set_pte_at_kernel, mm == &init_mm);
+ if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
+ return false;
- if (mm == current->mm || mm == &init_mm) {
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
- struct multicall_space mcs;
- mcs = xen_mc_entry(0);
+ xen_mc_batch();
- MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
- ADD_STATS(set_pte_at_batched, 1);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
- goto out;
- } else
- if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
- goto out;
+ u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
+ u.val = pte_val_ma(pteval);
+ xen_extend_mmu_update(&u);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ return true;
+}
+
+static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
+{
+ if (!xen_batched_set_pte(ptep, pteval)) {
+ /*
+ * Could call native_set_pte() here and trap and
+ * emulate the PTE write but with 32-bit guests this
+ * needs two traps (one for each of the two 32-bit
+ * words in the PTE) so do one hypercall directly
+ * instead.
+ */
+ struct mmu_update u;
+
+ u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
+ u.val = pte_val_ma(pteval);
+ HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
}
- xen_set_pte(ptep, pteval);
+}
-out:
- if (mm == &init_mm)
- preempt_enable();
+static void xen_set_pte(pte_t *ptep, pte_t pteval)
+{
+ trace_xen_mmu_set_pte(ptep, pteval);
+ __xen_set_pte(ptep, pteval);
+}
+
+static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pteval)
+{
+ trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);
+ __xen_set_pte(ptep, pteval);
}
pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
/* Just return the pte as-is. We preserve the bits on commit */
+ trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
return *ptep;
}
@@ -419,15 +352,13 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
{
struct mmu_update u;
+ trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
xen_mc_batch();
- u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
+ u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
u.val = pte_val_ma(pte);
xen_extend_mmu_update(&u);
- ADD_STATS(prot_commit, 1);
- ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
-
xen_mc_issue(PARAVIRT_LAZY_MMU);
}
@@ -436,8 +367,13 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)
{
if (val & _PAGE_PRESENT) {
unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
+ unsigned long pfn = mfn_to_pfn(mfn);
+
pteval_t flags = val & PTE_FLAGS_MASK;
- val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
+ if (unlikely(pfn == ~0))
+ val = flags & ~_PAGE_PRESENT;
+ else
+ val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
}
return val;
@@ -448,40 +384,150 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
if (val & _PAGE_PRESENT) {
unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
pteval_t flags = val & PTE_FLAGS_MASK;
- val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
+ unsigned long mfn;
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ mfn = get_phys_to_machine(pfn);
+ else
+ mfn = pfn;
+ /*
+ * If there's no mfn for the pfn, then just create an
+ * empty non-present pte. Unfortunately this loses
+ * information about the original pfn, so
+ * pte_mfn_to_pfn is asymmetric.
+ */
+ if (unlikely(mfn == INVALID_P2M_ENTRY)) {
+ mfn = 0;
+ flags = 0;
+ } else {
+ /*
+ * Paramount to do this test _after_ the
+ * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
+ * IDENTITY_FRAME_BIT resolves to true.
+ */
+ mfn &= ~FOREIGN_FRAME_BIT;
+ if (mfn & IDENTITY_FRAME_BIT) {
+ mfn &= ~IDENTITY_FRAME_BIT;
+ flags |= _PAGE_IOMAP;
+ }
+ }
+ val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
+ }
+
+ return val;
+}
+
+static pteval_t iomap_pte(pteval_t val)
+{
+ if (val & _PAGE_PRESENT) {
+ unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
+ pteval_t flags = val & PTE_FLAGS_MASK;
+
+ /* We assume the pte frame number is a MFN, so
+ just use it as-is. */
+ val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
}
return val;
}
-pteval_t xen_pte_val(pte_t pte)
+__visible pteval_t xen_pte_val(pte_t pte)
{
- return pte_mfn_to_pfn(pte.pte);
+ pteval_t pteval = pte.pte;
+#if 0
+ /* If this is a WC pte, convert back from Xen WC to Linux WC */
+ if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
+ WARN_ON(!pat_enabled);
+ pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
+ }
+#endif
+ if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
+ return pteval;
+
+ return pte_mfn_to_pfn(pteval);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
-pgdval_t xen_pgd_val(pgd_t pgd)
+__visible pgdval_t xen_pgd_val(pgd_t pgd)
{
return pte_mfn_to_pfn(pgd.pgd);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
+
+/*
+ * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
+ * are reserved for now, to correspond to the Intel-reserved PAT
+ * types.
+ *
+ * We expect Linux's PAT set as follows:
+ *
+ * Idx PTE flags Linux Xen Default
+ * 0 WB WB WB
+ * 1 PWT WC WT WT
+ * 2 PCD UC- UC- UC-
+ * 3 PCD PWT UC UC UC
+ * 4 PAT WB WC WB
+ * 5 PAT PWT WC WP WT
+ * 6 PAT PCD UC- rsv UC-
+ * 7 PAT PCD PWT UC rsv UC
+ */
+
+void xen_set_pat(u64 pat)
+{
+ /* We expect Linux to use a PAT setting of
+ * UC UC- WC WB (ignoring the PAT flag) */
+ WARN_ON(pat != 0x0007010600070106ull);
+}
-pte_t xen_make_pte(pteval_t pte)
+__visible pte_t xen_make_pte(pteval_t pte)
{
- pte = pte_pfn_to_mfn(pte);
+ phys_addr_t addr = (pte & PTE_PFN_MASK);
+#if 0
+ /* If Linux is trying to set a WC pte, then map to the Xen WC.
+ * If _PAGE_PAT is set, then it probably means it is really
+ * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
+ * things work out OK...
+ *
+ * (We should never see kernel mappings with _PAGE_PSE set,
+ * but we could see hugetlbfs mappings, I think.).
+ */
+ if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
+ if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
+ pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
+ }
+#endif
+ /*
+ * Unprivileged domains are allowed to do IOMAPpings for
+ * PCI passthrough, but not map ISA space. The ISA
+ * mappings are just dummy local mappings to keep other
+ * parts of the kernel happy.
+ */
+ if (unlikely(pte & _PAGE_IOMAP) &&
+ (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
+ pte = iomap_pte(pte);
+ } else {
+ pte &= ~_PAGE_IOMAP;
+ pte = pte_pfn_to_mfn(pte);
+ }
+
return native_make_pte(pte);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
-pgd_t xen_make_pgd(pgdval_t pgd)
+__visible pgd_t xen_make_pgd(pgdval_t pgd)
{
pgd = pte_pfn_to_mfn(pgd);
return native_make_pgd(pgd);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
-pmdval_t xen_pmd_val(pmd_t pmd)
+__visible pmdval_t xen_pmd_val(pmd_t pmd)
{
return pte_mfn_to_pfn(pmd.pmd);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
-void xen_set_pud_hyper(pud_t *ptr, pud_t val)
+static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
{
struct mmu_update u;
@@ -494,16 +540,14 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
u.val = pud_val_ma(val);
xen_extend_mmu_update(&u);
- ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
-
xen_mc_issue(PARAVIRT_LAZY_MMU);
preempt_enable();
}
-void xen_set_pud(pud_t *ptr, pud_t val)
+static void xen_set_pud(pud_t *ptr, pud_t val)
{
- ADD_STATS(pud_update, 1);
+ trace_xen_mmu_set_pud(ptr, val);
/* If page is not pinned, we can just update the entry
directly */
@@ -512,65 +556,53 @@ void xen_set_pud(pud_t *ptr, pud_t val)
return;
}
- ADD_STATS(pud_update_pinned, 1);
-
xen_set_pud_hyper(ptr, val);
}
-void xen_set_pte(pte_t *ptep, pte_t pte)
-{
- ADD_STATS(pte_update, 1);
-// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
- ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
-
-#ifdef CONFIG_X86_PAE
- ptep->pte_high = pte.pte_high;
- smp_wmb();
- ptep->pte_low = pte.pte_low;
-#else
- *ptep = pte;
-#endif
-}
-
#ifdef CONFIG_X86_PAE
-void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
{
+ trace_xen_mmu_set_pte_atomic(ptep, pte);
set_64bit((u64 *)ptep, native_pte_val(pte));
}
-void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
- ptep->pte_low = 0;
- smp_wmb(); /* make sure low gets written first */
- ptep->pte_high = 0;
+ trace_xen_mmu_pte_clear(mm, addr, ptep);
+ if (!xen_batched_set_pte(ptep, native_make_pte(0)))
+ native_pte_clear(mm, addr, ptep);
}
-void xen_pmd_clear(pmd_t *pmdp)
+static void xen_pmd_clear(pmd_t *pmdp)
{
+ trace_xen_mmu_pmd_clear(pmdp);
set_pmd(pmdp, __pmd(0));
}
#endif /* CONFIG_X86_PAE */
-pmd_t xen_make_pmd(pmdval_t pmd)
+__visible pmd_t xen_make_pmd(pmdval_t pmd)
{
pmd = pte_pfn_to_mfn(pmd);
return native_make_pmd(pmd);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
#if PAGETABLE_LEVELS == 4
-pudval_t xen_pud_val(pud_t pud)
+__visible pudval_t xen_pud_val(pud_t pud)
{
return pte_mfn_to_pfn(pud.pud);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
-pud_t xen_make_pud(pudval_t pud)
+__visible pud_t xen_make_pud(pudval_t pud)
{
pud = pte_pfn_to_mfn(pud);
return native_make_pud(pud);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
-pgd_t *xen_get_user_pgd(pgd_t *pgd)
+static pgd_t *xen_get_user_pgd(pgd_t *pgd)
{
pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
unsigned offset = pgd - pgd_page;
@@ -602,7 +634,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
* 2. It is always pinned
* 3. It has no user pagetable attached to it
*/
-void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
+static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
{
preempt_disable();
@@ -615,11 +647,11 @@ void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
preempt_enable();
}
-void xen_set_pgd(pgd_t *ptr, pgd_t val)
+static void xen_set_pgd(pgd_t *ptr, pgd_t val)
{
pgd_t *user_ptr = xen_get_user_pgd(ptr);
- ADD_STATS(pgd_update, 1);
+ trace_xen_mmu_set_pgd(ptr, user_ptr, val);
/* If page is not pinned, we can just update the entry
directly */
@@ -632,9 +664,6 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
return;
}
- ADD_STATS(pgd_update_pinned, 1);
- ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
-
/* If it's pinned, then we can at least batch the kernel and
user updates together. */
xen_mc_batch();
@@ -767,8 +796,8 @@ static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
{
spinlock_t *ptl = NULL;
-#if USE_SPLIT_PTLOCKS
- ptl = __pte_lockptr(page);
+#if USE_SPLIT_PTE_PTLOCKS
+ ptl = ptlock_ptr(page);
spin_lock_nest_lock(ptl, &mm->page_table_lock);
#endif
@@ -783,14 +812,12 @@ static void xen_pte_unlock(void *v)
static void xen_do_pin(unsigned level, unsigned long pfn)
{
- struct mmuext_op *op;
- struct multicall_space mcs;
+ struct mmuext_op op;
- mcs = __xen_mc_entry(sizeof(*op));
- op = mcs.args;
- op->cmd = level;
- op->arg1.mfn = pfn_to_mfn(pfn);
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+ op.cmd = level;
+ op.arg1.mfn = pfn_to_mfn(pfn);
+
+ xen_extend_mmuext_op(&op);
}
static int xen_pin_page(struct mm_struct *mm, struct page *page,
@@ -858,7 +885,7 @@ static int xen_pin_page(struct mm_struct *mm, struct page *page,
read-only, and can be pinned. */
static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
{
- vm_unmap_aliases();
+ trace_xen_mmu_pgd_pin(mm, pgd);
xen_mc_batch();
@@ -911,10 +938,9 @@ static void xen_pgd_pin(struct mm_struct *mm)
*/
void xen_mm_pin_all(void)
{
- unsigned long flags;
struct page *page;
- spin_lock_irqsave(&pgd_lock, flags);
+ spin_lock(&pgd_lock);
list_for_each_entry(page, &pgd_list, lru) {
if (!PagePinned(page)) {
@@ -923,7 +949,7 @@ void xen_mm_pin_all(void)
}
}
- spin_unlock_irqrestore(&pgd_lock, flags);
+ spin_unlock(&pgd_lock);
}
/*
@@ -931,14 +957,14 @@ void xen_mm_pin_all(void)
* that's before we have page structures to store the bits. So do all
* the book-keeping now.
*/
-static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
+static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
enum pt_level level)
{
SetPagePinned(page);
return 0;
}
-void __init xen_mark_init_mm_pinned(void)
+static void __init xen_mark_init_mm_pinned(void)
{
xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
}
@@ -986,6 +1012,8 @@ static int xen_unpin_page(struct mm_struct *mm, struct page *page,
/* Release a pagetables pages back as normal RW */
static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
{
+ trace_xen_mmu_pgd_unpin(mm, pgd);
+
xen_mc_batch();
xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
@@ -1024,10 +1052,9 @@ static void xen_pgd_unpin(struct mm_struct *mm)
*/
void xen_mm_unpin_all(void)
{
- unsigned long flags;
struct page *page;
- spin_lock_irqsave(&pgd_lock, flags);
+ spin_lock(&pgd_lock);
list_for_each_entry(page, &pgd_list, lru) {
if (PageSavePinned(page)) {
@@ -1037,17 +1064,17 @@ void xen_mm_unpin_all(void)
}
}
- spin_unlock_irqrestore(&pgd_lock, flags);
+ spin_unlock(&pgd_lock);
}
-void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
+static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
{
spin_lock(&next->page_table_lock);
xen_pgd_pin(next);
spin_unlock(&next->page_table_lock);
}
-void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
spin_lock(&mm->page_table_lock);
xen_pgd_pin(mm);
@@ -1063,21 +1090,15 @@ static void drop_other_mm_ref(void *info)
struct mm_struct *mm = info;
struct mm_struct *active_mm;
-#ifdef CONFIG_X86_64
- active_mm = read_pda(active_mm);
-#else
- active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
-#endif
+ active_mm = this_cpu_read(cpu_tlbstate.active_mm);
- if (active_mm == mm)
+ if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
leave_mm(smp_processor_id());
/* If this cpu still has a stale cr3 reference, then make sure
it has been flushed. */
- if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
+ if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
load_cr3(swapper_pg_dir);
- arch_flush_lazy_cpu_mode();
- }
}
static void xen_drop_mm_ref(struct mm_struct *mm)
@@ -1090,20 +1111,19 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
load_cr3(swapper_pg_dir);
else
leave_mm(smp_processor_id());
- arch_flush_lazy_cpu_mode();
}
/* Get the "official" set of cpus referring to our pagetable. */
if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
for_each_online_cpu(cpu) {
- if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask)
+ if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
&& per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
continue;
smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
}
return;
}
- cpumask_copy(mask, &mm->cpu_vm_mask);
+ cpumask_copy(mask, mm_cpumask(mm));
/* It's possible that a vcpu may have a stale reference to our
cr3, because its in lazy mode, and it hasn't yet flushed
@@ -1141,7 +1161,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
* pagetable because of lazy tlb flushing. This means we need need to
* switch all CPUs off this pagetable before we can unpin it.
*/
-void xen_exit_mmap(struct mm_struct *mm)
+static void xen_exit_mmap(struct mm_struct *mm)
{
get_cpu(); /* make sure we don't move around */
xen_drop_mm_ref(mm);
@@ -1156,65 +1176,1529 @@ void xen_exit_mmap(struct mm_struct *mm)
spin_unlock(&mm->page_table_lock);
}
-#ifdef CONFIG_XEN_DEBUG_FS
+static void xen_post_allocator_init(void);
+
+#ifdef CONFIG_X86_64
+static void __init xen_cleanhighmap(unsigned long vaddr,
+ unsigned long vaddr_end)
+{
+ unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
+ pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr);
+
+ /* NOTE: The loop is more greedy than the cleanup_highmap variant.
+ * We include the PMD passed in on _both_ boundaries. */
+ for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PAGE_SIZE));
+ pmd++, vaddr += PMD_SIZE) {
+ if (pmd_none(*pmd))
+ continue;
+ if (vaddr < (unsigned long) _text || vaddr > kernel_end)
+ set_pmd(pmd, __pmd(0));
+ }
+ /* In case we did something silly, we should crash in this function
+ * instead of somewhere later and be confusing. */
+ xen_mc_flush();
+}
+static void __init xen_pagetable_p2m_copy(void)
+{
+ unsigned long size;
+ unsigned long addr;
+ unsigned long new_mfn_list;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+
+ new_mfn_list = xen_revector_p2m_tree();
+ /* No memory or already called. */
+ if (!new_mfn_list || new_mfn_list == xen_start_info->mfn_list)
+ return;
+
+ /* using __ka address and sticking INVALID_P2M_ENTRY! */
+ memset((void *)xen_start_info->mfn_list, 0xff, size);
+
+ /* We should be in __ka space. */
+ BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
+ addr = xen_start_info->mfn_list;
+ /* We roundup to the PMD, which means that if anybody at this stage is
+ * using the __ka address of xen_start_info or xen_start_info->shared_info
+ * they are in going to crash. Fortunatly we have already revectored
+ * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
+ size = roundup(size, PMD_SIZE);
+ xen_cleanhighmap(addr, addr + size);
+
+ size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+ memblock_free(__pa(xen_start_info->mfn_list), size);
+ /* And revector! Bye bye old array */
+ xen_start_info->mfn_list = new_mfn_list;
+
+ /* At this stage, cleanup_highmap has already cleaned __ka space
+ * from _brk_limit way up to the max_pfn_mapped (which is the end of
+ * the ramdisk). We continue on, erasing PMD entries that point to page
+ * tables - do note that they are accessible at this stage via __va.
+ * For good measure we also round up to the PMD - which means that if
+ * anybody is using __ka address to the initial boot-stack - and try
+ * to use it - they are going to crash. The xen_start_info has been
+ * taken care of already in xen_setup_kernel_pagetable. */
+ addr = xen_start_info->pt_base;
+ size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE);
+
+ xen_cleanhighmap(addr, addr + size);
+ xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
+#ifdef DEBUG
+ /* This is superflous and is not neccessary, but you know what
+ * lets do it. The MODULES_VADDR -> MODULES_END should be clear of
+ * anything at this stage. */
+ xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
+#endif
+}
+#endif
+
+static void __init xen_pagetable_init(void)
+{
+ paging_init();
+ xen_setup_shared_info();
+#ifdef CONFIG_X86_64
+ xen_pagetable_p2m_copy();
+#endif
+ xen_post_allocator_init();
+}
+static void xen_write_cr2(unsigned long cr2)
+{
+ this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
+}
+
+static unsigned long xen_read_cr2(void)
+{
+ return this_cpu_read(xen_vcpu)->arch.cr2;
+}
+
+unsigned long xen_read_cr2_direct(void)
+{
+ return this_cpu_read(xen_vcpu_info.arch.cr2);
+}
+
+void xen_flush_tlb_all(void)
+{
+ struct mmuext_op *op;
+ struct multicall_space mcs;
+
+ trace_xen_mmu_flush_tlb_all(0);
+
+ preempt_disable();
+
+ mcs = xen_mc_entry(sizeof(*op));
+
+ op = mcs.args;
+ op->cmd = MMUEXT_TLB_FLUSH_ALL;
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
+}
+static void xen_flush_tlb(void)
+{
+ struct mmuext_op *op;
+ struct multicall_space mcs;
+
+ trace_xen_mmu_flush_tlb(0);
+
+ preempt_disable();
+
+ mcs = xen_mc_entry(sizeof(*op));
+
+ op = mcs.args;
+ op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
-static struct dentry *d_mmu_debug;
+ preempt_enable();
+}
-static int __init xen_mmu_debugfs(void)
+static void xen_flush_tlb_single(unsigned long addr)
{
- struct dentry *d_xen = xen_init_debugfs();
+ struct mmuext_op *op;
+ struct multicall_space mcs;
+
+ trace_xen_mmu_flush_tlb_single(addr);
+
+ preempt_disable();
+
+ mcs = xen_mc_entry(sizeof(*op));
+ op = mcs.args;
+ op->cmd = MMUEXT_INVLPG_LOCAL;
+ op->arg1.linear_addr = addr & PAGE_MASK;
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
+}
+
+static void xen_flush_tlb_others(const struct cpumask *cpus,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long end)
+{
+ struct {
+ struct mmuext_op op;
+#ifdef CONFIG_SMP
+ DECLARE_BITMAP(mask, num_processors);
+#else
+ DECLARE_BITMAP(mask, NR_CPUS);
+#endif
+ } *args;
+ struct multicall_space mcs;
+
+ trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
+
+ if (cpumask_empty(cpus))
+ return; /* nothing to do */
+
+ mcs = xen_mc_entry(sizeof(*args));
+ args = mcs.args;
+ args->op.arg2.vcpumask = to_cpumask(args->mask);
+
+ /* Remove us, and any offline CPUS. */
+ cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
+
+ args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+ if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
+ args->op.cmd = MMUEXT_INVLPG_MULTI;
+ args->op.arg1.linear_addr = start;
+ }
+
+ MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+static unsigned long xen_read_cr3(void)
+{
+ return this_cpu_read(xen_cr3);
+}
+
+static void set_current_cr3(void *v)
+{
+ this_cpu_write(xen_current_cr3, (unsigned long)v);
+}
+
+static void __xen_write_cr3(bool kernel, unsigned long cr3)
+{
+ struct mmuext_op op;
+ unsigned long mfn;
+
+ trace_xen_mmu_write_cr3(kernel, cr3);
+
+ if (cr3)
+ mfn = pfn_to_mfn(PFN_DOWN(cr3));
+ else
+ mfn = 0;
+
+ WARN_ON(mfn == 0 && kernel);
+
+ op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
+ op.arg1.mfn = mfn;
+
+ xen_extend_mmuext_op(&op);
+
+ if (kernel) {
+ this_cpu_write(xen_cr3, cr3);
+
+ /* Update xen_current_cr3 once the batch has actually
+ been submitted. */
+ xen_mc_callback(set_current_cr3, (void *)cr3);
+ }
+}
+static void xen_write_cr3(unsigned long cr3)
+{
+ BUG_ON(preemptible());
+
+ xen_mc_batch(); /* disables interrupts */
+
+ /* Update while interrupts are disabled, so its atomic with
+ respect to ipis */
+ this_cpu_write(xen_cr3, cr3);
+
+ __xen_write_cr3(true, cr3);
+
+#ifdef CONFIG_X86_64
+ {
+ pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
+ if (user_pgd)
+ __xen_write_cr3(false, __pa(user_pgd));
+ else
+ __xen_write_cr3(false, 0);
+ }
+#endif
+
+ xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * At the start of the day - when Xen launches a guest, it has already
+ * built pagetables for the guest. We diligently look over them
+ * in xen_setup_kernel_pagetable and graft as appropiate them in the
+ * init_level4_pgt and its friends. Then when we are happy we load
+ * the new init_level4_pgt - and continue on.
+ *
+ * The generic code starts (start_kernel) and 'init_mem_mapping' sets
+ * up the rest of the pagetables. When it has completed it loads the cr3.
+ * N.B. that baremetal would start at 'start_kernel' (and the early
+ * #PF handler would create bootstrap pagetables) - so we are running
+ * with the same assumptions as what to do when write_cr3 is executed
+ * at this point.
+ *
+ * Since there are no user-page tables at all, we have two variants
+ * of xen_write_cr3 - the early bootup (this one), and the late one
+ * (xen_write_cr3). The reason we have to do that is that in 64-bit
+ * the Linux kernel and user-space are both in ring 3 while the
+ * hypervisor is in ring 0.
+ */
+static void __init xen_write_cr3_init(unsigned long cr3)
+{
+ BUG_ON(preemptible());
+
+ xen_mc_batch(); /* disables interrupts */
+
+ /* Update while interrupts are disabled, so its atomic with
+ respect to ipis */
+ this_cpu_write(xen_cr3, cr3);
+
+ __xen_write_cr3(true, cr3);
+
+ xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
+}
+#endif
+
+static int xen_pgd_alloc(struct mm_struct *mm)
+{
+ pgd_t *pgd = mm->pgd;
+ int ret = 0;
+
+ BUG_ON(PagePinned(virt_to_page(pgd)));
+
+#ifdef CONFIG_X86_64
+ {
+ struct page *page = virt_to_page(pgd);
+ pgd_t *user_pgd;
+
+ BUG_ON(page->private != 0);
+
+ ret = -ENOMEM;
+
+ user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ page->private = (unsigned long)user_pgd;
+
+ if (user_pgd != NULL) {
+ user_pgd[pgd_index(VSYSCALL_ADDR)] =
+ __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
+ ret = 0;
+ }
+
+ BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
+ }
+#endif
+
+ return ret;
+}
+
+static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+#ifdef CONFIG_X86_64
+ pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+ if (user_pgd)
+ free_page((unsigned long)user_pgd);
+#endif
+}
+
+#ifdef CONFIG_X86_32
+static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
+{
+ /* If there's an existing pte, then don't allow _PAGE_RW to be set */
+ if (pte_val_ma(*ptep) & _PAGE_PRESENT)
+ pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
+ pte_val_ma(pte));
+
+ return pte;
+}
+#else /* CONFIG_X86_64 */
+static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
+{
+ return pte;
+}
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Init-time set_pte while constructing initial pagetables, which
+ * doesn't allow RO page table pages to be remapped RW.
+ *
+ * If there is no MFN for this PFN then this page is initially
+ * ballooned out so clear the PTE (as in decrease_reservation() in
+ * drivers/xen/balloon.c).
+ *
+ * Many of these PTE updates are done on unpinned and writable pages
+ * and doing a hypercall for these is unnecessary and expensive. At
+ * this point it is not possible to tell if a page is pinned or not,
+ * so always write the PTE directly and rely on Xen trapping and
+ * emulating any updates as necessary.
+ */
+static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
+{
+ if (pte_mfn(pte) != INVALID_P2M_ENTRY)
+ pte = mask_rw_pte(ptep, pte);
+ else
+ pte = __pte_ma(0);
+
+ native_set_pte(ptep, pte);
+}
+
+static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+ struct mmuext_op op;
+ op.cmd = cmd;
+ op.arg1.mfn = pfn_to_mfn(pfn);
+ if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+ BUG();
+}
+
+/* Early in boot, while setting up the initial pagetable, assume
+ everything is pinned. */
+static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
+{
+#ifdef CONFIG_FLATMEM
+ BUG_ON(mem_map); /* should only be used early */
+#endif
+ make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+ pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+}
+
+/* Used for pmd and pud */
+static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
+{
+#ifdef CONFIG_FLATMEM
+ BUG_ON(mem_map); /* should only be used early */
+#endif
+ make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+}
+
+/* Early release_pte assumes that all pts are pinned, since there's
+ only init_mm and anything attached to that is pinned. */
+static void __init xen_release_pte_init(unsigned long pfn)
+{
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
+ make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+}
- if (d_xen == NULL)
+static void __init xen_release_pmd_init(unsigned long pfn)
+{
+ make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+}
+
+static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+ struct multicall_space mcs;
+ struct mmuext_op *op;
+
+ mcs = __xen_mc_entry(sizeof(*op));
+ op = mcs.args;
+ op->cmd = cmd;
+ op->arg1.mfn = pfn_to_mfn(pfn);
+
+ MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
+}
+
+static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
+{
+ struct multicall_space mcs;
+ unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
+
+ mcs = __xen_mc_entry(0);
+ MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,
+ pfn_pte(pfn, prot), 0);
+}
+
+/* This needs to make sure the new pte page is pinned iff its being
+ attached to a pinned pagetable. */
+static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
+ unsigned level)
+{
+ bool pinned = PagePinned(virt_to_page(mm->pgd));
+
+ trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
+
+ if (pinned) {
+ struct page *page = pfn_to_page(pfn);
+
+ SetPagePinned(page);
+
+ if (!PageHighMem(page)) {
+ xen_mc_batch();
+
+ __set_pfn_prot(pfn, PAGE_KERNEL_RO);
+
+ if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
+ __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+ } else {
+ /* make sure there are no stray mappings of
+ this page */
+ kmap_flush_unused();
+ }
+ }
+}
+
+static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
+{
+ xen_alloc_ptpage(mm, pfn, PT_PTE);
+}
+
+static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
+{
+ xen_alloc_ptpage(mm, pfn, PT_PMD);
+}
+
+/* This should never happen until we're OK to use struct page */
+static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
+{
+ struct page *page = pfn_to_page(pfn);
+ bool pinned = PagePinned(page);
+
+ trace_xen_mmu_release_ptpage(pfn, level, pinned);
+
+ if (pinned) {
+ if (!PageHighMem(page)) {
+ xen_mc_batch();
+
+ if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
+ __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
+
+ __set_pfn_prot(pfn, PAGE_KERNEL);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+ }
+ ClearPagePinned(page);
+ }
+}
+
+static void xen_release_pte(unsigned long pfn)
+{
+ xen_release_ptpage(pfn, PT_PTE);
+}
+
+static void xen_release_pmd(unsigned long pfn)
+{
+ xen_release_ptpage(pfn, PT_PMD);
+}
+
+#if PAGETABLE_LEVELS == 4
+static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
+{
+ xen_alloc_ptpage(mm, pfn, PT_PUD);
+}
+
+static void xen_release_pud(unsigned long pfn)
+{
+ xen_release_ptpage(pfn, PT_PUD);
+}
+#endif
+
+void __init xen_reserve_top(void)
+{
+#ifdef CONFIG_X86_32
+ unsigned long top = HYPERVISOR_VIRT_START;
+ struct xen_platform_parameters pp;
+
+ if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
+ top = pp.virt_start;
+
+ reserve_top_address(-top);
+#endif /* CONFIG_X86_32 */
+}
+
+/*
+ * Like __va(), but returns address in the kernel mapping (which is
+ * all we have until the physical memory mapping has been set up.
+ */
+static void *__ka(phys_addr_t paddr)
+{
+#ifdef CONFIG_X86_64
+ return (void *)(paddr + __START_KERNEL_map);
+#else
+ return __va(paddr);
+#endif
+}
+
+/* Convert a machine address to physical address */
+static unsigned long m2p(phys_addr_t maddr)
+{
+ phys_addr_t paddr;
+
+ maddr &= PTE_PFN_MASK;
+ paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
+
+ return paddr;
+}
+
+/* Convert a machine address to kernel virtual */
+static void *m2v(phys_addr_t maddr)
+{
+ return __ka(m2p(maddr));
+}
+
+/* Set the page permissions on an identity-mapped pages */
+static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags)
+{
+ unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
+ pte_t pte = pfn_pte(pfn, prot);
+
+ /* For PVH no need to set R/O or R/W to pin them or unpin them. */
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
+ BUG();
+}
+static void set_page_prot(void *addr, pgprot_t prot)
+{
+ return set_page_prot_flags(addr, prot, UVMF_NONE);
+}
+#ifdef CONFIG_X86_32
+static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
+{
+ unsigned pmdidx, pteidx;
+ unsigned ident_pte;
+ unsigned long pfn;
+
+ level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
+ PAGE_SIZE);
+
+ ident_pte = 0;
+ pfn = 0;
+ for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
+ pte_t *pte_page;
+
+ /* Reuse or allocate a page of ptes */
+ if (pmd_present(pmd[pmdidx]))
+ pte_page = m2v(pmd[pmdidx].pmd);
+ else {
+ /* Check for free pte pages */
+ if (ident_pte == LEVEL1_IDENT_ENTRIES)
+ break;
+
+ pte_page = &level1_ident_pgt[ident_pte];
+ ident_pte += PTRS_PER_PTE;
+
+ pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
+ }
+
+ /* Install mappings */
+ for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
+ pte_t pte;
+
+#ifdef CONFIG_X86_32
+ if (pfn > max_pfn_mapped)
+ max_pfn_mapped = pfn;
+#endif
+
+ if (!pte_none(pte_page[pteidx]))
+ continue;
+
+ pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
+ pte_page[pteidx] = pte;
+ }
+ }
+
+ for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
+ set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
+
+ set_page_prot(pmd, PAGE_KERNEL_RO);
+}
+#endif
+void __init xen_setup_machphys_mapping(void)
+{
+ struct xen_machphys_mapping mapping;
+
+ if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
+ machine_to_phys_mapping = (unsigned long *)mapping.v_start;
+ machine_to_phys_nr = mapping.max_mfn + 1;
+ } else {
+ machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
+ }
+#ifdef CONFIG_X86_32
+ WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1))
+ < machine_to_phys_mapping);
+#endif
+}
+
+#ifdef CONFIG_X86_64
+static void convert_pfn_mfn(void *v)
+{
+ pte_t *pte = v;
+ int i;
+
+ /* All levels are converted the same way, so just treat them
+ as ptes. */
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ pte[i] = xen_make_pte(pte[i].pte);
+}
+static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
+ unsigned long addr)
+{
+ if (*pt_base == PFN_DOWN(__pa(addr))) {
+ set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
+ clear_page((void *)addr);
+ (*pt_base)++;
+ }
+ if (*pt_end == PFN_DOWN(__pa(addr))) {
+ set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
+ clear_page((void *)addr);
+ (*pt_end)--;
+ }
+}
+/*
+ * Set up the initial kernel pagetable.
+ *
+ * We can construct this by grafting the Xen provided pagetable into
+ * head_64.S's preconstructed pagetables. We copy the Xen L2's into
+ * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
+ * means that only the kernel has a physical mapping to start with -
+ * but that's enough to get __va working. We need to fill in the rest
+ * of the physical mapping once some sort of allocator has been set
+ * up.
+ * NOTE: for PVH, the page tables are native.
+ */
+void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
+{
+ pud_t *l3;
+ pmd_t *l2;
+ unsigned long addr[3];
+ unsigned long pt_base, pt_end;
+ unsigned i;
+
+ /* max_pfn_mapped is the last pfn mapped in the initial memory
+ * mappings. Considering that on Xen after the kernel mappings we
+ * have the mappings of some pages that don't exist in pfn space, we
+ * set max_pfn_mapped to the last real pfn mapped. */
+ max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
+
+ pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
+ pt_end = pt_base + xen_start_info->nr_pt_frames;
+
+ /* Zap identity mapping */
+ init_level4_pgt[0] = __pgd(0);
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ /* Pre-constructed entries are in pfn, so convert to mfn */
+ /* L4[272] -> level3_ident_pgt
+ * L4[511] -> level3_kernel_pgt */
+ convert_pfn_mfn(init_level4_pgt);
+
+ /* L3_i[0] -> level2_ident_pgt */
+ convert_pfn_mfn(level3_ident_pgt);
+ /* L3_k[510] -> level2_kernel_pgt
+ * L3_i[511] -> level2_fixmap_pgt */
+ convert_pfn_mfn(level3_kernel_pgt);
+ }
+ /* We get [511][511] and have Xen's version of level2_kernel_pgt */
+ l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
+ l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
+
+ addr[0] = (unsigned long)pgd;
+ addr[1] = (unsigned long)l3;
+ addr[2] = (unsigned long)l2;
+ /* Graft it onto L4[272][0]. Note that we creating an aliasing problem:
+ * Both L4[272][0] and L4[511][511] have entries that point to the same
+ * L2 (PMD) tables. Meaning that if you modify it in __va space
+ * it will be also modified in the __ka space! (But if you just
+ * modify the PMD table to point to other PTE's or none, then you
+ * are OK - which is what cleanup_highmap does) */
+ copy_page(level2_ident_pgt, l2);
+ /* Graft it onto L4[511][511] */
+ copy_page(level2_kernel_pgt, l2);
+
+ /* Get [511][510] and graft that in level2_fixmap_pgt */
+ l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
+ l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
+ copy_page(level2_fixmap_pgt, l2);
+ /* Note that we don't do anything with level1_fixmap_pgt which
+ * we don't need. */
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ /* Make pagetable pieces RO */
+ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
+ set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
+
+ /* Pin down new L4 */
+ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
+ PFN_DOWN(__pa_symbol(init_level4_pgt)));
+
+ /* Unpin Xen-provided one */
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+
+ /*
+ * At this stage there can be no user pgd, and no page
+ * structure to attach it to, so make sure we just set kernel
+ * pgd.
+ */
+ xen_mc_batch();
+ __xen_write_cr3(true, __pa(init_level4_pgt));
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+ } else
+ native_write_cr3(__pa(init_level4_pgt));
+
+ /* We can't that easily rip out L3 and L2, as the Xen pagetables are
+ * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for
+ * the initial domain. For guests using the toolstack, they are in:
+ * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only
+ * rip out the [L4] (pgd), but for guests we shave off three pages.
+ */
+ for (i = 0; i < ARRAY_SIZE(addr); i++)
+ check_pt_base(&pt_base, &pt_end, addr[i]);
+
+ /* Our (by three pages) smaller Xen pagetable that we are using */
+ memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE);
+ /* Revector the xen_start_info */
+ xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
+}
+#else /* !CONFIG_X86_64 */
+static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
+static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
+
+static void __init xen_write_cr3_init(unsigned long cr3)
+{
+ unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
+
+ BUG_ON(read_cr3() != __pa(initial_page_table));
+ BUG_ON(cr3 != __pa(swapper_pg_dir));
+
+ /*
+ * We are switching to swapper_pg_dir for the first time (from
+ * initial_page_table) and therefore need to mark that page
+ * read-only and then pin it.
+ *
+ * Xen disallows sharing of kernel PMDs for PAE
+ * guests. Therefore we must copy the kernel PMD from
+ * initial_page_table into a new kernel PMD to be used in
+ * swapper_pg_dir.
+ */
+ swapper_kernel_pmd =
+ extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
+ copy_page(swapper_kernel_pmd, initial_kernel_pmd);
+ swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
+ __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
+ set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
+
+ set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
+ xen_write_cr3(cr3);
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
+
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
+ PFN_DOWN(__pa(initial_page_table)));
+ set_page_prot(initial_page_table, PAGE_KERNEL);
+ set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
+
+ pv_mmu_ops.write_cr3 = &xen_write_cr3;
+}
+
+void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
+{
+ pmd_t *kernel_pmd;
+
+ initial_kernel_pmd =
+ extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
+
+ max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
+ xen_start_info->nr_pt_frames * PAGE_SIZE +
+ 512*1024);
+
+ kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
+ copy_page(initial_kernel_pmd, kernel_pmd);
+
+ xen_map_identity_early(initial_kernel_pmd, max_pfn);
+
+ copy_page(initial_page_table, pgd);
+ initial_page_table[KERNEL_PGD_BOUNDARY] =
+ __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
+
+ set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
+ set_page_prot(initial_page_table, PAGE_KERNEL_RO);
+ set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
+
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
+ PFN_DOWN(__pa(initial_page_table)));
+ xen_write_cr3(__pa(initial_page_table));
+
+ memblock_reserve(__pa(xen_start_info->pt_base),
+ xen_start_info->nr_pt_frames * PAGE_SIZE);
+}
+#endif /* CONFIG_X86_64 */
+
+static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
+
+static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
+{
+ pte_t pte;
+
+ phys >>= PAGE_SHIFT;
+
+ switch (idx) {
+ case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
+ case FIX_RO_IDT:
+#ifdef CONFIG_X86_32
+ case FIX_WP_TEST:
+# ifdef CONFIG_HIGHMEM
+ case FIX_KMAP_BEGIN ... FIX_KMAP_END:
+# endif
+#else
+ case VSYSCALL_PAGE:
+#endif
+ case FIX_TEXT_POKE0:
+ case FIX_TEXT_POKE1:
+ /* All local page mappings */
+ pte = pfn_pte(phys, prot);
+ break;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ case FIX_APIC_BASE: /* maps dummy local APIC */
+ pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
+ break;
+#endif
+
+#ifdef CONFIG_X86_IO_APIC
+ case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
+ /*
+ * We just don't map the IO APIC - all access is via
+ * hypercalls. Keep the address in the pte for reference.
+ */
+ pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
+ break;
+#endif
+
+ case FIX_PARAVIRT_BOOTMAP:
+ /* This is an MFN, but it isn't an IO mapping from the
+ IO domain */
+ pte = mfn_pte(phys, prot);
+ break;
+
+ default:
+ /* By default, set_fixmap is used for hardware mappings */
+ pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
+ break;
+ }
+
+ __native_set_fixmap(idx, pte);
+
+#ifdef CONFIG_X86_64
+ /* Replicate changes to map the vsyscall page into the user
+ pagetable vsyscall mapping. */
+ if (idx == VSYSCALL_PAGE) {
+ unsigned long vaddr = __fix_to_virt(idx);
+ set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
+ }
+#endif
+}
+
+static void __init xen_post_allocator_init(void)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ pv_mmu_ops.set_pte = xen_set_pte;
+ pv_mmu_ops.set_pmd = xen_set_pmd;
+ pv_mmu_ops.set_pud = xen_set_pud;
+#if PAGETABLE_LEVELS == 4
+ pv_mmu_ops.set_pgd = xen_set_pgd;
+#endif
+
+ /* This will work as long as patching hasn't happened yet
+ (which it hasn't) */
+ pv_mmu_ops.alloc_pte = xen_alloc_pte;
+ pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
+ pv_mmu_ops.release_pte = xen_release_pte;
+ pv_mmu_ops.release_pmd = xen_release_pmd;
+#if PAGETABLE_LEVELS == 4
+ pv_mmu_ops.alloc_pud = xen_alloc_pud;
+ pv_mmu_ops.release_pud = xen_release_pud;
+#endif
+
+#ifdef CONFIG_X86_64
+ pv_mmu_ops.write_cr3 = &xen_write_cr3;
+ SetPagePinned(virt_to_page(level3_user_vsyscall));
+#endif
+ xen_mark_init_mm_pinned();
+}
+
+static void xen_leave_lazy_mmu(void)
+{
+ preempt_disable();
+ xen_mc_flush();
+ paravirt_leave_lazy_mmu();
+ preempt_enable();
+}
+
+static const struct pv_mmu_ops xen_mmu_ops __initconst = {
+ .read_cr2 = xen_read_cr2,
+ .write_cr2 = xen_write_cr2,
+
+ .read_cr3 = xen_read_cr3,
+ .write_cr3 = xen_write_cr3_init,
+
+ .flush_tlb_user = xen_flush_tlb,
+ .flush_tlb_kernel = xen_flush_tlb,
+ .flush_tlb_single = xen_flush_tlb_single,
+ .flush_tlb_others = xen_flush_tlb_others,
+
+ .pte_update = paravirt_nop,
+ .pte_update_defer = paravirt_nop,
+
+ .pgd_alloc = xen_pgd_alloc,
+ .pgd_free = xen_pgd_free,
+
+ .alloc_pte = xen_alloc_pte_init,
+ .release_pte = xen_release_pte_init,
+ .alloc_pmd = xen_alloc_pmd_init,
+ .release_pmd = xen_release_pmd_init,
+
+ .set_pte = xen_set_pte_init,
+ .set_pte_at = xen_set_pte_at,
+ .set_pmd = xen_set_pmd_hyper,
+
+ .ptep_modify_prot_start = __ptep_modify_prot_start,
+ .ptep_modify_prot_commit = __ptep_modify_prot_commit,
+
+ .pte_val = PV_CALLEE_SAVE(xen_pte_val),
+ .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
+
+ .make_pte = PV_CALLEE_SAVE(xen_make_pte),
+ .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
+
+#ifdef CONFIG_X86_PAE
+ .set_pte_atomic = xen_set_pte_atomic,
+ .pte_clear = xen_pte_clear,
+ .pmd_clear = xen_pmd_clear,
+#endif /* CONFIG_X86_PAE */
+ .set_pud = xen_set_pud_hyper,
+
+ .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
+ .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
+
+#if PAGETABLE_LEVELS == 4
+ .pud_val = PV_CALLEE_SAVE(xen_pud_val),
+ .make_pud = PV_CALLEE_SAVE(xen_make_pud),
+ .set_pgd = xen_set_pgd_hyper,
+
+ .alloc_pud = xen_alloc_pmd_init,
+ .release_pud = xen_release_pmd_init,
+#endif /* PAGETABLE_LEVELS == 4 */
+
+ .activate_mm = xen_activate_mm,
+ .dup_mmap = xen_dup_mmap,
+ .exit_mmap = xen_exit_mmap,
+
+ .lazy_mode = {
+ .enter = paravirt_enter_lazy_mmu,
+ .leave = xen_leave_lazy_mmu,
+ .flush = paravirt_flush_lazy_mmu,
+ },
+
+ .set_fixmap = xen_set_fixmap,
+};
+
+void __init xen_init_mmu_ops(void)
+{
+ x86_init.paging.pagetable_init = xen_pagetable_init;
+
+ /* Optimization - we can use the HVM one but it has no idea which
+ * VCPUs are descheduled - which means that it will needlessly IPI
+ * them. Xen knows so let it do the job.
+ */
+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
+ pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others;
+ return;
+ }
+ pv_mmu_ops = xen_mmu_ops;
+
+ memset(dummy_mapping, 0xff, PAGE_SIZE);
+}
+
+/* Protected by xen_reservation_lock. */
+#define MAX_CONTIG_ORDER 9 /* 2MB */
+static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
+
+#define VOID_PTE (mfn_pte(0, __pgprot(0)))
+static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
+ unsigned long *in_frames,
+ unsigned long *out_frames)
+{
+ int i;
+ struct multicall_space mcs;
+
+ xen_mc_batch();
+ for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
+ mcs = __xen_mc_entry(0);
+
+ if (in_frames)
+ in_frames[i] = virt_to_mfn(vaddr);
+
+ MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
+ __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
+
+ if (out_frames)
+ out_frames[i] = virt_to_pfn(vaddr);
+ }
+ xen_mc_issue(0);
+}
+
+/*
+ * Update the pfn-to-mfn mappings for a virtual address range, either to
+ * point to an array of mfns, or contiguously from a single starting
+ * mfn.
+ */
+static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
+ unsigned long *mfns,
+ unsigned long first_mfn)
+{
+ unsigned i, limit;
+ unsigned long mfn;
+
+ xen_mc_batch();
+
+ limit = 1u << order;
+ for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
+ struct multicall_space mcs;
+ unsigned flags;
+
+ mcs = __xen_mc_entry(0);
+ if (mfns)
+ mfn = mfns[i];
+ else
+ mfn = first_mfn + i;
+
+ if (i < (limit - 1))
+ flags = 0;
+ else {
+ if (order == 0)
+ flags = UVMF_INVLPG | UVMF_ALL;
+ else
+ flags = UVMF_TLB_FLUSH | UVMF_ALL;
+ }
+
+ MULTI_update_va_mapping(mcs.mc, vaddr,
+ mfn_pte(mfn, PAGE_KERNEL), flags);
+
+ set_phys_to_machine(virt_to_pfn(vaddr), mfn);
+ }
+
+ xen_mc_issue(0);
+}
+
+/*
+ * Perform the hypercall to exchange a region of our pfns to point to
+ * memory with the required contiguous alignment. Takes the pfns as
+ * input, and populates mfns as output.
+ *
+ * Returns a success code indicating whether the hypervisor was able to
+ * satisfy the request or not.
+ */
+static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
+ unsigned long *pfns_in,
+ unsigned long extents_out,
+ unsigned int order_out,
+ unsigned long *mfns_out,
+ unsigned int address_bits)
+{
+ long rc;
+ int success;
+
+ struct xen_memory_exchange exchange = {
+ .in = {
+ .nr_extents = extents_in,
+ .extent_order = order_in,
+ .extent_start = pfns_in,
+ .domid = DOMID_SELF
+ },
+ .out = {
+ .nr_extents = extents_out,
+ .extent_order = order_out,
+ .extent_start = mfns_out,
+ .address_bits = address_bits,
+ .domid = DOMID_SELF
+ }
+ };
+
+ BUG_ON(extents_in << order_in != extents_out << order_out);
+
+ rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
+ success = (exchange.nr_exchanged == extents_in);
+
+ BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
+ BUG_ON(success && (rc != 0));
+
+ return success;
+}
+
+int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
+ unsigned int address_bits,
+ dma_addr_t *dma_handle)
+{
+ unsigned long *in_frames = discontig_frames, out_frame;
+ unsigned long flags;
+ int success;
+ unsigned long vstart = (unsigned long)phys_to_virt(pstart);
+
+ /*
+ * Currently an auto-translated guest will not perform I/O, nor will
+ * it require PAE page directories below 4GB. Therefore any calls to
+ * this function are redundant and can be ignored.
+ */
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
+ if (unlikely(order > MAX_CONTIG_ORDER))
return -ENOMEM;
- d_mmu_debug = debugfs_create_dir("mmu", d_xen);
-
- debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
-
- debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
- debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
- &mmu_stats.pgd_update_pinned);
- debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
- &mmu_stats.pgd_update_pinned);
-
- debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
- debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
- &mmu_stats.pud_update_pinned);
- debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
- &mmu_stats.pud_update_pinned);
-
- debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
- debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
- &mmu_stats.pmd_update_pinned);
- debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
- &mmu_stats.pmd_update_pinned);
-
- debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
-// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
-// &mmu_stats.pte_update_pinned);
- debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
- &mmu_stats.pte_update_pinned);
-
- debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
- debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
- &mmu_stats.mmu_update_extended);
- xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
- mmu_stats.mmu_update_histo, 20);
-
- debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
- debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
- &mmu_stats.set_pte_at_batched);
- debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
- &mmu_stats.set_pte_at_current);
- debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
- &mmu_stats.set_pte_at_kernel);
-
- debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
- debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
- &mmu_stats.prot_commit_batched);
+ memset((void *) vstart, 0, PAGE_SIZE << order);
+
+ spin_lock_irqsave(&xen_reservation_lock, flags);
+
+ /* 1. Zap current PTEs, remembering MFNs. */
+ xen_zap_pfn_range(vstart, order, in_frames, NULL);
+
+ /* 2. Get a new contiguous memory extent. */
+ out_frame = virt_to_pfn(vstart);
+ success = xen_exchange_memory(1UL << order, 0, in_frames,
+ 1, order, &out_frame,
+ address_bits);
+
+ /* 3. Map the new extent in place of old pages. */
+ if (success)
+ xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
+ else
+ xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
+
+ spin_unlock_irqrestore(&xen_reservation_lock, flags);
+
+ *dma_handle = virt_to_machine(vstart).maddr;
+ return success ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
+
+void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
+{
+ unsigned long *out_frames = discontig_frames, in_frame;
+ unsigned long flags;
+ int success;
+ unsigned long vstart;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ if (unlikely(order > MAX_CONTIG_ORDER))
+ return;
+
+ vstart = (unsigned long)phys_to_virt(pstart);
+ memset((void *) vstart, 0, PAGE_SIZE << order);
+
+ spin_lock_irqsave(&xen_reservation_lock, flags);
+
+ /* 1. Find start MFN of contiguous extent. */
+ in_frame = virt_to_mfn(vstart);
+
+ /* 2. Zap current PTEs. */
+ xen_zap_pfn_range(vstart, order, NULL, out_frames);
+
+ /* 3. Do the exchange for non-contiguous MFNs. */
+ success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
+ 0, out_frames, 0);
+
+ /* 4. Map new pages in place of old pages. */
+ if (success)
+ xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
+ else
+ xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
+
+ spin_unlock_irqrestore(&xen_reservation_lock, flags);
+}
+EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
+
+#ifdef CONFIG_XEN_PVHVM
+#ifdef CONFIG_PROC_VMCORE
+/*
+ * This function is used in two contexts:
+ * - the kdump kernel has to check whether a pfn of the crashed kernel
+ * was a ballooned page. vmcore is using this function to decide
+ * whether to access a pfn of the crashed kernel.
+ * - the kexec kernel has to check whether a pfn was ballooned by the
+ * previous kernel. If the pfn is ballooned, handle it properly.
+ * Returns 0 if the pfn is not backed by a RAM page, the caller may
+ * handle the pfn special in this case.
+ */
+static int xen_oldmem_pfn_is_ram(unsigned long pfn)
+{
+ struct xen_hvm_get_mem_type a = {
+ .domid = DOMID_SELF,
+ .pfn = pfn,
+ };
+ int ram;
+
+ if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a))
+ return -ENXIO;
+
+ switch (a.mem_type) {
+ case HVMMEM_mmio_dm:
+ ram = 0;
+ break;
+ case HVMMEM_ram_rw:
+ case HVMMEM_ram_ro:
+ default:
+ ram = 1;
+ break;
+ }
+
+ return ram;
+}
+#endif
+
+static void xen_hvm_exit_mmap(struct mm_struct *mm)
+{
+ struct xen_hvm_pagetable_dying a;
+ int rc;
+
+ a.domid = DOMID_SELF;
+ a.gpa = __pa(mm->pgd);
+ rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
+ WARN_ON_ONCE(rc < 0);
+}
+
+static int is_pagetable_dying_supported(void)
+{
+ struct xen_hvm_pagetable_dying a;
+ int rc = 0;
+
+ a.domid = DOMID_SELF;
+ a.gpa = 0x00;
+ rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
+ if (rc < 0) {
+ printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
+ return 0;
+ }
+ return 1;
+}
+
+void __init xen_hvm_init_mmu_ops(void)
+{
+ if (is_pagetable_dying_supported())
+ pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
+#ifdef CONFIG_PROC_VMCORE
+ register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram);
+#endif
+}
+#endif
+
+#ifdef CONFIG_XEN_PVH
+/*
+ * Map foreign gfn (fgfn), to local pfn (lpfn). This for the user
+ * space creating new guest on pvh dom0 and needing to map domU pages.
+ */
+static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn,
+ unsigned int domid)
+{
+ int rc, err = 0;
+ xen_pfn_t gpfn = lpfn;
+ xen_ulong_t idx = fgfn;
+
+ struct xen_add_to_physmap_range xatp = {
+ .domid = DOMID_SELF,
+ .foreign_domid = domid,
+ .size = 1,
+ .space = XENMAPSPACE_gmfn_foreign,
+ };
+ set_xen_guest_handle(xatp.idxs, &idx);
+ set_xen_guest_handle(xatp.gpfns, &gpfn);
+ set_xen_guest_handle(xatp.errs, &err);
+
+ rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
+ if (rc < 0)
+ return rc;
+ return err;
+}
+
+static int xlate_remove_from_p2m(unsigned long spfn, int count)
+{
+ struct xen_remove_from_physmap xrp;
+ int i, rc;
+
+ for (i = 0; i < count; i++) {
+ xrp.domid = DOMID_SELF;
+ xrp.gpfn = spfn+i;
+ rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
+ if (rc)
+ break;
+ }
+ return rc;
+}
+
+struct xlate_remap_data {
+ unsigned long fgfn; /* foreign domain's gfn */
+ pgprot_t prot;
+ domid_t domid;
+ int index;
+ struct page **pages;
+};
+
+static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
+ void *data)
+{
+ int rc;
+ struct xlate_remap_data *remap = data;
+ unsigned long pfn = page_to_pfn(remap->pages[remap->index++]);
+ pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot));
+
+ rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid);
+ if (rc)
+ return rc;
+ native_set_pte(ptep, pteval);
return 0;
}
-fs_initcall(xen_mmu_debugfs);
-#endif /* CONFIG_XEN_DEBUG_FS */
+static int xlate_remap_gfn_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long mfn,
+ int nr, pgprot_t prot, unsigned domid,
+ struct page **pages)
+{
+ int err;
+ struct xlate_remap_data pvhdata;
+
+ BUG_ON(!pages);
+
+ pvhdata.fgfn = mfn;
+ pvhdata.prot = prot;
+ pvhdata.domid = domid;
+ pvhdata.index = 0;
+ pvhdata.pages = pages;
+ err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT,
+ xlate_map_pte_fn, &pvhdata);
+ flush_tlb_all();
+ return err;
+}
+#endif
+
+#define REMAP_BATCH_SIZE 16
+
+struct remap_data {
+ unsigned long mfn;
+ pgprot_t prot;
+ struct mmu_update *mmu_update;
+};
+
+static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
+ unsigned long addr, void *data)
+{
+ struct remap_data *rmd = data;
+ pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot));
+
+ rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
+ rmd->mmu_update->val = pte_val_ma(pte);
+ rmd->mmu_update++;
+
+ return 0;
+}
+
+int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+ unsigned long addr,
+ xen_pfn_t mfn, int nr,
+ pgprot_t prot, unsigned domid,
+ struct page **pages)
+
+{
+ struct remap_data rmd;
+ struct mmu_update mmu_update[REMAP_BATCH_SIZE];
+ int batch;
+ unsigned long range;
+ int err = 0;
+
+ BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
+
+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
+#ifdef CONFIG_XEN_PVH
+ /* We need to update the local page tables and the xen HAP */
+ return xlate_remap_gfn_range(vma, addr, mfn, nr, prot,
+ domid, pages);
+#else
+ return -EINVAL;
+#endif
+ }
+
+ rmd.mfn = mfn;
+ rmd.prot = prot;
+
+ while (nr) {
+ batch = min(REMAP_BATCH_SIZE, nr);
+ range = (unsigned long)batch << PAGE_SHIFT;
+
+ rmd.mmu_update = mmu_update;
+ err = apply_to_page_range(vma->vm_mm, addr, range,
+ remap_area_mfn_pte_fn, &rmd);
+ if (err)
+ goto out;
+
+ err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid);
+ if (err < 0)
+ goto out;
+
+ nr -= batch;
+ addr += range;
+ }
+
+ err = 0;
+out:
+
+ xen_flush_tlb_all();
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
+
+/* Returns: 0 success */
+int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
+ int numpgs, struct page **pages)
+{
+ if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
+#ifdef CONFIG_XEN_PVH
+ while (numpgs--) {
+ /*
+ * The mmu has already cleaned up the process mmu
+ * resources at this point (lookup_address will return
+ * NULL).
+ */
+ unsigned long pfn = page_to_pfn(pages[numpgs]);
+
+ xlate_remove_from_p2m(pfn, 1);
+ }
+ /*
+ * We don't need to flush tlbs because as part of
+ * xlate_remove_from_p2m, the hypervisor will do tlb flushes
+ * after removing the p2m entries from the EPT/NPT
+ */
+ return 0;
+#else
+ return -EINVAL;
+#endif
+}
+EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range);
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 98d71659da5..73809bb951b 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -11,47 +11,16 @@ enum pt_level {
};
-void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-
-
-void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
-void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
-void xen_exit_mmap(struct mm_struct *mm);
-
-pteval_t xen_pte_val(pte_t);
-pmdval_t xen_pmd_val(pmd_t);
-pgdval_t xen_pgd_val(pgd_t);
-
-pte_t xen_make_pte(pteval_t);
-pmd_t xen_make_pmd(pmdval_t);
-pgd_t xen_make_pgd(pgdval_t);
-
-void xen_set_pte(pte_t *ptep, pte_t pteval);
-void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval);
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
-#ifdef CONFIG_X86_PAE
-void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
-void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void xen_pmd_clear(pmd_t *pmdp);
-#endif /* CONFIG_X86_PAE */
-
-void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
-void xen_set_pud(pud_t *ptr, pud_t val);
-void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
-void xen_set_pud_hyper(pud_t *ptr, pud_t val);
-
-#if PAGETABLE_LEVELS == 4
-pudval_t xen_pud_val(pud_t pud);
-pud_t xen_make_pud(pudval_t pudval);
-void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
-void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
-#endif
-
-pgd_t *xen_get_user_pgd(pgd_t *pgd);
+void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte);
+unsigned long xen_read_cr2_direct(void);
+
+extern void xen_init_mmu_ops(void);
+extern void xen_hvm_init_mmu_ops(void);
#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index c738644b543..0d82003e76a 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -30,100 +30,32 @@
#define MC_BATCH 32
-#define MC_DEBUG 1
+#define MC_DEBUG 0
#define MC_ARGS (MC_BATCH * 16)
struct mc_buffer {
+ unsigned mcidx, argidx, cbidx;
struct multicall_entry entries[MC_BATCH];
#if MC_DEBUG
struct multicall_entry debug[MC_BATCH];
+ void *caller[MC_BATCH];
#endif
unsigned char args[MC_ARGS];
struct callback {
void (*fn)(void *);
void *data;
} callbacks[MC_BATCH];
- unsigned mcidx, argidx, cbidx;
};
static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
-/* flush reasons 0- slots, 1- args, 2- callbacks */
-enum flush_reasons
-{
- FL_SLOTS,
- FL_ARGS,
- FL_CALLBACKS,
-
- FL_N_REASONS
-};
-
-#ifdef CONFIG_XEN_DEBUG_FS
-#define NHYPERCALLS 40 /* not really */
-
-static struct {
- unsigned histo[MC_BATCH+1];
-
- unsigned issued;
- unsigned arg_total;
- unsigned hypercalls;
- unsigned histo_hypercalls[NHYPERCALLS];
-
- unsigned flush[FL_N_REASONS];
-} mc_stats;
-
-static u8 zero_stats;
-
-static inline void check_zero(void)
-{
- if (unlikely(zero_stats)) {
- memset(&mc_stats, 0, sizeof(mc_stats));
- zero_stats = 0;
- }
-}
-
-static void mc_add_stats(const struct mc_buffer *mc)
-{
- int i;
-
- check_zero();
-
- mc_stats.issued++;
- mc_stats.hypercalls += mc->mcidx;
- mc_stats.arg_total += mc->argidx;
-
- mc_stats.histo[mc->mcidx]++;
- for(i = 0; i < mc->mcidx; i++) {
- unsigned op = mc->entries[i].op;
- if (op < NHYPERCALLS)
- mc_stats.histo_hypercalls[op]++;
- }
-}
-
-static void mc_stats_flush(enum flush_reasons idx)
-{
- check_zero();
-
- mc_stats.flush[idx]++;
-}
-
-#else /* !CONFIG_XEN_DEBUG_FS */
-
-static inline void mc_add_stats(const struct mc_buffer *mc)
-{
-}
-
-static inline void mc_stats_flush(enum flush_reasons idx)
-{
-}
-#endif /* CONFIG_XEN_DEBUG_FS */
-
void xen_mc_flush(void)
{
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+ struct multicall_entry *mc;
int ret = 0;
unsigned long flags;
int i;
@@ -134,9 +66,26 @@ void xen_mc_flush(void)
something in the middle */
local_irq_save(flags);
- mc_add_stats(b);
+ trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx);
+
+ switch (b->mcidx) {
+ case 0:
+ /* no-op */
+ BUG_ON(b->argidx != 0);
+ break;
+
+ case 1:
+ /* Singleton multicall - bypass multicall machinery
+ and just do the call directly. */
+ mc = &b->entries[0];
+
+ mc->result = privcmd_call(mc->op,
+ mc->args[0], mc->args[1], mc->args[2],
+ mc->args[3], mc->args[4]);
+ ret = mc->result < 0;
+ break;
- if (b->mcidx) {
+ default:
#if MC_DEBUG
memcpy(b->debug, b->entries,
b->mcidx * sizeof(struct multicall_entry));
@@ -154,21 +103,19 @@ void xen_mc_flush(void)
ret, smp_processor_id());
dump_stack();
for (i = 0; i < b->mcidx; i++) {
- printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
+ printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\t%pF\n",
i+1, b->mcidx,
b->debug[i].op,
b->debug[i].args[0],
- b->entries[i].result);
+ b->entries[i].result,
+ b->caller[i]);
}
}
#endif
+ }
- b->mcidx = 0;
- b->argidx = 0;
- } else
- BUG_ON(b->argidx != 0);
-
- local_irq_restore(flags);
+ b->mcidx = 0;
+ b->argidx = 0;
for (i = 0; i < b->cbidx; i++) {
struct callback *cb = &b->callbacks[i];
@@ -177,7 +124,9 @@ void xen_mc_flush(void)
}
b->cbidx = 0;
- BUG_ON(ret);
+ local_irq_restore(flags);
+
+ WARN_ON(ret);
}
struct multicall_space __xen_mc_entry(size_t args)
@@ -186,22 +135,28 @@ struct multicall_space __xen_mc_entry(size_t args)
struct multicall_space ret;
unsigned argidx = roundup(b->argidx, sizeof(u64));
+ trace_xen_mc_entry_alloc(args);
+
BUG_ON(preemptible());
- BUG_ON(b->argidx > MC_ARGS);
+ BUG_ON(b->argidx >= MC_ARGS);
- if (b->mcidx == MC_BATCH ||
- (argidx + args) > MC_ARGS) {
- mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS);
+ if (unlikely(b->mcidx == MC_BATCH ||
+ (argidx + args) >= MC_ARGS)) {
+ trace_xen_mc_flush_reason((b->mcidx == MC_BATCH) ?
+ XEN_MC_FL_BATCH : XEN_MC_FL_ARGS);
xen_mc_flush();
argidx = roundup(b->argidx, sizeof(u64));
}
ret.mc = &b->entries[b->mcidx];
+#if MC_DEBUG
+ b->caller[b->mcidx] = __builtin_return_address(0);
+#endif
b->mcidx++;
ret.args = &b->args[argidx];
b->argidx = argidx + args;
- BUG_ON(b->argidx > MC_ARGS);
+ BUG_ON(b->argidx >= MC_ARGS);
return ret;
}
@@ -211,22 +166,27 @@ struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
struct multicall_space ret = { NULL, NULL };
BUG_ON(preemptible());
- BUG_ON(b->argidx > MC_ARGS);
-
- if (b->mcidx == 0)
- return ret;
+ BUG_ON(b->argidx >= MC_ARGS);
- if (b->entries[b->mcidx - 1].op != op)
- return ret;
+ if (unlikely(b->mcidx == 0 ||
+ b->entries[b->mcidx - 1].op != op)) {
+ trace_xen_mc_extend_args(op, size, XEN_MC_XE_BAD_OP);
+ goto out;
+ }
- if ((b->argidx + size) > MC_ARGS)
- return ret;
+ if (unlikely((b->argidx + size) >= MC_ARGS)) {
+ trace_xen_mc_extend_args(op, size, XEN_MC_XE_NO_SPACE);
+ goto out;
+ }
ret.mc = &b->entries[b->mcidx - 1];
ret.args = &b->args[b->argidx];
b->argidx += size;
- BUG_ON(b->argidx > MC_ARGS);
+ BUG_ON(b->argidx >= MC_ARGS);
+
+ trace_xen_mc_extend_args(op, size, XEN_MC_XE_OK);
+out:
return ret;
}
@@ -236,43 +196,13 @@ void xen_mc_callback(void (*fn)(void *), void *data)
struct callback *cb;
if (b->cbidx == MC_BATCH) {
- mc_stats_flush(FL_CALLBACKS);
+ trace_xen_mc_flush_reason(XEN_MC_FL_CALLBACK);
xen_mc_flush();
}
+ trace_xen_mc_callback(fn, data);
+
cb = &b->callbacks[b->cbidx++];
cb->fn = fn;
cb->data = data;
}
-
-#ifdef CONFIG_XEN_DEBUG_FS
-
-static struct dentry *d_mc_debug;
-
-static int __init xen_mc_debugfs(void)
-{
- struct dentry *d_xen = xen_init_debugfs();
-
- if (d_xen == NULL)
- return -ENOMEM;
-
- d_mc_debug = debugfs_create_dir("multicalls", d_xen);
-
- debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats);
-
- debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued);
- debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls);
- debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total);
-
- xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug,
- mc_stats.histo, MC_BATCH);
- xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug,
- mc_stats.histo_hypercalls, NHYPERCALLS);
- xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug,
- mc_stats.flush, FL_N_REASONS);
-
- return 0;
-}
-fs_initcall(xen_mc_debugfs);
-
-#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index fa3e10725d9..9c2e74f9096 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -1,6 +1,8 @@
#ifndef _XEN_MULTICALLS_H
#define _XEN_MULTICALLS_H
+#include <trace/events/xen.h>
+
#include "xen-ops.h"
/* Multicalls */
@@ -20,9 +22,11 @@ DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
static inline void xen_mc_batch(void)
{
unsigned long flags;
+
/* need to disable interrupts until this entry is complete */
local_irq_save(flags);
- __get_cpu_var(xen_mc_irq_flags) = flags;
+ trace_xen_mc_batch(paravirt_get_lazy_mode());
+ __this_cpu_write(xen_mc_irq_flags, flags);
}
static inline struct multicall_space xen_mc_entry(size_t args)
@@ -37,11 +41,13 @@ void xen_mc_flush(void);
/* Issue a multicall if we're not in a lazy mode */
static inline void xen_mc_issue(unsigned mode)
{
+ trace_xen_mc_issue(mode);
+
if ((paravirt_get_lazy_mode() & mode) == 0)
xen_mc_flush();
/* restore flags saved in xen_mc_batch */
- local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
+ local_irq_restore(this_cpu_read(xen_mc_irq_flags));
}
/* Set up a callback to be called when the current batch is flushed */
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
new file mode 100644
index 00000000000..9bb3d82ffec
--- /dev/null
+++ b/arch/x86/xen/p2m.c
@@ -0,0 +1,1340 @@
+/*
+ * Xen leaves the responsibility for maintaining p2m mappings to the
+ * guests themselves, but it must also access and update the p2m array
+ * during suspend/resume when all the pages are reallocated.
+ *
+ * The p2m table is logically a flat array, but we implement it as a
+ * three-level tree to allow the address space to be sparse.
+ *
+ * Xen
+ * |
+ * p2m_top p2m_top_mfn
+ * / \ / \
+ * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
+ * / \ / \ / /
+ * p2m p2m p2m p2m p2m p2m p2m ...
+ *
+ * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
+ *
+ * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
+ * maximum representable pseudo-physical address space is:
+ * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
+ *
+ * P2M_PER_PAGE depends on the architecture, as a mfn is always
+ * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
+ * 512 and 1024 entries respectively.
+ *
+ * In short, these structures contain the Machine Frame Number (MFN) of the PFN.
+ *
+ * However not all entries are filled with MFNs. Specifically for all other
+ * leaf entries, or for the top root, or middle one, for which there is a void
+ * entry, we assume it is "missing". So (for example)
+ * pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY.
+ *
+ * We also have the possibility of setting 1-1 mappings on certain regions, so
+ * that:
+ * pfn_to_mfn(0xc0000)=0xc0000
+ *
+ * The benefit of this is, that we can assume for non-RAM regions (think
+ * PCI BARs, or ACPI spaces), we can create mappings easily because we
+ * get the PFN value to match the MFN.
+ *
+ * For this to work efficiently we have one new page p2m_identity and
+ * allocate (via reserved_brk) any other pages we need to cover the sides
+ * (1GB or 4MB boundary violations). All entries in p2m_identity are set to
+ * INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs,
+ * no other fancy value).
+ *
+ * On lookup we spot that the entry points to p2m_identity and return the
+ * identity value instead of dereferencing and returning INVALID_P2M_ENTRY.
+ * If the entry points to an allocated page, we just proceed as before and
+ * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in
+ * appropriate functions (pfn_to_mfn).
+ *
+ * The reason for having the IDENTITY_FRAME_BIT instead of just returning the
+ * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a
+ * non-identity pfn. To protect ourselves against we elect to set (and get) the
+ * IDENTITY_FRAME_BIT on all identity mapped PFNs.
+ *
+ * This simplistic diagram is used to explain the more subtle piece of code.
+ * There is also a digram of the P2M at the end that can help.
+ * Imagine your E820 looking as so:
+ *
+ * 1GB 2GB 4GB
+ * /-------------------+---------\/----\ /----------\ /---+-----\
+ * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM |
+ * \-------------------+---------/\----/ \----------/ \---+-----/
+ * ^- 1029MB ^- 2001MB
+ *
+ * [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100),
+ * 2048MB = 524288 (0x80000)]
+ *
+ * And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB
+ * is actually not present (would have to kick the balloon driver to put it in).
+ *
+ * When we are told to set the PFNs for identity mapping (see patch: "xen/setup:
+ * Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start
+ * of the PFN and the end PFN (263424 and 512256 respectively). The first step
+ * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page
+ * covers 512^2 of page estate (1GB) and in case the start or end PFN is not
+ * aligned on 512^2*PAGE_SIZE (1GB) we reserve_brk new middle and leaf pages as
+ * required to split any existing p2m_mid_missing middle pages.
+ *
+ * With the E820 example above, 263424 is not 1GB aligned so we allocate a
+ * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000.
+ * Each entry in the allocate page is "missing" (points to p2m_missing).
+ *
+ * Next stage is to determine if we need to do a more granular boundary check
+ * on the 4MB (or 2MB depending on architecture) off the start and end pfn's.
+ * We check if the start pfn and end pfn violate that boundary check, and if
+ * so reserve_brk a (p2m[x][y]) leaf page. This way we have a much finer
+ * granularity of setting which PFNs are missing and which ones are identity.
+ * In our example 263424 and 512256 both fail the check so we reserve_brk two
+ * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing"
+ * values) and assign them to p2m[1][2] and p2m[1][488] respectively.
+ *
+ * At this point we would at minimum reserve_brk one page, but could be up to
+ * three. Each call to set_phys_range_identity has at maximum a three page
+ * cost. If we were to query the P2M at this stage, all those entries from
+ * start PFN through end PFN (so 1029MB -> 2001MB) would return
+ * INVALID_P2M_ENTRY ("missing").
+ *
+ * The next step is to walk from the start pfn to the end pfn setting
+ * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity.
+ * If we find that the middle entry is pointing to p2m_missing we can swap it
+ * over to p2m_identity - this way covering 4MB (or 2MB) PFN space (and
+ * similarly swapping p2m_mid_missing for p2m_mid_identity for larger regions).
+ * At this point we do not need to worry about boundary aligment (so no need to
+ * reserve_brk a middle page, figure out which PFNs are "missing" and which
+ * ones are identity), as that has been done earlier. If we find that the
+ * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference
+ * that page (which covers 512 PFNs) and set the appropriate PFN with
+ * IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we
+ * set from p2m[1][2][256->511] and p2m[1][488][0->256] with
+ * IDENTITY_FRAME_BIT set.
+ *
+ * All other regions that are void (or not filled) either point to p2m_missing
+ * (considered missing) or have the default value of INVALID_P2M_ENTRY (also
+ * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511]
+ * contain the INVALID_P2M_ENTRY value and are considered "missing."
+ *
+ * Finally, the region beyond the end of of the E820 (4 GB in this example)
+ * is set to be identity (in case there are MMIO regions placed here).
+ *
+ * This is what the p2m ends up looking (for the E820 above) with this
+ * fabulous drawing:
+ *
+ * p2m /--------------\
+ * /-----\ | &mfn_list[0],| /-----------------\
+ * | 0 |------>| &mfn_list[1],| /---------------\ | ~0, ~0, .. |
+ * |-----| | ..., ~0, ~0 | | ~0, ~0, [x]---+----->| IDENTITY [@256] |
+ * | 1 |---\ \--------------/ | [p2m_identity]+\ | IDENTITY [@257] |
+ * |-----| \ | [p2m_identity]+\\ | .... |
+ * | 2 |--\ \-------------------->| ... | \\ \----------------/
+ * |-----| \ \---------------/ \\
+ * | 3 |-\ \ \\ p2m_identity [1]
+ * |-----| \ \-------------------->/---------------\ /-----------------\
+ * | .. |\ | | [p2m_identity]+-->| ~0, ~0, ~0, ... |
+ * \-----/ | | | [p2m_identity]+-->| ..., ~0 |
+ * | | | .... | \-----------------/
+ * | | +-[x], ~0, ~0.. +\
+ * | | \---------------/ \
+ * | | \-> /---------------\
+ * | V p2m_mid_missing p2m_missing | IDENTITY[@0] |
+ * | /-----------------\ /------------\ | IDENTITY[@256]|
+ * | | [p2m_missing] +---->| ~0, ~0, ...| | ~0, ~0, .... |
+ * | | [p2m_missing] +---->| ..., ~0 | \---------------/
+ * | | ... | \------------/
+ * | \-----------------/
+ * |
+ * | p2m_mid_identity
+ * | /-----------------\
+ * \-->| [p2m_identity] +---->[1]
+ * | [p2m_identity] +---->[1]
+ * | ... |
+ * \-----------------/
+ *
+ * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT)
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+
+#include <asm/cache.h>
+#include <asm/setup.h>
+
+#include <asm/xen/page.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/balloon.h>
+#include <xen/grant_table.h>
+
+#include "multicalls.h"
+#include "xen-ops.h"
+
+static void __init m2p_override_init(void);
+
+unsigned long xen_max_p2m_pfn __read_mostly;
+
+#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
+#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
+
+#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
+
+/* Placeholders for holes in the address space */
+static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
+
+static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
+
+static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_identity, P2M_MID_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_identity_mfn, P2M_MID_PER_PAGE);
+
+RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
+RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
+
+/* We might hit two boundary violations at the start and end, at max each
+ * boundary violation will require three middle nodes. */
+RESERVE_BRK(p2m_mid_extra, PAGE_SIZE * 2 * 3);
+
+/* When we populate back during bootup, the amount of pages can vary. The
+ * max we have is seen is 395979, but that does not mean it can't be more.
+ * Some machines can have 3GB I/O holes even. With early_can_reuse_p2m_middle
+ * it can re-use Xen provided mfn_list array, so we only need to allocate at
+ * most three P2M top nodes. */
+RESERVE_BRK(p2m_populated, PAGE_SIZE * 3);
+
+static inline unsigned p2m_top_index(unsigned long pfn)
+{
+ BUG_ON(pfn >= MAX_P2M_PFN);
+ return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
+}
+
+static inline unsigned p2m_mid_index(unsigned long pfn)
+{
+ return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
+}
+
+static inline unsigned p2m_index(unsigned long pfn)
+{
+ return pfn % P2M_PER_PAGE;
+}
+
+static void p2m_top_init(unsigned long ***top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = p2m_mid_missing;
+}
+
+static void p2m_top_mfn_init(unsigned long *top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = virt_to_mfn(p2m_mid_missing_mfn);
+}
+
+static void p2m_top_mfn_p_init(unsigned long **top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = p2m_mid_missing_mfn;
+}
+
+static void p2m_mid_init(unsigned long **mid, unsigned long *leaf)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ mid[i] = leaf;
+}
+
+static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ mid[i] = virt_to_mfn(leaf);
+}
+
+static void p2m_init(unsigned long *p2m)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ p2m[i] = INVALID_P2M_ENTRY;
+}
+
+/*
+ * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
+ *
+ * This is called both at boot time, and after resuming from suspend:
+ * - At boot time we're called very early, and must use extend_brk()
+ * to allocate memory.
+ *
+ * - After resume we're called from within stop_machine, but the mfn
+ * tree should alreay be completely allocated.
+ */
+void __ref xen_build_mfn_list_list(void)
+{
+ unsigned long pfn;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ /* Pre-initialize p2m_top_mfn to be completely missing */
+ if (p2m_top_mfn == NULL) {
+ p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
+ p2m_mid_identity_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity);
+
+ p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_mfn_p_init(p2m_top_mfn_p);
+
+ p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_mfn_init(p2m_top_mfn);
+ } else {
+ /* Reinitialise, mfn's all change after migration */
+ p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
+ p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity);
+ }
+
+ for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
+ unsigned long **mid;
+ unsigned long *mid_mfn_p;
+
+ mid = p2m_top[topidx];
+ mid_mfn_p = p2m_top_mfn_p[topidx];
+
+ /* Don't bother allocating any mfn mid levels if
+ * they're just missing, just update the stored mfn,
+ * since all could have changed over a migrate.
+ */
+ if (mid == p2m_mid_missing) {
+ BUG_ON(mididx);
+ BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
+ p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
+ pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
+ continue;
+ }
+
+ if (mid_mfn_p == p2m_mid_missing_mfn) {
+ /*
+ * XXX boot-time only! We should never find
+ * missing parts of the mfn tree after
+ * runtime. extend_brk() will BUG if we call
+ * it too late.
+ */
+ mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(mid_mfn_p, p2m_missing);
+
+ p2m_top_mfn_p[topidx] = mid_mfn_p;
+ }
+
+ p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
+ mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
+ }
+}
+
+void xen_setup_mfn_list_list(void)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
+
+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+ virt_to_mfn(p2m_top_mfn);
+ HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
+}
+
+/* Set up p2m_top to point to the domain-builder provided p2m pages */
+void __init xen_build_dynamic_phys_to_machine(void)
+{
+ unsigned long *mfn_list;
+ unsigned long max_pfn;
+ unsigned long pfn;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ mfn_list = (unsigned long *)xen_start_info->mfn_list;
+ max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
+ xen_max_p2m_pfn = max_pfn;
+
+ p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_init(p2m_missing);
+ p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_init(p2m_identity);
+
+ p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_init(p2m_mid_missing, p2m_missing);
+ p2m_mid_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_init(p2m_mid_identity, p2m_identity);
+
+ p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_init(p2m_top);
+
+ /*
+ * The domain builder gives us a pre-constructed p2m array in
+ * mfn_list for all the pages initially given to us, so we just
+ * need to graft that into our tree structure.
+ */
+ for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
+
+ if (p2m_top[topidx] == p2m_mid_missing) {
+ unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_init(mid, p2m_missing);
+
+ p2m_top[topidx] = mid;
+ }
+
+ /*
+ * As long as the mfn_list has enough entries to completely
+ * fill a p2m page, pointing into the array is ok. But if
+ * not the entries beyond the last pfn will be undefined.
+ */
+ if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) {
+ unsigned long p2midx;
+
+ p2midx = max_pfn % P2M_PER_PAGE;
+ for ( ; p2midx < P2M_PER_PAGE; p2midx++)
+ mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY;
+ }
+ p2m_top[topidx][mididx] = &mfn_list[pfn];
+ }
+
+ m2p_override_init();
+}
+#ifdef CONFIG_X86_64
+#include <linux/bootmem.h>
+unsigned long __init xen_revector_p2m_tree(void)
+{
+ unsigned long va_start;
+ unsigned long va_end;
+ unsigned long pfn;
+ unsigned long pfn_free = 0;
+ unsigned long *mfn_list = NULL;
+ unsigned long size;
+
+ va_start = xen_start_info->mfn_list;
+ /*We copy in increments of P2M_PER_PAGE * sizeof(unsigned long),
+ * so make sure it is rounded up to that */
+ size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+ va_end = va_start + size;
+
+ /* If we were revectored already, don't do it again. */
+ if (va_start <= __START_KERNEL_map && va_start >= __PAGE_OFFSET)
+ return 0;
+
+ mfn_list = alloc_bootmem_align(size, PAGE_SIZE);
+ if (!mfn_list) {
+ pr_warn("Could not allocate space for a new P2M tree!\n");
+ return xen_start_info->mfn_list;
+ }
+ /* Fill it out with INVALID_P2M_ENTRY value */
+ memset(mfn_list, 0xFF, size);
+
+ for (pfn = 0; pfn < ALIGN(MAX_DOMAIN_PAGES, P2M_PER_PAGE); pfn += P2M_PER_PAGE) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx;
+ unsigned long *mid_p;
+
+ if (!p2m_top[topidx])
+ continue;
+
+ if (p2m_top[topidx] == p2m_mid_missing)
+ continue;
+
+ mididx = p2m_mid_index(pfn);
+ mid_p = p2m_top[topidx][mididx];
+ if (!mid_p)
+ continue;
+ if ((mid_p == p2m_missing) || (mid_p == p2m_identity))
+ continue;
+
+ if ((unsigned long)mid_p == INVALID_P2M_ENTRY)
+ continue;
+
+ /* The old va. Rebase it on mfn_list */
+ if (mid_p >= (unsigned long *)va_start && mid_p <= (unsigned long *)va_end) {
+ unsigned long *new;
+
+ if (pfn_free > (size / sizeof(unsigned long))) {
+ WARN(1, "Only allocated for %ld pages, but we want %ld!\n",
+ size / sizeof(unsigned long), pfn_free);
+ return 0;
+ }
+ new = &mfn_list[pfn_free];
+
+ copy_page(new, mid_p);
+ p2m_top[topidx][mididx] = &mfn_list[pfn_free];
+ p2m_top_mfn_p[topidx][mididx] = virt_to_mfn(&mfn_list[pfn_free]);
+
+ pfn_free += P2M_PER_PAGE;
+
+ }
+ /* This should be the leafs allocated for identity from _brk. */
+ }
+ return (unsigned long)mfn_list;
+
+}
+#else
+unsigned long __init xen_revector_p2m_tree(void)
+{
+ return 0;
+}
+#endif
+unsigned long get_phys_to_machine(unsigned long pfn)
+{
+ unsigned topidx, mididx, idx;
+
+ if (unlikely(pfn >= MAX_P2M_PFN))
+ return IDENTITY_FRAME(pfn);
+
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+ idx = p2m_index(pfn);
+
+ /*
+ * The INVALID_P2M_ENTRY is filled in both p2m_*identity
+ * and in p2m_*missing, so returning the INVALID_P2M_ENTRY
+ * would be wrong.
+ */
+ if (p2m_top[topidx][mididx] == p2m_identity)
+ return IDENTITY_FRAME(pfn);
+
+ return p2m_top[topidx][mididx][idx];
+}
+EXPORT_SYMBOL_GPL(get_phys_to_machine);
+
+static void *alloc_p2m_page(void)
+{
+ return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
+}
+
+static void free_p2m_page(void *p)
+{
+ free_page((unsigned long)p);
+}
+
+/*
+ * Fully allocate the p2m structure for a given pfn. We need to check
+ * that both the top and mid levels are allocated, and make sure the
+ * parallel mfn tree is kept in sync. We may race with other cpus, so
+ * the new pages are installed with cmpxchg; if we lose the race then
+ * simply free the page we allocated and use the one that's there.
+ */
+static bool alloc_p2m(unsigned long pfn)
+{
+ unsigned topidx, mididx;
+ unsigned long ***top_p, **mid;
+ unsigned long *top_mfn_p, *mid_mfn;
+
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+
+ top_p = &p2m_top[topidx];
+ mid = *top_p;
+
+ if (mid == p2m_mid_missing) {
+ /* Mid level is missing, allocate a new one */
+ mid = alloc_p2m_page();
+ if (!mid)
+ return false;
+
+ p2m_mid_init(mid, p2m_missing);
+
+ if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
+ free_p2m_page(mid);
+ }
+
+ top_mfn_p = &p2m_top_mfn[topidx];
+ mid_mfn = p2m_top_mfn_p[topidx];
+
+ BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
+
+ if (mid_mfn == p2m_mid_missing_mfn) {
+ /* Separately check the mid mfn level */
+ unsigned long missing_mfn;
+ unsigned long mid_mfn_mfn;
+
+ mid_mfn = alloc_p2m_page();
+ if (!mid_mfn)
+ return false;
+
+ p2m_mid_mfn_init(mid_mfn, p2m_missing);
+
+ missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
+ mid_mfn_mfn = virt_to_mfn(mid_mfn);
+ if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
+ free_p2m_page(mid_mfn);
+ else
+ p2m_top_mfn_p[topidx] = mid_mfn;
+ }
+
+ if (p2m_top[topidx][mididx] == p2m_identity ||
+ p2m_top[topidx][mididx] == p2m_missing) {
+ /* p2m leaf page is missing */
+ unsigned long *p2m;
+ unsigned long *p2m_orig = p2m_top[topidx][mididx];
+
+ p2m = alloc_p2m_page();
+ if (!p2m)
+ return false;
+
+ p2m_init(p2m);
+
+ if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig)
+ free_p2m_page(p2m);
+ else
+ mid_mfn[mididx] = virt_to_mfn(p2m);
+ }
+
+ return true;
+}
+
+static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary)
+{
+ unsigned topidx, mididx, idx;
+ unsigned long *p2m;
+ unsigned long *mid_mfn_p;
+
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+ idx = p2m_index(pfn);
+
+ /* Pfff.. No boundary cross-over, lets get out. */
+ if (!idx && check_boundary)
+ return false;
+
+ WARN(p2m_top[topidx][mididx] == p2m_identity,
+ "P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n",
+ topidx, mididx);
+
+ /*
+ * Could be done by xen_build_dynamic_phys_to_machine..
+ */
+ if (p2m_top[topidx][mididx] != p2m_missing)
+ return false;
+
+ /* Boundary cross-over for the edges: */
+ p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
+
+ p2m_init(p2m);
+
+ p2m_top[topidx][mididx] = p2m;
+
+ /* For save/restore we need to MFN of the P2M saved */
+
+ mid_mfn_p = p2m_top_mfn_p[topidx];
+ WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing),
+ "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n",
+ topidx, mididx);
+ mid_mfn_p[mididx] = virt_to_mfn(p2m);
+
+ return true;
+}
+
+static bool __init early_alloc_p2m_middle(unsigned long pfn)
+{
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned long *mid_mfn_p;
+ unsigned long **mid;
+
+ mid = p2m_top[topidx];
+ mid_mfn_p = p2m_top_mfn_p[topidx];
+ if (mid == p2m_mid_missing) {
+ mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
+
+ p2m_mid_init(mid, p2m_missing);
+
+ p2m_top[topidx] = mid;
+
+ BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
+ }
+ /* And the save/restore P2M tables.. */
+ if (mid_mfn_p == p2m_mid_missing_mfn) {
+ mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(mid_mfn_p, p2m_missing);
+
+ p2m_top_mfn_p[topidx] = mid_mfn_p;
+ p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
+ /* Note: we don't set mid_mfn_p[midix] here,
+ * look in early_alloc_p2m() */
+ }
+ return true;
+}
+
+/*
+ * Skim over the P2M tree looking at pages that are either filled with
+ * INVALID_P2M_ENTRY or with 1:1 PFNs. If found, re-use that page and
+ * replace the P2M leaf with a p2m_missing or p2m_identity.
+ * Stick the old page in the new P2M tree location.
+ */
+bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_mfn)
+{
+ unsigned topidx;
+ unsigned mididx;
+ unsigned ident_pfns;
+ unsigned inv_pfns;
+ unsigned long *p2m;
+ unsigned long *mid_mfn_p;
+ unsigned idx;
+ unsigned long pfn;
+
+ /* We only look when this entails a P2M middle layer */
+ if (p2m_index(set_pfn))
+ return false;
+
+ for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) {
+ topidx = p2m_top_index(pfn);
+
+ if (!p2m_top[topidx])
+ continue;
+
+ if (p2m_top[topidx] == p2m_mid_missing)
+ continue;
+
+ mididx = p2m_mid_index(pfn);
+ p2m = p2m_top[topidx][mididx];
+ if (!p2m)
+ continue;
+
+ if ((p2m == p2m_missing) || (p2m == p2m_identity))
+ continue;
+
+ if ((unsigned long)p2m == INVALID_P2M_ENTRY)
+ continue;
+
+ ident_pfns = 0;
+ inv_pfns = 0;
+ for (idx = 0; idx < P2M_PER_PAGE; idx++) {
+ /* IDENTITY_PFNs are 1:1 */
+ if (p2m[idx] == IDENTITY_FRAME(pfn + idx))
+ ident_pfns++;
+ else if (p2m[idx] == INVALID_P2M_ENTRY)
+ inv_pfns++;
+ else
+ break;
+ }
+ if ((ident_pfns == P2M_PER_PAGE) || (inv_pfns == P2M_PER_PAGE))
+ goto found;
+ }
+ return false;
+found:
+ /* Found one, replace old with p2m_identity or p2m_missing */
+ p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing);
+ /* And the other for save/restore.. */
+ mid_mfn_p = p2m_top_mfn_p[topidx];
+ /* NOTE: Even if it is a p2m_identity it should still be point to
+ * a page filled with INVALID_P2M_ENTRY entries. */
+ mid_mfn_p[mididx] = virt_to_mfn(p2m_missing);
+
+ /* Reset where we want to stick the old page in. */
+ topidx = p2m_top_index(set_pfn);
+ mididx = p2m_mid_index(set_pfn);
+
+ /* This shouldn't happen */
+ if (WARN_ON(p2m_top[topidx] == p2m_mid_missing))
+ early_alloc_p2m_middle(set_pfn);
+
+ if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing))
+ return false;
+
+ p2m_init(p2m);
+ p2m_top[topidx][mididx] = p2m;
+ mid_mfn_p = p2m_top_mfn_p[topidx];
+ mid_mfn_p[mididx] = virt_to_mfn(p2m);
+
+ return true;
+}
+bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+ if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
+ if (!early_alloc_p2m_middle(pfn))
+ return false;
+
+ if (early_can_reuse_p2m_middle(pfn, mfn))
+ return __set_phys_to_machine(pfn, mfn);
+
+ if (!early_alloc_p2m(pfn, false /* boundary crossover OK!*/))
+ return false;
+
+ if (!__set_phys_to_machine(pfn, mfn))
+ return false;
+ }
+
+ return true;
+}
+
+static void __init early_split_p2m(unsigned long pfn)
+{
+ unsigned long mididx, idx;
+
+ mididx = p2m_mid_index(pfn);
+ idx = p2m_index(pfn);
+
+ /*
+ * Allocate new middle and leaf pages if this pfn lies in the
+ * middle of one.
+ */
+ if (mididx || idx)
+ early_alloc_p2m_middle(pfn);
+ if (idx)
+ early_alloc_p2m(pfn, false);
+}
+
+unsigned long __init set_phys_range_identity(unsigned long pfn_s,
+ unsigned long pfn_e)
+{
+ unsigned long pfn;
+
+ if (unlikely(pfn_s >= MAX_P2M_PFN))
+ return 0;
+
+ if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
+ return pfn_e - pfn_s;
+
+ if (pfn_s > pfn_e)
+ return 0;
+
+ if (pfn_e > MAX_P2M_PFN)
+ pfn_e = MAX_P2M_PFN;
+
+ early_split_p2m(pfn_s);
+ early_split_p2m(pfn_e);
+
+ for (pfn = pfn_s; pfn < pfn_e;) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
+
+ if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
+ break;
+ pfn++;
+
+ /*
+ * If the PFN was set to a middle or leaf identity
+ * page the remainder must also be identity, so skip
+ * ahead to the next middle or leaf entry.
+ */
+ if (p2m_top[topidx] == p2m_mid_identity)
+ pfn = ALIGN(pfn, P2M_MID_PER_PAGE * P2M_PER_PAGE);
+ else if (p2m_top[topidx][mididx] == p2m_identity)
+ pfn = ALIGN(pfn, P2M_PER_PAGE);
+ }
+
+ if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s),
+ "Identity mapping failed. We are %ld short of 1-1 mappings!\n",
+ (pfn_e - pfn_s) - (pfn - pfn_s)))
+ printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn);
+
+ return pfn - pfn_s;
+}
+
+/* Try to install p2m mapping; fail if intermediate bits missing */
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+ unsigned topidx, mididx, idx;
+
+ /* don't track P2M changes in autotranslate guests */
+ if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
+ return true;
+
+ if (unlikely(pfn >= MAX_P2M_PFN)) {
+ BUG_ON(mfn != INVALID_P2M_ENTRY);
+ return true;
+ }
+
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+ idx = p2m_index(pfn);
+
+ /* For sparse holes were the p2m leaf has real PFN along with
+ * PCI holes, stick in the PFN as the MFN value.
+ *
+ * set_phys_range_identity() will have allocated new middle
+ * and leaf pages as required so an existing p2m_mid_missing
+ * or p2m_missing mean that whole range will be identity so
+ * these can be switched to p2m_mid_identity or p2m_identity.
+ */
+ if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) {
+ if (p2m_top[topidx] == p2m_mid_identity)
+ return true;
+
+ if (p2m_top[topidx] == p2m_mid_missing) {
+ WARN_ON(cmpxchg(&p2m_top[topidx], p2m_mid_missing,
+ p2m_mid_identity) != p2m_mid_missing);
+ return true;
+ }
+
+ if (p2m_top[topidx][mididx] == p2m_identity)
+ return true;
+
+ /* Swap over from MISSING to IDENTITY if needed. */
+ if (p2m_top[topidx][mididx] == p2m_missing) {
+ WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing,
+ p2m_identity) != p2m_missing);
+ return true;
+ }
+ }
+
+ if (p2m_top[topidx][mididx] == p2m_missing)
+ return mfn == INVALID_P2M_ENTRY;
+
+ p2m_top[topidx][mididx][idx] = mfn;
+
+ return true;
+}
+
+bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+ if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
+ if (!alloc_p2m(pfn))
+ return false;
+
+ if (!__set_phys_to_machine(pfn, mfn))
+ return false;
+ }
+
+ return true;
+}
+
+#define M2P_OVERRIDE_HASH_SHIFT 10
+#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT)
+
+static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH);
+static DEFINE_SPINLOCK(m2p_override_lock);
+
+static void __init m2p_override_init(void)
+{
+ unsigned i;
+
+ m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
+ sizeof(unsigned long));
+
+ for (i = 0; i < M2P_OVERRIDE_HASH; i++)
+ INIT_LIST_HEAD(&m2p_overrides[i]);
+}
+
+static unsigned long mfn_hash(unsigned long mfn)
+{
+ return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
+}
+
+int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
+ struct gnttab_map_grant_ref *kmap_ops,
+ struct page **pages, unsigned int count)
+{
+ int i, ret = 0;
+ bool lazy = false;
+ pte_t *pte;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
+ if (kmap_ops &&
+ !in_interrupt() &&
+ paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
+ arch_enter_lazy_mmu_mode();
+ lazy = true;
+ }
+
+ for (i = 0; i < count; i++) {
+ unsigned long mfn, pfn;
+
+ /* Do not add to override if the map failed. */
+ if (map_ops[i].status)
+ continue;
+
+ if (map_ops[i].flags & GNTMAP_contains_pte) {
+ pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
+ (map_ops[i].host_addr & ~PAGE_MASK));
+ mfn = pte_mfn(*pte);
+ } else {
+ mfn = PFN_DOWN(map_ops[i].dev_bus_addr);
+ }
+ pfn = page_to_pfn(pages[i]);
+
+ WARN_ON(PagePrivate(pages[i]));
+ SetPagePrivate(pages[i]);
+ set_page_private(pages[i], mfn);
+ pages[i]->index = pfn_to_mfn(pfn);
+
+ if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (kmap_ops) {
+ ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]);
+ if (ret)
+ goto out;
+ }
+ }
+
+out:
+ if (lazy)
+ arch_leave_lazy_mmu_mode();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping);
+
+/* Add an MFN override for a particular page */
+int m2p_add_override(unsigned long mfn, struct page *page,
+ struct gnttab_map_grant_ref *kmap_op)
+{
+ unsigned long flags;
+ unsigned long pfn;
+ unsigned long uninitialized_var(address);
+ unsigned level;
+ pte_t *ptep = NULL;
+
+ pfn = page_to_pfn(page);
+ if (!PageHighMem(page)) {
+ address = (unsigned long)__va(pfn << PAGE_SHIFT);
+ ptep = lookup_address(address, &level);
+ if (WARN(ptep == NULL || level != PG_LEVEL_4K,
+ "m2p_add_override: pfn %lx not mapped", pfn))
+ return -EINVAL;
+ }
+
+ if (kmap_op != NULL) {
+ if (!PageHighMem(page)) {
+ struct multicall_space mcs =
+ xen_mc_entry(sizeof(*kmap_op));
+
+ MULTI_grant_table_op(mcs.mc,
+ GNTTABOP_map_grant_ref, kmap_op, 1);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+ }
+ }
+ spin_lock_irqsave(&m2p_override_lock, flags);
+ list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
+ spin_unlock_irqrestore(&m2p_override_lock, flags);
+
+ /* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in
+ * this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other
+ * pfn so that the following mfn_to_pfn(mfn) calls will return the
+ * pfn from the m2p_override (the backend pfn) instead.
+ * We need to do this because the pages shared by the frontend
+ * (xen-blkfront) can be already locked (lock_page, called by
+ * do_read_cache_page); when the userspace backend tries to use them
+ * with direct_IO, mfn_to_pfn returns the pfn of the frontend, so
+ * do_blockdev_direct_IO is going to try to lock the same pages
+ * again resulting in a deadlock.
+ * As a side effect get_user_pages_fast might not be safe on the
+ * frontend pages while they are being shared with the backend,
+ * because mfn_to_pfn (that ends up being called by GUPF) will
+ * return the backend pfn rather than the frontend pfn. */
+ pfn = mfn_to_pfn_no_overrides(mfn);
+ if (get_phys_to_machine(pfn) == mfn)
+ set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(m2p_add_override);
+
+int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
+ struct gnttab_map_grant_ref *kmap_ops,
+ struct page **pages, unsigned int count)
+{
+ int i, ret = 0;
+ bool lazy = false;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
+ if (kmap_ops &&
+ !in_interrupt() &&
+ paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
+ arch_enter_lazy_mmu_mode();
+ lazy = true;
+ }
+
+ for (i = 0; i < count; i++) {
+ unsigned long mfn = get_phys_to_machine(page_to_pfn(pages[i]));
+ unsigned long pfn = page_to_pfn(pages[i]);
+
+ if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ set_page_private(pages[i], INVALID_P2M_ENTRY);
+ WARN_ON(!PagePrivate(pages[i]));
+ ClearPagePrivate(pages[i]);
+ set_phys_to_machine(pfn, pages[i]->index);
+
+ if (kmap_ops)
+ ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn);
+ if (ret)
+ goto out;
+ }
+
+out:
+ if (lazy)
+ arch_leave_lazy_mmu_mode();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping);
+
+int m2p_remove_override(struct page *page,
+ struct gnttab_map_grant_ref *kmap_op,
+ unsigned long mfn)
+{
+ unsigned long flags;
+ unsigned long pfn;
+ unsigned long uninitialized_var(address);
+ unsigned level;
+ pte_t *ptep = NULL;
+
+ pfn = page_to_pfn(page);
+
+ if (!PageHighMem(page)) {
+ address = (unsigned long)__va(pfn << PAGE_SHIFT);
+ ptep = lookup_address(address, &level);
+
+ if (WARN(ptep == NULL || level != PG_LEVEL_4K,
+ "m2p_remove_override: pfn %lx not mapped", pfn))
+ return -EINVAL;
+ }
+
+ spin_lock_irqsave(&m2p_override_lock, flags);
+ list_del(&page->lru);
+ spin_unlock_irqrestore(&m2p_override_lock, flags);
+
+ if (kmap_op != NULL) {
+ if (!PageHighMem(page)) {
+ struct multicall_space mcs;
+ struct gnttab_unmap_and_replace *unmap_op;
+ struct page *scratch_page = get_balloon_scratch_page();
+ unsigned long scratch_page_address = (unsigned long)
+ __va(page_to_pfn(scratch_page) << PAGE_SHIFT);
+
+ /*
+ * It might be that we queued all the m2p grant table
+ * hypercalls in a multicall, then m2p_remove_override
+ * get called before the multicall has actually been
+ * issued. In this case handle is going to -1 because
+ * it hasn't been modified yet.
+ */
+ if (kmap_op->handle == -1)
+ xen_mc_flush();
+ /*
+ * Now if kmap_op->handle is negative it means that the
+ * hypercall actually returned an error.
+ */
+ if (kmap_op->handle == GNTST_general_error) {
+ printk(KERN_WARNING "m2p_remove_override: "
+ "pfn %lx mfn %lx, failed to modify kernel mappings",
+ pfn, mfn);
+ put_balloon_scratch_page();
+ return -1;
+ }
+
+ xen_mc_batch();
+
+ mcs = __xen_mc_entry(
+ sizeof(struct gnttab_unmap_and_replace));
+ unmap_op = mcs.args;
+ unmap_op->host_addr = kmap_op->host_addr;
+ unmap_op->new_addr = scratch_page_address;
+ unmap_op->handle = kmap_op->handle;
+
+ MULTI_grant_table_op(mcs.mc,
+ GNTTABOP_unmap_and_replace, unmap_op, 1);
+
+ mcs = __xen_mc_entry(0);
+ MULTI_update_va_mapping(mcs.mc, scratch_page_address,
+ pfn_pte(page_to_pfn(scratch_page),
+ PAGE_KERNEL_RO), 0);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ kmap_op->host_addr = 0;
+ put_balloon_scratch_page();
+ }
+ }
+
+ /* p2m(m2p(mfn)) == FOREIGN_FRAME(mfn): the mfn is already present
+ * somewhere in this domain, even before being added to the
+ * m2p_override (see comment above in m2p_add_override).
+ * If there are no other entries in the m2p_override corresponding
+ * to this mfn, then remove the FOREIGN_FRAME_BIT from the p2m for
+ * the original pfn (the one shared by the frontend): the backend
+ * cannot do any IO on this page anymore because it has been
+ * unshared. Removing the FOREIGN_FRAME_BIT from the p2m entry of
+ * the original pfn causes mfn_to_pfn(mfn) to return the frontend
+ * pfn again. */
+ mfn &= ~FOREIGN_FRAME_BIT;
+ pfn = mfn_to_pfn_no_overrides(mfn);
+ if (get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) &&
+ m2p_find_override(mfn) == NULL)
+ set_phys_to_machine(pfn, mfn);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(m2p_remove_override);
+
+struct page *m2p_find_override(unsigned long mfn)
+{
+ unsigned long flags;
+ struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)];
+ struct page *p, *ret;
+
+ ret = NULL;
+
+ spin_lock_irqsave(&m2p_override_lock, flags);
+
+ list_for_each_entry(p, bucket, lru) {
+ if (page_private(p) == mfn) {
+ ret = p;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&m2p_override_lock, flags);
+
+ return ret;
+}
+
+unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
+{
+ struct page *p = m2p_find_override(mfn);
+ unsigned long ret = pfn;
+
+ if (p)
+ ret = page_to_pfn(p);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
+
+#ifdef CONFIG_XEN_DEBUG_FS
+#include <linux/debugfs.h>
+#include "debugfs.h"
+static int p2m_dump_show(struct seq_file *m, void *v)
+{
+ static const char * const level_name[] = { "top", "middle",
+ "entry", "abnormal", "error"};
+#define TYPE_IDENTITY 0
+#define TYPE_MISSING 1
+#define TYPE_PFN 2
+#define TYPE_UNKNOWN 3
+ static const char * const type_name[] = {
+ [TYPE_IDENTITY] = "identity",
+ [TYPE_MISSING] = "missing",
+ [TYPE_PFN] = "pfn",
+ [TYPE_UNKNOWN] = "abnormal"};
+ unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0;
+ unsigned int uninitialized_var(prev_level);
+ unsigned int uninitialized_var(prev_type);
+
+ if (!p2m_top)
+ return 0;
+
+ for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
+ unsigned idx = p2m_index(pfn);
+ unsigned lvl, type;
+
+ lvl = 4;
+ type = TYPE_UNKNOWN;
+ if (p2m_top[topidx] == p2m_mid_missing) {
+ lvl = 0; type = TYPE_MISSING;
+ } else if (p2m_top[topidx] == NULL) {
+ lvl = 0; type = TYPE_UNKNOWN;
+ } else if (p2m_top[topidx][mididx] == NULL) {
+ lvl = 1; type = TYPE_UNKNOWN;
+ } else if (p2m_top[topidx][mididx] == p2m_identity) {
+ lvl = 1; type = TYPE_IDENTITY;
+ } else if (p2m_top[topidx][mididx] == p2m_missing) {
+ lvl = 1; type = TYPE_MISSING;
+ } else if (p2m_top[topidx][mididx][idx] == 0) {
+ lvl = 2; type = TYPE_UNKNOWN;
+ } else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) {
+ lvl = 2; type = TYPE_IDENTITY;
+ } else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) {
+ lvl = 2; type = TYPE_MISSING;
+ } else if (p2m_top[topidx][mididx][idx] == pfn) {
+ lvl = 2; type = TYPE_PFN;
+ } else if (p2m_top[topidx][mididx][idx] != pfn) {
+ lvl = 2; type = TYPE_PFN;
+ }
+ if (pfn == 0) {
+ prev_level = lvl;
+ prev_type = type;
+ }
+ if (pfn == MAX_DOMAIN_PAGES-1) {
+ lvl = 3;
+ type = TYPE_UNKNOWN;
+ }
+ if (prev_type != type) {
+ seq_printf(m, " [0x%lx->0x%lx] %s\n",
+ prev_pfn_type, pfn, type_name[prev_type]);
+ prev_pfn_type = pfn;
+ prev_type = type;
+ }
+ if (prev_level != lvl) {
+ seq_printf(m, " [0x%lx->0x%lx] level %s\n",
+ prev_pfn_level, pfn, level_name[prev_level]);
+ prev_pfn_level = pfn;
+ prev_level = lvl;
+ }
+ }
+ return 0;
+#undef TYPE_IDENTITY
+#undef TYPE_MISSING
+#undef TYPE_PFN
+#undef TYPE_UNKNOWN
+}
+
+static int p2m_dump_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, p2m_dump_show, NULL);
+}
+
+static const struct file_operations p2m_dump_fops = {
+ .open = p2m_dump_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *d_mmu_debug;
+
+static int __init xen_p2m_debugfs(void)
+{
+ struct dentry *d_xen = xen_init_debugfs();
+
+ if (d_xen == NULL)
+ return -ENOMEM;
+
+ d_mmu_debug = debugfs_create_dir("mmu", d_xen);
+
+ debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops);
+ return 0;
+}
+fs_initcall(xen_p2m_debugfs);
+#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
new file mode 100644
index 00000000000..0e98e5d241d
--- /dev/null
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -0,0 +1,109 @@
+/* Glue code to lib/swiotlb-xen.c */
+
+#include <linux/dma-mapping.h>
+#include <linux/pci.h>
+#include <xen/swiotlb-xen.h>
+
+#include <asm/xen/hypervisor.h>
+#include <xen/xen.h>
+#include <asm/iommu_table.h>
+
+
+#include <asm/xen/swiotlb-xen.h>
+#ifdef CONFIG_X86_64
+#include <asm/iommu.h>
+#include <asm/dma.h>
+#endif
+#include <linux/export.h>
+
+int xen_swiotlb __read_mostly;
+
+static struct dma_map_ops xen_swiotlb_dma_ops = {
+ .mapping_error = xen_swiotlb_dma_mapping_error,
+ .alloc = xen_swiotlb_alloc_coherent,
+ .free = xen_swiotlb_free_coherent,
+ .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
+ .sync_single_for_device = xen_swiotlb_sync_single_for_device,
+ .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu,
+ .sync_sg_for_device = xen_swiotlb_sync_sg_for_device,
+ .map_sg = xen_swiotlb_map_sg_attrs,
+ .unmap_sg = xen_swiotlb_unmap_sg_attrs,
+ .map_page = xen_swiotlb_map_page,
+ .unmap_page = xen_swiotlb_unmap_page,
+ .dma_supported = xen_swiotlb_dma_supported,
+};
+
+/*
+ * pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary
+ *
+ * This returns non-zero if we are forced to use xen_swiotlb (by the boot
+ * option).
+ */
+int __init pci_xen_swiotlb_detect(void)
+{
+
+ if (!xen_pv_domain())
+ return 0;
+
+ /* If running as PV guest, either iommu=soft, or swiotlb=force will
+ * activate this IOMMU. If running as PV privileged, activate it
+ * irregardless.
+ */
+ if ((xen_initial_domain() || swiotlb || swiotlb_force))
+ xen_swiotlb = 1;
+
+ /* If we are running under Xen, we MUST disable the native SWIOTLB.
+ * Don't worry about swiotlb_force flag activating the native, as
+ * the 'swiotlb' flag is the only one turning it on. */
+ swiotlb = 0;
+
+#ifdef CONFIG_X86_64
+ /* pci_swiotlb_detect_4gb turns on native SWIOTLB if no_iommu == 0
+ * (so no iommu=X command line over-writes).
+ * Considering that PV guests do not want the *native SWIOTLB* but
+ * only Xen SWIOTLB it is not useful to us so set no_iommu=1 here.
+ */
+ if (max_pfn > MAX_DMA32_PFN)
+ no_iommu = 1;
+#endif
+ return xen_swiotlb;
+}
+
+void __init pci_xen_swiotlb_init(void)
+{
+ if (xen_swiotlb) {
+ xen_swiotlb_init(1, true /* early */);
+ dma_ops = &xen_swiotlb_dma_ops;
+
+#ifdef CONFIG_PCI
+ /* Make sure ACS will be enabled */
+ pci_request_acs();
+#endif
+ }
+}
+
+int pci_xen_swiotlb_init_late(void)
+{
+ int rc;
+
+ if (xen_swiotlb)
+ return 0;
+
+ rc = xen_swiotlb_init(1, false /* late */);
+ if (rc)
+ return rc;
+
+ dma_ops = &xen_swiotlb_dma_ops;
+#ifdef CONFIG_PCI
+ /* Make sure ACS will be enabled */
+ pci_request_acs();
+#endif
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late);
+
+IOMMU_INIT_FINISH(pci_xen_swiotlb_detect,
+ NULL,
+ pci_xen_swiotlb_init,
+ NULL);
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
new file mode 100644
index 00000000000..a8261716d58
--- /dev/null
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -0,0 +1,217 @@
+/******************************************************************************
+ * platform-pci-unplug.c
+ *
+ * Xen platform PCI device driver
+ * Copyright (c) 2010, Citrix
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+
+#include <xen/platform_pci.h>
+#include "xen-ops.h"
+
+#define XEN_PLATFORM_ERR_MAGIC -1
+#define XEN_PLATFORM_ERR_PROTOCOL -2
+#define XEN_PLATFORM_ERR_BLACKLIST -3
+
+#ifdef CONFIG_XEN_PVHVM
+/* store the value of xen_emul_unplug after the unplug is done */
+static int xen_platform_pci_unplug;
+static int xen_emul_unplug;
+
+static int check_platform_magic(void)
+{
+ short magic;
+ char protocol;
+
+ magic = inw(XEN_IOPORT_MAGIC);
+ if (magic != XEN_IOPORT_MAGIC_VAL) {
+ printk(KERN_ERR "Xen Platform PCI: unrecognised magic value\n");
+ return XEN_PLATFORM_ERR_MAGIC;
+ }
+
+ protocol = inb(XEN_IOPORT_PROTOVER);
+
+ printk(KERN_DEBUG "Xen Platform PCI: I/O protocol version %d\n",
+ protocol);
+
+ switch (protocol) {
+ case 1:
+ outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM);
+ outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER);
+ if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) {
+ printk(KERN_ERR "Xen Platform: blacklisted by host\n");
+ return XEN_PLATFORM_ERR_BLACKLIST;
+ }
+ break;
+ default:
+ printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version");
+ return XEN_PLATFORM_ERR_PROTOCOL;
+ }
+
+ return 0;
+}
+
+bool xen_has_pv_devices()
+{
+ if (!xen_domain())
+ return false;
+
+ /* PV domains always have them. */
+ if (xen_pv_domain())
+ return true;
+
+ /* And user has xen_platform_pci=0 set in guest config as
+ * driver did not modify the value. */
+ if (xen_platform_pci_unplug == 0)
+ return false;
+
+ if (xen_platform_pci_unplug & XEN_UNPLUG_NEVER)
+ return false;
+
+ if (xen_platform_pci_unplug & XEN_UNPLUG_ALL)
+ return true;
+
+ /* This is an odd one - we are going to run legacy
+ * and PV drivers at the same time. */
+ if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY)
+ return true;
+
+ /* And the caller has to follow with xen_pv_{disk,nic}_devices
+ * to be certain which driver can load. */
+ return false;
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_devices);
+
+static bool __xen_has_pv_device(int state)
+{
+ /* HVM domains might or might not */
+ if (xen_hvm_domain() && (xen_platform_pci_unplug & state))
+ return true;
+
+ return xen_has_pv_devices();
+}
+
+bool xen_has_pv_nic_devices(void)
+{
+ return __xen_has_pv_device(XEN_UNPLUG_ALL_NICS | XEN_UNPLUG_ALL);
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_nic_devices);
+
+bool xen_has_pv_disk_devices(void)
+{
+ return __xen_has_pv_device(XEN_UNPLUG_ALL_IDE_DISKS |
+ XEN_UNPLUG_AUX_IDE_DISKS | XEN_UNPLUG_ALL);
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_disk_devices);
+
+/*
+ * This one is odd - it determines whether you want to run PV _and_
+ * legacy (IDE) drivers together. This combination is only possible
+ * under HVM.
+ */
+bool xen_has_pv_and_legacy_disk_devices(void)
+{
+ if (!xen_domain())
+ return false;
+
+ /* N.B. This is only ever used in HVM mode */
+ if (xen_pv_domain())
+ return false;
+
+ if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_and_legacy_disk_devices);
+
+void xen_unplug_emulated_devices(void)
+{
+ int r;
+
+ /* user explicitly requested no unplug */
+ if (xen_emul_unplug & XEN_UNPLUG_NEVER)
+ return;
+ /* check the version of the xen platform PCI device */
+ r = check_platform_magic();
+ /* If the version matches enable the Xen platform PCI driver.
+ * Also enable the Xen platform PCI driver if the host does
+ * not support the unplug protocol (XEN_PLATFORM_ERR_MAGIC)
+ * but the user told us that unplugging is unnecessary. */
+ if (r && !(r == XEN_PLATFORM_ERR_MAGIC &&
+ (xen_emul_unplug & XEN_UNPLUG_UNNECESSARY)))
+ return;
+ /* Set the default value of xen_emul_unplug depending on whether or
+ * not the Xen PV frontends and the Xen platform PCI driver have
+ * been compiled for this kernel (modules or built-in are both OK). */
+ if (!xen_emul_unplug) {
+ if (xen_must_unplug_nics()) {
+ printk(KERN_INFO "Netfront and the Xen platform PCI driver have "
+ "been compiled for this kernel: unplug emulated NICs.\n");
+ xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
+ }
+ if (xen_must_unplug_disks()) {
+ printk(KERN_INFO "Blkfront and the Xen platform PCI driver have "
+ "been compiled for this kernel: unplug emulated disks.\n"
+ "You might have to change the root device\n"
+ "from /dev/hd[a-d] to /dev/xvd[a-d]\n"
+ "in your root= kernel command line option\n");
+ xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
+ }
+ }
+ /* Now unplug the emulated devices */
+ if (!(xen_emul_unplug & XEN_UNPLUG_UNNECESSARY))
+ outw(xen_emul_unplug, XEN_IOPORT_UNPLUG);
+ xen_platform_pci_unplug = xen_emul_unplug;
+}
+
+static int __init parse_xen_emul_unplug(char *arg)
+{
+ char *p, *q;
+ int l;
+
+ for (p = arg; p; p = q) {
+ q = strchr(p, ',');
+ if (q) {
+ l = q - p;
+ q++;
+ } else {
+ l = strlen(p);
+ }
+ if (!strncmp(p, "all", l))
+ xen_emul_unplug |= XEN_UNPLUG_ALL;
+ else if (!strncmp(p, "ide-disks", l))
+ xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
+ else if (!strncmp(p, "aux-ide-disks", l))
+ xen_emul_unplug |= XEN_UNPLUG_AUX_IDE_DISKS;
+ else if (!strncmp(p, "nics", l))
+ xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
+ else if (!strncmp(p, "unnecessary", l))
+ xen_emul_unplug |= XEN_UNPLUG_UNNECESSARY;
+ else if (!strncmp(p, "never", l))
+ xen_emul_unplug |= XEN_UNPLUG_NEVER;
+ else
+ printk(KERN_WARNING "unrecognised option '%s' "
+ "in parameter 'xen_emul_unplug'\n", p);
+ }
+ return 0;
+}
+early_param("xen_emul_unplug", parse_xen_emul_unplug);
+#endif
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 15c6c68db6a..2e555163c2f 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -8,48 +8,460 @@
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/pm.h>
+#include <linux/memblock.h>
+#include <linux/cpuidle.h>
+#include <linux/cpufreq.h>
#include <asm/elf.h>
#include <asm/vdso.h>
#include <asm/e820.h>
#include <asm/setup.h>
#include <asm/acpi.h>
+#include <asm/numa.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
+#include <xen/xen.h>
#include <xen/page.h>
#include <xen/interface/callback.h>
+#include <xen/interface/memory.h>
#include <xen/interface/physdev.h>
#include <xen/features.h>
-
#include "xen-ops.h"
#include "vdso.h"
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
+#ifdef CONFIG_X86_64
+extern asmlinkage void nmi(void);
+#endif
extern void xen_sysenter_target(void);
extern void xen_syscall_target(void);
extern void xen_syscall32_target(void);
+/* Amount of extra memory space we add to the e820 ranges */
+struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
+
+/* Number of pages released from the initial allocation. */
+unsigned long xen_released_pages;
+
+/*
+ * The maximum amount of extra memory compared to the base size. The
+ * main scaling factor is the size of struct page. At extreme ratios
+ * of base:extra, all the base memory can be filled with page
+ * structures for the extra memory, leaving no space for anything
+ * else.
+ *
+ * 10x seems like a reasonable balance between scaling flexibility and
+ * leaving a practically usable system.
+ */
+#define EXTRA_MEM_RATIO (10)
+
+static void __init xen_add_extra_mem(u64 start, u64 size)
+{
+ unsigned long pfn;
+ int i;
+
+ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
+ /* Add new region. */
+ if (xen_extra_mem[i].size == 0) {
+ xen_extra_mem[i].start = start;
+ xen_extra_mem[i].size = size;
+ break;
+ }
+ /* Append to existing region. */
+ if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
+ xen_extra_mem[i].size += size;
+ break;
+ }
+ }
+ if (i == XEN_EXTRA_MEM_MAX_REGIONS)
+ printk(KERN_WARNING "Warning: not enough extra memory regions\n");
+
+ memblock_reserve(start, size);
+
+ xen_max_p2m_pfn = PFN_DOWN(start + size);
+ for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
+ unsigned long mfn = pfn_to_mfn(pfn);
+
+ if (WARN_ONCE(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn))
+ continue;
+ WARN_ONCE(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n",
+ pfn, mfn);
+
+ __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+ }
+}
+
+static unsigned long __init xen_do_chunk(unsigned long start,
+ unsigned long end, bool release)
+{
+ struct xen_memory_reservation reservation = {
+ .address_bits = 0,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+ unsigned long len = 0;
+ unsigned long pfn;
+ int ret;
+
+ for (pfn = start; pfn < end; pfn++) {
+ unsigned long frame;
+ unsigned long mfn = pfn_to_mfn(pfn);
+
+ if (release) {
+ /* Make sure pfn exists to start with */
+ if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
+ continue;
+ frame = mfn;
+ } else {
+ if (mfn != INVALID_P2M_ENTRY)
+ continue;
+ frame = pfn;
+ }
+ set_xen_guest_handle(reservation.extent_start, &frame);
+ reservation.nr_extents = 1;
+
+ ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap,
+ &reservation);
+ WARN(ret != 1, "Failed to %s pfn %lx err=%d\n",
+ release ? "release" : "populate", pfn, ret);
+
+ if (ret == 1) {
+ if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) {
+ if (release)
+ break;
+ set_xen_guest_handle(reservation.extent_start, &frame);
+ reservation.nr_extents = 1;
+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+ &reservation);
+ break;
+ }
+ len++;
+ } else
+ break;
+ }
+ if (len)
+ printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n",
+ release ? "Freeing" : "Populating",
+ start, end, len,
+ release ? "freed" : "added");
+
+ return len;
+}
+
+static unsigned long __init xen_release_chunk(unsigned long start,
+ unsigned long end)
+{
+ return xen_do_chunk(start, end, true);
+}
+
+static unsigned long __init xen_populate_chunk(
+ const struct e820entry *list, size_t map_size,
+ unsigned long max_pfn, unsigned long *last_pfn,
+ unsigned long credits_left)
+{
+ const struct e820entry *entry;
+ unsigned int i;
+ unsigned long done = 0;
+ unsigned long dest_pfn;
+
+ for (i = 0, entry = list; i < map_size; i++, entry++) {
+ unsigned long s_pfn;
+ unsigned long e_pfn;
+ unsigned long pfns;
+ long capacity;
+
+ if (credits_left <= 0)
+ break;
+
+ if (entry->type != E820_RAM)
+ continue;
+
+ e_pfn = PFN_DOWN(entry->addr + entry->size);
+
+ /* We only care about E820 after the xen_start_info->nr_pages */
+ if (e_pfn <= max_pfn)
+ continue;
+
+ s_pfn = PFN_UP(entry->addr);
+ /* If the E820 falls within the nr_pages, we want to start
+ * at the nr_pages PFN.
+ * If that would mean going past the E820 entry, skip it
+ */
+ if (s_pfn <= max_pfn) {
+ capacity = e_pfn - max_pfn;
+ dest_pfn = max_pfn;
+ } else {
+ capacity = e_pfn - s_pfn;
+ dest_pfn = s_pfn;
+ }
+
+ if (credits_left < capacity)
+ capacity = credits_left;
+
+ pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false);
+ done += pfns;
+ *last_pfn = (dest_pfn + pfns);
+ if (pfns < capacity)
+ break;
+ credits_left -= pfns;
+ }
+ return done;
+}
+
+static void __init xen_set_identity_and_release_chunk(
+ unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
+ unsigned long *released, unsigned long *identity)
+{
+ unsigned long pfn;
+
+ /*
+ * If the PFNs are currently mapped, clear the mappings
+ * (except for the ISA region which must be 1:1 mapped) to
+ * release the refcounts (in Xen) on the original frames.
+ */
+ for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
+ pte_t pte = __pte_ma(0);
+
+ if (pfn < PFN_UP(ISA_END_ADDRESS))
+ pte = mfn_pte(pfn, PAGE_KERNEL_IO);
+
+ (void)HYPERVISOR_update_va_mapping(
+ (unsigned long)__va(pfn << PAGE_SHIFT), pte, 0);
+ }
+
+ if (start_pfn < nr_pages)
+ *released += xen_release_chunk(
+ start_pfn, min(end_pfn, nr_pages));
+
+ *identity += set_phys_range_identity(start_pfn, end_pfn);
+}
+
+static unsigned long __init xen_set_identity_and_release(
+ const struct e820entry *list, size_t map_size, unsigned long nr_pages)
+{
+ phys_addr_t start = 0;
+ unsigned long released = 0;
+ unsigned long identity = 0;
+ const struct e820entry *entry;
+ int i;
+
+ /*
+ * Combine non-RAM regions and gaps until a RAM region (or the
+ * end of the map) is reached, then set the 1:1 map and
+ * release the pages (if available) in those non-RAM regions.
+ *
+ * The combined non-RAM regions are rounded to a whole number
+ * of pages so any partial pages are accessible via the 1:1
+ * mapping. This is needed for some BIOSes that put (for
+ * example) the DMI tables in a reserved region that begins on
+ * a non-page boundary.
+ */
+ for (i = 0, entry = list; i < map_size; i++, entry++) {
+ phys_addr_t end = entry->addr + entry->size;
+ if (entry->type == E820_RAM || i == map_size - 1) {
+ unsigned long start_pfn = PFN_DOWN(start);
+ unsigned long end_pfn = PFN_UP(end);
+
+ if (entry->type == E820_RAM)
+ end_pfn = PFN_UP(entry->addr);
+
+ if (start_pfn < end_pfn)
+ xen_set_identity_and_release_chunk(
+ start_pfn, end_pfn, nr_pages,
+ &released, &identity);
+
+ start = end;
+ }
+ }
+
+ if (released)
+ printk(KERN_INFO "Released %lu pages of unused memory\n", released);
+ if (identity)
+ printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity);
+
+ return released;
+}
+
+static unsigned long __init xen_get_max_pages(void)
+{
+ unsigned long max_pages = MAX_DOMAIN_PAGES;
+ domid_t domid = DOMID_SELF;
+ int ret;
+
+ /*
+ * For the initial domain we use the maximum reservation as
+ * the maximum page.
+ *
+ * For guest domains the current maximum reservation reflects
+ * the current maximum rather than the static maximum. In this
+ * case the e820 map provided to us will cover the static
+ * maximum region.
+ */
+ if (xen_initial_domain()) {
+ ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
+ if (ret > 0)
+ max_pages = ret;
+ }
+
+ return min(max_pages, MAX_DOMAIN_PAGES);
+}
+
+static void xen_align_and_add_e820_region(u64 start, u64 size, int type)
+{
+ u64 end = start + size;
+
+ /* Align RAM regions to page boundaries. */
+ if (type == E820_RAM) {
+ start = PAGE_ALIGN(start);
+ end &= ~((u64)PAGE_SIZE - 1);
+ }
+
+ e820_add_region(start, end - start, type);
+}
+
+void xen_ignore_unusable(struct e820entry *list, size_t map_size)
+{
+ struct e820entry *entry;
+ unsigned int i;
+
+ for (i = 0, entry = list; i < map_size; i++, entry++) {
+ if (entry->type == E820_UNUSABLE)
+ entry->type = E820_RAM;
+ }
+}
/**
* machine_specific_memory_setup - Hook for machine specific memory setup.
**/
-
char * __init xen_memory_setup(void)
{
+ static struct e820entry map[E820MAX] __initdata;
+
unsigned long max_pfn = xen_start_info->nr_pages;
+ unsigned long long mem_end;
+ int rc;
+ struct xen_memory_map memmap;
+ unsigned long max_pages;
+ unsigned long last_pfn = 0;
+ unsigned long extra_pages = 0;
+ unsigned long populated;
+ int i;
+ int op;
max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
+ mem_end = PFN_PHYS(max_pfn);
+
+ memmap.nr_entries = E820MAX;
+ set_xen_guest_handle(memmap.buffer, map);
+
+ op = xen_initial_domain() ?
+ XENMEM_machine_memory_map :
+ XENMEM_memory_map;
+ rc = HYPERVISOR_memory_op(op, &memmap);
+ if (rc == -ENOSYS) {
+ BUG_ON(xen_initial_domain());
+ memmap.nr_entries = 1;
+ map[0].addr = 0ULL;
+ map[0].size = mem_end;
+ /* 8MB slack (to balance backend allocations). */
+ map[0].size += 8ULL << 20;
+ map[0].type = E820_RAM;
+ rc = 0;
+ }
+ BUG_ON(rc);
- e820.nr_map = 0;
+ /*
+ * Xen won't allow a 1:1 mapping to be created to UNUSABLE
+ * regions, so if we're using the machine memory map leave the
+ * region as RAM as it is in the pseudo-physical map.
+ *
+ * UNUSABLE regions in domUs are not handled and will need
+ * a patch in the future.
+ */
+ if (xen_initial_domain())
+ xen_ignore_unusable(map, memmap.nr_entries);
+
+ /* Make sure the Xen-supplied memory map is well-ordered. */
+ sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
+
+ max_pages = xen_get_max_pages();
+ if (max_pages > max_pfn)
+ extra_pages += max_pages - max_pfn;
+
+ /*
+ * Set P2M for all non-RAM pages and E820 gaps to be identity
+ * type PFNs. Any RAM pages that would be made inaccesible by
+ * this are first released.
+ */
+ xen_released_pages = xen_set_identity_and_release(
+ map, memmap.nr_entries, max_pfn);
- e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM);
+ /*
+ * Populate back the non-RAM pages and E820 gaps that had been
+ * released. */
+ populated = xen_populate_chunk(map, memmap.nr_entries,
+ max_pfn, &last_pfn, xen_released_pages);
+
+ xen_released_pages -= populated;
+ extra_pages += xen_released_pages;
+
+ if (last_pfn > max_pfn) {
+ max_pfn = min(MAX_DOMAIN_PAGES, last_pfn);
+ mem_end = PFN_PHYS(max_pfn);
+ }
+ /*
+ * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
+ * factor the base size. On non-highmem systems, the base
+ * size is the full initial memory allocation; on highmem it
+ * is limited to the max size of lowmem, so that it doesn't
+ * get completely filled.
+ *
+ * In principle there could be a problem in lowmem systems if
+ * the initial memory is also very large with respect to
+ * lowmem, but we won't try to deal with that here.
+ */
+ extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
+ extra_pages);
+ i = 0;
+ while (i < memmap.nr_entries) {
+ u64 addr = map[i].addr;
+ u64 size = map[i].size;
+ u32 type = map[i].type;
+
+ if (type == E820_RAM) {
+ if (addr < mem_end) {
+ size = min(size, mem_end - addr);
+ } else if (extra_pages) {
+ size = min(size, (u64)extra_pages * PAGE_SIZE);
+ extra_pages -= size / PAGE_SIZE;
+ xen_add_extra_mem(addr, size);
+ } else
+ type = E820_UNUSABLE;
+ }
+
+ xen_align_and_add_e820_region(addr, size, type);
+
+ map[i].addr += size;
+ map[i].size -= size;
+ if (map[i].size == 0)
+ i++;
+ }
/*
- * Even though this is normal, usable memory under Xen, reserve
- * ISA memory anyway because too many things think they can poke
+ * Set the rest as identity mapped, in case PCI BARs are
+ * located here.
+ *
+ * PFNs above MAX_P2M_PFN are considered identity mapped as
+ * well.
+ */
+ set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul);
+
+ /*
+ * In domU, the ISA region is normal, usable memory, but we
+ * reserve ISA memory anyway because too many things poke
* about in there.
*/
e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
@@ -60,28 +472,60 @@ char * __init xen_memory_setup(void)
* - mfn_list
* - xen_start_info
* See comment above "struct start_info" in <xen/interface/xen.h>
+ * We tried to make the the memblock_reserve more selective so
+ * that it would be clear what region is reserved. Sadly we ran
+ * in the problem wherein on a 64-bit hypervisor with a 32-bit
+ * initial domain, the pt_base has the cr3 value which is not
+ * neccessarily where the pagetable starts! As Jan put it: "
+ * Actually, the adjustment turns out to be correct: The page
+ * tables for a 32-on-64 dom0 get allocated in the order "first L1",
+ * "first L2", "first L3", so the offset to the page table base is
+ * indeed 2. When reading xen/include/public/xen.h's comment
+ * very strictly, this is not a violation (since there nothing is said
+ * that the first thing in the page table space is pointed to by
+ * pt_base; I admit that this seems to be implied though, namely
+ * do I think that it is implied that the page table space is the
+ * range [pt_base, pt_base + nt_pt_frames), whereas that
+ * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
+ * which - without a priori knowledge - the kernel would have
+ * difficulty to figure out)." - so lets just fall back to the
+ * easy way and reserve the whole region.
*/
- e820_add_region(__pa(xen_start_info->mfn_list),
- xen_start_info->pt_base - xen_start_info->mfn_list,
- E820_RESERVED);
+ memblock_reserve(__pa(xen_start_info->mfn_list),
+ xen_start_info->pt_base - xen_start_info->mfn_list);
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
return "Xen";
}
-static void xen_idle(void)
+/*
+ * Machine specific memory setup for auto-translated guests.
+ */
+char * __init xen_auto_xlated_memory_setup(void)
{
- local_irq_disable();
+ static struct e820entry map[E820MAX] __initdata;
- if (need_resched())
- local_irq_enable();
- else {
- current_thread_info()->status &= ~TS_POLLING;
- smp_mb__after_clear_bit();
- safe_halt();
- current_thread_info()->status |= TS_POLLING;
- }
+ struct xen_memory_map memmap;
+ int i;
+ int rc;
+
+ memmap.nr_entries = E820MAX;
+ set_xen_guest_handle(memmap.buffer, map);
+
+ rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+ if (rc < 0)
+ panic("No memory map (%d)\n", rc);
+
+ sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries);
+
+ for (i = 0; i < memmap.nr_entries; i++)
+ e820_add_region(map[i].addr, map[i].size, map[i].type);
+
+ memblock_reserve(__pa(xen_start_info->mfn_list),
+ xen_start_info->pt_base - xen_start_info->mfn_list);
+
+ return "Xen";
}
/*
@@ -92,15 +536,22 @@ static void xen_idle(void)
static void __init fiddle_vdso(void)
{
#ifdef CONFIG_X86_32
+ /*
+ * This could be called before selected_vdso32 is initialized, so
+ * just fiddle with both possible images. vdso_image_32_syscall
+ * can't be selected, since it only exists on 64-bit systems.
+ */
u32 *mask;
- mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
+ mask = vdso_image_32_int80.data +
+ vdso_image_32_int80.sym_VDSO32_NOTE_MASK;
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
- mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
+ mask = vdso_image_32_sysenter.data +
+ vdso_image_32_sysenter.sym_VDSO32_NOTE_MASK;
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
#endif
}
-static __cpuinit int register_callback(unsigned type, const void *func)
+static int register_callback(unsigned type, const void *func)
{
struct callback_register callback = {
.type = type,
@@ -111,7 +562,7 @@ static __cpuinit int register_callback(unsigned type, const void *func)
return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
}
-void __cpuinit xen_enable_sysenter(void)
+void xen_enable_sysenter(void)
{
int ret;
unsigned sysenter_feature;
@@ -130,7 +581,7 @@ void __cpuinit xen_enable_sysenter(void)
setup_clear_cpu_cap(sysenter_feature);
}
-void __cpuinit xen_enable_syscall(void)
+void xen_enable_syscall(void)
{
#ifdef CONFIG_X86_64
int ret;
@@ -151,17 +602,13 @@ void __cpuinit xen_enable_syscall(void)
#endif /* CONFIG_X86_64 */
}
-void __init xen_arch_setup(void)
+void __init xen_pvmmu_arch_setup(void)
{
- struct physdev_set_iopl set_iopl;
- int rc;
-
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
- if (!xen_feature(XENFEAT_auto_translated_physmap))
- HYPERVISOR_vm_assist(VMASST_CMD_enable,
- VMASST_TYPE_pae_extended_cr3);
+ HYPERVISOR_vm_assist(VMASST_CMD_enable,
+ VMASST_TYPE_pae_extended_cr3);
if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
@@ -169,11 +616,14 @@ void __init xen_arch_setup(void)
xen_enable_sysenter();
xen_enable_syscall();
+}
- set_iopl.iopl = 1;
- rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
- if (rc != 0)
- printk(KERN_INFO "physdev_op failed %d\n", rc);
+/* This function is not called for HVM domains */
+void __init xen_arch_setup(void)
+{
+ xen_panic_handler_init();
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ xen_pvmmu_arch_setup();
#ifdef CONFIG_ACPI
if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
@@ -186,9 +636,12 @@ void __init xen_arch_setup(void)
MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
- pm_idle = xen_idle;
-
- paravirt_disable_iospace();
-
+ /* Set up idle, making sure it calls safe_halt() pvop */
+ disable_cpuidle();
+ disable_cpufreq();
+ WARN_ON(xen_set_default_idle());
fiddle_vdso();
+#ifdef CONFIG_NUMA
+ numa_off = 1;
+#endif
}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index c44e2069c7c..7005974c3ff 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -14,7 +14,10 @@
*/
#include <linux/sched.h>
#include <linux/err.h>
+#include <linux/slab.h>
#include <linux/smp.h>
+#include <linux/irq_work.h>
+#include <linux/tick.h>
#include <asm/paravirt.h>
#include <asm/desc.h>
@@ -27,49 +30,54 @@
#include <asm/xen/interface.h>
#include <asm/xen/hypercall.h>
+#include <xen/xen.h>
#include <xen/page.h>
#include <xen/events.h>
+#include <xen/hvc-console.h>
#include "xen-ops.h"
#include "mmu.h"
cpumask_var_t xen_cpu_initialized_map;
-static DEFINE_PER_CPU(int, resched_irq);
-static DEFINE_PER_CPU(int, callfunc_irq);
-static DEFINE_PER_CPU(int, callfuncsingle_irq);
-static DEFINE_PER_CPU(int, debug_irq) = -1;
+struct xen_common_irq {
+ int irq;
+ char *name;
+};
+static DEFINE_PER_CPU(struct xen_common_irq, xen_resched_irq) = { .irq = -1 };
+static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 };
+static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 };
+static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 };
+static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 };
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
+static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id);
/*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
+ * Reschedule call back.
*/
static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
{
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_resched_count++;
-#else
- add_pda(irq_resched_count, 1);
-#endif
+ inc_irq_stat(irq_resched_count);
+ scheduler_ipi();
return IRQ_HANDLED;
}
-static __cpuinit void cpu_bringup(void)
+static void cpu_bringup(void)
{
- int cpu = smp_processor_id();
+ int cpu;
cpu_init();
touch_softlockup_watchdog();
preempt_disable();
- xen_enable_sysenter();
- xen_enable_syscall();
-
+ /* PVH runs in ring 0 and allows us to do native syscalls. Yay! */
+ if (!xen_feature(XENFEAT_supervisor_mode_kernel)) {
+ xen_enable_sysenter();
+ xen_enable_syscall();
+ }
cpu = smp_processor_id();
smp_store_cpu_info(cpu);
cpu_data(cpu).x86_max_cores = 1;
@@ -77,8 +85,12 @@ static __cpuinit void cpu_bringup(void)
xen_setup_cpu_clockevents();
- cpu_set(cpu, cpu_online_map);
- x86_write_percpu(cpu_state, CPU_ONLINE);
+ notify_cpu_starting(cpu);
+
+ set_cpu_online(cpu, true);
+
+ this_cpu_write(cpu_state, CPU_ONLINE);
+
wmb();
/* We can take interrupts now: we're officially "up". */
@@ -87,70 +99,128 @@ static __cpuinit void cpu_bringup(void)
wmb(); /* make sure everything is out */
}
-static __cpuinit void cpu_bringup_and_idle(void)
+/* Note: cpu parameter is only relevant for PVH */
+static void cpu_bringup_and_idle(int cpu)
{
+#ifdef CONFIG_X86_64
+ if (xen_feature(XENFEAT_auto_translated_physmap) &&
+ xen_feature(XENFEAT_supervisor_mode_kernel))
+ xen_pvh_secondary_vcpu_init(cpu);
+#endif
cpu_bringup();
- cpu_idle();
+ cpu_startup_entry(CPUHP_ONLINE);
}
+static void xen_smp_intr_free(unsigned int cpu)
+{
+ if (per_cpu(xen_resched_irq, cpu).irq >= 0) {
+ unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu).irq, NULL);
+ per_cpu(xen_resched_irq, cpu).irq = -1;
+ kfree(per_cpu(xen_resched_irq, cpu).name);
+ per_cpu(xen_resched_irq, cpu).name = NULL;
+ }
+ if (per_cpu(xen_callfunc_irq, cpu).irq >= 0) {
+ unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu).irq, NULL);
+ per_cpu(xen_callfunc_irq, cpu).irq = -1;
+ kfree(per_cpu(xen_callfunc_irq, cpu).name);
+ per_cpu(xen_callfunc_irq, cpu).name = NULL;
+ }
+ if (per_cpu(xen_debug_irq, cpu).irq >= 0) {
+ unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu).irq, NULL);
+ per_cpu(xen_debug_irq, cpu).irq = -1;
+ kfree(per_cpu(xen_debug_irq, cpu).name);
+ per_cpu(xen_debug_irq, cpu).name = NULL;
+ }
+ if (per_cpu(xen_callfuncsingle_irq, cpu).irq >= 0) {
+ unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu).irq,
+ NULL);
+ per_cpu(xen_callfuncsingle_irq, cpu).irq = -1;
+ kfree(per_cpu(xen_callfuncsingle_irq, cpu).name);
+ per_cpu(xen_callfuncsingle_irq, cpu).name = NULL;
+ }
+ if (xen_hvm_domain())
+ return;
+
+ if (per_cpu(xen_irq_work, cpu).irq >= 0) {
+ unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL);
+ per_cpu(xen_irq_work, cpu).irq = -1;
+ kfree(per_cpu(xen_irq_work, cpu).name);
+ per_cpu(xen_irq_work, cpu).name = NULL;
+ }
+};
static int xen_smp_intr_init(unsigned int cpu)
{
int rc;
- const char *resched_name, *callfunc_name, *debug_name;
+ char *resched_name, *callfunc_name, *debug_name;
resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
cpu,
xen_reschedule_interrupt,
- IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+ IRQF_PERCPU|IRQF_NOBALANCING,
resched_name,
NULL);
if (rc < 0)
goto fail;
- per_cpu(resched_irq, cpu) = rc;
+ per_cpu(xen_resched_irq, cpu).irq = rc;
+ per_cpu(xen_resched_irq, cpu).name = resched_name;
callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
cpu,
xen_call_function_interrupt,
- IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+ IRQF_PERCPU|IRQF_NOBALANCING,
callfunc_name,
NULL);
if (rc < 0)
goto fail;
- per_cpu(callfunc_irq, cpu) = rc;
+ per_cpu(xen_callfunc_irq, cpu).irq = rc;
+ per_cpu(xen_callfunc_irq, cpu).name = callfunc_name;
debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu);
rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt,
- IRQF_DISABLED | IRQF_PERCPU | IRQF_NOBALANCING,
+ IRQF_PERCPU | IRQF_NOBALANCING,
debug_name, NULL);
if (rc < 0)
goto fail;
- per_cpu(debug_irq, cpu) = rc;
+ per_cpu(xen_debug_irq, cpu).irq = rc;
+ per_cpu(xen_debug_irq, cpu).name = debug_name;
callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
cpu,
xen_call_function_single_interrupt,
- IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+ IRQF_PERCPU|IRQF_NOBALANCING,
+ callfunc_name,
+ NULL);
+ if (rc < 0)
+ goto fail;
+ per_cpu(xen_callfuncsingle_irq, cpu).irq = rc;
+ per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name;
+
+ /*
+ * The IRQ worker on PVHVM goes through the native path and uses the
+ * IPI mechanism.
+ */
+ if (xen_hvm_domain())
+ return 0;
+
+ callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu);
+ rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR,
+ cpu,
+ xen_irq_work_interrupt,
+ IRQF_PERCPU|IRQF_NOBALANCING,
callfunc_name,
NULL);
if (rc < 0)
goto fail;
- per_cpu(callfuncsingle_irq, cpu) = rc;
+ per_cpu(xen_irq_work, cpu).irq = rc;
+ per_cpu(xen_irq_work, cpu).name = callfunc_name;
return 0;
fail:
- if (per_cpu(resched_irq, cpu) >= 0)
- unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
- if (per_cpu(callfunc_irq, cpu) >= 0)
- unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
- if (per_cpu(debug_irq, cpu) >= 0)
- unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
- if (per_cpu(callfuncsingle_irq, cpu) >= 0)
- unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
-
+ xen_smp_intr_free(cpu);
return rc;
}
@@ -158,35 +228,110 @@ static void __init xen_fill_possible_map(void)
{
int i, rc;
+ if (xen_initial_domain())
+ return;
+
for (i = 0; i < nr_cpu_ids; i++) {
rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
if (rc >= 0) {
num_processors++;
- cpu_set(i, cpu_possible_map);
+ set_cpu_possible(i, true);
}
}
}
+static void __init xen_filter_cpu_maps(void)
+{
+ int i, rc;
+ unsigned int subtract = 0;
+
+ if (!xen_initial_domain())
+ return;
+
+ num_processors = 0;
+ disabled_cpus = 0;
+ for (i = 0; i < nr_cpu_ids; i++) {
+ rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
+ if (rc >= 0) {
+ num_processors++;
+ set_cpu_possible(i, true);
+ } else {
+ set_cpu_possible(i, false);
+ set_cpu_present(i, false);
+ subtract++;
+ }
+ }
+#ifdef CONFIG_HOTPLUG_CPU
+ /* This is akin to using 'nr_cpus' on the Linux command line.
+ * Which is OK as when we use 'dom0_max_vcpus=X' we can only
+ * have up to X, while nr_cpu_ids is greater than X. This
+ * normally is not a problem, except when CPU hotplugging
+ * is involved and then there might be more than X CPUs
+ * in the guest - which will not work as there is no
+ * hypercall to expand the max number of VCPUs an already
+ * running guest has. So cap it up to X. */
+ if (subtract)
+ nr_cpu_ids = nr_cpu_ids - subtract;
+#endif
+
+}
+
static void __init xen_smp_prepare_boot_cpu(void)
{
BUG_ON(smp_processor_id() != 0);
native_smp_prepare_boot_cpu();
- /* We've switched to the "real" per-cpu gdt, so make sure the
- old memory can be recycled */
- make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
+ if (xen_pv_domain()) {
+ if (!xen_feature(XENFEAT_writable_page_tables))
+ /* We've switched to the "real" per-cpu gdt, so make
+ * sure the old memory can be recycled. */
+ make_lowmem_page_readwrite(xen_initial_gdt);
+
+#ifdef CONFIG_X86_32
+ /*
+ * Xen starts us with XEN_FLAT_RING1_DS, but linux code
+ * expects __USER_DS
+ */
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+#endif
- xen_setup_vcpu_info_placement();
+ xen_filter_cpu_maps();
+ xen_setup_vcpu_info_placement();
+ }
+ /*
+ * The alternative logic (which patches the unlock/lock) runs before
+ * the smp bootup up code is activated. Hence we need to set this up
+ * the core kernel is being patched. Otherwise we will have only
+ * modules patched but not core code.
+ */
+ xen_init_spinlocks();
}
static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
{
unsigned cpu;
+ unsigned int i;
+
+ if (skip_ioapic_setup) {
+ char *m = (max_cpus == 0) ?
+ "The nosmp parameter is incompatible with Xen; " \
+ "use Xen dom0_max_vcpus=1 parameter" :
+ "The noapic parameter is incompatible with Xen";
+ xen_raw_printk(m);
+ panic(m);
+ }
xen_init_lock_cpu(0);
- smp_store_cpu_info(0);
+ smp_store_boot_cpu_info();
cpu_data(0).x86_max_cores = 1;
+
+ for_each_possible_cpu(i) {
+ zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
+ zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
+ zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
+ }
set_cpu_sibling_map(0);
if (xen_smp_intr_init(0))
@@ -201,28 +346,19 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--)
continue;
- cpu_clear(cpu, cpu_possible_map);
+ set_cpu_possible(cpu, false);
}
- for_each_possible_cpu (cpu) {
- struct task_struct *idle;
-
- if (cpu == 0)
- continue;
-
- idle = fork_idle(cpu);
- if (IS_ERR(idle))
- panic("failed fork for CPU %d", cpu);
-
- cpu_set(cpu, cpu_present_map);
- }
+ for_each_possible_cpu(cpu)
+ set_cpu_present(cpu, true);
}
-static __cpuinit int
+static int
cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
{
struct vcpu_guest_context *ctxt;
struct desc_struct *gdt;
+ unsigned long gdt_mfn;
if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
return 0;
@@ -233,44 +369,62 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
gdt = get_cpu_gdt_table(cpu);
- ctxt->flags = VGCF_IN_KERNEL;
- ctxt->user_regs.ds = __USER_DS;
- ctxt->user_regs.es = __USER_DS;
- ctxt->user_regs.ss = __KERNEL_DS;
#ifdef CONFIG_X86_32
+ /* Note: PVH is not yet supported on x86_32. */
ctxt->user_regs.fs = __KERNEL_PERCPU;
+ ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
#endif
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
- ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
- xen_copy_trap_info(ctxt->trap_ctxt);
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ ctxt->flags = VGCF_IN_KERNEL;
+ ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
+ ctxt->user_regs.ds = __USER_DS;
+ ctxt->user_regs.es = __USER_DS;
+ ctxt->user_regs.ss = __KERNEL_DS;
- ctxt->ldt_ents = 0;
+ xen_copy_trap_info(ctxt->trap_ctxt);
- BUG_ON((unsigned long)gdt & ~PAGE_MASK);
- make_lowmem_page_readonly(gdt);
+ ctxt->ldt_ents = 0;
- ctxt->gdt_frames[0] = virt_to_mfn(gdt);
- ctxt->gdt_ents = GDT_ENTRIES;
+ BUG_ON((unsigned long)gdt & ~PAGE_MASK);
- ctxt->user_regs.cs = __KERNEL_CS;
- ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
+ gdt_mfn = arbitrary_virt_to_mfn(gdt);
+ make_lowmem_page_readonly(gdt);
+ make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
- ctxt->kernel_ss = __KERNEL_DS;
- ctxt->kernel_sp = idle->thread.sp0;
+ ctxt->gdt_frames[0] = gdt_mfn;
+ ctxt->gdt_ents = GDT_ENTRIES;
+
+ ctxt->kernel_ss = __KERNEL_DS;
+ ctxt->kernel_sp = idle->thread.sp0;
#ifdef CONFIG_X86_32
- ctxt->event_callback_cs = __KERNEL_CS;
- ctxt->failsafe_callback_cs = __KERNEL_CS;
+ ctxt->event_callback_cs = __KERNEL_CS;
+ ctxt->failsafe_callback_cs = __KERNEL_CS;
+#else
+ ctxt->gs_base_kernel = per_cpu_offset(cpu);
#endif
- ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
- ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
-
- per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
+ ctxt->event_callback_eip =
+ (unsigned long)xen_hypervisor_callback;
+ ctxt->failsafe_callback_eip =
+ (unsigned long)xen_failsafe_callback;
+ ctxt->user_regs.cs = __KERNEL_CS;
+ per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
+#ifdef CONFIG_X86_32
+ }
+#else
+ } else
+ /* N.B. The user_regs.eip (cpu_bringup_and_idle) is called with
+ * %rdi having the cpu number - which means are passing in
+ * as the first parameter the cpu. Subtle!
+ */
+ ctxt->user_regs.rdi = cpu;
+#endif
+ ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
-
if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
BUG();
@@ -278,29 +432,21 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
return 0;
}
-static int __cpuinit xen_cpu_up(unsigned int cpu)
+static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
{
- struct task_struct *idle = idle_task(cpu);
int rc;
-#ifdef CONFIG_X86_64
- /* Allocate node local memory for AP pdas */
- WARN_ON(cpu == 0);
- if (cpu > 0) {
- rc = get_local_pda(cpu);
- if (rc)
- return rc;
- }
-#endif
-
-#ifdef CONFIG_X86_32
- init_gdt(cpu);
per_cpu(current_task, cpu) = idle;
+#ifdef CONFIG_X86_32
irq_ctx_init(cpu);
#else
- cpu_pda(cpu)->pcurrent = idle;
clear_tsk_thread_flag(idle, TIF_FORK);
#endif
+ per_cpu(kernel_stack, cpu) =
+ (unsigned long)task_stack_page(idle) -
+ KERNEL_STACK_OFFSET + THREAD_SIZE;
+
+ xen_setup_runstate_info(cpu);
xen_setup_timer(cpu);
xen_init_lock_cpu(cpu);
@@ -314,7 +460,8 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
return rc;
if (num_online_cpus() == 1)
- alternatives_smp_switch(1);
+ /* Just in case we booted with a single CPU. */
+ alternatives_enable_smp();
rc = xen_smp_intr_init(cpu);
if (rc)
@@ -324,7 +471,7 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
BUG_ON(rc);
while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
- HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+ HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
barrier();
}
@@ -350,26 +497,27 @@ static int xen_cpu_disable(void)
static void xen_cpu_die(unsigned int cpu)
{
- while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
+ while (xen_pv_domain() && HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
current->state = TASK_UNINTERRUPTIBLE;
schedule_timeout(HZ/10);
}
- unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
- unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
- unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
- unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
+ xen_smp_intr_free(cpu);
xen_uninit_lock_cpu(cpu);
xen_teardown_timer(cpu);
-
- if (num_online_cpus() == 1)
- alternatives_smp_switch(0);
}
-static void __cpuinit xen_play_dead(void) /* used only with CPU_HOTPLUG */
+static void xen_play_dead(void) /* used only with HOTPLUG_CPU */
{
play_dead_common();
HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
cpu_bringup();
+ /*
+ * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down)
+ * clears certain data that the cpu_idle loop (which called us
+ * and that we return from) expects. The only way to get that
+ * data back is to call:
+ */
+ tick_nohz_idle_enter();
}
#else /* !CONFIG_HOTPLUG_CPU */
@@ -397,13 +545,15 @@ static void stop_self(void *v)
load_cr3(swapper_pg_dir);
/* should set up a minimal gdt */
+ set_cpu_online(cpu, false);
+
HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
BUG();
}
-static void xen_smp_send_stop(void)
+static void xen_stop_other_cpus(int wait)
{
- smp_call_function(stop_self, NULL, 0);
+ smp_call_function(stop_self, NULL, wait);
}
static void xen_smp_send_reschedule(int cpu)
@@ -411,8 +561,8 @@ static void xen_smp_send_reschedule(int cpu)
xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
}
-static void xen_send_IPI_mask(const struct cpumask *mask,
- enum ipi_vector vector)
+static void __xen_send_IPI_mask(const struct cpumask *mask,
+ int vector)
{
unsigned cpu;
@@ -424,12 +574,12 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
{
int cpu;
- xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
+ __xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
/* Make sure other vcpus get a chance to run if they need to. */
for_each_cpu(cpu, mask) {
if (xen_vcpu_stolen(cpu)) {
- HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+ HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
break;
}
}
@@ -437,19 +587,95 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
static void xen_smp_send_call_function_single_ipi(int cpu)
{
- xen_send_IPI_mask(cpumask_of(cpu),
+ __xen_send_IPI_mask(cpumask_of(cpu),
XEN_CALL_FUNCTION_SINGLE_VECTOR);
}
+static inline int xen_map_vector(int vector)
+{
+ int xen_vector;
+
+ switch (vector) {
+ case RESCHEDULE_VECTOR:
+ xen_vector = XEN_RESCHEDULE_VECTOR;
+ break;
+ case CALL_FUNCTION_VECTOR:
+ xen_vector = XEN_CALL_FUNCTION_VECTOR;
+ break;
+ case CALL_FUNCTION_SINGLE_VECTOR:
+ xen_vector = XEN_CALL_FUNCTION_SINGLE_VECTOR;
+ break;
+ case IRQ_WORK_VECTOR:
+ xen_vector = XEN_IRQ_WORK_VECTOR;
+ break;
+#ifdef CONFIG_X86_64
+ case NMI_VECTOR:
+ case APIC_DM_NMI: /* Some use that instead of NMI_VECTOR */
+ xen_vector = XEN_NMI_VECTOR;
+ break;
+#endif
+ default:
+ xen_vector = -1;
+ printk(KERN_ERR "xen: vector 0x%x is not implemented\n",
+ vector);
+ }
+
+ return xen_vector;
+}
+
+void xen_send_IPI_mask(const struct cpumask *mask,
+ int vector)
+{
+ int xen_vector = xen_map_vector(vector);
+
+ if (xen_vector >= 0)
+ __xen_send_IPI_mask(mask, xen_vector);
+}
+
+void xen_send_IPI_all(int vector)
+{
+ int xen_vector = xen_map_vector(vector);
+
+ if (xen_vector >= 0)
+ __xen_send_IPI_mask(cpu_online_mask, xen_vector);
+}
+
+void xen_send_IPI_self(int vector)
+{
+ int xen_vector = xen_map_vector(vector);
+
+ if (xen_vector >= 0)
+ xen_send_IPI_one(smp_processor_id(), xen_vector);
+}
+
+void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
+ int vector)
+{
+ unsigned cpu;
+ unsigned int this_cpu = smp_processor_id();
+ int xen_vector = xen_map_vector(vector);
+
+ if (!(num_online_cpus() > 1) || (xen_vector < 0))
+ return;
+
+ for_each_cpu_and(cpu, mask, cpu_online_mask) {
+ if (this_cpu == cpu)
+ continue;
+
+ xen_send_IPI_one(cpu, xen_vector);
+ }
+}
+
+void xen_send_IPI_allbutself(int vector)
+{
+ xen_send_IPI_mask_allbutself(cpu_online_mask, vector);
+}
+
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_interrupt();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_call_count++;
-#else
- add_pda(irq_call_count, 1);
-#endif
+ inc_irq_stat(irq_call_count);
irq_exit();
return IRQ_HANDLED;
@@ -459,17 +685,23 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_single_interrupt();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_call_count++;
-#else
- add_pda(irq_call_count, 1);
-#endif
+ inc_irq_stat(irq_call_count);
irq_exit();
return IRQ_HANDLED;
}
-static const struct smp_ops xen_smp_ops __initdata = {
+static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id)
+{
+ irq_enter();
+ irq_work_run();
+ inc_irq_stat(apic_irq_work_irqs);
+ irq_exit();
+
+ return IRQ_HANDLED;
+}
+
+static const struct smp_ops xen_smp_ops __initconst = {
.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
.smp_prepare_cpus = xen_smp_prepare_cpus,
.smp_cpus_done = xen_smp_cpus_done,
@@ -479,7 +711,7 @@ static const struct smp_ops xen_smp_ops __initdata = {
.cpu_disable = xen_cpu_disable,
.play_dead = xen_play_dead,
- .smp_send_stop = xen_smp_send_stop,
+ .stop_other_cpus = xen_stop_other_cpus,
.smp_send_reschedule = xen_smp_send_reschedule,
.send_call_func_ipi = xen_smp_send_call_function_ipi,
@@ -490,5 +722,55 @@ void __init xen_smp_init(void)
{
smp_ops = xen_smp_ops;
xen_fill_possible_map();
- xen_init_spinlocks();
+}
+
+static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
+{
+ native_smp_prepare_cpus(max_cpus);
+ WARN_ON(xen_smp_intr_init(0));
+
+ xen_init_lock_cpu(0);
+}
+
+static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
+{
+ int rc;
+ /*
+ * xen_smp_intr_init() needs to run before native_cpu_up()
+ * so that IPI vectors are set up on the booting CPU before
+ * it is marked online in native_cpu_up().
+ */
+ rc = xen_smp_intr_init(cpu);
+ WARN_ON(rc);
+ if (!rc)
+ rc = native_cpu_up(cpu, tidle);
+
+ /*
+ * We must initialize the slowpath CPU kicker _after_ the native
+ * path has executed. If we initialized it before none of the
+ * unlocker IPI kicks would reach the booting CPU as the booting
+ * CPU had not set itself 'online' in cpu_online_mask. That mask
+ * is checked when IPIs are sent (on HVM at least).
+ */
+ xen_init_lock_cpu(cpu);
+ return rc;
+}
+
+static void xen_hvm_cpu_die(unsigned int cpu)
+{
+ xen_cpu_die(cpu);
+ native_cpu_die(cpu);
+}
+
+void __init xen_hvm_smp_init(void)
+{
+ if (!xen_have_vector_callback)
+ return;
+ smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
+ smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
+ smp_ops.cpu_up = xen_hvm_cpu_up;
+ smp_ops.cpu_die = xen_hvm_cpu_die;
+ smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
+ smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
+ smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu;
}
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
new file mode 100644
index 00000000000..c7c2d89efd7
--- /dev/null
+++ b/arch/x86/xen/smp.h
@@ -0,0 +1,11 @@
+#ifndef _XEN_SMP_H
+
+extern void xen_send_IPI_mask(const struct cpumask *mask,
+ int vector);
+extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
+ int vector);
+extern void xen_send_IPI_allbutself(int vector);
+extern void xen_send_IPI_all(int vector);
+extern void xen_send_IPI_self(int vector);
+
+#endif
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 5601506f2dd..0ba5f3b967f 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -6,6 +6,8 @@
#include <linux/spinlock.h>
#include <linux/debugfs.h>
#include <linux/log2.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
#include <asm/paravirt.h>
@@ -15,45 +17,44 @@
#include "xen-ops.h"
#include "debugfs.h"
-#ifdef CONFIG_XEN_DEBUG_FS
-static struct xen_spinlock_stats
-{
- u64 taken;
- u32 taken_slow;
- u32 taken_slow_nested;
- u32 taken_slow_pickup;
- u32 taken_slow_spurious;
- u32 taken_slow_irqenable;
+enum xen_contention_stat {
+ TAKEN_SLOW,
+ TAKEN_SLOW_PICKUP,
+ TAKEN_SLOW_SPURIOUS,
+ RELEASED_SLOW,
+ RELEASED_SLOW_KICKED,
+ NR_CONTENTION_STATS
+};
- u64 released;
- u32 released_slow;
- u32 released_slow_kicked;
+#ifdef CONFIG_XEN_DEBUG_FS
#define HISTO_BUCKETS 30
- u32 histo_spin_total[HISTO_BUCKETS+1];
- u32 histo_spin_spinning[HISTO_BUCKETS+1];
+static struct xen_spinlock_stats
+{
+ u32 contention_stats[NR_CONTENTION_STATS];
u32 histo_spin_blocked[HISTO_BUCKETS+1];
-
- u64 time_total;
- u64 time_spinning;
u64 time_blocked;
} spinlock_stats;
static u8 zero_stats;
-static unsigned lock_timeout = 1 << 10;
-#define TIMEOUT lock_timeout
-
static inline void check_zero(void)
{
- if (unlikely(zero_stats)) {
- memset(&spinlock_stats, 0, sizeof(spinlock_stats));
- zero_stats = 0;
+ u8 ret;
+ u8 old = ACCESS_ONCE(zero_stats);
+ if (unlikely(old)) {
+ ret = cmpxchg(&zero_stats, old, 0);
+ /* This ensures only one fellow resets the stat */
+ if (ret == old)
+ memset(&spinlock_stats, 0, sizeof(spinlock_stats));
}
}
-#define ADD_STATS(elem, val) \
- do { check_zero(); spinlock_stats.elem += (val); } while(0)
+static inline void add_stats(enum xen_contention_stat var, u32 val)
+{
+ check_zero();
+ spinlock_stats.contention_stats[var] += val;
+}
static inline u64 spin_time_start(void)
{
@@ -72,22 +73,6 @@ static void __spin_time_accum(u64 delta, u32 *array)
array[HISTO_BUCKETS]++;
}
-static inline void spin_time_accum_spinning(u64 start)
-{
- u32 delta = xen_clocksource_read() - start;
-
- __spin_time_accum(delta, spinlock_stats.histo_spin_spinning);
- spinlock_stats.time_spinning += delta;
-}
-
-static inline void spin_time_accum_total(u64 start)
-{
- u32 delta = xen_clocksource_read() - start;
-
- __spin_time_accum(delta, spinlock_stats.histo_spin_total);
- spinlock_stats.time_total += delta;
-}
-
static inline void spin_time_accum_blocked(u64 start)
{
u32 delta = xen_clocksource_read() - start;
@@ -96,262 +81,167 @@ static inline void spin_time_accum_blocked(u64 start)
spinlock_stats.time_blocked += delta;
}
#else /* !CONFIG_XEN_DEBUG_FS */
-#define TIMEOUT (1 << 10)
-#define ADD_STATS(elem, val) do { (void)(val); } while(0)
+static inline void add_stats(enum xen_contention_stat var, u32 val)
+{
+}
static inline u64 spin_time_start(void)
{
return 0;
}
-static inline void spin_time_accum_total(u64 start)
-{
-}
-static inline void spin_time_accum_spinning(u64 start)
-{
-}
static inline void spin_time_accum_blocked(u64 start)
{
}
#endif /* CONFIG_XEN_DEBUG_FS */
-struct xen_spinlock {
- unsigned char lock; /* 0 -> free; 1 -> locked */
- unsigned short spinners; /* count of waiting cpus */
+struct xen_lock_waiting {
+ struct arch_spinlock *lock;
+ __ticket_t want;
};
-static int xen_spin_is_locked(struct raw_spinlock *lock)
-{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-
- return xl->lock != 0;
-}
-
-static int xen_spin_is_contended(struct raw_spinlock *lock)
-{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-
- /* Not strictly true; this is only the count of contended
- lock-takers entering the slow path. */
- return xl->spinners != 0;
-}
-
-static int xen_spin_trylock(struct raw_spinlock *lock)
-{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
- u8 old = 1;
-
- asm("xchgb %b0,%1"
- : "+q" (old), "+m" (xl->lock) : : "memory");
-
- return old == 0;
-}
-
static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
-static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
-
-/*
- * Mark a cpu as interested in a lock. Returns the CPU's previous
- * lock of interest, in case we got preempted by an interrupt.
- */
-static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
-{
- struct xen_spinlock *prev;
-
- prev = __get_cpu_var(lock_spinners);
- __get_cpu_var(lock_spinners) = xl;
-
- wmb(); /* set lock of interest before count */
-
- asm(LOCK_PREFIX " incw %0"
- : "+m" (xl->spinners) : : "memory");
-
- return prev;
-}
+static DEFINE_PER_CPU(char *, irq_name);
+static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting);
+static cpumask_t waiting_cpus;
-/*
- * Mark a cpu as no longer interested in a lock. Restores previous
- * lock of interest (NULL for none).
- */
-static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
+static bool xen_pvspin = true;
+__visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
{
- asm(LOCK_PREFIX " decw %0"
- : "+m" (xl->spinners) : : "memory");
- wmb(); /* decrement count before restoring lock */
- __get_cpu_var(lock_spinners) = prev;
-}
-
-static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable)
-{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
- struct xen_spinlock *prev;
- int irq = __get_cpu_var(lock_kicker_irq);
- int ret;
- unsigned long flags;
+ int irq = __this_cpu_read(lock_kicker_irq);
+ struct xen_lock_waiting *w = &__get_cpu_var(lock_waiting);
+ int cpu = smp_processor_id();
u64 start;
+ unsigned long flags;
/* If kicker interrupts not initialized yet, just spin */
if (irq == -1)
- return 0;
+ return;
start = spin_time_start();
- /* announce we're spinning */
- prev = spinning_lock(xl);
+ /*
+ * Make sure an interrupt handler can't upset things in a
+ * partially setup state.
+ */
+ local_irq_save(flags);
+ /*
+ * We don't really care if we're overwriting some other
+ * (lock,want) pair, as that would mean that we're currently
+ * in an interrupt context, and the outer context had
+ * interrupts enabled. That has already kicked the VCPU out
+ * of xen_poll_irq(), so it will just return spuriously and
+ * retry with newly setup (lock,want).
+ *
+ * The ordering protocol on this is that the "lock" pointer
+ * may only be set non-NULL if the "want" ticket is correct.
+ * If we're updating "want", we must first clear "lock".
+ */
+ w->lock = NULL;
+ smp_wmb();
+ w->want = want;
+ smp_wmb();
+ w->lock = lock;
+
+ /* This uses set_bit, which atomic and therefore a barrier */
+ cpumask_set_cpu(cpu, &waiting_cpus);
+ add_stats(TAKEN_SLOW, 1);
+
+ /* clear pending */
+ xen_clear_irq_pending(irq);
+
+ /* Only check lock once pending cleared */
+ barrier();
- flags = __raw_local_save_flags();
- if (irq_enable) {
- ADD_STATS(taken_slow_irqenable, 1);
- raw_local_irq_enable();
+ /*
+ * Mark entry to slowpath before doing the pickup test to make
+ * sure we don't deadlock with an unlocker.
+ */
+ __ticket_enter_slowpath(lock);
+
+ /*
+ * check again make sure it didn't become free while
+ * we weren't looking
+ */
+ if (ACCESS_ONCE(lock->tickets.head) == want) {
+ add_stats(TAKEN_SLOW_PICKUP, 1);
+ goto out;
}
- ADD_STATS(taken_slow, 1);
- ADD_STATS(taken_slow_nested, prev != NULL);
-
- do {
- /* clear pending */
- xen_clear_irq_pending(irq);
-
- /* check again make sure it didn't become free while
- we weren't looking */
- ret = xen_spin_trylock(lock);
- if (ret) {
- ADD_STATS(taken_slow_pickup, 1);
-
- /*
- * If we interrupted another spinlock while it
- * was blocking, make sure it doesn't block
- * without rechecking the lock.
- */
- if (prev != NULL)
- xen_set_irq_pending(irq);
- goto out;
- }
-
- /*
- * Block until irq becomes pending. If we're
- * interrupted at this point (after the trylock but
- * before entering the block), then the nested lock
- * handler guarantees that the irq will be left
- * pending if there's any chance the lock became free;
- * xen_poll_irq() returns immediately if the irq is
- * pending.
- */
- xen_poll_irq(irq);
- ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
- } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
-
- kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
-
-out:
- raw_local_irq_restore(flags);
- unspinning_lock(xl, prev);
- spin_time_accum_blocked(start);
-
- return ret;
-}
-
-static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable)
-{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
- unsigned timeout;
- u8 oldval;
- u64 start_spin;
-
- ADD_STATS(taken, 1);
-
- start_spin = spin_time_start();
-
- do {
- u64 start_spin_fast = spin_time_start();
+ /* Allow interrupts while blocked */
+ local_irq_restore(flags);
- timeout = TIMEOUT;
+ /*
+ * If an interrupt happens here, it will leave the wakeup irq
+ * pending, which will cause xen_poll_irq() to return
+ * immediately.
+ */
- asm("1: xchgb %1,%0\n"
- " testb %1,%1\n"
- " jz 3f\n"
- "2: rep;nop\n"
- " cmpb $0,%0\n"
- " je 1b\n"
- " dec %2\n"
- " jnz 2b\n"
- "3:\n"
- : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
- : "1" (1)
- : "memory");
+ /* Block until irq becomes pending (or perhaps a spurious wakeup) */
+ xen_poll_irq(irq);
+ add_stats(TAKEN_SLOW_SPURIOUS, !xen_test_irq_pending(irq));
- spin_time_accum_spinning(start_spin_fast);
+ local_irq_save(flags);
- } while (unlikely(oldval != 0 &&
- (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable))));
+ kstat_incr_irq_this_cpu(irq);
+out:
+ cpumask_clear_cpu(cpu, &waiting_cpus);
+ w->lock = NULL;
- spin_time_accum_total(start_spin);
-}
+ local_irq_restore(flags);
-static void xen_spin_lock(struct raw_spinlock *lock)
-{
- __xen_spin_lock(lock, false);
-}
-
-static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
-{
- __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
+ spin_time_accum_blocked(start);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_lock_spinning);
-static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
+static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)
{
int cpu;
- ADD_STATS(released_slow, 1);
+ add_stats(RELEASED_SLOW, 1);
- for_each_online_cpu(cpu) {
- /* XXX should mix up next cpu selection */
- if (per_cpu(lock_spinners, cpu) == xl) {
- ADD_STATS(released_slow_kicked, 1);
+ for_each_cpu(cpu, &waiting_cpus) {
+ const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu);
+
+ /* Make sure we read lock before want */
+ if (ACCESS_ONCE(w->lock) == lock &&
+ ACCESS_ONCE(w->want) == next) {
+ add_stats(RELEASED_SLOW_KICKED, 1);
xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
break;
}
}
}
-static void xen_spin_unlock(struct raw_spinlock *lock)
-{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-
- ADD_STATS(released, 1);
-
- smp_wmb(); /* make sure no writes get moved after unlock */
- xl->lock = 0; /* release lock */
-
- /* make sure unlock happens before kick */
- barrier();
-
- if (unlikely(xl->spinners))
- xen_spin_unlock_slow(xl);
-}
-
static irqreturn_t dummy_handler(int irq, void *dev_id)
{
BUG();
return IRQ_HANDLED;
}
-void __cpuinit xen_init_lock_cpu(int cpu)
+void xen_init_lock_cpu(int cpu)
{
int irq;
- const char *name;
+ char *name;
+
+ if (!xen_pvspin)
+ return;
+
+ WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n",
+ cpu, per_cpu(lock_kicker_irq, cpu));
name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
cpu,
dummy_handler,
- IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+ IRQF_PERCPU|IRQF_NOBALANCING,
name,
NULL);
if (irq >= 0) {
disable_irq(irq); /* make sure it's never delivered */
per_cpu(lock_kicker_irq, cpu) = irq;
+ per_cpu(irq_name, cpu) = name;
}
printk("cpu %d spinlock event irq %d\n", cpu, irq);
@@ -359,18 +249,61 @@ void __cpuinit xen_init_lock_cpu(int cpu)
void xen_uninit_lock_cpu(int cpu)
{
+ if (!xen_pvspin)
+ return;
+
unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
+ per_cpu(lock_kicker_irq, cpu) = -1;
+ kfree(per_cpu(irq_name, cpu));
+ per_cpu(irq_name, cpu) = NULL;
}
+
+/*
+ * Our init of PV spinlocks is split in two init functions due to us
+ * using paravirt patching and jump labels patching and having to do
+ * all of this before SMP code is invoked.
+ *
+ * The paravirt patching needs to be done _before_ the alternative asm code
+ * is started, otherwise we would not patch the core kernel code.
+ */
void __init xen_init_spinlocks(void)
{
- pv_lock_ops.spin_is_locked = xen_spin_is_locked;
- pv_lock_ops.spin_is_contended = xen_spin_is_contended;
- pv_lock_ops.spin_lock = xen_spin_lock;
- pv_lock_ops.spin_lock_flags = xen_spin_lock_flags;
- pv_lock_ops.spin_trylock = xen_spin_trylock;
- pv_lock_ops.spin_unlock = xen_spin_unlock;
+
+ if (!xen_pvspin) {
+ printk(KERN_DEBUG "xen: PV spinlocks disabled\n");
+ return;
+ }
+ printk(KERN_DEBUG "xen: PV spinlocks enabled\n");
+ pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning);
+ pv_lock_ops.unlock_kick = xen_unlock_kick;
+}
+
+/*
+ * While the jump_label init code needs to happend _after_ the jump labels are
+ * enabled and before SMP is started. Hence we use pre-SMP initcall level
+ * init. We cannot do it in xen_init_spinlocks as that is done before
+ * jump labels are activated.
+ */
+static __init int xen_init_spinlocks_jump(void)
+{
+ if (!xen_pvspin)
+ return 0;
+
+ if (!xen_domain())
+ return 0;
+
+ static_key_slow_inc(&paravirt_ticketlocks_enabled);
+ return 0;
+}
+early_initcall(xen_init_spinlocks_jump);
+
+static __init int xen_parse_nopvspin(char *arg)
+{
+ xen_pvspin = false;
+ return 0;
}
+early_param("xen_nopvspin", xen_parse_nopvspin);
#ifdef CONFIG_XEN_DEBUG_FS
@@ -383,43 +316,30 @@ static int __init xen_spinlock_debugfs(void)
if (d_xen == NULL)
return -ENOMEM;
+ if (!xen_pvspin)
+ return 0;
+
d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
- debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout);
-
- debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken);
debugfs_create_u32("taken_slow", 0444, d_spin_debug,
- &spinlock_stats.taken_slow);
- debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug,
- &spinlock_stats.taken_slow_nested);
+ &spinlock_stats.contention_stats[TAKEN_SLOW]);
debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
- &spinlock_stats.taken_slow_pickup);
+ &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);
debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
- &spinlock_stats.taken_slow_spurious);
- debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug,
- &spinlock_stats.taken_slow_irqenable);
+ &spinlock_stats.contention_stats[TAKEN_SLOW_SPURIOUS]);
- debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released);
debugfs_create_u32("released_slow", 0444, d_spin_debug,
- &spinlock_stats.released_slow);
+ &spinlock_stats.contention_stats[RELEASED_SLOW]);
debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
- &spinlock_stats.released_slow_kicked);
+ &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]);
- debugfs_create_u64("time_spinning", 0444, d_spin_debug,
- &spinlock_stats.time_spinning);
debugfs_create_u64("time_blocked", 0444, d_spin_debug,
&spinlock_stats.time_blocked);
- debugfs_create_u64("time_total", 0444, d_spin_debug,
- &spinlock_stats.time_total);
-
- xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
- spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
- xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
- spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
- xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
- spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
+
+ debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
+ spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
return 0;
}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 212ffe012b7..c4df9dbd63b 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -1,4 +1,5 @@
#include <linux/types.h>
+#include <linux/clockchips.h>
#include <xen/interface/xen.h>
#include <xen/grant_table.h>
@@ -6,12 +7,15 @@
#include <asm/xen/hypercall.h>
#include <asm/xen/page.h>
+#include <asm/fixmap.h>
#include "xen-ops.h"
#include "mmu.h"
-void xen_pre_suspend(void)
+static void xen_pv_pre_suspend(void)
{
+ xen_mm_pin_all();
+
xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
xen_start_info->console.domU.mfn =
mfn_to_pfn(xen_start_info->console.domU.mfn);
@@ -24,8 +28,25 @@ void xen_pre_suspend(void)
BUG();
}
-void xen_post_suspend(int suspend_cancelled)
+static void xen_hvm_post_suspend(int suspend_cancelled)
{
+#ifdef CONFIG_XEN_PVHVM
+ int cpu;
+ xen_hvm_init_shared_info();
+ xen_callback_vector();
+ xen_unplug_emulated_devices();
+ if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
+ for_each_online_cpu(cpu) {
+ xen_setup_runstate_info(cpu);
+ }
+ }
+#endif
+}
+
+static void xen_pv_post_suspend(int suspend_cancelled)
+{
+ xen_build_mfn_list_list();
+
xen_setup_shared_info();
if (suspend_cancelled) {
@@ -41,9 +62,36 @@ void xen_post_suspend(int suspend_cancelled)
xen_vcpu_restore();
}
+ xen_mm_unpin_all();
+}
+
+void xen_arch_pre_suspend(void)
+{
+ if (xen_pv_domain())
+ xen_pv_pre_suspend();
+}
+
+void xen_arch_post_suspend(int cancelled)
+{
+ if (xen_pv_domain())
+ xen_pv_post_suspend(cancelled);
+ else
+ xen_hvm_post_suspend(cancelled);
+}
+
+static void xen_vcpu_notify_restore(void *data)
+{
+ unsigned long reason = (unsigned long)data;
+
+ /* Boot processor notified via generic timekeeping_resume() */
+ if ( smp_processor_id() == 0)
+ return;
+
+ clockevents_notify(reason, NULL);
}
void xen_arch_resume(void)
{
- /* nothing */
+ on_each_cpu(xen_vcpu_notify_restore,
+ (void *)CLOCK_EVT_NOTIFY_RESUME, 1);
}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 14f24062349..7b78f88c170 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -13,32 +13,33 @@
#include <linux/clockchips.h>
#include <linux/kernel_stat.h>
#include <linux/math64.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/pvclock_gtod.h>
#include <asm/pvclock.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
#include <xen/events.h>
+#include <xen/features.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
#include "xen-ops.h"
-#define XEN_SHIFT 22
-
/* Xen may fire a timer up to this many ns early */
#define TIMER_SLOP 100000
#define NS_PER_TICK (1000000000LL / HZ)
/* runstate info updated by Xen */
-static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
/* snapshots of runstate info */
-static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
+static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot);
-/* unused ns of stolen and blocked time */
-static DEFINE_PER_CPU(u64, residual_stolen);
-static DEFINE_PER_CPU(u64, residual_blocked);
+/* unused ns of stolen time */
+static DEFINE_PER_CPU(u64, xen_residual_stolen);
/* return an consistent snapshot of 64-bit time/counter value */
static u64 get64(const u64 *p)
@@ -79,7 +80,7 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
BUG_ON(preemptible());
- state = &__get_cpu_var(runstate);
+ state = &__get_cpu_var(xen_runstate);
/*
* The runstate info is always updated by the hypervisor on
@@ -97,14 +98,14 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
/* return true when a vcpu could run but has no real cpu to run on */
bool xen_vcpu_stolen(int vcpu)
{
- return per_cpu(runstate, vcpu).state == RUNSTATE_runnable;
+ return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable;
}
-static void setup_runstate_info(int cpu)
+void xen_setup_runstate_info(int cpu)
{
struct vcpu_register_runstate_memory_area area;
- area.addr.v = &per_cpu(runstate, cpu);
+ area.addr.v = &per_cpu(xen_runstate, cpu);
if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
cpu, &area))
@@ -115,17 +116,16 @@ static void do_stolen_accounting(void)
{
struct vcpu_runstate_info state;
struct vcpu_runstate_info *snap;
- s64 blocked, runnable, offline, stolen;
+ s64 runnable, offline, stolen;
cputime_t ticks;
get_runstate_snapshot(&state);
WARN_ON(state.state != RUNSTATE_running);
- snap = &__get_cpu_var(runstate_snapshot);
+ snap = &__get_cpu_var(xen_runstate_snapshot);
/* work out how much time the VCPU has not been runn*ing* */
- blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
@@ -133,68 +133,18 @@ static void do_stolen_accounting(void)
/* Add the appropriate number of ticks of stolen time,
including any left-overs from last time. */
- stolen = runnable + offline + __get_cpu_var(residual_stolen);
+ stolen = runnable + offline + __this_cpu_read(xen_residual_stolen);
if (stolen < 0)
stolen = 0;
ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
- __get_cpu_var(residual_stolen) = stolen;
+ __this_cpu_write(xen_residual_stolen, stolen);
account_steal_ticks(ticks);
-
- /* Add the appropriate number of ticks of blocked time,
- including any left-overs from last time. */
- blocked += __get_cpu_var(residual_blocked);
-
- if (blocked < 0)
- blocked = 0;
-
- ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
- __get_cpu_var(residual_blocked) = blocked;
- account_idle_ticks(ticks);
-}
-
-/*
- * Xen sched_clock implementation. Returns the number of unstolen
- * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
- * states.
- */
-unsigned long long xen_sched_clock(void)
-{
- struct vcpu_runstate_info state;
- cycle_t now;
- u64 ret;
- s64 offset;
-
- /*
- * Ideally sched_clock should be called on a per-cpu basis
- * anyway, so preempt should already be disabled, but that's
- * not current practice at the moment.
- */
- preempt_disable();
-
- now = xen_clocksource_read();
-
- get_runstate_snapshot(&state);
-
- WARN_ON(state.state != RUNSTATE_running);
-
- offset = now - state.state_entry_time;
- if (offset < 0)
- offset = 0;
-
- ret = state.time[RUNSTATE_blocked] +
- state.time[RUNSTATE_running] +
- offset;
-
- preempt_enable();
-
- return ret;
}
-
/* Get the TSC speed from Xen */
-unsigned long xen_tsc_khz(void)
+static unsigned long xen_tsc_khz(void)
{
struct pvclock_vcpu_time_info *info =
&HYPERVISOR_shared_info->vcpu_info[0].time;
@@ -207,12 +157,18 @@ cycle_t xen_clocksource_read(void)
struct pvclock_vcpu_time_info *src;
cycle_t ret;
- src = &get_cpu_var(xen_vcpu)->time;
+ preempt_disable_notrace();
+ src = &__get_cpu_var(xen_vcpu)->time;
ret = pvclock_clocksource_read(src);
- put_cpu_var(xen_vcpu);
+ preempt_enable_notrace();
return ret;
}
+static cycle_t xen_clocksource_get_cycles(struct clocksource *cs)
+{
+ return xen_clocksource_read();
+}
+
static void xen_read_wallclock(struct timespec *ts)
{
struct shared_info *s = HYPERVISOR_shared_info;
@@ -224,27 +180,61 @@ static void xen_read_wallclock(struct timespec *ts)
put_cpu_var(xen_vcpu);
}
-unsigned long xen_get_wallclock(void)
+static void xen_get_wallclock(struct timespec *now)
{
- struct timespec ts;
-
- xen_read_wallclock(&ts);
- return ts.tv_sec;
+ xen_read_wallclock(now);
}
-int xen_set_wallclock(unsigned long now)
+static int xen_set_wallclock(const struct timespec *now)
{
- /* do nothing for domU */
return -1;
}
+static int xen_pvclock_gtod_notify(struct notifier_block *nb,
+ unsigned long was_set, void *priv)
+{
+ /* Protected by the calling core code serialization */
+ static struct timespec next_sync;
+
+ struct xen_platform_op op;
+ struct timespec now;
+
+ now = __current_kernel_time();
+
+ /*
+ * We only take the expensive HV call when the clock was set
+ * or when the 11 minutes RTC synchronization time elapsed.
+ */
+ if (!was_set && timespec_compare(&now, &next_sync) < 0)
+ return NOTIFY_OK;
+
+ op.cmd = XENPF_settime;
+ op.u.settime.secs = now.tv_sec;
+ op.u.settime.nsecs = now.tv_nsec;
+ op.u.settime.system_time = xen_clocksource_read();
+
+ (void)HYPERVISOR_dom0_op(&op);
+
+ /*
+ * Move the next drift compensation time 11 minutes
+ * ahead. That's emulating the sync_cmos_clock() update for
+ * the hardware RTC.
+ */
+ next_sync = now;
+ next_sync.tv_sec += 11 * 60;
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block xen_pvclock_gtod_notifier = {
+ .notifier_call = xen_pvclock_gtod_notify,
+};
+
static struct clocksource xen_clocksource __read_mostly = {
.name = "xen",
.rating = 400,
- .read = xen_clocksource_read,
+ .read = xen_clocksource_get_cycles,
.mask = ~0,
- .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
- .shift = XEN_SHIFT,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
@@ -398,11 +388,16 @@ static const struct clock_event_device xen_vcpuop_clockevent = {
static const struct clock_event_device *xen_clockevent =
&xen_timerop_clockevent;
-static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
+
+struct xen_clock_event_device {
+ struct clock_event_device evt;
+ char *name;
+};
+static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt.irq = -1 };
static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
{
- struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
+ struct clock_event_device *evt = &__get_cpu_var(xen_clock_events).evt;
irqreturn_t ret;
ret = IRQ_NONE;
@@ -416,12 +411,31 @@ static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
return ret;
}
+void xen_teardown_timer(int cpu)
+{
+ struct clock_event_device *evt;
+ BUG_ON(cpu == 0);
+ evt = &per_cpu(xen_clock_events, cpu).evt;
+
+ if (evt->irq >= 0) {
+ unbind_from_irqhandler(evt->irq, NULL);
+ evt->irq = -1;
+ kfree(per_cpu(xen_clock_events, cpu).name);
+ per_cpu(xen_clock_events, cpu).name = NULL;
+ }
+}
+
void xen_setup_timer(int cpu)
{
- const char *name;
+ char *name;
struct clock_event_device *evt;
int irq;
+ evt = &per_cpu(xen_clock_events, cpu).evt;
+ WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu);
+ if (evt->irq >= 0)
+ xen_teardown_timer(cpu);
+
printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
name = kasprintf(GFP_KERNEL, "timer%d", cpu);
@@ -429,37 +443,32 @@ void xen_setup_timer(int cpu)
name = "<timer kasprintf failed>";
irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
- IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+ IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
+ IRQF_FORCE_RESUME,
name, NULL);
+ (void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX);
- evt = &per_cpu(xen_clock_events, cpu);
memcpy(evt, xen_clockevent, sizeof(*evt));
evt->cpumask = cpumask_of(cpu);
evt->irq = irq;
-
- setup_runstate_info(cpu);
+ per_cpu(xen_clock_events, cpu).name = name;
}
-void xen_teardown_timer(int cpu)
-{
- struct clock_event_device *evt;
- BUG_ON(cpu == 0);
- evt = &per_cpu(xen_clock_events, cpu);
- unbind_from_irqhandler(evt->irq, NULL);
-}
void xen_setup_cpu_clockevents(void)
{
BUG_ON(preemptible());
- clockevents_register_device(&__get_cpu_var(xen_clock_events));
+ clockevents_register_device(&__get_cpu_var(xen_clock_events).evt);
}
void xen_timer_resume(void)
{
int cpu;
+ pvclock_resume();
+
if (xen_clockevent != &xen_vcpuop_clockevent)
return;
@@ -469,11 +478,16 @@ void xen_timer_resume(void)
}
}
-__init void xen_time_init(void)
+static const struct pv_time_ops xen_time_ops __initconst = {
+ .sched_clock = xen_clocksource_read,
+};
+
+static void __init xen_time_init(void)
{
int cpu = smp_processor_id();
+ struct timespec tp;
- clocksource_register(&xen_clocksource);
+ clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
/* Successfully turned off 100Hz tick, so we have the
@@ -483,12 +497,66 @@ __init void xen_time_init(void)
}
/* Set initial system time with full resolution */
- xen_read_wallclock(&xtime);
- set_normalized_timespec(&wall_to_monotonic,
- -xtime.tv_sec, -xtime.tv_nsec);
+ xen_read_wallclock(&tp);
+ do_settimeofday(&tp);
setup_force_cpu_cap(X86_FEATURE_TSC);
+ xen_setup_runstate_info(cpu);
xen_setup_timer(cpu);
xen_setup_cpu_clockevents();
+
+ if (xen_initial_domain())
+ pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
+}
+
+void __init xen_init_time_ops(void)
+{
+ pv_time_ops = xen_time_ops;
+
+ x86_init.timers.timer_init = xen_time_init;
+ x86_init.timers.setup_percpu_clockev = x86_init_noop;
+ x86_cpuinit.setup_percpu_clockev = x86_init_noop;
+
+ x86_platform.calibrate_tsc = xen_tsc_khz;
+ x86_platform.get_wallclock = xen_get_wallclock;
+ /* Dom0 uses the native method to set the hardware RTC. */
+ if (!xen_initial_domain())
+ x86_platform.set_wallclock = xen_set_wallclock;
+}
+
+#ifdef CONFIG_XEN_PVHVM
+static void xen_hvm_setup_cpu_clockevents(void)
+{
+ int cpu = smp_processor_id();
+ xen_setup_runstate_info(cpu);
+ /*
+ * xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence
+ * doing it xen_hvm_cpu_notify (which gets called by smp_init during
+ * early bootup and also during CPU hotplug events).
+ */
+ xen_setup_cpu_clockevents();
+}
+
+void __init xen_hvm_init_time_ops(void)
+{
+ /* vector callback is needed otherwise we cannot receive interrupts
+ * on cpu > 0 and at this point we don't know how many cpus are
+ * available */
+ if (!xen_have_vector_callback)
+ return;
+ if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
+ printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
+ "disable pv timer\n");
+ return;
+ }
+
+ pv_time_ops = xen_time_ops;
+ x86_init.timers.setup_percpu_clockev = xen_time_init;
+ x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
+
+ x86_platform.calibrate_tsc = xen_tsc_khz;
+ x86_platform.get_wallclock = xen_get_wallclock;
+ x86_platform.set_wallclock = xen_set_wallclock;
}
+#endif
diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c
new file mode 100644
index 00000000000..520022d1a18
--- /dev/null
+++ b/arch/x86/xen/trace.c
@@ -0,0 +1,62 @@
+#include <linux/ftrace.h>
+#include <xen/interface/xen.h>
+
+#define N(x) [__HYPERVISOR_##x] = "("#x")"
+static const char *xen_hypercall_names[] = {
+ N(set_trap_table),
+ N(mmu_update),
+ N(set_gdt),
+ N(stack_switch),
+ N(set_callbacks),
+ N(fpu_taskswitch),
+ N(sched_op_compat),
+ N(dom0_op),
+ N(set_debugreg),
+ N(get_debugreg),
+ N(update_descriptor),
+ N(memory_op),
+ N(multicall),
+ N(update_va_mapping),
+ N(set_timer_op),
+ N(event_channel_op_compat),
+ N(xen_version),
+ N(console_io),
+ N(physdev_op_compat),
+ N(grant_table_op),
+ N(vm_assist),
+ N(update_va_mapping_otherdomain),
+ N(iret),
+ N(vcpu_op),
+ N(set_segment_base),
+ N(mmuext_op),
+ N(acm_op),
+ N(nmi_op),
+ N(sched_op),
+ N(callback_op),
+ N(xenoprof_op),
+ N(event_channel_op),
+ N(physdev_op),
+ N(hvm_op),
+
+/* Architecture-specific hypercall definitions. */
+ N(arch_0),
+ N(arch_1),
+ N(arch_2),
+ N(arch_3),
+ N(arch_4),
+ N(arch_5),
+ N(arch_6),
+ N(arch_7),
+};
+#undef N
+
+static const char *xen_hypercall_name(unsigned op)
+{
+ if (op < ARRAY_SIZE(xen_hypercall_names) && xen_hypercall_names[op] != NULL)
+ return xen_hypercall_names[op];
+
+ return "";
+}
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/xen.h>
diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c
new file mode 100644
index 00000000000..6722e3733f0
--- /dev/null
+++ b/arch/x86/xen/vga.c
@@ -0,0 +1,74 @@
+#include <linux/screen_info.h>
+#include <linux/init.h>
+
+#include <asm/bootparam.h>
+#include <asm/setup.h>
+
+#include <xen/interface/xen.h>
+
+#include "xen-ops.h"
+
+void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
+{
+ struct screen_info *screen_info = &boot_params.screen_info;
+
+ /* This is drawn from a dump from vgacon:startup in
+ * standard Linux. */
+ screen_info->orig_video_mode = 3;
+ screen_info->orig_video_isVGA = 1;
+ screen_info->orig_video_lines = 25;
+ screen_info->orig_video_cols = 80;
+ screen_info->orig_video_ega_bx = 3;
+ screen_info->orig_video_points = 16;
+ screen_info->orig_y = screen_info->orig_video_lines - 1;
+
+ switch (info->video_type) {
+ case XEN_VGATYPE_TEXT_MODE_3:
+ if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3)
+ + sizeof(info->u.text_mode_3))
+ break;
+ screen_info->orig_video_lines = info->u.text_mode_3.rows;
+ screen_info->orig_video_cols = info->u.text_mode_3.columns;
+ screen_info->orig_x = info->u.text_mode_3.cursor_x;
+ screen_info->orig_y = info->u.text_mode_3.cursor_y;
+ screen_info->orig_video_points =
+ info->u.text_mode_3.font_height;
+ break;
+
+ case XEN_VGATYPE_EFI_LFB:
+ case XEN_VGATYPE_VESA_LFB:
+ if (size < offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.gbl_caps))
+ break;
+ screen_info->orig_video_isVGA = VIDEO_TYPE_VLFB;
+ screen_info->lfb_width = info->u.vesa_lfb.width;
+ screen_info->lfb_height = info->u.vesa_lfb.height;
+ screen_info->lfb_depth = info->u.vesa_lfb.bits_per_pixel;
+ screen_info->lfb_base = info->u.vesa_lfb.lfb_base;
+ screen_info->lfb_size = info->u.vesa_lfb.lfb_size;
+ screen_info->lfb_linelength = info->u.vesa_lfb.bytes_per_line;
+ screen_info->red_size = info->u.vesa_lfb.red_size;
+ screen_info->red_pos = info->u.vesa_lfb.red_pos;
+ screen_info->green_size = info->u.vesa_lfb.green_size;
+ screen_info->green_pos = info->u.vesa_lfb.green_pos;
+ screen_info->blue_size = info->u.vesa_lfb.blue_size;
+ screen_info->blue_pos = info->u.vesa_lfb.blue_pos;
+ screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size;
+ screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos;
+
+ if (info->video_type == XEN_VGATYPE_EFI_LFB) {
+ screen_info->orig_video_isVGA = VIDEO_TYPE_EFI;
+ break;
+ }
+
+ if (size >= offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.gbl_caps)
+ + sizeof(info->u.vesa_lfb.gbl_caps))
+ screen_info->capabilities = info->u.vesa_lfb.gbl_caps;
+ if (size >= offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.mode_attrs)
+ + sizeof(info->u.vesa_lfb.mode_attrs))
+ screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs;
+ break;
+ }
+}
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
new file mode 100644
index 00000000000..3e45aa00071
--- /dev/null
+++ b/arch/x86/xen/xen-asm.S
@@ -0,0 +1,142 @@
+/*
+ * Asm versions of Xen pv-ops, suitable for either direct use or
+ * inlining. The inline versions are the same as the direct-use
+ * versions, with the pre- and post-amble chopped off.
+ *
+ * This code is encoded for size rather than absolute efficiency, with
+ * a view to being able to inline as much as possible.
+ *
+ * We only bother with direct forms (ie, vcpu in percpu data) of the
+ * operations here; the indirect forms are better handled in C, since
+ * they're generally too large to inline anyway.
+ */
+
+#include <asm/asm-offsets.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+
+#include "xen-asm.h"
+
+/*
+ * Enable events. This clears the event mask and tests the pending
+ * event status with one and operation. If there are pending events,
+ * then enter the hypervisor to get them handled.
+ */
+ENTRY(xen_irq_enable_direct)
+ /* Unmask events */
+ movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+
+ /*
+ * Preempt here doesn't matter because that will deal with any
+ * pending interrupts. The pending check may end up being run
+ * on the wrong CPU, but that doesn't hurt.
+ */
+
+ /* Test for pending */
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
+ jz 1f
+
+2: call check_events
+1:
+ENDPATCH(xen_irq_enable_direct)
+ ret
+ ENDPROC(xen_irq_enable_direct)
+ RELOC(xen_irq_enable_direct, 2b+1)
+
+
+/*
+ * Disabling events is simply a matter of making the event mask
+ * non-zero.
+ */
+ENTRY(xen_irq_disable_direct)
+ movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ENDPATCH(xen_irq_disable_direct)
+ ret
+ ENDPROC(xen_irq_disable_direct)
+ RELOC(xen_irq_disable_direct, 0)
+
+/*
+ * (xen_)save_fl is used to get the current interrupt enable status.
+ * Callers expect the status to be in X86_EFLAGS_IF, and other bits
+ * may be set in the return value. We take advantage of this by
+ * making sure that X86_EFLAGS_IF has the right value (and other bits
+ * in that byte are 0), but other bits in the return value are
+ * undefined. We need to toggle the state of the bit, because Xen and
+ * x86 use opposite senses (mask vs enable).
+ */
+ENTRY(xen_save_fl_direct)
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ setz %ah
+ addb %ah, %ah
+ENDPATCH(xen_save_fl_direct)
+ ret
+ ENDPROC(xen_save_fl_direct)
+ RELOC(xen_save_fl_direct, 0)
+
+
+/*
+ * In principle the caller should be passing us a value return from
+ * xen_save_fl_direct, but for robustness sake we test only the
+ * X86_EFLAGS_IF flag rather than the whole byte. After setting the
+ * interrupt mask state, it checks for unmasked pending events and
+ * enters the hypervisor to get them delivered if so.
+ */
+ENTRY(xen_restore_fl_direct)
+#ifdef CONFIG_X86_64
+ testw $X86_EFLAGS_IF, %di
+#else
+ testb $X86_EFLAGS_IF>>8, %ah
+#endif
+ setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ /*
+ * Preempt here doesn't matter because that will deal with any
+ * pending interrupts. The pending check may end up being run
+ * on the wrong CPU, but that doesn't hurt.
+ */
+
+ /* check for unmasked and pending */
+ cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
+ jnz 1f
+2: call check_events
+1:
+ENDPATCH(xen_restore_fl_direct)
+ ret
+ ENDPROC(xen_restore_fl_direct)
+ RELOC(xen_restore_fl_direct, 2b+1)
+
+
+/*
+ * Force an event check by making a hypercall, but preserve regs
+ * before making the call.
+ */
+check_events:
+#ifdef CONFIG_X86_32
+ push %eax
+ push %ecx
+ push %edx
+ call xen_force_evtchn_callback
+ pop %edx
+ pop %ecx
+ pop %eax
+#else
+ push %rax
+ push %rcx
+ push %rdx
+ push %rsi
+ push %rdi
+ push %r8
+ push %r9
+ push %r10
+ push %r11
+ call xen_force_evtchn_callback
+ pop %r11
+ pop %r10
+ pop %r9
+ pop %r8
+ pop %rdi
+ pop %rsi
+ pop %rdx
+ pop %rcx
+ pop %rax
+#endif
+ ret
diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h
new file mode 100644
index 00000000000..465276467a4
--- /dev/null
+++ b/arch/x86/xen/xen-asm.h
@@ -0,0 +1,12 @@
+#ifndef _XEN_XEN_ASM_H
+#define _XEN_XEN_ASM_H
+
+#include <linux/linkage.h>
+
+#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
+#define ENDPATCH(x) .globl x##_end; x##_end=.
+
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI 0x80000000
+
+#endif
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 42786f59d9c..fd92a64d748 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -1,117 +1,44 @@
/*
- Asm versions of Xen pv-ops, suitable for either direct use or inlining.
- The inline versions are the same as the direct-use versions, with the
- pre- and post-amble chopped off.
-
- This code is encoded for size rather than absolute efficiency,
- with a view to being able to inline as much as possible.
-
- We only bother with direct forms (ie, vcpu in pda) of the operations
- here; the indirect forms are better handled in C, since they're
- generally too large to inline anyway.
+ * Asm versions of Xen pv-ops, suitable for either direct use or
+ * inlining. The inline versions are the same as the direct-use
+ * versions, with the pre- and post-amble chopped off.
+ *
+ * This code is encoded for size rather than absolute efficiency, with
+ * a view to being able to inline as much as possible.
+ *
+ * We only bother with direct forms (ie, vcpu in pda) of the
+ * operations here; the indirect forms are better handled in C, since
+ * they're generally too large to inline anyway.
*/
-#include <linux/linkage.h>
-
-#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
-#include <asm/percpu.h>
#include <asm/processor-flags.h>
#include <asm/segment.h>
+#include <asm/asm.h>
#include <xen/interface/xen.h>
-#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
-#define ENDPATCH(x) .globl x##_end; x##_end=.
-
-/* Pseudo-flag used for virtual NMI, which we don't implement yet */
-#define XEN_EFLAGS_NMI 0x80000000
-
-/*
- Enable events. This clears the event mask and tests the pending
- event status with one and operation. If there are pending
- events, then enter the hypervisor to get them handled.
- */
-ENTRY(xen_irq_enable_direct)
- /* Unmask events */
- movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-
- /* Preempt here doesn't matter because that will deal with
- any pending interrupts. The pending check may end up being
- run on the wrong CPU, but that doesn't hurt. */
-
- /* Test for pending */
- testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
- jz 1f
-
-2: call check_events
-1:
-ENDPATCH(xen_irq_enable_direct)
- ret
- ENDPROC(xen_irq_enable_direct)
- RELOC(xen_irq_enable_direct, 2b+1)
-
+#include "xen-asm.h"
/*
- Disabling events is simply a matter of making the event mask
- non-zero.
+ * Force an event check by making a hypercall, but preserve regs
+ * before making the call.
*/
-ENTRY(xen_irq_disable_direct)
- movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-ENDPATCH(xen_irq_disable_direct)
- ret
- ENDPROC(xen_irq_disable_direct)
- RELOC(xen_irq_disable_direct, 0)
-
-/*
- (xen_)save_fl is used to get the current interrupt enable status.
- Callers expect the status to be in X86_EFLAGS_IF, and other bits
- may be set in the return value. We take advantage of this by
- making sure that X86_EFLAGS_IF has the right value (and other bits
- in that byte are 0), but other bits in the return value are
- undefined. We need to toggle the state of the bit, because
- Xen and x86 use opposite senses (mask vs enable).
- */
-ENTRY(xen_save_fl_direct)
- testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
- setz %ah
- addb %ah,%ah
-ENDPATCH(xen_save_fl_direct)
- ret
- ENDPROC(xen_save_fl_direct)
- RELOC(xen_save_fl_direct, 0)
-
-
-/*
- In principle the caller should be passing us a value return
- from xen_save_fl_direct, but for robustness sake we test only
- the X86_EFLAGS_IF flag rather than the whole byte. After
- setting the interrupt mask state, it checks for unmasked
- pending events and enters the hypervisor to get them delivered
- if so.
- */
-ENTRY(xen_restore_fl_direct)
- testb $X86_EFLAGS_IF>>8, %ah
- setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
- /* Preempt here doesn't matter because that will deal with
- any pending interrupts. The pending check may end up being
- run on the wrong CPU, but that doesn't hurt. */
-
- /* check for unmasked and pending */
- cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
- jz 1f
-2: call check_events
-1:
-ENDPATCH(xen_restore_fl_direct)
+check_events:
+ push %eax
+ push %ecx
+ push %edx
+ call xen_force_evtchn_callback
+ pop %edx
+ pop %ecx
+ pop %eax
ret
- ENDPROC(xen_restore_fl_direct)
- RELOC(xen_restore_fl_direct, 2b+1)
/*
- We can't use sysexit directly, because we're not running in ring0.
- But we can easily fake it up using iret. Assuming xen_sysexit
- is jumped to with a standard stack frame, we can just strip it
- back to a standard iret frame and use iret.
+ * We can't use sysexit directly, because we're not running in ring0.
+ * But we can easily fake it up using iret. Assuming xen_sysexit is
+ * jumped to with a standard stack frame, we can just strip it back to
+ * a standard iret frame and use iret.
*/
ENTRY(xen_sysexit)
movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */
@@ -122,34 +49,43 @@ ENTRY(xen_sysexit)
ENDPROC(xen_sysexit)
/*
- This is run where a normal iret would be run, with the same stack setup:
- 8: eflags
- 4: cs
- esp-> 0: eip
-
- This attempts to make sure that any pending events are dealt
- with on return to usermode, but there is a small window in
- which an event can happen just before entering usermode. If
- the nested interrupt ends up setting one of the TIF_WORK_MASK
- pending work flags, they will not be tested again before
- returning to usermode. This means that a process can end up
- with pending work, which will be unprocessed until the process
- enters and leaves the kernel again, which could be an
- unbounded amount of time. This means that a pending signal or
- reschedule event could be indefinitely delayed.
+ * This is run where a normal iret would be run, with the same stack setup:
+ * 8: eflags
+ * 4: cs
+ * esp-> 0: eip
+ *
+ * This attempts to make sure that any pending events are dealt with
+ * on return to usermode, but there is a small window in which an
+ * event can happen just before entering usermode. If the nested
+ * interrupt ends up setting one of the TIF_WORK_MASK pending work
+ * flags, they will not be tested again before returning to
+ * usermode. This means that a process can end up with pending work,
+ * which will be unprocessed until the process enters and leaves the
+ * kernel again, which could be an unbounded amount of time. This
+ * means that a pending signal or reschedule event could be
+ * indefinitely delayed.
+ *
+ * The fix is to notice a nested interrupt in the critical window, and
+ * if one occurs, then fold the nested interrupt into the current
+ * interrupt stack frame, and re-process it iteratively rather than
+ * recursively. This means that it will exit via the normal path, and
+ * all pending work will be dealt with appropriately.
+ *
+ * Because the nested interrupt handler needs to deal with the current
+ * stack state in whatever form its in, we keep things simple by only
+ * using a single register which is pushed/popped on the stack.
+ */
- The fix is to notice a nested interrupt in the critical
- window, and if one occurs, then fold the nested interrupt into
- the current interrupt stack frame, and re-process it
- iteratively rather than recursively. This means that it will
- exit via the normal path, and all pending work will be dealt
- with appropriately.
+.macro POP_FS
+1:
+ popw %fs
+.pushsection .fixup, "ax"
+2: movw $0, (%esp)
+ jmp 1b
+.popsection
+ _ASM_EXTABLE(1b,2b)
+.endm
- Because the nested interrupt handler needs to deal with the
- current stack state in whatever form its in, we keep things
- simple by only using a single register which is pushed/popped
- on the stack.
- */
ENTRY(xen_iret)
/* test eflags for special cases */
testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
@@ -158,52 +94,60 @@ ENTRY(xen_iret)
push %eax
ESP_OFFSET=4 # bytes pushed onto stack
- /* Store vcpu_info pointer for easy access. Do it this
- way to avoid having to reload %fs */
+ /* Store vcpu_info pointer for easy access */
#ifdef CONFIG_SMP
- GET_THREAD_INFO(%eax)
- movl TI_cpu(%eax),%eax
- movl __per_cpu_offset(,%eax,4),%eax
- mov per_cpu__xen_vcpu(%eax),%eax
+ pushw %fs
+ movl $(__KERNEL_PERCPU), %eax
+ movl %eax, %fs
+ movl %fs:xen_vcpu, %eax
+ POP_FS
#else
- movl per_cpu__xen_vcpu, %eax
+ movl %ss:xen_vcpu, %eax
#endif
/* check IF state we're restoring */
testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
- /* Maybe enable events. Once this happens we could get a
- recursive event, so the critical region starts immediately
- afterwards. However, if that happens we don't end up
- resuming the code, so we don't have to be worried about
- being preempted to another CPU. */
- setz XEN_vcpu_info_mask(%eax)
+ /*
+ * Maybe enable events. Once this happens we could get a
+ * recursive event, so the critical region starts immediately
+ * afterwards. However, if that happens we don't end up
+ * resuming the code, so we don't have to be worried about
+ * being preempted to another CPU.
+ */
+ setz %ss:XEN_vcpu_info_mask(%eax)
xen_iret_start_crit:
/* check for unmasked and pending */
- cmpw $0x0001, XEN_vcpu_info_pending(%eax)
+ cmpw $0x0001, %ss:XEN_vcpu_info_pending(%eax)
- /* If there's something pending, mask events again so we
- can jump back into xen_hypervisor_callback */
- sete XEN_vcpu_info_mask(%eax)
+ /*
+ * If there's something pending, mask events again so we can
+ * jump back into xen_hypervisor_callback. Otherwise do not
+ * touch XEN_vcpu_info_mask.
+ */
+ jne 1f
+ movb $1, %ss:XEN_vcpu_info_mask(%eax)
- popl %eax
+1: popl %eax
- /* From this point on the registers are restored and the stack
- updated, so we don't need to worry about it if we're preempted */
+ /*
+ * From this point on the registers are restored and the stack
+ * updated, so we don't need to worry about it if we're
+ * preempted
+ */
iret_restore_end:
- /* Jump to hypervisor_callback after fixing up the stack.
- Events are masked, so jumping out of the critical
- region is OK. */
+ /*
+ * Jump to hypervisor_callback after fixing up the stack.
+ * Events are masked, so jumping out of the critical region is
+ * OK.
+ */
je xen_hypervisor_callback
1: iret
xen_iret_end_crit:
-.section __ex_table,"a"
- .align 4
- .long 1b,iret_exc
-.previous
+ _ASM_EXTABLE(1b, iret_exc)
hyper_iret:
/* put this out of line since its very rarely used */
@@ -212,55 +156,55 @@ hyper_iret:
.globl xen_iret_start_crit, xen_iret_end_crit
/*
- This is called by xen_hypervisor_callback in entry.S when it sees
- that the EIP at the time of interrupt was between xen_iret_start_crit
- and xen_iret_end_crit. We're passed the EIP in %eax so we can do
- a more refined determination of what to do.
-
- The stack format at this point is:
- ----------------
- ss : (ss/esp may be present if we came from usermode)
- esp :
- eflags } outer exception info
- cs }
- eip }
- ---------------- <- edi (copy dest)
- eax : outer eax if it hasn't been restored
- ----------------
- eflags } nested exception info
- cs } (no ss/esp because we're nested
- eip } from the same ring)
- orig_eax }<- esi (copy src)
- - - - - - - - -
- fs }
- es }
- ds } SAVE_ALL state
- eax }
- : :
- ebx }<- esp
- ----------------
-
- In order to deliver the nested exception properly, we need to shift
- everything from the return addr up to the error code so it
- sits just under the outer exception info. This means that when we
- handle the exception, we do it in the context of the outer exception
- rather than starting a new one.
-
- The only caveat is that if the outer eax hasn't been
- restored yet (ie, it's still on stack), we need to insert
- its value into the SAVE_ALL state before going on, since
- it's usermode state which we eventually need to restore.
+ * This is called by xen_hypervisor_callback in entry.S when it sees
+ * that the EIP at the time of interrupt was between
+ * xen_iret_start_crit and xen_iret_end_crit. We're passed the EIP in
+ * %eax so we can do a more refined determination of what to do.
+ *
+ * The stack format at this point is:
+ * ----------------
+ * ss : (ss/esp may be present if we came from usermode)
+ * esp :
+ * eflags } outer exception info
+ * cs }
+ * eip }
+ * ---------------- <- edi (copy dest)
+ * eax : outer eax if it hasn't been restored
+ * ----------------
+ * eflags } nested exception info
+ * cs } (no ss/esp because we're nested
+ * eip } from the same ring)
+ * orig_eax }<- esi (copy src)
+ * - - - - - - - -
+ * fs }
+ * es }
+ * ds } SAVE_ALL state
+ * eax }
+ * : :
+ * ebx }<- esp
+ * ----------------
+ *
+ * In order to deliver the nested exception properly, we need to shift
+ * everything from the return addr up to the error code so it sits
+ * just under the outer exception info. This means that when we
+ * handle the exception, we do it in the context of the outer
+ * exception rather than starting a new one.
+ *
+ * The only caveat is that if the outer eax hasn't been restored yet
+ * (ie, it's still on stack), we need to insert its value into the
+ * SAVE_ALL state before going on, since it's usermode state which we
+ * eventually need to restore.
*/
ENTRY(xen_iret_crit_fixup)
/*
- Paranoia: Make sure we're really coming from kernel space.
- One could imagine a case where userspace jumps into the
- critical range address, but just before the CPU delivers a GP,
- it decides to deliver an interrupt instead. Unlikely?
- Definitely. Easy to avoid? Yes. The Intel documents
- explicitly say that the reported EIP for a bad jump is the
- jump instruction itself, not the destination, but some virtual
- environments get this wrong.
+ * Paranoia: Make sure we're really coming from kernel space.
+ * One could imagine a case where userspace jumps into the
+ * critical range address, but just before the CPU delivers a
+ * GP, it decides to deliver an interrupt instead. Unlikely?
+ * Definitely. Easy to avoid? Yes. The Intel documents
+ * explicitly say that the reported EIP for a bad jump is the
+ * jump instruction itself, not the destination, but some
+ * virtual environments get this wrong.
*/
movl PT_CS(%esp), %ecx
andl $SEGMENT_RPL_MASK, %ecx
@@ -270,15 +214,17 @@ ENTRY(xen_iret_crit_fixup)
lea PT_ORIG_EAX(%esp), %esi
lea PT_EFLAGS(%esp), %edi
- /* If eip is before iret_restore_end then stack
- hasn't been restored yet. */
+ /*
+ * If eip is before iret_restore_end then stack
+ * hasn't been restored yet.
+ */
cmp $iret_restore_end, %eax
jae 1f
- movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */
+ movl 0+4(%edi), %eax /* copy EAX (just above top of frame) */
movl %eax, PT_EAX(%esp)
- lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */
+ lea ESP_OFFSET(%edi), %edi /* move dest up over saved regs */
/* set up the copy */
1: std
@@ -286,20 +232,6 @@ ENTRY(xen_iret_crit_fixup)
rep movsl
cld
- lea 4(%edi),%esp /* point esp to new frame */
+ lea 4(%edi), %esp /* point esp to new frame */
2: jmp xen_do_upcall
-
-/*
- Force an event check by making a hypercall,
- but preserve regs before making the call.
- */
-check_events:
- push %eax
- push %ecx
- push %edx
- call xen_force_evtchn_callback
- pop %edx
- pop %ecx
- pop %eax
- ret
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 05794c566e8..53adefda427 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -1,174 +1,45 @@
/*
- Asm versions of Xen pv-ops, suitable for either direct use or inlining.
- The inline versions are the same as the direct-use versions, with the
- pre- and post-amble chopped off.
-
- This code is encoded for size rather than absolute efficiency,
- with a view to being able to inline as much as possible.
-
- We only bother with direct forms (ie, vcpu in pda) of the operations
- here; the indirect forms are better handled in C, since they're
- generally too large to inline anyway.
+ * Asm versions of Xen pv-ops, suitable for either direct use or
+ * inlining. The inline versions are the same as the direct-use
+ * versions, with the pre- and post-amble chopped off.
+ *
+ * This code is encoded for size rather than absolute efficiency, with
+ * a view to being able to inline as much as possible.
+ *
+ * We only bother with direct forms (ie, vcpu in pda) of the
+ * operations here; the indirect forms are better handled in C, since
+ * they're generally too large to inline anyway.
*/
-#include <linux/linkage.h>
-
-#include <asm/asm-offsets.h>
-#include <asm/processor-flags.h>
#include <asm/errno.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
#include <asm/segment.h>
#include <xen/interface/xen.h>
-#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
-#define ENDPATCH(x) .globl x##_end; x##_end=.
-
-/* Pseudo-flag used for virtual NMI, which we don't implement yet */
-#define XEN_EFLAGS_NMI 0x80000000
-
-#if 1
-/*
- x86-64 does not yet support direct access to percpu variables
- via a segment override, so we just need to make sure this code
- never gets used
- */
-#define BUG ud2a
-#define PER_CPU_VAR(var, off) 0xdeadbeef
-#endif
-
-/*
- Enable events. This clears the event mask and tests the pending
- event status with one and operation. If there are pending
- events, then enter the hypervisor to get them handled.
- */
-ENTRY(xen_irq_enable_direct)
- BUG
-
- /* Unmask events */
- movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
-
- /* Preempt here doesn't matter because that will deal with
- any pending interrupts. The pending check may end up being
- run on the wrong CPU, but that doesn't hurt. */
-
- /* Test for pending */
- testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
- jz 1f
-
-2: call check_events
-1:
-ENDPATCH(xen_irq_enable_direct)
- ret
- ENDPROC(xen_irq_enable_direct)
- RELOC(xen_irq_enable_direct, 2b+1)
-
-/*
- Disabling events is simply a matter of making the event mask
- non-zero.
- */
-ENTRY(xen_irq_disable_direct)
- BUG
-
- movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
-ENDPATCH(xen_irq_disable_direct)
- ret
- ENDPROC(xen_irq_disable_direct)
- RELOC(xen_irq_disable_direct, 0)
-
-/*
- (xen_)save_fl is used to get the current interrupt enable status.
- Callers expect the status to be in X86_EFLAGS_IF, and other bits
- may be set in the return value. We take advantage of this by
- making sure that X86_EFLAGS_IF has the right value (and other bits
- in that byte are 0), but other bits in the return value are
- undefined. We need to toggle the state of the bit, because
- Xen and x86 use opposite senses (mask vs enable).
- */
-ENTRY(xen_save_fl_direct)
- BUG
-
- testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
- setz %ah
- addb %ah,%ah
-ENDPATCH(xen_save_fl_direct)
- ret
- ENDPROC(xen_save_fl_direct)
- RELOC(xen_save_fl_direct, 0)
-
-/*
- In principle the caller should be passing us a value return
- from xen_save_fl_direct, but for robustness sake we test only
- the X86_EFLAGS_IF flag rather than the whole byte. After
- setting the interrupt mask state, it checks for unmasked
- pending events and enters the hypervisor to get them delivered
- if so.
- */
-ENTRY(xen_restore_fl_direct)
- BUG
-
- testb $X86_EFLAGS_IF>>8, %ah
- setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
- /* Preempt here doesn't matter because that will deal with
- any pending interrupts. The pending check may end up being
- run on the wrong CPU, but that doesn't hurt. */
-
- /* check for unmasked and pending */
- cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
- jz 1f
-2: call check_events
-1:
-ENDPATCH(xen_restore_fl_direct)
- ret
- ENDPROC(xen_restore_fl_direct)
- RELOC(xen_restore_fl_direct, 2b+1)
-
-
-/*
- Force an event check by making a hypercall,
- but preserve regs before making the call.
- */
-check_events:
- push %rax
- push %rcx
- push %rdx
- push %rsi
- push %rdi
- push %r8
- push %r9
- push %r10
- push %r11
- call xen_force_evtchn_callback
- pop %r11
- pop %r10
- pop %r9
- pop %r8
- pop %rdi
- pop %rsi
- pop %rdx
- pop %rcx
- pop %rax
- ret
+#include "xen-asm.h"
ENTRY(xen_adjust_exception_frame)
- mov 8+0(%rsp),%rcx
- mov 8+8(%rsp),%r11
+ mov 8+0(%rsp), %rcx
+ mov 8+8(%rsp), %r11
ret $16
hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
/*
- Xen64 iret frame:
-
- ss
- rsp
- rflags
- cs
- rip <-- standard iret frame
-
- flags
-
- rcx }
- r11 }<-- pushed by hypercall page
-rsp -> rax }
+ * Xen64 iret frame:
+ *
+ * ss
+ * rsp
+ * rflags
+ * cs
+ * rip <-- standard iret frame
+ *
+ * flags
+ *
+ * rcx }
+ * r11 }<-- pushed by hypercall page
+ * rsp->rax }
*/
ENTRY(xen_iret)
pushq $0
@@ -177,8 +48,8 @@ ENDPATCH(xen_iret)
RELOC(xen_iret, 1b+1)
/*
- sysexit is not used for 64-bit processes, so it's
- only ever used to return to 32-bit compat userspace.
+ * sysexit is not used for 64-bit processes, so it's only ever used to
+ * return to 32-bit compat userspace.
*/
ENTRY(xen_sysexit)
pushq $__USER32_DS
@@ -193,13 +64,15 @@ ENDPATCH(xen_sysexit)
RELOC(xen_sysexit, 1b+1)
ENTRY(xen_sysret64)
- /* We're already on the usermode stack at this point, but still
- with the kernel gs, so we can easily switch back */
- movq %rsp, %gs:pda_oldrsp
- movq %gs:pda_kernelstack,%rsp
+ /*
+ * We're already on the usermode stack at this point, but
+ * still with the kernel gs, so we can easily switch back
+ */
+ movq %rsp, PER_CPU_VAR(old_rsp)
+ movq PER_CPU_VAR(kernel_stack), %rsp
pushq $__USER_DS
- pushq %gs:pda_oldrsp
+ pushq PER_CPU_VAR(old_rsp)
pushq %r11
pushq $__USER_CS
pushq %rcx
@@ -210,45 +83,46 @@ ENDPATCH(xen_sysret64)
RELOC(xen_sysret64, 1b+1)
ENTRY(xen_sysret32)
- /* We're already on the usermode stack at this point, but still
- with the kernel gs, so we can easily switch back */
- movq %rsp, %gs:pda_oldrsp
- movq %gs:pda_kernelstack, %rsp
+ /*
+ * We're already on the usermode stack at this point, but
+ * still with the kernel gs, so we can easily switch back
+ */
+ movq %rsp, PER_CPU_VAR(old_rsp)
+ movq PER_CPU_VAR(kernel_stack), %rsp
pushq $__USER32_DS
- pushq %gs:pda_oldrsp
+ pushq PER_CPU_VAR(old_rsp)
pushq %r11
pushq $__USER32_CS
pushq %rcx
- pushq $VGCF_in_syscall
+ pushq $0
1: jmp hypercall_iret
ENDPATCH(xen_sysret32)
RELOC(xen_sysret32, 1b+1)
/*
- Xen handles syscall callbacks much like ordinary exceptions,
- which means we have:
- - kernel gs
- - kernel rsp
- - an iret-like stack frame on the stack (including rcx and r11):
- ss
- rsp
- rflags
- cs
- rip
- r11
- rsp-> rcx
-
- In all the entrypoints, we undo all that to make it look
- like a CPU-generated syscall/sysenter and jump to the normal
- entrypoint.
+ * Xen handles syscall callbacks much like ordinary exceptions, which
+ * means we have:
+ * - kernel gs
+ * - kernel rsp
+ * - an iret-like stack frame on the stack (including rcx and r11):
+ * ss
+ * rsp
+ * rflags
+ * cs
+ * rip
+ * r11
+ * rsp->rcx
+ *
+ * In all the entrypoints, we undo all that to make it look like a
+ * CPU-generated syscall/sysenter and jump to the normal entrypoint.
*/
.macro undo_xen_syscall
- mov 0*8(%rsp),%rcx
- mov 1*8(%rsp),%r11
- mov 5*8(%rsp),%rsp
+ mov 0*8(%rsp), %rcx
+ mov 1*8(%rsp), %r11
+ mov 5*8(%rsp), %rsp
.endm
/* Normal 64-bit system call target */
@@ -275,9 +149,9 @@ ENDPROC(xen_sysenter_target)
ENTRY(xen_syscall32_target)
ENTRY(xen_sysenter_target)
- lea 16(%rsp), %rsp /* strip %rcx,%r11 */
+ lea 16(%rsp), %rsp /* strip %rcx, %r11 */
mov $-ENOSYS, %rax
- pushq $VGCF_in_syscall
+ pushq $0
jmp hypercall_iret
ENDPROC(xen_syscall32_target)
ENDPROC(xen_sysenter_target)
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 63d49a523ed..485b6958554 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -8,11 +8,31 @@
#include <asm/boot.h>
#include <asm/asm.h>
-#include <asm/page.h>
+#include <asm/page_types.h>
#include <xen/interface/elfnote.h>
+#include <xen/interface/features.h>
#include <asm/xen/interface.h>
+#ifdef CONFIG_XEN_PVH
+#define PVH_FEATURES_STR "|writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel"
+/* Note the lack of 'hvm_callback_vector'. Older hypervisor will
+ * balk at this being part of XEN_ELFNOTE_FEATURES, so we put it in
+ * XEN_ELFNOTE_SUPPORTED_FEATURES which older hypervisors will ignore.
+ */
+#define PVH_FEATURES ((1 << XENFEAT_writable_page_tables) | \
+ (1 << XENFEAT_auto_translated_physmap) | \
+ (1 << XENFEAT_supervisor_mode_kernel) | \
+ (1 << XENFEAT_hvm_callback_vector))
+/* The XENFEAT_writable_page_tables is not stricly neccessary as we set that
+ * up regardless whether this CONFIG option is enabled or not, but it
+ * clarifies what the right flags need to be.
+ */
+#else
+#define PVH_FEATURES_STR ""
+#define PVH_FEATURES (0)
+#endif
+
__INIT
ENTRY(startup_xen)
cld
@@ -28,9 +48,61 @@ ENTRY(startup_xen)
__FINIT
.pushsection .text
- .align PAGE_SIZE_asm
+ .balign PAGE_SIZE
ENTRY(hypercall_page)
- .skip PAGE_SIZE_asm
+#define NEXT_HYPERCALL(x) \
+ ENTRY(xen_hypercall_##x) \
+ .skip 32
+
+NEXT_HYPERCALL(set_trap_table)
+NEXT_HYPERCALL(mmu_update)
+NEXT_HYPERCALL(set_gdt)
+NEXT_HYPERCALL(stack_switch)
+NEXT_HYPERCALL(set_callbacks)
+NEXT_HYPERCALL(fpu_taskswitch)
+NEXT_HYPERCALL(sched_op_compat)
+NEXT_HYPERCALL(platform_op)
+NEXT_HYPERCALL(set_debugreg)
+NEXT_HYPERCALL(get_debugreg)
+NEXT_HYPERCALL(update_descriptor)
+NEXT_HYPERCALL(ni)
+NEXT_HYPERCALL(memory_op)
+NEXT_HYPERCALL(multicall)
+NEXT_HYPERCALL(update_va_mapping)
+NEXT_HYPERCALL(set_timer_op)
+NEXT_HYPERCALL(event_channel_op_compat)
+NEXT_HYPERCALL(xen_version)
+NEXT_HYPERCALL(console_io)
+NEXT_HYPERCALL(physdev_op_compat)
+NEXT_HYPERCALL(grant_table_op)
+NEXT_HYPERCALL(vm_assist)
+NEXT_HYPERCALL(update_va_mapping_otherdomain)
+NEXT_HYPERCALL(iret)
+NEXT_HYPERCALL(vcpu_op)
+NEXT_HYPERCALL(set_segment_base)
+NEXT_HYPERCALL(mmuext_op)
+NEXT_HYPERCALL(xsm_op)
+NEXT_HYPERCALL(nmi_op)
+NEXT_HYPERCALL(sched_op)
+NEXT_HYPERCALL(callback_op)
+NEXT_HYPERCALL(xenoprof_op)
+NEXT_HYPERCALL(event_channel_op)
+NEXT_HYPERCALL(physdev_op)
+NEXT_HYPERCALL(hvm_op)
+NEXT_HYPERCALL(sysctl)
+NEXT_HYPERCALL(domctl)
+NEXT_HYPERCALL(kexec_op)
+NEXT_HYPERCALL(tmem_op) /* 38 */
+ENTRY(xen_hypercall_rsvr)
+ .skip 320
+NEXT_HYPERCALL(mca) /* 48 */
+NEXT_HYPERCALL(arch_1)
+NEXT_HYPERCALL(arch_2)
+NEXT_HYPERCALL(arch_3)
+NEXT_HYPERCALL(arch_4)
+NEXT_HYPERCALL(arch_5)
+NEXT_HYPERCALL(arch_6)
+ .balign PAGE_SIZE
.popsection
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
@@ -43,7 +115,10 @@ ENTRY(hypercall_page)
#endif
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
- ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
+ ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables|pae_pgdir_above_4gb"; .asciz PVH_FEATURES_STR)
+ ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long (PVH_FEATURES) |
+ (1 << XENFEAT_writable_page_tables) |
+ (1 << XENFEAT_dom0))
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index c1f8faf0a2c..97d87659f77 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -10,9 +10,12 @@
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
+extern void *xen_initial_gdt;
+
struct trap_info;
void xen_copy_trap_info(struct trap_info *traps);
+DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
DECLARE_PER_CPU(unsigned long, xen_cr3);
DECLARE_PER_CPU(unsigned long, xen_current_cr3);
@@ -22,54 +25,92 @@ extern struct shared_info *HYPERVISOR_shared_info;
void xen_setup_mfn_list_list(void);
void xen_setup_shared_info(void);
+void xen_build_mfn_list_list(void);
+void xen_setup_machphys_mapping(void);
+void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
+void xen_reserve_top(void);
+extern unsigned long xen_max_p2m_pfn;
+
+void xen_mm_pin_all(void);
+void xen_mm_unpin_all(void);
+void xen_set_pat(u64);
char * __init xen_memory_setup(void);
+char * xen_auto_xlated_memory_setup(void);
void __init xen_arch_setup(void);
-void __init xen_init_IRQ(void);
void xen_enable_sysenter(void);
void xen_enable_syscall(void);
void xen_vcpu_restore(void);
+void xen_callback_vector(void);
+void xen_hvm_init_shared_info(void);
+void xen_unplug_emulated_devices(void);
+
void __init xen_build_dynamic_phys_to_machine(void);
+unsigned long __init xen_revector_p2m_tree(void);
void xen_init_irq_ops(void);
void xen_setup_timer(int cpu);
+void xen_setup_runstate_info(int cpu);
void xen_teardown_timer(int cpu);
cycle_t xen_clocksource_read(void);
void xen_setup_cpu_clockevents(void);
-unsigned long xen_tsc_khz(void);
-void __init xen_time_init(void);
-unsigned long xen_get_wallclock(void);
-int xen_set_wallclock(unsigned long time);
-unsigned long long xen_sched_clock(void);
+void __init xen_init_time_ops(void);
+void __init xen_hvm_init_time_ops(void);
irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
bool xen_vcpu_stolen(int vcpu);
-void xen_mark_init_mm_pinned(void);
-
void xen_setup_vcpu_info_placement(void);
#ifdef CONFIG_SMP
void xen_smp_init(void);
-
-void __init xen_init_spinlocks(void);
-__cpuinit void xen_init_lock_cpu(int cpu);
-void xen_uninit_lock_cpu(int cpu);
+void __init xen_hvm_smp_init(void);
extern cpumask_var_t xen_cpu_initialized_map;
#else
static inline void xen_smp_init(void) {}
+static inline void xen_hvm_smp_init(void) {}
+#endif
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+void __init xen_init_spinlocks(void);
+void xen_init_lock_cpu(int cpu);
+void xen_uninit_lock_cpu(int cpu);
+#else
+static inline void xen_init_spinlocks(void)
+{
+}
+static inline void xen_init_lock_cpu(int cpu)
+{
+}
+static inline void xen_uninit_lock_cpu(int cpu)
+{
+}
#endif
+struct dom0_vga_console_info;
+
+#ifdef CONFIG_XEN_DOM0
+void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
+void __init xen_init_apic(void);
+#else
+static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
+ size_t size)
+{
+}
+static inline void __init xen_init_apic(void)
+{
+}
+#endif
/* Declare an asm function, along with symbols needed to make it
inlineable */
#define DECL_ASM(ret, name, ...) \
- ret name(__VA_ARGS__); \
- extern char name##_end[]; \
- extern char name##_reloc[] \
+ __visible ret name(__VA_ARGS__); \
+ extern char name##_end[] __visible; \
+ extern char name##_reloc[] __visible
DECL_ASM(void, xen_irq_enable_direct, void);
DECL_ASM(void, xen_irq_disable_direct, void);
@@ -77,10 +118,13 @@ DECL_ASM(unsigned long, xen_save_fl_direct, void);
DECL_ASM(void, xen_restore_fl_direct, unsigned long);
/* These are not functions, and cannot be called normally */
-void xen_iret(void);
-void xen_sysexit(void);
-void xen_sysret32(void);
-void xen_sysret64(void);
-void xen_adjust_exception_frame(void);
+__visible void xen_iret(void);
+__visible void xen_sysexit(void);
+__visible void xen_sysret32(void);
+__visible void xen_sysret64(void);
+__visible void xen_adjust_exception_frame(void);
+
+extern int xen_panic_handler_init(void);
+void xen_pvh_secondary_vcpu_init(int cpu);
#endif /* XEN_OPS_H */