aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/xen
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/Kconfig13
-rw-r--r--arch/x86/xen/enlighten.c373
-rw-r--r--arch/x86/xen/grant-table.c204
-rw-r--r--arch/x86/xen/irq.c38
-rw-r--r--arch/x86/xen/mmu.c427
-rw-r--r--arch/x86/xen/p2m.c352
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c4
-rw-r--r--arch/x86/xen/platform-pci-unplug.c79
-rw-r--r--arch/x86/xen/setup.c124
-rw-r--r--arch/x86/xen/smp.c282
-rw-r--r--arch/x86/xen/smp.h1
-rw-r--r--arch/x86/xen/spinlock.c456
-rw-r--r--arch/x86/xen/suspend.c25
-rw-r--r--arch/x86/xen/time.c127
-rw-r--r--arch/x86/xen/xen-asm_32.S33
-rw-r--r--arch/x86/xen/xen-head.S25
-rw-r--r--arch/x86/xen/xen-ops.h24
17 files changed, 1712 insertions, 875 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 131dacd2748..e88fda867a3 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -4,10 +4,10 @@
config XEN
bool "Xen guest support"
- select PARAVIRT
+ depends on PARAVIRT
select PARAVIRT_CLOCK
select XEN_HAVE_PVMMU
- depends on X86_64 || (X86_32 && X86_PAE && !X86_VISWS)
+ depends on X86_64 || (X86_32 && X86_PAE)
depends on X86_TSC
help
This is the Linux Xen port. Enabling this will allow the
@@ -19,11 +19,6 @@ config XEN_DOM0
depends on XEN && PCI_XEN && SWIOTLB_XEN
depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI
-# Dummy symbol since people have come to rely on the PRIVILEGED_GUEST
-# name in tools.
-config XEN_PRIVILEGED_GUEST
- def_bool XEN_DOM0
-
config XEN_PVHVM
def_bool y
depends on XEN && PCI && X86_LOCAL_APIC
@@ -51,3 +46,7 @@ config XEN_DEBUG_FS
Enable statistics output and various tuning options in debugfs.
Enabling this option may incur a significant performance overhead.
+config XEN_PVH
+ bool "Support for running as a PVH guest"
+ depends on X86_64 && XEN && XEN_PVHVM
+ def_bool n
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 138e5667409..ffb101e4573 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -31,6 +31,7 @@
#include <linux/pci.h>
#include <linux/gfp.h>
#include <linux/memblock.h>
+#include <linux/edd.h>
#include <xen/xen.h>
#include <xen/events.h>
@@ -67,6 +68,7 @@
#include <asm/hypervisor.h>
#include <asm/mwait.h>
#include <asm/pci_x86.h>
+#include <asm/pat.h>
#ifdef CONFIG_ACPI
#include <linux/acpi.h>
@@ -83,7 +85,29 @@
EXPORT_SYMBOL_GPL(hypercall_page);
+/*
+ * Pointer to the xen_vcpu_info structure or
+ * &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info
+ * and xen_vcpu_setup for details. By default it points to share_info->vcpu_info
+ * but if the hypervisor supports VCPUOP_register_vcpu_info then it can point
+ * to xen_vcpu_info. The pointer is used in __xen_evtchn_do_upcall to
+ * acknowledge pending events.
+ * Also more subtly it is used by the patched version of irq enable/disable
+ * e.g. xen_irq_enable_direct and xen_iret in PV mode.
+ *
+ * The desire to be able to do those mask/unmask operations as a single
+ * instruction by using the per-cpu offset held in %gs is the real reason
+ * vcpu info is in a per-cpu pointer and the original reason for this
+ * hypercall.
+ *
+ */
DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
+
+/*
+ * Per CPU pages used if hypervisor supports VCPUOP_register_vcpu_info
+ * hypercall. This can be used both in PV and PVHVM mode. The structure
+ * overrides the default per_cpu(xen_vcpu, cpu) value.
+ */
DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
enum xen_domain_type xen_domain_type = XEN_NATIVE;
@@ -155,6 +179,21 @@ static void xen_vcpu_setup(int cpu)
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
+ /*
+ * This path is called twice on PVHVM - first during bootup via
+ * smp_init -> xen_hvm_cpu_notify, and then if the VCPU is being
+ * hotplugged: cpu_up -> xen_hvm_cpu_notify.
+ * As we can only do the VCPUOP_register_vcpu_info once lets
+ * not over-write its result.
+ *
+ * For PV it is called during restore (xen_vcpu_restore) and bootup
+ * (xen_setup_vcpu_info_placement). The hotplug mechanism does not
+ * use this function.
+ */
+ if (xen_hvm_domain()) {
+ if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu))
+ return;
+ }
if (cpu < MAX_VIRT_CPUS)
per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
@@ -170,7 +209,12 @@ static void xen_vcpu_setup(int cpu)
/* Check to see if the hypervisor will put the vcpu_info
structure where we want it, which allows direct access via
- a percpu-variable. */
+ a percpu-variable.
+ N.B. This hypercall can _only_ be called once per CPU. Subsequent
+ calls will error out with -EINVAL. This is due to the fact that
+ hypervisor has no unregister variant and this hypercall does not
+ allow to over-write info.mfn and info.offset.
+ */
err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);
if (err) {
@@ -218,8 +262,9 @@ static void __init xen_banner(void)
struct xen_extraversion extra;
HYPERVISOR_xen_version(XENVER_extraversion, &extra);
- printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
- pv_info.name);
+ pr_info("Booting paravirtualized kernel %son %s\n",
+ xen_feature(XENFEAT_auto_translated_physmap) ?
+ "with PVH extensions " : "", pv_info.name);
printk(KERN_INFO "Xen version: %d.%d%s%s\n",
version >> 16, version & 0xffff, extra.extraversion,
xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
@@ -383,11 +428,13 @@ static void __init xen_init_cpuid_mask(void)
if (!xen_initial_domain())
cpuid_leaf1_edx_mask &=
- ~((1 << X86_FEATURE_APIC) | /* disable local APIC */
- (1 << X86_FEATURE_ACPI)); /* disable ACPI */
+ ~((1 << X86_FEATURE_ACPI)); /* disable ACPI */
+
+ cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32));
+
ax = 1;
cx = 0;
- xen_cpuid(&ax, &bx, &cx, &dx);
+ cpuid(1, &ax, &bx, &cx, &dx);
xsave_mask =
(1 << (X86_FEATURE_XSAVE % 32)) |
@@ -688,8 +735,7 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
addr = (unsigned long)xen_int3;
else if (addr == (unsigned long)stack_segment)
addr = (unsigned long)xen_stack_segment;
- else if (addr == (unsigned long)double_fault ||
- addr == (unsigned long)nmi) {
+ else if (addr == (unsigned long)double_fault) {
/* Don't need to handle these */
return 0;
#ifdef CONFIG_X86_MCE
@@ -700,7 +746,12 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
*/
;
#endif
- } else {
+ } else if (addr == (unsigned long)nmi)
+ /*
+ * Use the native version as well.
+ */
+ ;
+ else {
/* Some other trap using IST? */
if (WARN_ON(val->ist != 0))
return 0;
@@ -1092,8 +1143,9 @@ void xen_setup_vcpu_info_placement(void)
xen_vcpu_setup(cpu);
/* xen_vcpu_setup managed to place the vcpu_info within the
- percpu area for all cpus, so make use of it */
- if (have_vcpu_info_placement) {
+ * percpu area for all cpus, so make use of it. Note that for
+ * PVH we want to use native IRQ mechanism. */
+ if (have_vcpu_info_placement && !xen_pvh_domain()) {
pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
@@ -1219,7 +1271,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.alloc_ldt = xen_alloc_ldt,
.free_ldt = xen_free_ldt,
- .store_gdt = native_store_gdt,
.store_idt = native_store_idt,
.store_tr = xen_store_tr,
@@ -1288,6 +1339,7 @@ xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
static struct notifier_block xen_panic_block = {
.notifier_call= xen_panic_event,
+ .priority = INT_MIN
};
int xen_panic_handler_init(void)
@@ -1305,13 +1357,102 @@ static const struct machine_ops xen_machine_ops __initconst = {
.emergency_restart = xen_emergency_restart,
};
+static void __init xen_boot_params_init_edd(void)
+{
+#if IS_ENABLED(CONFIG_EDD)
+ struct xen_platform_op op;
+ struct edd_info *edd_info;
+ u32 *mbr_signature;
+ unsigned nr;
+ int ret;
+
+ edd_info = boot_params.eddbuf;
+ mbr_signature = boot_params.edd_mbr_sig_buffer;
+
+ op.cmd = XENPF_firmware_info;
+
+ op.u.firmware_info.type = XEN_FW_DISK_INFO;
+ for (nr = 0; nr < EDDMAXNR; nr++) {
+ struct edd_info *info = edd_info + nr;
+
+ op.u.firmware_info.index = nr;
+ info->params.length = sizeof(info->params);
+ set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params,
+ &info->params);
+ ret = HYPERVISOR_dom0_op(&op);
+ if (ret)
+ break;
+
+#define C(x) info->x = op.u.firmware_info.u.disk_info.x
+ C(device);
+ C(version);
+ C(interface_support);
+ C(legacy_max_cylinder);
+ C(legacy_max_head);
+ C(legacy_sectors_per_track);
+#undef C
+ }
+ boot_params.eddbuf_entries = nr;
+
+ op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE;
+ for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) {
+ op.u.firmware_info.index = nr;
+ ret = HYPERVISOR_dom0_op(&op);
+ if (ret)
+ break;
+ mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature;
+ }
+ boot_params.edd_mbr_sig_buf_entries = nr;
+#endif
+}
+
/*
* Set up the GDT and segment registers for -fstack-protector. Until
* we do this, we have to be careful not to call any stack-protected
* function, which is most of the kernel.
+ *
+ * Note, that it is __ref because the only caller of this after init
+ * is PVH which is not going to use xen_load_gdt_boot or other
+ * __init functions.
*/
-static void __init xen_setup_stackprotector(void)
+static void __ref xen_setup_gdt(int cpu)
{
+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
+#ifdef CONFIG_X86_64
+ unsigned long dummy;
+
+ load_percpu_segment(cpu); /* We need to access per-cpu area */
+ switch_to_new_gdt(cpu); /* GDT and GS set */
+
+ /* We are switching of the Xen provided GDT to our HVM mode
+ * GDT. The new GDT has __KERNEL_CS with CS.L = 1
+ * and we are jumping to reload it.
+ */
+ asm volatile ("pushq %0\n"
+ "leaq 1f(%%rip),%0\n"
+ "pushq %0\n"
+ "lretq\n"
+ "1:\n"
+ : "=&r" (dummy) : "0" (__KERNEL_CS));
+
+ /*
+ * While not needed, we also set the %es, %ds, and %fs
+ * to zero. We don't care about %ss as it is NULL.
+ * Strictly speaking this is not needed as Xen zeros those
+ * out (and also MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE)
+ *
+ * Linux zeros them in cpu_init() and in secondary_startup_64
+ * (for BSP).
+ */
+ loadsegment(es, 0);
+ loadsegment(ds, 0);
+ loadsegment(fs, 0);
+#else
+ /* PVH: TODO Implement. */
+ BUG();
+#endif
+ return; /* PVH does not need any PV GDT ops. */
+ }
pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
pv_cpu_ops.load_gdt = xen_load_gdt_boot;
@@ -1322,8 +1463,60 @@ static void __init xen_setup_stackprotector(void)
pv_cpu_ops.load_gdt = xen_load_gdt;
}
+/*
+ * A PV guest starts with default flags that are not set for PVH, set them
+ * here asap.
+ */
+static void xen_pvh_set_cr_flags(int cpu)
+{
+
+ /* Some of these are setup in 'secondary_startup_64'. The others:
+ * X86_CR0_TS, X86_CR0_PE, X86_CR0_ET are set by Xen for HVM guests
+ * (which PVH shared codepaths), while X86_CR0_PG is for PVH. */
+ write_cr0(read_cr0() | X86_CR0_MP | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM);
+
+ if (!cpu)
+ return;
+ /*
+ * For BSP, PSE PGE are set in probe_page_size_mask(), for APs
+ * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu_init.
+ */
+ if (cpu_has_pse)
+ set_in_cr4(X86_CR4_PSE);
+
+ if (cpu_has_pge)
+ set_in_cr4(X86_CR4_PGE);
+}
+
+/*
+ * Note, that it is ref - because the only caller of this after init
+ * is PVH which is not going to use xen_load_gdt_boot or other
+ * __init functions.
+ */
+void __ref xen_pvh_secondary_vcpu_init(int cpu)
+{
+ xen_setup_gdt(cpu);
+ xen_pvh_set_cr_flags(cpu);
+}
+
+static void __init xen_pvh_early_guest_init(void)
+{
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ if (!xen_feature(XENFEAT_hvm_callback_vector))
+ return;
+
+ xen_have_vector_callback = 1;
+ xen_pvh_set_cr_flags(0);
+
+#ifdef CONFIG_X86_32
+ BUG(); /* PVH: Implement proper support. */
+#endif
+}
+
/* First C function to be called on Xen boot */
-asmlinkage void __init xen_start_kernel(void)
+asmlinkage __visible void __init xen_start_kernel(void)
{
struct physdev_set_iopl set_iopl;
int rc;
@@ -1333,15 +1526,21 @@ asmlinkage void __init xen_start_kernel(void)
xen_domain_type = XEN_PV_DOMAIN;
+ xen_setup_features();
+ xen_pvh_early_guest_init();
xen_setup_machphys_mapping();
/* Install Xen paravirt ops */
pv_info = xen_info;
pv_init_ops = xen_init_ops;
- pv_cpu_ops = xen_cpu_ops;
pv_apic_ops = xen_apic_ops;
+ if (!xen_pvh_domain())
+ pv_cpu_ops = xen_cpu_ops;
- x86_init.resources.memory_setup = xen_memory_setup;
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ x86_init.resources.memory_setup = xen_auto_xlated_memory_setup;
+ else
+ x86_init.resources.memory_setup = xen_memory_setup;
x86_init.oem.arch_setup = xen_arch_setup;
x86_init.oem.banner = xen_banner;
@@ -1371,17 +1570,14 @@ asmlinkage void __init xen_start_kernel(void)
/* Work out if we support NX */
x86_configure_nx();
- xen_setup_features();
-
/* Get mfn list */
- if (!xen_feature(XENFEAT_auto_translated_physmap))
- xen_build_dynamic_phys_to_machine();
+ xen_build_dynamic_phys_to_machine();
/*
* Set up kernel GDT and segment registers, mainly so that
* -fstack-protector code can be executed.
*/
- xen_setup_stackprotector();
+ xen_setup_gdt(0);
xen_init_irq_ops();
xen_init_cpuid_mask();
@@ -1417,7 +1613,14 @@ asmlinkage void __init xen_start_kernel(void)
*/
acpi_numa = -1;
#endif
-
+#ifdef CONFIG_X86_PAT
+ /*
+ * For right now disable the PAT. We should remove this once
+ * git commit 8eaffa67b43e99ae581622c5133e20b0f48bcef1
+ * (xen/pat: Disable PAT support for now) is reverted.
+ */
+ pat_enabled = 0;
+#endif
/* Don't do the full vcpu_info placement stuff until we have a
possible map and a non-dummy shared_info. */
per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
@@ -1443,19 +1646,23 @@ asmlinkage void __init xen_start_kernel(void)
/* set the limit of our address space */
xen_reserve_top();
- /* We used to do this in xen_arch_setup, but that is too late on AMD
- * were early_cpu_init (run before ->arch_setup()) calls early_amd_init
- * which pokes 0xcf8 port.
- */
- set_iopl.iopl = 1;
- rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
- if (rc != 0)
- xen_raw_printk("physdev_op failed %d\n", rc);
+ /* PVH: runs at default kernel iopl of 0 */
+ if (!xen_pvh_domain()) {
+ /*
+ * We used to do this in xen_arch_setup, but that is too late
+ * on AMD were early_cpu_init (run before ->arch_setup()) calls
+ * early_amd_init which pokes 0xcf8 port.
+ */
+ set_iopl.iopl = 1;
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+ if (rc != 0)
+ xen_raw_printk("physdev_op failed %d\n", rc);
+ }
#ifdef CONFIG_X86_32
/* set up basic CPUID stuff */
cpu_detect(&new_cpu_data);
- new_cpu_data.hard_math = 1;
+ set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
new_cpu_data.wp_works_ok = 1;
new_cpu_data.x86_capability[0] = cpuid_edx(1);
#endif
@@ -1500,6 +1707,8 @@ asmlinkage void __init xen_start_kernel(void)
/* Avoid searching for BIOS MP tables */
x86_init.mpparse.find_smp_config = x86_init_noop;
x86_init.mpparse.get_smp_config = x86_init_uint_noop;
+
+ xen_boot_params_init_edd();
}
#ifdef CONFIG_PCI
/* PCI BIOS service won't work from a PV guest. */
@@ -1517,72 +1726,54 @@ asmlinkage void __init xen_start_kernel(void)
#endif
}
-#ifdef CONFIG_XEN_PVHVM
-#define HVM_SHARED_INFO_ADDR 0xFE700000UL
-static struct shared_info *xen_hvm_shared_info;
-static unsigned long xen_hvm_sip_phys;
-static int xen_major, xen_minor;
-
-static void xen_hvm_connect_shared_info(unsigned long pfn)
+void __ref xen_hvm_init_shared_info(void)
{
+ int cpu;
struct xen_add_to_physmap xatp;
+ static struct shared_info *shared_info_page = 0;
+ if (!shared_info_page)
+ shared_info_page = (struct shared_info *)
+ extend_brk(PAGE_SIZE, PAGE_SIZE);
xatp.domid = DOMID_SELF;
xatp.idx = 0;
xatp.space = XENMAPSPACE_shared_info;
- xatp.gpfn = pfn;
+ xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
BUG();
-}
-static void __init xen_hvm_set_shared_info(struct shared_info *sip)
-{
- int cpu;
-
- HYPERVISOR_shared_info = sip;
+ HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
* page, we use it in the event channel upcall and in some pvclock
* related functions. We don't need the vcpu_info placement
* optimizations because we don't use any pv_mmu or pv_irq op on
- * HVM. */
- for_each_online_cpu(cpu)
+ * HVM.
+ * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
+ * online but xen_hvm_init_shared_info is run at resume time too and
+ * in that case multiple vcpus might be online. */
+ for_each_online_cpu(cpu) {
+ /* Leave it to be NULL. */
+ if (cpu >= MAX_VIRT_CPUS)
+ continue;
per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
-}
-
-/* Reconnect the shared_info pfn to a (new) mfn */
-void xen_hvm_resume_shared_info(void)
-{
- xen_hvm_connect_shared_info(xen_hvm_sip_phys >> PAGE_SHIFT);
-}
-
-/* Xen tools prior to Xen 4 do not provide a E820_Reserved area for guest usage.
- * On these old tools the shared info page will be placed in E820_Ram.
- * Xen 4 provides a E820_Reserved area at 0xFC000000, and this code expects
- * that nothing is mapped up to HVM_SHARED_INFO_ADDR.
- * Xen 4.3+ provides an explicit 1MB area at HVM_SHARED_INFO_ADDR which is used
- * here for the shared info page. */
-static void __init xen_hvm_init_shared_info(void)
-{
- if (xen_major < 4) {
- xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
- xen_hvm_sip_phys = __pa(xen_hvm_shared_info);
- } else {
- xen_hvm_sip_phys = HVM_SHARED_INFO_ADDR;
- set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_hvm_sip_phys);
- xen_hvm_shared_info =
- (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
}
- xen_hvm_connect_shared_info(xen_hvm_sip_phys >> PAGE_SHIFT);
- xen_hvm_set_shared_info(xen_hvm_shared_info);
}
+#ifdef CONFIG_XEN_PVHVM
static void __init init_hvm_pv_info(void)
{
- uint32_t ecx, edx, pages, msr, base;
+ int major, minor;
+ uint32_t eax, ebx, ecx, edx, pages, msr, base;
u64 pfn;
base = xen_cpuid_base();
+ cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+
+ major = eax >> 16;
+ minor = eax & 0xffff;
+ printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
+
cpuid(base + 2, &pages, &msr, &ecx, &edx);
pfn = __pa(hypercall_page);
@@ -1595,15 +1786,17 @@ static void __init init_hvm_pv_info(void)
xen_domain_type = XEN_HVM_DOMAIN;
}
-static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+static int xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action,
+ void *hcpu)
{
int cpu = (long)hcpu;
switch (action) {
case CPU_UP_PREPARE:
xen_vcpu_setup(cpu);
- if (xen_have_vector_callback)
- xen_init_lock_cpu(cpu);
+ if (xen_have_vector_callback) {
+ if (xen_feature(XENFEAT_hvm_safe_pvclock))
+ xen_setup_timer(cpu);
+ }
break;
default:
break;
@@ -1611,7 +1804,7 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
+static struct notifier_block xen_hvm_cpu_notifier = {
.notifier_call = xen_hvm_cpu_notify,
};
@@ -1621,6 +1814,8 @@ static void __init xen_hvm_guest_init(void)
xen_hvm_init_shared_info();
+ xen_panic_handler_init();
+
if (xen_feature(XENFEAT_hvm_callback_vector))
xen_have_vector_callback = 1;
xen_hvm_smp_init();
@@ -1631,25 +1826,12 @@ static void __init xen_hvm_guest_init(void)
xen_hvm_init_mmu_ops();
}
-static bool __init xen_hvm_platform(void)
+static uint32_t __init xen_hvm_platform(void)
{
- uint32_t eax, ebx, ecx, edx, base;
-
if (xen_pv_domain())
- return false;
-
- base = xen_cpuid_base();
- if (!base)
- return false;
-
- cpuid(base + 1, &eax, &ebx, &ecx, &edx);
-
- xen_major = eax >> 16;
- xen_minor = eax & 0xffff;
-
- printk(KERN_INFO "Xen version %d.%d.\n", xen_major, xen_minor);
+ return 0;
- return true;
+ return xen_cpuid_base();
}
bool xen_hvm_need_lapic(void)
@@ -1668,6 +1850,7 @@ const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = {
.name = "Xen HVM",
.detect = xen_hvm_platform,
.init_platform = xen_hvm_guest_init,
+ .x2apic_available = xen_x2apic_para_available,
};
EXPORT_SYMBOL(x86_hyper_xen_hvm);
#endif
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index 3a5f55d5190..ebfa9b2c871 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -36,92 +36,190 @@
#include <linux/sched.h>
#include <linux/mm.h>
+#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <xen/interface/xen.h>
#include <xen/page.h>
#include <xen/grant_table.h>
+#include <xen/xen.h>
#include <asm/pgtable.h>
-static int map_pte_fn(pte_t *pte, struct page *pmd_page,
- unsigned long addr, void *data)
+static struct gnttab_vm_area {
+ struct vm_struct *area;
+ pte_t **ptes;
+} gnttab_shared_vm_area, gnttab_status_vm_area;
+
+int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
+ unsigned long max_nr_gframes,
+ void **__shared)
{
- unsigned long **frames = (unsigned long **)data;
+ void *shared = *__shared;
+ unsigned long addr;
+ unsigned long i;
+
+ if (shared == NULL)
+ *__shared = shared = gnttab_shared_vm_area.area->addr;
+
+ addr = (unsigned long)shared;
+
+ for (i = 0; i < nr_gframes; i++) {
+ set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i],
+ mfn_pte(frames[i], PAGE_KERNEL));
+ addr += PAGE_SIZE;
+ }
- set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
- (*frames)++;
return 0;
}
-/*
- * This function is used to map shared frames to store grant status. It is
- * different from map_pte_fn above, the frames type here is uint64_t.
- */
-static int map_pte_fn_status(pte_t *pte, struct page *pmd_page,
- unsigned long addr, void *data)
+int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
+ unsigned long max_nr_gframes,
+ grant_status_t **__shared)
{
- uint64_t **frames = (uint64_t **)data;
+ grant_status_t *shared = *__shared;
+ unsigned long addr;
+ unsigned long i;
+
+ if (shared == NULL)
+ *__shared = shared = gnttab_status_vm_area.area->addr;
+
+ addr = (unsigned long)shared;
+
+ for (i = 0; i < nr_gframes; i++) {
+ set_pte_at(&init_mm, addr, gnttab_status_vm_area.ptes[i],
+ mfn_pte(frames[i], PAGE_KERNEL));
+ addr += PAGE_SIZE;
+ }
- set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
- (*frames)++;
return 0;
}
-static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
- unsigned long addr, void *data)
+void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
+{
+ pte_t **ptes;
+ unsigned long addr;
+ unsigned long i;
+
+ if (shared == gnttab_status_vm_area.area->addr)
+ ptes = gnttab_status_vm_area.ptes;
+ else
+ ptes = gnttab_shared_vm_area.ptes;
+
+ addr = (unsigned long)shared;
+
+ for (i = 0; i < nr_gframes; i++) {
+ set_pte_at(&init_mm, addr, ptes[i], __pte(0));
+ addr += PAGE_SIZE;
+ }
+}
+
+static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames)
{
+ area->ptes = kmalloc(sizeof(pte_t *) * nr_frames, GFP_KERNEL);
+ if (area->ptes == NULL)
+ return -ENOMEM;
+
+ area->area = alloc_vm_area(PAGE_SIZE * nr_frames, area->ptes);
+ if (area->area == NULL) {
+ kfree(area->ptes);
+ return -ENOMEM;
+ }
- set_pte_at(&init_mm, addr, pte, __pte(0));
return 0;
}
-int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
- unsigned long max_nr_gframes,
- void **__shared)
+static void arch_gnttab_vfree(struct gnttab_vm_area *area)
{
- int rc;
- void *shared = *__shared;
+ free_vm_area(area->area);
+ kfree(area->ptes);
+}
- if (shared == NULL) {
- struct vm_struct *area =
- alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL);
- BUG_ON(area == NULL);
- shared = area->addr;
- *__shared = shared;
- }
+int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status)
+{
+ int ret;
+
+ if (!xen_pv_domain())
+ return 0;
+
+ ret = arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Always allocate the space for the status frames in case
+ * we're migrated to a host with V2 support.
+ */
+ ret = arch_gnttab_valloc(&gnttab_status_vm_area, nr_status);
+ if (ret < 0)
+ goto err;
- rc = apply_to_page_range(&init_mm, (unsigned long)shared,
- PAGE_SIZE * nr_gframes,
- map_pte_fn, &frames);
- return rc;
+ return 0;
+ err:
+ arch_gnttab_vfree(&gnttab_shared_vm_area);
+ return -ENOMEM;
}
-int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
- unsigned long max_nr_gframes,
- grant_status_t **__shared)
+#ifdef CONFIG_XEN_PVH
+#include <xen/balloon.h>
+#include <xen/events.h>
+#include <linux/slab.h>
+static int __init xlated_setup_gnttab_pages(void)
{
+ struct page **pages;
+ xen_pfn_t *pfns;
int rc;
- grant_status_t *shared = *__shared;
-
- if (shared == NULL) {
- /* No need to pass in PTE as we are going to do it
- * in apply_to_page_range anyhow. */
- struct vm_struct *area =
- alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL);
- BUG_ON(area == NULL);
- shared = area->addr;
- *__shared = shared;
+ unsigned int i;
+ unsigned long nr_grant_frames = gnttab_max_grant_frames();
+
+ BUG_ON(nr_grant_frames == 0);
+ pages = kcalloc(nr_grant_frames, sizeof(pages[0]), GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ pfns = kcalloc(nr_grant_frames, sizeof(pfns[0]), GFP_KERNEL);
+ if (!pfns) {
+ kfree(pages);
+ return -ENOMEM;
}
+ rc = alloc_xenballooned_pages(nr_grant_frames, pages, 0 /* lowmem */);
+ if (rc) {
+ pr_warn("%s Couldn't balloon alloc %ld pfns rc:%d\n", __func__,
+ nr_grant_frames, rc);
+ kfree(pages);
+ kfree(pfns);
+ return rc;
+ }
+ for (i = 0; i < nr_grant_frames; i++)
+ pfns[i] = page_to_pfn(pages[i]);
+
+ rc = arch_gnttab_map_shared(pfns, nr_grant_frames, nr_grant_frames,
+ &xen_auto_xlat_grant_frames.vaddr);
+
+ if (rc) {
+ pr_warn("%s Couldn't map %ld pfns rc:%d\n", __func__,
+ nr_grant_frames, rc);
+ free_xenballooned_pages(nr_grant_frames, pages);
+ kfree(pages);
+ kfree(pfns);
+ return rc;
+ }
+ kfree(pages);
+
+ xen_auto_xlat_grant_frames.pfn = pfns;
+ xen_auto_xlat_grant_frames.count = nr_grant_frames;
- rc = apply_to_page_range(&init_mm, (unsigned long)shared,
- PAGE_SIZE * nr_gframes,
- map_pte_fn_status, &frames);
- return rc;
+ return 0;
}
-void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
+static int __init xen_pvh_gnttab_setup(void)
{
- apply_to_page_range(&init_mm, (unsigned long)shared,
- PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
+ if (!xen_pvh_domain())
+ return -ENODEV;
+
+ return xlated_setup_gnttab_pages();
}
+/* Call it _before_ __gnttab_init as we need to initialize the
+ * xen_auto_xlat_grant_frames first. */
+core_initcall(xen_pvh_gnttab_setup);
+#endif
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 01a4dc015ae..a1207cb6472 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -5,6 +5,7 @@
#include <xen/interface/xen.h>
#include <xen/interface/sched.h>
#include <xen/interface/vcpu.h>
+#include <xen/features.h>
#include <xen/events.h>
#include <asm/xen/hypercall.h>
@@ -22,7 +23,7 @@ void xen_force_evtchn_callback(void)
(void)HYPERVISOR_xen_version(0, NULL);
}
-static unsigned long xen_save_fl(void)
+asmlinkage __visible unsigned long xen_save_fl(void)
{
struct vcpu_info *vcpu;
unsigned long flags;
@@ -40,34 +41,29 @@ static unsigned long xen_save_fl(void)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
-static void xen_restore_fl(unsigned long flags)
+__visible void xen_restore_fl(unsigned long flags)
{
struct vcpu_info *vcpu;
/* convert from IF type flag */
flags = !(flags & X86_EFLAGS_IF);
- /* There's a one instruction preempt window here. We need to
- make sure we're don't switch CPUs between getting the vcpu
- pointer and updating the mask. */
+ /* See xen_irq_enable() for why preemption must be disabled. */
preempt_disable();
vcpu = this_cpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = flags;
- preempt_enable_no_resched();
-
- /* Doesn't matter if we get preempted here, because any
- pending event will get dealt with anyway. */
if (flags == 0) {
- preempt_check_resched();
barrier(); /* unmask then check (avoid races) */
if (unlikely(vcpu->evtchn_upcall_pending))
xen_force_evtchn_callback();
- }
+ preempt_enable();
+ } else
+ preempt_enable_no_resched();
}
PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
-static void xen_irq_disable(void)
+asmlinkage __visible void xen_irq_disable(void)
{
/* There's a one instruction preempt window here. We need to
make sure we're don't switch CPUs between getting the vcpu
@@ -78,14 +74,16 @@ static void xen_irq_disable(void)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
-static void xen_irq_enable(void)
+asmlinkage __visible void xen_irq_enable(void)
{
struct vcpu_info *vcpu;
- /* We don't need to worry about being preempted here, since
- either a) interrupts are disabled, so no preemption, or b)
- the caller is confused and is trying to re-enable interrupts
- on an indeterminate processor. */
+ /*
+ * We may be preempted as soon as vcpu->evtchn_upcall_mask is
+ * cleared, so disable preemption to ensure we check for
+ * events on the VCPU we are still running on.
+ */
+ preempt_disable();
vcpu = this_cpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = 0;
@@ -96,6 +94,8 @@ static void xen_irq_enable(void)
barrier(); /* unmask then check (avoid races) */
if (unlikely(vcpu->evtchn_upcall_pending))
xen_force_evtchn_callback();
+
+ preempt_enable();
}
PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
@@ -129,6 +129,8 @@ static const struct pv_irq_ops xen_irq_ops __initconst = {
void __init xen_init_irq_ops(void)
{
- pv_irq_ops = xen_irq_ops;
+ /* For PVH we use default pv_irq_ops settings. */
+ if (!xen_feature(XENFEAT_hvm_callback_vector))
+ pv_irq_ops = xen_irq_ops;
x86_init.irqs.intr_init = xen_init_IRQ;
}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 01de35c7722..e8a1201c329 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -431,7 +431,7 @@ static pteval_t iomap_pte(pteval_t val)
return val;
}
-static pteval_t xen_pte_val(pte_t pte)
+__visible pteval_t xen_pte_val(pte_t pte)
{
pteval_t pteval = pte.pte;
#if 0
@@ -448,7 +448,7 @@ static pteval_t xen_pte_val(pte_t pte)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
-static pgdval_t xen_pgd_val(pgd_t pgd)
+__visible pgdval_t xen_pgd_val(pgd_t pgd)
{
return pte_mfn_to_pfn(pgd.pgd);
}
@@ -468,8 +468,8 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
* 3 PCD PWT UC UC UC
* 4 PAT WB WC WB
* 5 PAT PWT WC WP WT
- * 6 PAT PCD UC- UC UC-
- * 7 PAT PCD PWT UC UC UC
+ * 6 PAT PCD UC- rsv UC-
+ * 7 PAT PCD PWT UC rsv UC
*/
void xen_set_pat(u64 pat)
@@ -479,7 +479,7 @@ void xen_set_pat(u64 pat)
WARN_ON(pat != 0x0007010600070106ull);
}
-static pte_t xen_make_pte(pteval_t pte)
+__visible pte_t xen_make_pte(pteval_t pte)
{
phys_addr_t addr = (pte & PTE_PFN_MASK);
#if 0
@@ -514,14 +514,14 @@ static pte_t xen_make_pte(pteval_t pte)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
-static pgd_t xen_make_pgd(pgdval_t pgd)
+__visible pgd_t xen_make_pgd(pgdval_t pgd)
{
pgd = pte_pfn_to_mfn(pgd);
return native_make_pgd(pgd);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
-static pmdval_t xen_pmd_val(pmd_t pmd)
+__visible pmdval_t xen_pmd_val(pmd_t pmd)
{
return pte_mfn_to_pfn(pmd.pmd);
}
@@ -580,7 +580,7 @@ static void xen_pmd_clear(pmd_t *pmdp)
}
#endif /* CONFIG_X86_PAE */
-static pmd_t xen_make_pmd(pmdval_t pmd)
+__visible pmd_t xen_make_pmd(pmdval_t pmd)
{
pmd = pte_pfn_to_mfn(pmd);
return native_make_pmd(pmd);
@@ -588,13 +588,13 @@ static pmd_t xen_make_pmd(pmdval_t pmd)
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
#if PAGETABLE_LEVELS == 4
-static pudval_t xen_pud_val(pud_t pud)
+__visible pudval_t xen_pud_val(pud_t pud)
{
return pte_mfn_to_pfn(pud.pud);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
-static pud_t xen_make_pud(pudval_t pud)
+__visible pud_t xen_make_pud(pudval_t pud)
{
pud = pte_pfn_to_mfn(pud);
@@ -796,8 +796,8 @@ static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
{
spinlock_t *ptl = NULL;
-#if USE_SPLIT_PTLOCKS
- ptl = __pte_lockptr(page);
+#if USE_SPLIT_PTE_PTLOCKS
+ ptl = ptlock_ptr(page);
spin_lock_nest_lock(ptl, &mm->page_table_lock);
#endif
@@ -1178,20 +1178,6 @@ static void xen_exit_mmap(struct mm_struct *mm)
static void xen_post_allocator_init(void);
-static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
-{
- /* reserve the range used */
- native_pagetable_reserve(start, end);
-
- /* set as RW the rest */
- printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
- PFN_PHYS(pgt_buf_top));
- while (end < PFN_PHYS(pgt_buf_top)) {
- make_lowmem_page_readwrite(__va(end));
- end += PAGE_SIZE;
- }
-}
-
#ifdef CONFIG_X86_64
static void __init xen_cleanhighmap(unsigned long vaddr,
unsigned long vaddr_end)
@@ -1212,44 +1198,40 @@ static void __init xen_cleanhighmap(unsigned long vaddr,
* instead of somewhere later and be confusing. */
xen_mc_flush();
}
-#endif
-static void __init xen_pagetable_init(void)
+static void __init xen_pagetable_p2m_copy(void)
{
-#ifdef CONFIG_X86_64
unsigned long size;
unsigned long addr;
-#endif
- paging_init();
- xen_setup_shared_info();
-#ifdef CONFIG_X86_64
- if (!xen_feature(XENFEAT_auto_translated_physmap)) {
- unsigned long new_mfn_list;
-
- size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
-
- /* On 32-bit, we get zero so this never gets executed. */
- new_mfn_list = xen_revector_p2m_tree();
- if (new_mfn_list && new_mfn_list != xen_start_info->mfn_list) {
- /* using __ka address and sticking INVALID_P2M_ENTRY! */
- memset((void *)xen_start_info->mfn_list, 0xff, size);
-
- /* We should be in __ka space. */
- BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
- addr = xen_start_info->mfn_list;
- /* We roundup to the PMD, which means that if anybody at this stage is
- * using the __ka address of xen_start_info or xen_start_info->shared_info
- * they are in going to crash. Fortunatly we have already revectored
- * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
- size = roundup(size, PMD_SIZE);
- xen_cleanhighmap(addr, addr + size);
-
- size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
- memblock_free(__pa(xen_start_info->mfn_list), size);
- /* And revector! Bye bye old array */
- xen_start_info->mfn_list = new_mfn_list;
- } else
- goto skip;
- }
+ unsigned long new_mfn_list;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+
+ new_mfn_list = xen_revector_p2m_tree();
+ /* No memory or already called. */
+ if (!new_mfn_list || new_mfn_list == xen_start_info->mfn_list)
+ return;
+
+ /* using __ka address and sticking INVALID_P2M_ENTRY! */
+ memset((void *)xen_start_info->mfn_list, 0xff, size);
+
+ /* We should be in __ka space. */
+ BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
+ addr = xen_start_info->mfn_list;
+ /* We roundup to the PMD, which means that if anybody at this stage is
+ * using the __ka address of xen_start_info or xen_start_info->shared_info
+ * they are in going to crash. Fortunatly we have already revectored
+ * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
+ size = roundup(size, PMD_SIZE);
+ xen_cleanhighmap(addr, addr + size);
+
+ size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+ memblock_free(__pa(xen_start_info->mfn_list), size);
+ /* And revector! Bye bye old array */
+ xen_start_info->mfn_list = new_mfn_list;
+
/* At this stage, cleanup_highmap has already cleaned __ka space
* from _brk_limit way up to the max_pfn_mapped (which is the end of
* the ramdisk). We continue on, erasing PMD entries that point to page
@@ -1269,7 +1251,15 @@ static void __init xen_pagetable_init(void)
* anything at this stage. */
xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
#endif
-skip:
+}
+#endif
+
+static void __init xen_pagetable_init(void)
+{
+ paging_init();
+ xen_setup_shared_info();
+#ifdef CONFIG_X86_64
+ xen_pagetable_p2m_copy();
#endif
xen_post_allocator_init();
}
@@ -1422,7 +1412,6 @@ static void __xen_write_cr3(bool kernel, unsigned long cr3)
xen_mc_callback(set_current_cr3, (void *)cr3);
}
}
-
static void xen_write_cr3(unsigned long cr3)
{
BUG_ON(preemptible());
@@ -1448,6 +1437,43 @@ static void xen_write_cr3(unsigned long cr3)
xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
}
+#ifdef CONFIG_X86_64
+/*
+ * At the start of the day - when Xen launches a guest, it has already
+ * built pagetables for the guest. We diligently look over them
+ * in xen_setup_kernel_pagetable and graft as appropiate them in the
+ * init_level4_pgt and its friends. Then when we are happy we load
+ * the new init_level4_pgt - and continue on.
+ *
+ * The generic code starts (start_kernel) and 'init_mem_mapping' sets
+ * up the rest of the pagetables. When it has completed it loads the cr3.
+ * N.B. that baremetal would start at 'start_kernel' (and the early
+ * #PF handler would create bootstrap pagetables) - so we are running
+ * with the same assumptions as what to do when write_cr3 is executed
+ * at this point.
+ *
+ * Since there are no user-page tables at all, we have two variants
+ * of xen_write_cr3 - the early bootup (this one), and the late one
+ * (xen_write_cr3). The reason we have to do that is that in 64-bit
+ * the Linux kernel and user-space are both in ring 3 while the
+ * hypervisor is in ring 0.
+ */
+static void __init xen_write_cr3_init(unsigned long cr3)
+{
+ BUG_ON(preemptible());
+
+ xen_mc_batch(); /* disables interrupts */
+
+ /* Update while interrupts are disabled, so its atomic with
+ respect to ipis */
+ this_cpu_write(xen_cr3, cr3);
+
+ __xen_write_cr3(true, cr3);
+
+ xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
+}
+#endif
+
static int xen_pgd_alloc(struct mm_struct *mm)
{
pgd_t *pgd = mm->pgd;
@@ -1468,7 +1494,7 @@ static int xen_pgd_alloc(struct mm_struct *mm)
page->private = (unsigned long)user_pgd;
if (user_pgd != NULL) {
- user_pgd[pgd_index(VSYSCALL_START)] =
+ user_pgd[pgd_index(VSYSCALL_ADDR)] =
__pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
ret = 0;
}
@@ -1503,19 +1529,6 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
#else /* CONFIG_X86_64 */
static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
{
- unsigned long pfn = pte_pfn(pte);
-
- /*
- * If the new pfn is within the range of the newly allocated
- * kernel pagetable, and it isn't being mapped into an
- * early_ioremap fixmap slot as a freshly allocated page, make sure
- * it is RO.
- */
- if (((!is_early_ioremap_ptep(ptep) &&
- pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
- (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
- pte = pte_wrprotect(pte);
-
return pte;
}
#endif /* CONFIG_X86_64 */
@@ -1628,7 +1641,7 @@ static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
__set_pfn_prot(pfn, PAGE_KERNEL_RO);
- if (level == PT_PTE && USE_SPLIT_PTLOCKS)
+ if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
__pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
xen_mc_issue(PARAVIRT_LAZY_MMU);
@@ -1662,7 +1675,7 @@ static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
if (!PageHighMem(page)) {
xen_mc_batch();
- if (level == PT_PTE && USE_SPLIT_PTLOCKS)
+ if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
__pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
__set_pfn_prot(pfn, PAGE_KERNEL);
@@ -1739,14 +1752,22 @@ static void *m2v(phys_addr_t maddr)
}
/* Set the page permissions on an identity-mapped pages */
-static void set_page_prot(void *addr, pgprot_t prot)
+static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags)
{
unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
pte_t pte = pfn_pte(pfn, prot);
- if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
+ /* For PVH no need to set R/O or R/W to pin them or unpin them. */
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
BUG();
}
+static void set_page_prot(void *addr, pgprot_t prot)
+{
+ return set_page_prot_flags(addr, prot, UVMF_NONE);
+}
#ifdef CONFIG_X86_32
static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
{
@@ -1830,12 +1851,12 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
unsigned long addr)
{
if (*pt_base == PFN_DOWN(__pa(addr))) {
- set_page_prot((void *)addr, PAGE_KERNEL);
+ set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
clear_page((void *)addr);
(*pt_base)++;
}
if (*pt_end == PFN_DOWN(__pa(addr))) {
- set_page_prot((void *)addr, PAGE_KERNEL);
+ set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
clear_page((void *)addr);
(*pt_end)--;
}
@@ -1850,6 +1871,7 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
* but that's enough to get __va working. We need to fill in the rest
* of the physical mapping once some sort of allocator has been set
* up.
+ * NOTE: for PVH, the page tables are native.
*/
void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
{
@@ -1871,17 +1893,18 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
/* Zap identity mapping */
init_level4_pgt[0] = __pgd(0);
- /* Pre-constructed entries are in pfn, so convert to mfn */
- /* L4[272] -> level3_ident_pgt
- * L4[511] -> level3_kernel_pgt */
- convert_pfn_mfn(init_level4_pgt);
-
- /* L3_i[0] -> level2_ident_pgt */
- convert_pfn_mfn(level3_ident_pgt);
- /* L3_k[510] -> level2_kernel_pgt
- * L3_i[511] -> level2_fixmap_pgt */
- convert_pfn_mfn(level3_kernel_pgt);
-
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ /* Pre-constructed entries are in pfn, so convert to mfn */
+ /* L4[272] -> level3_ident_pgt
+ * L4[511] -> level3_kernel_pgt */
+ convert_pfn_mfn(init_level4_pgt);
+
+ /* L3_i[0] -> level2_ident_pgt */
+ convert_pfn_mfn(level3_ident_pgt);
+ /* L3_k[510] -> level2_kernel_pgt
+ * L3_i[511] -> level2_fixmap_pgt */
+ convert_pfn_mfn(level3_kernel_pgt);
+ }
/* We get [511][511] and have Xen's version of level2_kernel_pgt */
l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
@@ -1905,31 +1928,33 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
copy_page(level2_fixmap_pgt, l2);
/* Note that we don't do anything with level1_fixmap_pgt which
* we don't need. */
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ /* Make pagetable pieces RO */
+ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
+ set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
+
+ /* Pin down new L4 */
+ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
+ PFN_DOWN(__pa_symbol(init_level4_pgt)));
+
+ /* Unpin Xen-provided one */
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
- /* Make pagetable pieces RO */
- set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
- set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
- set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
- set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
- set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
- set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
- set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
-
- /* Pin down new L4 */
- pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
- PFN_DOWN(__pa_symbol(init_level4_pgt)));
-
- /* Unpin Xen-provided one */
- pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
-
- /*
- * At this stage there can be no user pgd, and no page
- * structure to attach it to, so make sure we just set kernel
- * pgd.
- */
- xen_mc_batch();
- __xen_write_cr3(true, __pa(init_level4_pgt));
- xen_mc_issue(PARAVIRT_LAZY_CPU);
+ /*
+ * At this stage there can be no user pgd, and no page
+ * structure to attach it to, so make sure we just set kernel
+ * pgd.
+ */
+ xen_mc_batch();
+ __xen_write_cr3(true, __pa(init_level4_pgt));
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+ } else
+ native_write_cr3(__pa(init_level4_pgt));
/* We can't that easily rip out L3 and L2, as the Xen pagetables are
* set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for
@@ -2030,18 +2055,14 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
switch (idx) {
case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
-#ifdef CONFIG_X86_F00F_BUG
- case FIX_F00F_IDT:
-#endif
+ case FIX_RO_IDT:
#ifdef CONFIG_X86_32
case FIX_WP_TEST:
- case FIX_VDSO:
# ifdef CONFIG_HIGHMEM
case FIX_KMAP_BEGIN ... FIX_KMAP_END:
# endif
#else
- case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
- case VVAR_PAGE:
+ case VSYSCALL_PAGE:
#endif
case FIX_TEXT_POKE0:
case FIX_TEXT_POKE1:
@@ -2082,8 +2103,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#ifdef CONFIG_X86_64
/* Replicate changes to map the vsyscall page into the user
pagetable vsyscall mapping. */
- if ((idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) ||
- idx == VVAR_PAGE) {
+ if (idx == VSYSCALL_PAGE) {
unsigned long vaddr = __fix_to_virt(idx);
set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
}
@@ -2092,6 +2112,9 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
static void __init xen_post_allocator_init(void)
{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
pv_mmu_ops.set_pte = xen_set_pte;
pv_mmu_ops.set_pmd = xen_set_pmd;
pv_mmu_ops.set_pud = xen_set_pud;
@@ -2111,6 +2134,7 @@ static void __init xen_post_allocator_init(void)
#endif
#ifdef CONFIG_X86_64
+ pv_mmu_ops.write_cr3 = &xen_write_cr3;
SetPagePinned(virt_to_page(level3_user_vsyscall));
#endif
xen_mark_init_mm_pinned();
@@ -2129,11 +2153,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.write_cr2 = xen_write_cr2,
.read_cr3 = xen_read_cr3,
-#ifdef CONFIG_X86_32
.write_cr3 = xen_write_cr3_init,
-#else
- .write_cr3 = xen_write_cr3,
-#endif
.flush_tlb_user = xen_flush_tlb,
.flush_tlb_kernel = xen_flush_tlb,
@@ -2190,6 +2210,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.lazy_mode = {
.enter = paravirt_enter_lazy_mmu,
.leave = xen_leave_lazy_mmu,
+ .flush = paravirt_flush_lazy_mmu,
},
.set_fixmap = xen_set_fixmap,
@@ -2197,8 +2218,16 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
void __init xen_init_mmu_ops(void)
{
- x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
x86_init.paging.pagetable_init = xen_pagetable_init;
+
+ /* Optimization - we can use the HVM one but it has no idea which
+ * VCPUs are descheduled - which means that it will needlessly IPI
+ * them. Xen knows so let it do the job.
+ */
+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
+ pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others;
+ return;
+ }
pv_mmu_ops = xen_mmu_ops;
memset(dummy_mapping, 0xff, PAGE_SIZE);
@@ -2320,12 +2349,14 @@ static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
return success;
}
-int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
- unsigned int address_bits)
+int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
+ unsigned int address_bits,
+ dma_addr_t *dma_handle)
{
unsigned long *in_frames = discontig_frames, out_frame;
unsigned long flags;
int success;
+ unsigned long vstart = (unsigned long)phys_to_virt(pstart);
/*
* Currently an auto-translated guest will not perform I/O, nor will
@@ -2360,15 +2391,17 @@ int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
spin_unlock_irqrestore(&xen_reservation_lock, flags);
+ *dma_handle = virt_to_machine(vstart).maddr;
return success ? 0 : -ENOMEM;
}
EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
-void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
+void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
{
unsigned long *out_frames = discontig_frames, in_frame;
unsigned long flags;
int success;
+ unsigned long vstart;
if (xen_feature(XENFEAT_auto_translated_physmap))
return;
@@ -2376,6 +2409,7 @@ void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
if (unlikely(order > MAX_CONTIG_ORDER))
return;
+ vstart = (unsigned long)phys_to_virt(pstart);
memset((void *) vstart, 0, PAGE_SIZE << order);
spin_lock_irqsave(&xen_reservation_lock, flags);
@@ -2474,6 +2508,95 @@ void __init xen_hvm_init_mmu_ops(void)
}
#endif
+#ifdef CONFIG_XEN_PVH
+/*
+ * Map foreign gfn (fgfn), to local pfn (lpfn). This for the user
+ * space creating new guest on pvh dom0 and needing to map domU pages.
+ */
+static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn,
+ unsigned int domid)
+{
+ int rc, err = 0;
+ xen_pfn_t gpfn = lpfn;
+ xen_ulong_t idx = fgfn;
+
+ struct xen_add_to_physmap_range xatp = {
+ .domid = DOMID_SELF,
+ .foreign_domid = domid,
+ .size = 1,
+ .space = XENMAPSPACE_gmfn_foreign,
+ };
+ set_xen_guest_handle(xatp.idxs, &idx);
+ set_xen_guest_handle(xatp.gpfns, &gpfn);
+ set_xen_guest_handle(xatp.errs, &err);
+
+ rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
+ if (rc < 0)
+ return rc;
+ return err;
+}
+
+static int xlate_remove_from_p2m(unsigned long spfn, int count)
+{
+ struct xen_remove_from_physmap xrp;
+ int i, rc;
+
+ for (i = 0; i < count; i++) {
+ xrp.domid = DOMID_SELF;
+ xrp.gpfn = spfn+i;
+ rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
+ if (rc)
+ break;
+ }
+ return rc;
+}
+
+struct xlate_remap_data {
+ unsigned long fgfn; /* foreign domain's gfn */
+ pgprot_t prot;
+ domid_t domid;
+ int index;
+ struct page **pages;
+};
+
+static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
+ void *data)
+{
+ int rc;
+ struct xlate_remap_data *remap = data;
+ unsigned long pfn = page_to_pfn(remap->pages[remap->index++]);
+ pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot));
+
+ rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid);
+ if (rc)
+ return rc;
+ native_set_pte(ptep, pteval);
+
+ return 0;
+}
+
+static int xlate_remap_gfn_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long mfn,
+ int nr, pgprot_t prot, unsigned domid,
+ struct page **pages)
+{
+ int err;
+ struct xlate_remap_data pvhdata;
+
+ BUG_ON(!pages);
+
+ pvhdata.fgfn = mfn;
+ pvhdata.prot = prot;
+ pvhdata.domid = domid;
+ pvhdata.index = 0;
+ pvhdata.pages = pages;
+ err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT,
+ xlate_map_pte_fn, &pvhdata);
+ flush_tlb_all();
+ return err;
+}
+#endif
+
#define REMAP_BATCH_SIZE 16
struct remap_data {
@@ -2486,7 +2609,7 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
unsigned long addr, void *data)
{
struct remap_data *rmd = data;
- pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
+ pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot));
rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
rmd->mmu_update->val = pte_val_ma(pte);
@@ -2508,13 +2631,18 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
unsigned long range;
int err = 0;
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return -EINVAL;
-
- prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
-
BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
+ if (xen_feature(XENFEAT_auto_translated_physmap)) {
+#ifdef CONFIG_XEN_PVH
+ /* We need to update the local page tables and the xen HAP */
+ return xlate_remap_gfn_range(vma, addr, mfn, nr, prot,
+ domid, pages);
+#else
+ return -EINVAL;
+#endif
+ }
+
rmd.mfn = mfn;
rmd.prot = prot;
@@ -2552,6 +2680,25 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
return 0;
+#ifdef CONFIG_XEN_PVH
+ while (numpgs--) {
+ /*
+ * The mmu has already cleaned up the process mmu
+ * resources at this point (lookup_address will return
+ * NULL).
+ */
+ unsigned long pfn = page_to_pfn(pages[numpgs]);
+
+ xlate_remove_from_p2m(pfn, 1);
+ }
+ /*
+ * We don't need to flush tlbs because as part of
+ * xlate_remove_from_p2m, the hypervisor will do tlb flushes
+ * after removing the p2m entries from the EPT/NPT
+ */
+ return 0;
+#else
return -EINVAL;
+#endif
}
EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 95fb2aa5927..9bb3d82ffec 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -36,7 +36,7 @@
* pfn_to_mfn(0xc0000)=0xc0000
*
* The benefit of this is, that we can assume for non-RAM regions (think
- * PCI BARs, or ACPI spaces), we can create mappings easily b/c we
+ * PCI BARs, or ACPI spaces), we can create mappings easily because we
* get the PFN value to match the MFN.
*
* For this to work efficiently we have one new page p2m_identity and
@@ -60,7 +60,7 @@
* There is also a digram of the P2M at the end that can help.
* Imagine your E820 looking as so:
*
- * 1GB 2GB
+ * 1GB 2GB 4GB
* /-------------------+---------\/----\ /----------\ /---+-----\
* | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM |
* \-------------------+---------/\----/ \----------/ \---+-----/
@@ -77,9 +77,8 @@
* of the PFN and the end PFN (263424 and 512256 respectively). The first step
* is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page
* covers 512^2 of page estate (1GB) and in case the start or end PFN is not
- * aligned on 512^2*PAGE_SIZE (1GB) we loop on aligned 1GB PFNs from start pfn
- * to end pfn. We reserve_brk top leaf pages if they are missing (means they
- * point to p2m_mid_missing).
+ * aligned on 512^2*PAGE_SIZE (1GB) we reserve_brk new middle and leaf pages as
+ * required to split any existing p2m_mid_missing middle pages.
*
* With the E820 example above, 263424 is not 1GB aligned so we allocate a
* reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000.
@@ -88,7 +87,7 @@
* Next stage is to determine if we need to do a more granular boundary check
* on the 4MB (or 2MB depending on architecture) off the start and end pfn's.
* We check if the start pfn and end pfn violate that boundary check, and if
- * so reserve_brk a middle (p2m[x][y]) leaf page. This way we have a much finer
+ * so reserve_brk a (p2m[x][y]) leaf page. This way we have a much finer
* granularity of setting which PFNs are missing and which ones are identity.
* In our example 263424 and 512256 both fail the check so we reserve_brk two
* pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing"
@@ -102,9 +101,10 @@
*
* The next step is to walk from the start pfn to the end pfn setting
* the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity.
- * If we find that the middle leaf is pointing to p2m_missing we can swap it
- * over to p2m_identity - this way covering 4MB (or 2MB) PFN space. At this
- * point we do not need to worry about boundary aligment (so no need to
+ * If we find that the middle entry is pointing to p2m_missing we can swap it
+ * over to p2m_identity - this way covering 4MB (or 2MB) PFN space (and
+ * similarly swapping p2m_mid_missing for p2m_mid_identity for larger regions).
+ * At this point we do not need to worry about boundary aligment (so no need to
* reserve_brk a middle page, figure out which PFNs are "missing" and which
* ones are identity), as that has been done earlier. If we find that the
* middle leaf is not occupied by p2m_identity or p2m_missing, we dereference
@@ -118,6 +118,9 @@
* considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511]
* contain the INVALID_P2M_ENTRY value and are considered "missing."
*
+ * Finally, the region beyond the end of of the E820 (4 GB in this example)
+ * is set to be identity (in case there are MMIO regions placed here).
+ *
* This is what the p2m ends up looking (for the E820 above) with this
* fabulous drawing:
*
@@ -129,21 +132,27 @@
* |-----| \ | [p2m_identity]+\\ | .... |
* | 2 |--\ \-------------------->| ... | \\ \----------------/
* |-----| \ \---------------/ \\
- * | 3 |\ \ \\ p2m_identity
- * |-----| \ \-------------------->/---------------\ /-----------------\
- * | .. +->+ | [p2m_identity]+-->| ~0, ~0, ~0, ... |
- * \-----/ / | [p2m_identity]+-->| ..., ~0 |
- * / /---------------\ | .... | \-----------------/
- * / | IDENTITY[@0] | /-+-[x], ~0, ~0.. |
- * / | IDENTITY[@256]|<----/ \---------------/
- * / | ~0, ~0, .... |
- * | \---------------/
- * |
- * p2m_mid_missing p2m_missing
- * /-----------------\ /------------\
- * | [p2m_missing] +---->| ~0, ~0, ~0 |
- * | [p2m_missing] +---->| ..., ~0 |
- * \-----------------/ \------------/
+ * | 3 |-\ \ \\ p2m_identity [1]
+ * |-----| \ \-------------------->/---------------\ /-----------------\
+ * | .. |\ | | [p2m_identity]+-->| ~0, ~0, ~0, ... |
+ * \-----/ | | | [p2m_identity]+-->| ..., ~0 |
+ * | | | .... | \-----------------/
+ * | | +-[x], ~0, ~0.. +\
+ * | | \---------------/ \
+ * | | \-> /---------------\
+ * | V p2m_mid_missing p2m_missing | IDENTITY[@0] |
+ * | /-----------------\ /------------\ | IDENTITY[@256]|
+ * | | [p2m_missing] +---->| ~0, ~0, ...| | ~0, ~0, .... |
+ * | | [p2m_missing] +---->| ..., ~0 | \---------------/
+ * | | ... | \------------/
+ * | \-----------------/
+ * |
+ * | p2m_mid_identity
+ * | /-----------------\
+ * \-->| [p2m_identity] +---->[1]
+ * | [p2m_identity] +---->[1]
+ * | ... |
+ * \-----------------/
*
* where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT)
*/
@@ -161,6 +170,7 @@
#include <asm/xen/page.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
+#include <xen/balloon.h>
#include <xen/grant_table.h>
#include "multicalls.h"
@@ -186,13 +196,15 @@ static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_identity, P2M_MID_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_identity_mfn, P2M_MID_PER_PAGE);
RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
/* We might hit two boundary violations at the start and end, at max each
* boundary violation will require three middle nodes. */
-RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3);
+RESERVE_BRK(p2m_mid_extra, PAGE_SIZE * 2 * 3);
/* When we populate back during bootup, the amount of pages can vary. The
* max we have is seen is 395979, but that does not mean it can't be more.
@@ -241,20 +253,20 @@ static void p2m_top_mfn_p_init(unsigned long **top)
top[i] = p2m_mid_missing_mfn;
}
-static void p2m_mid_init(unsigned long **mid)
+static void p2m_mid_init(unsigned long **mid, unsigned long *leaf)
{
unsigned i;
for (i = 0; i < P2M_MID_PER_PAGE; i++)
- mid[i] = p2m_missing;
+ mid[i] = leaf;
}
-static void p2m_mid_mfn_init(unsigned long *mid)
+static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf)
{
unsigned i;
for (i = 0; i < P2M_MID_PER_PAGE; i++)
- mid[i] = virt_to_mfn(p2m_missing);
+ mid[i] = virt_to_mfn(leaf);
}
static void p2m_init(unsigned long *p2m)
@@ -279,10 +291,15 @@ void __ref xen_build_mfn_list_list(void)
{
unsigned long pfn;
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
/* Pre-initialize p2m_top_mfn to be completely missing */
if (p2m_top_mfn == NULL) {
p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_mfn_init(p2m_mid_missing_mfn);
+ p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
+ p2m_mid_identity_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity);
p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_top_mfn_p_init(p2m_top_mfn_p);
@@ -291,7 +308,8 @@ void __ref xen_build_mfn_list_list(void)
p2m_top_mfn_init(p2m_top_mfn);
} else {
/* Reinitialise, mfn's all change after migration */
- p2m_mid_mfn_init(p2m_mid_missing_mfn);
+ p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
+ p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity);
}
for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
@@ -323,7 +341,7 @@ void __ref xen_build_mfn_list_list(void)
* it too late.
*/
mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_mfn_init(mid_mfn_p);
+ p2m_mid_mfn_init(mid_mfn_p, p2m_missing);
p2m_top_mfn_p[topidx] = mid_mfn_p;
}
@@ -335,6 +353,9 @@ void __ref xen_build_mfn_list_list(void)
void xen_setup_mfn_list_list(void)
{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
@@ -345,24 +366,30 @@ void xen_setup_mfn_list_list(void)
/* Set up p2m_top to point to the domain-builder provided p2m pages */
void __init xen_build_dynamic_phys_to_machine(void)
{
- unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
- unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
+ unsigned long *mfn_list;
+ unsigned long max_pfn;
unsigned long pfn;
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ mfn_list = (unsigned long *)xen_start_info->mfn_list;
+ max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
xen_max_p2m_pfn = max_pfn;
p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_init(p2m_missing);
+ p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_init(p2m_identity);
p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_init(p2m_mid_missing);
+ p2m_mid_init(p2m_mid_missing, p2m_missing);
+ p2m_mid_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_init(p2m_mid_identity, p2m_identity);
p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_top_init(p2m_top);
- p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_init(p2m_identity);
-
/*
* The domain builder gives us a pre-constructed p2m array in
* mfn_list for all the pages initially given to us, so we just
@@ -374,7 +401,7 @@ void __init xen_build_dynamic_phys_to_machine(void)
if (p2m_top[topidx] == p2m_mid_missing) {
unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_init(mid);
+ p2m_mid_init(mid, p2m_missing);
p2m_top[topidx] = mid;
}
@@ -480,7 +507,7 @@ unsigned long get_phys_to_machine(unsigned long pfn)
unsigned topidx, mididx, idx;
if (unlikely(pfn >= MAX_P2M_PFN))
- return INVALID_P2M_ENTRY;
+ return IDENTITY_FRAME(pfn);
topidx = p2m_top_index(pfn);
mididx = p2m_mid_index(pfn);
@@ -533,7 +560,7 @@ static bool alloc_p2m(unsigned long pfn)
if (!mid)
return false;
- p2m_mid_init(mid);
+ p2m_mid_init(mid, p2m_missing);
if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
free_p2m_page(mid);
@@ -553,7 +580,7 @@ static bool alloc_p2m(unsigned long pfn)
if (!mid_mfn)
return false;
- p2m_mid_mfn_init(mid_mfn);
+ p2m_mid_mfn_init(mid_mfn, p2m_missing);
missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
mid_mfn_mfn = virt_to_mfn(mid_mfn);
@@ -584,7 +611,7 @@ static bool alloc_p2m(unsigned long pfn)
return true;
}
-static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary)
+static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary)
{
unsigned topidx, mididx, idx;
unsigned long *p2m;
@@ -626,7 +653,7 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary
return true;
}
-static bool __init early_alloc_p2m(unsigned long pfn)
+static bool __init early_alloc_p2m_middle(unsigned long pfn)
{
unsigned topidx = p2m_top_index(pfn);
unsigned long *mid_mfn_p;
@@ -637,7 +664,7 @@ static bool __init early_alloc_p2m(unsigned long pfn)
if (mid == p2m_mid_missing) {
mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_init(mid);
+ p2m_mid_init(mid, p2m_missing);
p2m_top[topidx] = mid;
@@ -646,12 +673,12 @@ static bool __init early_alloc_p2m(unsigned long pfn)
/* And the save/restore P2M tables.. */
if (mid_mfn_p == p2m_mid_missing_mfn) {
mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_mfn_init(mid_mfn_p);
+ p2m_mid_mfn_init(mid_mfn_p, p2m_missing);
p2m_top_mfn_p[topidx] = mid_mfn_p;
p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
/* Note: we don't set mid_mfn_p[midix] here,
- * look in early_alloc_p2m_middle */
+ * look in early_alloc_p2m() */
}
return true;
}
@@ -727,7 +754,7 @@ found:
/* This shouldn't happen */
if (WARN_ON(p2m_top[topidx] == p2m_mid_missing))
- early_alloc_p2m(set_pfn);
+ early_alloc_p2m_middle(set_pfn);
if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing))
return false;
@@ -742,13 +769,13 @@ found:
bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
- if (!early_alloc_p2m(pfn))
+ if (!early_alloc_p2m_middle(pfn))
return false;
if (early_can_reuse_p2m_middle(pfn, mfn))
return __set_phys_to_machine(pfn, mfn);
- if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/))
+ if (!early_alloc_p2m(pfn, false /* boundary crossover OK!*/))
return false;
if (!__set_phys_to_machine(pfn, mfn))
@@ -757,12 +784,30 @@ bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn)
return true;
}
+
+static void __init early_split_p2m(unsigned long pfn)
+{
+ unsigned long mididx, idx;
+
+ mididx = p2m_mid_index(pfn);
+ idx = p2m_index(pfn);
+
+ /*
+ * Allocate new middle and leaf pages if this pfn lies in the
+ * middle of one.
+ */
+ if (mididx || idx)
+ early_alloc_p2m_middle(pfn);
+ if (idx)
+ early_alloc_p2m(pfn, false);
+}
+
unsigned long __init set_phys_range_identity(unsigned long pfn_s,
unsigned long pfn_e)
{
unsigned long pfn;
- if (unlikely(pfn_s >= MAX_P2M_PFN || pfn_e >= MAX_P2M_PFN))
+ if (unlikely(pfn_s >= MAX_P2M_PFN))
return 0;
if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
@@ -771,19 +816,30 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s,
if (pfn_s > pfn_e)
return 0;
- for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1));
- pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE));
- pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
- {
- WARN_ON(!early_alloc_p2m(pfn));
- }
+ if (pfn_e > MAX_P2M_PFN)
+ pfn_e = MAX_P2M_PFN;
- early_alloc_p2m_middle(pfn_s, true);
- early_alloc_p2m_middle(pfn_e, true);
+ early_split_p2m(pfn_s);
+ early_split_p2m(pfn_e);
+
+ for (pfn = pfn_s; pfn < pfn_e;) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
- for (pfn = pfn_s; pfn < pfn_e; pfn++)
if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
break;
+ pfn++;
+
+ /*
+ * If the PFN was set to a middle or leaf identity
+ * page the remainder must also be identity, so skip
+ * ahead to the next middle or leaf entry.
+ */
+ if (p2m_top[topidx] == p2m_mid_identity)
+ pfn = ALIGN(pfn, P2M_MID_PER_PAGE * P2M_PER_PAGE);
+ else if (p2m_top[topidx][mididx] == p2m_identity)
+ pfn = ALIGN(pfn, P2M_PER_PAGE);
+ }
if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s),
"Identity mapping failed. We are %ld short of 1-1 mappings!\n",
@@ -798,10 +854,10 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
unsigned topidx, mididx, idx;
- if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
- BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+ /* don't track P2M changes in autotranslate guests */
+ if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
return true;
- }
+
if (unlikely(pfn >= MAX_P2M_PFN)) {
BUG_ON(mfn != INVALID_P2M_ENTRY);
return true;
@@ -813,8 +869,22 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
/* For sparse holes were the p2m leaf has real PFN along with
* PCI holes, stick in the PFN as the MFN value.
+ *
+ * set_phys_range_identity() will have allocated new middle
+ * and leaf pages as required so an existing p2m_mid_missing
+ * or p2m_missing mean that whole range will be identity so
+ * these can be switched to p2m_mid_identity or p2m_identity.
*/
if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) {
+ if (p2m_top[topidx] == p2m_mid_identity)
+ return true;
+
+ if (p2m_top[topidx] == p2m_mid_missing) {
+ WARN_ON(cmpxchg(&p2m_top[topidx], p2m_mid_missing,
+ p2m_mid_identity) != p2m_mid_missing);
+ return true;
+ }
+
if (p2m_top[topidx][mididx] == p2m_identity)
return true;
@@ -869,6 +939,65 @@ static unsigned long mfn_hash(unsigned long mfn)
return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
}
+int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
+ struct gnttab_map_grant_ref *kmap_ops,
+ struct page **pages, unsigned int count)
+{
+ int i, ret = 0;
+ bool lazy = false;
+ pte_t *pte;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
+ if (kmap_ops &&
+ !in_interrupt() &&
+ paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
+ arch_enter_lazy_mmu_mode();
+ lazy = true;
+ }
+
+ for (i = 0; i < count; i++) {
+ unsigned long mfn, pfn;
+
+ /* Do not add to override if the map failed. */
+ if (map_ops[i].status)
+ continue;
+
+ if (map_ops[i].flags & GNTMAP_contains_pte) {
+ pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
+ (map_ops[i].host_addr & ~PAGE_MASK));
+ mfn = pte_mfn(*pte);
+ } else {
+ mfn = PFN_DOWN(map_ops[i].dev_bus_addr);
+ }
+ pfn = page_to_pfn(pages[i]);
+
+ WARN_ON(PagePrivate(pages[i]));
+ SetPagePrivate(pages[i]);
+ set_page_private(pages[i], mfn);
+ pages[i]->index = pfn_to_mfn(pfn);
+
+ if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (kmap_ops) {
+ ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]);
+ if (ret)
+ goto out;
+ }
+ }
+
+out:
+ if (lazy)
+ arch_leave_lazy_mmu_mode();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping);
+
/* Add an MFN override for a particular page */
int m2p_add_override(unsigned long mfn, struct page *page,
struct gnttab_map_grant_ref *kmap_op)
@@ -878,7 +1007,6 @@ int m2p_add_override(unsigned long mfn, struct page *page,
unsigned long uninitialized_var(address);
unsigned level;
pte_t *ptep = NULL;
- int ret = 0;
pfn = page_to_pfn(page);
if (!PageHighMem(page)) {
@@ -888,13 +1016,6 @@ int m2p_add_override(unsigned long mfn, struct page *page,
"m2p_add_override: pfn %lx not mapped", pfn))
return -EINVAL;
}
- WARN_ON(PagePrivate(page));
- SetPagePrivate(page);
- set_page_private(page, mfn);
- page->index = pfn_to_mfn(pfn);
-
- if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn))))
- return -ENOMEM;
if (kmap_op != NULL) {
if (!PageHighMem(page)) {
@@ -925,28 +1046,69 @@ int m2p_add_override(unsigned long mfn, struct page *page,
* frontend pages while they are being shared with the backend,
* because mfn_to_pfn (that ends up being called by GUPF) will
* return the backend pfn rather than the frontend pfn. */
- ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
- if (ret == 0 && get_phys_to_machine(pfn) == mfn)
+ pfn = mfn_to_pfn_no_overrides(mfn);
+ if (get_phys_to_machine(pfn) == mfn)
set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
return 0;
}
EXPORT_SYMBOL_GPL(m2p_add_override);
+
+int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
+ struct gnttab_map_grant_ref *kmap_ops,
+ struct page **pages, unsigned int count)
+{
+ int i, ret = 0;
+ bool lazy = false;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
+ if (kmap_ops &&
+ !in_interrupt() &&
+ paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
+ arch_enter_lazy_mmu_mode();
+ lazy = true;
+ }
+
+ for (i = 0; i < count; i++) {
+ unsigned long mfn = get_phys_to_machine(page_to_pfn(pages[i]));
+ unsigned long pfn = page_to_pfn(pages[i]);
+
+ if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ set_page_private(pages[i], INVALID_P2M_ENTRY);
+ WARN_ON(!PagePrivate(pages[i]));
+ ClearPagePrivate(pages[i]);
+ set_phys_to_machine(pfn, pages[i]->index);
+
+ if (kmap_ops)
+ ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn);
+ if (ret)
+ goto out;
+ }
+
+out:
+ if (lazy)
+ arch_leave_lazy_mmu_mode();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping);
+
int m2p_remove_override(struct page *page,
- struct gnttab_map_grant_ref *kmap_op)
+ struct gnttab_map_grant_ref *kmap_op,
+ unsigned long mfn)
{
unsigned long flags;
- unsigned long mfn;
unsigned long pfn;
unsigned long uninitialized_var(address);
unsigned level;
pte_t *ptep = NULL;
- int ret = 0;
pfn = page_to_pfn(page);
- mfn = get_phys_to_machine(pfn);
- if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT))
- return -EINVAL;
if (!PageHighMem(page)) {
address = (unsigned long)__va(pfn << PAGE_SHIFT);
@@ -960,14 +1122,14 @@ int m2p_remove_override(struct page *page,
spin_lock_irqsave(&m2p_override_lock, flags);
list_del(&page->lru);
spin_unlock_irqrestore(&m2p_override_lock, flags);
- WARN_ON(!PagePrivate(page));
- ClearPagePrivate(page);
- set_phys_to_machine(pfn, page->index);
if (kmap_op != NULL) {
if (!PageHighMem(page)) {
struct multicall_space mcs;
- struct gnttab_unmap_grant_ref *unmap_op;
+ struct gnttab_unmap_and_replace *unmap_op;
+ struct page *scratch_page = get_balloon_scratch_page();
+ unsigned long scratch_page_address = (unsigned long)
+ __va(page_to_pfn(scratch_page) << PAGE_SHIFT);
/*
* It might be that we queued all the m2p grant table
@@ -986,25 +1148,31 @@ int m2p_remove_override(struct page *page,
printk(KERN_WARNING "m2p_remove_override: "
"pfn %lx mfn %lx, failed to modify kernel mappings",
pfn, mfn);
+ put_balloon_scratch_page();
return -1;
}
- mcs = xen_mc_entry(
- sizeof(struct gnttab_unmap_grant_ref));
+ xen_mc_batch();
+
+ mcs = __xen_mc_entry(
+ sizeof(struct gnttab_unmap_and_replace));
unmap_op = mcs.args;
unmap_op->host_addr = kmap_op->host_addr;
+ unmap_op->new_addr = scratch_page_address;
unmap_op->handle = kmap_op->handle;
- unmap_op->dev_bus_addr = 0;
MULTI_grant_table_op(mcs.mc,
- GNTTABOP_unmap_grant_ref, unmap_op, 1);
+ GNTTABOP_unmap_and_replace, unmap_op, 1);
+
+ mcs = __xen_mc_entry(0);
+ MULTI_update_va_mapping(mcs.mc, scratch_page_address,
+ pfn_pte(page_to_pfn(scratch_page),
+ PAGE_KERNEL_RO), 0);
xen_mc_issue(PARAVIRT_LAZY_MMU);
- set_pte_at(&init_mm, address, ptep,
- pfn_pte(pfn, PAGE_KERNEL));
- __flush_tlb_single(address);
kmap_op->host_addr = 0;
+ put_balloon_scratch_page();
}
}
@@ -1019,8 +1187,8 @@ int m2p_remove_override(struct page *page,
* the original pfn causes mfn_to_pfn(mfn) to return the frontend
* pfn again. */
mfn &= ~FOREIGN_FRAME_BIT;
- ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
- if (ret == 0 && get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) &&
+ pfn = mfn_to_pfn_no_overrides(mfn);
+ if (get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) &&
m2p_find_override(mfn) == NULL)
set_phys_to_machine(pfn, mfn);
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
index 969570491c3..0e98e5d241d 100644
--- a/arch/x86/xen/pci-swiotlb-xen.c
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -75,8 +75,10 @@ void __init pci_xen_swiotlb_init(void)
xen_swiotlb_init(1, true /* early */);
dma_ops = &xen_swiotlb_dma_ops;
+#ifdef CONFIG_PCI
/* Make sure ACS will be enabled */
pci_request_acs();
+#endif
}
}
@@ -92,8 +94,10 @@ int pci_xen_swiotlb_init_late(void)
return rc;
dma_ops = &xen_swiotlb_dma_ops;
+#ifdef CONFIG_PCI
/* Make sure ACS will be enabled */
pci_request_acs();
+#endif
return 0;
}
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 0a7852483ff..a8261716d58 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -30,10 +30,9 @@
#define XEN_PLATFORM_ERR_PROTOCOL -2
#define XEN_PLATFORM_ERR_BLACKLIST -3
-/* store the value of xen_emul_unplug after the unplug is done */
-int xen_platform_pci_unplug;
-EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
#ifdef CONFIG_XEN_PVHVM
+/* store the value of xen_emul_unplug after the unplug is done */
+static int xen_platform_pci_unplug;
static int xen_emul_unplug;
static int check_platform_magic(void)
@@ -69,6 +68,80 @@ static int check_platform_magic(void)
return 0;
}
+bool xen_has_pv_devices()
+{
+ if (!xen_domain())
+ return false;
+
+ /* PV domains always have them. */
+ if (xen_pv_domain())
+ return true;
+
+ /* And user has xen_platform_pci=0 set in guest config as
+ * driver did not modify the value. */
+ if (xen_platform_pci_unplug == 0)
+ return false;
+
+ if (xen_platform_pci_unplug & XEN_UNPLUG_NEVER)
+ return false;
+
+ if (xen_platform_pci_unplug & XEN_UNPLUG_ALL)
+ return true;
+
+ /* This is an odd one - we are going to run legacy
+ * and PV drivers at the same time. */
+ if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY)
+ return true;
+
+ /* And the caller has to follow with xen_pv_{disk,nic}_devices
+ * to be certain which driver can load. */
+ return false;
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_devices);
+
+static bool __xen_has_pv_device(int state)
+{
+ /* HVM domains might or might not */
+ if (xen_hvm_domain() && (xen_platform_pci_unplug & state))
+ return true;
+
+ return xen_has_pv_devices();
+}
+
+bool xen_has_pv_nic_devices(void)
+{
+ return __xen_has_pv_device(XEN_UNPLUG_ALL_NICS | XEN_UNPLUG_ALL);
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_nic_devices);
+
+bool xen_has_pv_disk_devices(void)
+{
+ return __xen_has_pv_device(XEN_UNPLUG_ALL_IDE_DISKS |
+ XEN_UNPLUG_AUX_IDE_DISKS | XEN_UNPLUG_ALL);
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_disk_devices);
+
+/*
+ * This one is odd - it determines whether you want to run PV _and_
+ * legacy (IDE) drivers together. This combination is only possible
+ * under HVM.
+ */
+bool xen_has_pv_and_legacy_disk_devices(void)
+{
+ if (!xen_domain())
+ return false;
+
+ /* N.B. This is only ever used in HVM mode */
+ if (xen_pv_domain())
+ return false;
+
+ if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_and_legacy_disk_devices);
+
void xen_unplug_emulated_devices(void)
{
int r;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 8971a26d21a..2e555163c2f 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -33,6 +33,9 @@
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
+#ifdef CONFIG_X86_64
+extern asmlinkage void nmi(void);
+#endif
extern void xen_sysenter_target(void);
extern void xen_syscall_target(void);
extern void xen_syscall32_target(void);
@@ -82,10 +85,10 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
unsigned long mfn = pfn_to_mfn(pfn);
- if (WARN(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn))
+ if (WARN_ONCE(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn))
continue;
- WARN(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n",
- pfn, mfn);
+ WARN_ONCE(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n",
+ pfn, mfn);
__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
}
@@ -215,13 +218,19 @@ static void __init xen_set_identity_and_release_chunk(
unsigned long pfn;
/*
- * If the PFNs are currently mapped, the VA mapping also needs
- * to be updated to be 1:1.
+ * If the PFNs are currently mapped, clear the mappings
+ * (except for the ISA region which must be 1:1 mapped) to
+ * release the refcounts (in Xen) on the original frames.
*/
- for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
+ for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
+ pte_t pte = __pte_ma(0);
+
+ if (pfn < PFN_UP(ISA_END_ADDRESS))
+ pte = mfn_pte(pfn, PAGE_KERNEL_IO);
+
(void)HYPERVISOR_update_va_mapping(
- (unsigned long)__va(pfn << PAGE_SHIFT),
- mfn_pte(pfn, PAGE_KERNEL_IO), 0);
+ (unsigned long)__va(pfn << PAGE_SHIFT), pte, 0);
+ }
if (start_pfn < nr_pages)
*released += xen_release_chunk(
@@ -313,6 +322,17 @@ static void xen_align_and_add_e820_region(u64 start, u64 size, int type)
e820_add_region(start, end - start, type);
}
+void xen_ignore_unusable(struct e820entry *list, size_t map_size)
+{
+ struct e820entry *entry;
+ unsigned int i;
+
+ for (i = 0, entry = list; i < map_size; i++, entry++) {
+ if (entry->type == E820_UNUSABLE)
+ entry->type = E820_RAM;
+ }
+}
+
/**
* machine_specific_memory_setup - Hook for machine specific memory setup.
**/
@@ -353,6 +373,17 @@ char * __init xen_memory_setup(void)
}
BUG_ON(rc);
+ /*
+ * Xen won't allow a 1:1 mapping to be created to UNUSABLE
+ * regions, so if we're using the machine memory map leave the
+ * region as RAM as it is in the pseudo-physical map.
+ *
+ * UNUSABLE regions in domUs are not handled and will need
+ * a patch in the future.
+ */
+ if (xen_initial_domain())
+ xen_ignore_unusable(map, memmap.nr_entries);
+
/* Make sure the Xen-supplied memory map is well-ordered. */
sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
@@ -420,6 +451,15 @@ char * __init xen_memory_setup(void)
}
/*
+ * Set the rest as identity mapped, in case PCI BARs are
+ * located here.
+ *
+ * PFNs above MAX_P2M_PFN are considered identity mapped as
+ * well.
+ */
+ set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul);
+
+ /*
* In domU, the ISA region is normal, usable memory, but we
* reserve ISA memory anyway because too many things poke
* about in there.
@@ -460,6 +500,35 @@ char * __init xen_memory_setup(void)
}
/*
+ * Machine specific memory setup for auto-translated guests.
+ */
+char * __init xen_auto_xlated_memory_setup(void)
+{
+ static struct e820entry map[E820MAX] __initdata;
+
+ struct xen_memory_map memmap;
+ int i;
+ int rc;
+
+ memmap.nr_entries = E820MAX;
+ set_xen_guest_handle(memmap.buffer, map);
+
+ rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+ if (rc < 0)
+ panic("No memory map (%d)\n", rc);
+
+ sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries);
+
+ for (i = 0; i < memmap.nr_entries; i++)
+ e820_add_region(map[i].addr, map[i].size, map[i].type);
+
+ memblock_reserve(__pa(xen_start_info->mfn_list),
+ xen_start_info->pt_base - xen_start_info->mfn_list);
+
+ return "Xen";
+}
+
+/*
* Set the bit indicating "nosegneg" library variants should be used.
* We only need to bother in pure 32-bit mode; compat 32-bit processes
* can have un-truncated segments, so wrapping around is allowed.
@@ -467,15 +536,22 @@ char * __init xen_memory_setup(void)
static void __init fiddle_vdso(void)
{
#ifdef CONFIG_X86_32
+ /*
+ * This could be called before selected_vdso32 is initialized, so
+ * just fiddle with both possible images. vdso_image_32_syscall
+ * can't be selected, since it only exists on 64-bit systems.
+ */
u32 *mask;
- mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
+ mask = vdso_image_32_int80.data +
+ vdso_image_32_int80.sym_VDSO32_NOTE_MASK;
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
- mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
+ mask = vdso_image_32_sysenter.data +
+ vdso_image_32_sysenter.sym_VDSO32_NOTE_MASK;
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
#endif
}
-static int __cpuinit register_callback(unsigned type, const void *func)
+static int register_callback(unsigned type, const void *func)
{
struct callback_register callback = {
.type = type,
@@ -486,7 +562,7 @@ static int __cpuinit register_callback(unsigned type, const void *func)
return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
}
-void __cpuinit xen_enable_sysenter(void)
+void xen_enable_sysenter(void)
{
int ret;
unsigned sysenter_feature;
@@ -505,7 +581,7 @@ void __cpuinit xen_enable_sysenter(void)
setup_clear_cpu_cap(sysenter_feature);
}
-void __cpuinit xen_enable_syscall(void)
+void xen_enable_syscall(void)
{
#ifdef CONFIG_X86_64
int ret;
@@ -526,16 +602,13 @@ void __cpuinit xen_enable_syscall(void)
#endif /* CONFIG_X86_64 */
}
-void __init xen_arch_setup(void)
+void __init xen_pvmmu_arch_setup(void)
{
- xen_panic_handler_init();
-
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
- if (!xen_feature(XENFEAT_auto_translated_physmap))
- HYPERVISOR_vm_assist(VMASST_CMD_enable,
- VMASST_TYPE_pae_extended_cr3);
+ HYPERVISOR_vm_assist(VMASST_CMD_enable,
+ VMASST_TYPE_pae_extended_cr3);
if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
@@ -543,6 +616,14 @@ void __init xen_arch_setup(void)
xen_enable_sysenter();
xen_enable_syscall();
+}
+
+/* This function is not called for HVM domains */
+void __init xen_arch_setup(void)
+{
+ xen_panic_handler_init();
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ xen_pvmmu_arch_setup();
#ifdef CONFIG_ACPI
if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
@@ -556,12 +637,9 @@ void __init xen_arch_setup(void)
COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
/* Set up idle, making sure it calls safe_halt() pvop */
-#ifdef CONFIG_X86_32
- boot_cpu_data.hlt_works_ok = 1;
-#endif
disable_cpuidle();
disable_cpufreq();
- WARN_ON(set_pm_idle_to_default());
+ WARN_ON(xen_set_default_idle());
fiddle_vdso();
#ifdef CONFIG_NUMA
numa_off = 1;
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 34bc4cee888..7005974c3ff 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -17,6 +17,7 @@
#include <linux/slab.h>
#include <linux/smp.h>
#include <linux/irq_work.h>
+#include <linux/tick.h>
#include <asm/paravirt.h>
#include <asm/desc.h>
@@ -39,11 +40,15 @@
cpumask_var_t xen_cpu_initialized_map;
-static DEFINE_PER_CPU(int, xen_resched_irq);
-static DEFINE_PER_CPU(int, xen_callfunc_irq);
-static DEFINE_PER_CPU(int, xen_callfuncsingle_irq);
-static DEFINE_PER_CPU(int, xen_irq_work);
-static DEFINE_PER_CPU(int, xen_debug_irq) = -1;
+struct xen_common_irq {
+ int irq;
+ char *name;
+};
+static DEFINE_PER_CPU(struct xen_common_irq, xen_resched_irq) = { .irq = -1 };
+static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 };
+static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 };
+static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 };
+static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 };
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
@@ -60,7 +65,7 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
return IRQ_HANDLED;
}
-static void __cpuinit cpu_bringup(void)
+static void cpu_bringup(void)
{
int cpu;
@@ -68,9 +73,11 @@ static void __cpuinit cpu_bringup(void)
touch_softlockup_watchdog();
preempt_disable();
- xen_enable_sysenter();
- xen_enable_syscall();
-
+ /* PVH runs in ring 0 and allows us to do native syscalls. Yay! */
+ if (!xen_feature(XENFEAT_supervisor_mode_kernel)) {
+ xen_enable_sysenter();
+ xen_enable_syscall();
+ }
cpu = smp_processor_id();
smp_store_cpu_info(cpu);
cpu_data(cpu).x86_max_cores = 1;
@@ -92,84 +99,128 @@ static void __cpuinit cpu_bringup(void)
wmb(); /* make sure everything is out */
}
-static void __cpuinit cpu_bringup_and_idle(void)
+/* Note: cpu parameter is only relevant for PVH */
+static void cpu_bringup_and_idle(int cpu)
{
+#ifdef CONFIG_X86_64
+ if (xen_feature(XENFEAT_auto_translated_physmap) &&
+ xen_feature(XENFEAT_supervisor_mode_kernel))
+ xen_pvh_secondary_vcpu_init(cpu);
+#endif
cpu_bringup();
- cpu_idle();
+ cpu_startup_entry(CPUHP_ONLINE);
}
+static void xen_smp_intr_free(unsigned int cpu)
+{
+ if (per_cpu(xen_resched_irq, cpu).irq >= 0) {
+ unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu).irq, NULL);
+ per_cpu(xen_resched_irq, cpu).irq = -1;
+ kfree(per_cpu(xen_resched_irq, cpu).name);
+ per_cpu(xen_resched_irq, cpu).name = NULL;
+ }
+ if (per_cpu(xen_callfunc_irq, cpu).irq >= 0) {
+ unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu).irq, NULL);
+ per_cpu(xen_callfunc_irq, cpu).irq = -1;
+ kfree(per_cpu(xen_callfunc_irq, cpu).name);
+ per_cpu(xen_callfunc_irq, cpu).name = NULL;
+ }
+ if (per_cpu(xen_debug_irq, cpu).irq >= 0) {
+ unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu).irq, NULL);
+ per_cpu(xen_debug_irq, cpu).irq = -1;
+ kfree(per_cpu(xen_debug_irq, cpu).name);
+ per_cpu(xen_debug_irq, cpu).name = NULL;
+ }
+ if (per_cpu(xen_callfuncsingle_irq, cpu).irq >= 0) {
+ unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu).irq,
+ NULL);
+ per_cpu(xen_callfuncsingle_irq, cpu).irq = -1;
+ kfree(per_cpu(xen_callfuncsingle_irq, cpu).name);
+ per_cpu(xen_callfuncsingle_irq, cpu).name = NULL;
+ }
+ if (xen_hvm_domain())
+ return;
+
+ if (per_cpu(xen_irq_work, cpu).irq >= 0) {
+ unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL);
+ per_cpu(xen_irq_work, cpu).irq = -1;
+ kfree(per_cpu(xen_irq_work, cpu).name);
+ per_cpu(xen_irq_work, cpu).name = NULL;
+ }
+};
static int xen_smp_intr_init(unsigned int cpu)
{
int rc;
- const char *resched_name, *callfunc_name, *debug_name;
+ char *resched_name, *callfunc_name, *debug_name;
resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
cpu,
xen_reschedule_interrupt,
- IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+ IRQF_PERCPU|IRQF_NOBALANCING,
resched_name,
NULL);
if (rc < 0)
goto fail;
- per_cpu(xen_resched_irq, cpu) = rc;
+ per_cpu(xen_resched_irq, cpu).irq = rc;
+ per_cpu(xen_resched_irq, cpu).name = resched_name;
callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
cpu,
xen_call_function_interrupt,
- IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+ IRQF_PERCPU|IRQF_NOBALANCING,
callfunc_name,
NULL);
if (rc < 0)
goto fail;
- per_cpu(xen_callfunc_irq, cpu) = rc;
+ per_cpu(xen_callfunc_irq, cpu).irq = rc;
+ per_cpu(xen_callfunc_irq, cpu).name = callfunc_name;
debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu);
rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt,
- IRQF_DISABLED | IRQF_PERCPU | IRQF_NOBALANCING,
+ IRQF_PERCPU | IRQF_NOBALANCING,
debug_name, NULL);
if (rc < 0)
goto fail;
- per_cpu(xen_debug_irq, cpu) = rc;
+ per_cpu(xen_debug_irq, cpu).irq = rc;
+ per_cpu(xen_debug_irq, cpu).name = debug_name;
callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
cpu,
xen_call_function_single_interrupt,
- IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+ IRQF_PERCPU|IRQF_NOBALANCING,
callfunc_name,
NULL);
if (rc < 0)
goto fail;
- per_cpu(xen_callfuncsingle_irq, cpu) = rc;
+ per_cpu(xen_callfuncsingle_irq, cpu).irq = rc;
+ per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name;
+
+ /*
+ * The IRQ worker on PVHVM goes through the native path and uses the
+ * IPI mechanism.
+ */
+ if (xen_hvm_domain())
+ return 0;
callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR,
cpu,
xen_irq_work_interrupt,
- IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+ IRQF_PERCPU|IRQF_NOBALANCING,
callfunc_name,
NULL);
if (rc < 0)
goto fail;
- per_cpu(xen_irq_work, cpu) = rc;
+ per_cpu(xen_irq_work, cpu).irq = rc;
+ per_cpu(xen_irq_work, cpu).name = callfunc_name;
return 0;
fail:
- if (per_cpu(xen_resched_irq, cpu) >= 0)
- unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
- if (per_cpu(xen_callfunc_irq, cpu) >= 0)
- unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
- if (per_cpu(xen_debug_irq, cpu) >= 0)
- unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
- if (per_cpu(xen_callfuncsingle_irq, cpu) >= 0)
- unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu),
- NULL);
- if (per_cpu(xen_irq_work, cpu) >= 0)
- unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL);
-
+ xen_smp_intr_free(cpu);
return rc;
}
@@ -230,12 +281,31 @@ static void __init xen_smp_prepare_boot_cpu(void)
BUG_ON(smp_processor_id() != 0);
native_smp_prepare_boot_cpu();
- /* We've switched to the "real" per-cpu gdt, so make sure the
- old memory can be recycled */
- make_lowmem_page_readwrite(xen_initial_gdt);
+ if (xen_pv_domain()) {
+ if (!xen_feature(XENFEAT_writable_page_tables))
+ /* We've switched to the "real" per-cpu gdt, so make
+ * sure the old memory can be recycled. */
+ make_lowmem_page_readwrite(xen_initial_gdt);
- xen_filter_cpu_maps();
- xen_setup_vcpu_info_placement();
+#ifdef CONFIG_X86_32
+ /*
+ * Xen starts us with XEN_FLAT_RING1_DS, but linux code
+ * expects __USER_DS
+ */
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+#endif
+
+ xen_filter_cpu_maps();
+ xen_setup_vcpu_info_placement();
+ }
+ /*
+ * The alternative logic (which patches the unlock/lock) runs before
+ * the smp bootup up code is activated. Hence we need to set this up
+ * the core kernel is being patched. Otherwise we will have only
+ * modules patched but not core code.
+ */
+ xen_init_spinlocks();
}
static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
@@ -283,7 +353,7 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
set_cpu_present(cpu, true);
}
-static int __cpuinit
+static int
cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
{
struct vcpu_guest_context *ctxt;
@@ -299,50 +369,62 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
gdt = get_cpu_gdt_table(cpu);
- ctxt->flags = VGCF_IN_KERNEL;
- ctxt->user_regs.ds = __USER_DS;
- ctxt->user_regs.es = __USER_DS;
- ctxt->user_regs.ss = __KERNEL_DS;
#ifdef CONFIG_X86_32
+ /* Note: PVH is not yet supported on x86_32. */
ctxt->user_regs.fs = __KERNEL_PERCPU;
ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
-#else
- ctxt->gs_base_kernel = per_cpu_offset(cpu);
#endif
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
- ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
- xen_copy_trap_info(ctxt->trap_ctxt);
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ ctxt->flags = VGCF_IN_KERNEL;
+ ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
+ ctxt->user_regs.ds = __USER_DS;
+ ctxt->user_regs.es = __USER_DS;
+ ctxt->user_regs.ss = __KERNEL_DS;
- ctxt->ldt_ents = 0;
+ xen_copy_trap_info(ctxt->trap_ctxt);
- BUG_ON((unsigned long)gdt & ~PAGE_MASK);
+ ctxt->ldt_ents = 0;
- gdt_mfn = arbitrary_virt_to_mfn(gdt);
- make_lowmem_page_readonly(gdt);
- make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
+ BUG_ON((unsigned long)gdt & ~PAGE_MASK);
- ctxt->gdt_frames[0] = gdt_mfn;
- ctxt->gdt_ents = GDT_ENTRIES;
+ gdt_mfn = arbitrary_virt_to_mfn(gdt);
+ make_lowmem_page_readonly(gdt);
+ make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
- ctxt->user_regs.cs = __KERNEL_CS;
- ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
+ ctxt->gdt_frames[0] = gdt_mfn;
+ ctxt->gdt_ents = GDT_ENTRIES;
- ctxt->kernel_ss = __KERNEL_DS;
- ctxt->kernel_sp = idle->thread.sp0;
+ ctxt->kernel_ss = __KERNEL_DS;
+ ctxt->kernel_sp = idle->thread.sp0;
#ifdef CONFIG_X86_32
- ctxt->event_callback_cs = __KERNEL_CS;
- ctxt->failsafe_callback_cs = __KERNEL_CS;
+ ctxt->event_callback_cs = __KERNEL_CS;
+ ctxt->failsafe_callback_cs = __KERNEL_CS;
+#else
+ ctxt->gs_base_kernel = per_cpu_offset(cpu);
#endif
- ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
- ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
-
- per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
+ ctxt->event_callback_eip =
+ (unsigned long)xen_hypervisor_callback;
+ ctxt->failsafe_callback_eip =
+ (unsigned long)xen_failsafe_callback;
+ ctxt->user_regs.cs = __KERNEL_CS;
+ per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
+#ifdef CONFIG_X86_32
+ }
+#else
+ } else
+ /* N.B. The user_regs.eip (cpu_bringup_and_idle) is called with
+ * %rdi having the cpu number - which means are passing in
+ * as the first parameter the cpu. Subtle!
+ */
+ ctxt->user_regs.rdi = cpu;
+#endif
+ ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
-
if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
BUG();
@@ -350,7 +432,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
return 0;
}
-static int __cpuinit xen_cpu_up(unsigned int cpu, struct task_struct *idle)
+static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
{
int rc;
@@ -359,10 +441,11 @@ static int __cpuinit xen_cpu_up(unsigned int cpu, struct task_struct *idle)
irq_ctx_init(cpu);
#else
clear_tsk_thread_flag(idle, TIF_FORK);
+#endif
per_cpu(kernel_stack, cpu) =
(unsigned long)task_stack_page(idle) -
KERNEL_STACK_OFFSET + THREAD_SIZE;
-#endif
+
xen_setup_runstate_info(cpu);
xen_setup_timer(cpu);
xen_init_lock_cpu(cpu);
@@ -414,24 +497,27 @@ static int xen_cpu_disable(void)
static void xen_cpu_die(unsigned int cpu)
{
- while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
+ while (xen_pv_domain() && HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
current->state = TASK_UNINTERRUPTIBLE;
schedule_timeout(HZ/10);
}
- unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
- unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
- unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
- unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
- unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL);
+ xen_smp_intr_free(cpu);
xen_uninit_lock_cpu(cpu);
xen_teardown_timer(cpu);
}
-static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */
+static void xen_play_dead(void) /* used only with HOTPLUG_CPU */
{
play_dead_common();
HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
cpu_bringup();
+ /*
+ * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down)
+ * clears certain data that the cpu_idle loop (which called us
+ * and that we return from) expects. The only way to get that
+ * data back is to call:
+ */
+ tick_nohz_idle_enter();
}
#else /* !CONFIG_HOTPLUG_CPU */
@@ -522,6 +608,12 @@ static inline int xen_map_vector(int vector)
case IRQ_WORK_VECTOR:
xen_vector = XEN_IRQ_WORK_VECTOR;
break;
+#ifdef CONFIG_X86_64
+ case NMI_VECTOR:
+ case APIC_DM_NMI: /* Some use that instead of NMI_VECTOR */
+ xen_vector = XEN_NMI_VECTOR;
+ break;
+#endif
default:
xen_vector = -1;
printk(KERN_ERR "xen: vector 0x%x is not implemented\n",
@@ -561,24 +653,22 @@ void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
{
unsigned cpu;
unsigned int this_cpu = smp_processor_id();
+ int xen_vector = xen_map_vector(vector);
- if (!(num_online_cpus() > 1))
+ if (!(num_online_cpus() > 1) || (xen_vector < 0))
return;
for_each_cpu_and(cpu, mask, cpu_online_mask) {
if (this_cpu == cpu)
continue;
- xen_smp_send_call_function_single_ipi(cpu);
+ xen_send_IPI_one(cpu, xen_vector);
}
}
void xen_send_IPI_allbutself(int vector)
{
- int xen_vector = xen_map_vector(vector);
-
- if (xen_vector >= 0)
- xen_send_IPI_mask_allbutself(cpu_online_mask, xen_vector);
+ xen_send_IPI_mask_allbutself(cpu_online_mask, vector);
}
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
@@ -632,7 +722,6 @@ void __init xen_smp_init(void)
{
smp_ops = xen_smp_ops;
xen_fill_possible_map();
- xen_init_spinlocks();
}
static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
@@ -643,21 +732,33 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
xen_init_lock_cpu(0);
}
-static int __cpuinit xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
+static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
{
int rc;
- rc = native_cpu_up(cpu, tidle);
- WARN_ON (xen_smp_intr_init(cpu));
+ /*
+ * xen_smp_intr_init() needs to run before native_cpu_up()
+ * so that IPI vectors are set up on the booting CPU before
+ * it is marked online in native_cpu_up().
+ */
+ rc = xen_smp_intr_init(cpu);
+ WARN_ON(rc);
+ if (!rc)
+ rc = native_cpu_up(cpu, tidle);
+
+ /*
+ * We must initialize the slowpath CPU kicker _after_ the native
+ * path has executed. If we initialized it before none of the
+ * unlocker IPI kicks would reach the booting CPU as the booting
+ * CPU had not set itself 'online' in cpu_online_mask. That mask
+ * is checked when IPIs are sent (on HVM at least).
+ */
+ xen_init_lock_cpu(cpu);
return rc;
}
static void xen_hvm_cpu_die(unsigned int cpu)
{
- unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
- unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
- unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
- unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
- unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL);
+ xen_cpu_die(cpu);
native_cpu_die(cpu);
}
@@ -671,4 +772,5 @@ void __init xen_hvm_smp_init(void)
smp_ops.cpu_die = xen_hvm_cpu_die;
smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
+ smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu;
}
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
index 8981a76d081..c7c2d89efd7 100644
--- a/arch/x86/xen/smp.h
+++ b/arch/x86/xen/smp.h
@@ -5,7 +5,6 @@ extern void xen_send_IPI_mask(const struct cpumask *mask,
extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
int vector);
extern void xen_send_IPI_allbutself(int vector);
-extern void physflat_send_IPI_allbutself(int vector);
extern void xen_send_IPI_all(int vector);
extern void xen_send_IPI_self(int vector);
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 83e866d714c..0ba5f3b967f 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -7,6 +7,7 @@
#include <linux/debugfs.h>
#include <linux/log2.h>
#include <linux/gfp.h>
+#include <linux/slab.h>
#include <asm/paravirt.h>
@@ -16,45 +17,44 @@
#include "xen-ops.h"
#include "debugfs.h"
-#ifdef CONFIG_XEN_DEBUG_FS
-static struct xen_spinlock_stats
-{
- u64 taken;
- u32 taken_slow;
- u32 taken_slow_nested;
- u32 taken_slow_pickup;
- u32 taken_slow_spurious;
- u32 taken_slow_irqenable;
+enum xen_contention_stat {
+ TAKEN_SLOW,
+ TAKEN_SLOW_PICKUP,
+ TAKEN_SLOW_SPURIOUS,
+ RELEASED_SLOW,
+ RELEASED_SLOW_KICKED,
+ NR_CONTENTION_STATS
+};
- u64 released;
- u32 released_slow;
- u32 released_slow_kicked;
+#ifdef CONFIG_XEN_DEBUG_FS
#define HISTO_BUCKETS 30
- u32 histo_spin_total[HISTO_BUCKETS+1];
- u32 histo_spin_spinning[HISTO_BUCKETS+1];
+static struct xen_spinlock_stats
+{
+ u32 contention_stats[NR_CONTENTION_STATS];
u32 histo_spin_blocked[HISTO_BUCKETS+1];
-
- u64 time_total;
- u64 time_spinning;
u64 time_blocked;
} spinlock_stats;
static u8 zero_stats;
-static unsigned lock_timeout = 1 << 10;
-#define TIMEOUT lock_timeout
-
static inline void check_zero(void)
{
- if (unlikely(zero_stats)) {
- memset(&spinlock_stats, 0, sizeof(spinlock_stats));
- zero_stats = 0;
+ u8 ret;
+ u8 old = ACCESS_ONCE(zero_stats);
+ if (unlikely(old)) {
+ ret = cmpxchg(&zero_stats, old, 0);
+ /* This ensures only one fellow resets the stat */
+ if (ret == old)
+ memset(&spinlock_stats, 0, sizeof(spinlock_stats));
}
}
-#define ADD_STATS(elem, val) \
- do { check_zero(); spinlock_stats.elem += (val); } while(0)
+static inline void add_stats(enum xen_contention_stat var, u32 val)
+{
+ check_zero();
+ spinlock_stats.contention_stats[var] += val;
+}
static inline u64 spin_time_start(void)
{
@@ -73,22 +73,6 @@ static void __spin_time_accum(u64 delta, u32 *array)
array[HISTO_BUCKETS]++;
}
-static inline void spin_time_accum_spinning(u64 start)
-{
- u32 delta = xen_clocksource_read() - start;
-
- __spin_time_accum(delta, spinlock_stats.histo_spin_spinning);
- spinlock_stats.time_spinning += delta;
-}
-
-static inline void spin_time_accum_total(u64 start)
-{
- u32 delta = xen_clocksource_read() - start;
-
- __spin_time_accum(delta, spinlock_stats.histo_spin_total);
- spinlock_stats.time_total += delta;
-}
-
static inline void spin_time_accum_blocked(u64 start)
{
u32 delta = xen_clocksource_read() - start;
@@ -97,285 +81,167 @@ static inline void spin_time_accum_blocked(u64 start)
spinlock_stats.time_blocked += delta;
}
#else /* !CONFIG_XEN_DEBUG_FS */
-#define TIMEOUT (1 << 10)
-#define ADD_STATS(elem, val) do { (void)(val); } while(0)
+static inline void add_stats(enum xen_contention_stat var, u32 val)
+{
+}
static inline u64 spin_time_start(void)
{
return 0;
}
-static inline void spin_time_accum_total(u64 start)
-{
-}
-static inline void spin_time_accum_spinning(u64 start)
-{
-}
static inline void spin_time_accum_blocked(u64 start)
{
}
#endif /* CONFIG_XEN_DEBUG_FS */
-/*
- * Size struct xen_spinlock so it's the same as arch_spinlock_t.
- */
-#if NR_CPUS < 256
-typedef u8 xen_spinners_t;
-# define inc_spinners(xl) \
- asm(LOCK_PREFIX " incb %0" : "+m" ((xl)->spinners) : : "memory");
-# define dec_spinners(xl) \
- asm(LOCK_PREFIX " decb %0" : "+m" ((xl)->spinners) : : "memory");
-#else
-typedef u16 xen_spinners_t;
-# define inc_spinners(xl) \
- asm(LOCK_PREFIX " incw %0" : "+m" ((xl)->spinners) : : "memory");
-# define dec_spinners(xl) \
- asm(LOCK_PREFIX " decw %0" : "+m" ((xl)->spinners) : : "memory");
-#endif
-
-struct xen_spinlock {
- unsigned char lock; /* 0 -> free; 1 -> locked */
- xen_spinners_t spinners; /* count of waiting cpus */
+struct xen_lock_waiting {
+ struct arch_spinlock *lock;
+ __ticket_t want;
};
-static int xen_spin_is_locked(struct arch_spinlock *lock)
-{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-
- return xl->lock != 0;
-}
-
-static int xen_spin_is_contended(struct arch_spinlock *lock)
-{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-
- /* Not strictly true; this is only the count of contended
- lock-takers entering the slow path. */
- return xl->spinners != 0;
-}
-
-static int xen_spin_trylock(struct arch_spinlock *lock)
-{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
- u8 old = 1;
-
- asm("xchgb %b0,%1"
- : "+q" (old), "+m" (xl->lock) : : "memory");
-
- return old == 0;
-}
-
static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
-static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
+static DEFINE_PER_CPU(char *, irq_name);
+static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting);
+static cpumask_t waiting_cpus;
-/*
- * Mark a cpu as interested in a lock. Returns the CPU's previous
- * lock of interest, in case we got preempted by an interrupt.
- */
-static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
+static bool xen_pvspin = true;
+__visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
{
- struct xen_spinlock *prev;
-
- prev = __this_cpu_read(lock_spinners);
- __this_cpu_write(lock_spinners, xl);
-
- wmb(); /* set lock of interest before count */
-
- inc_spinners(xl);
-
- return prev;
-}
-
-/*
- * Mark a cpu as no longer interested in a lock. Restores previous
- * lock of interest (NULL for none).
- */
-static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
-{
- dec_spinners(xl);
- wmb(); /* decrement count before restoring lock */
- __this_cpu_write(lock_spinners, prev);
-}
-
-static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable)
-{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
- struct xen_spinlock *prev;
int irq = __this_cpu_read(lock_kicker_irq);
- int ret;
+ struct xen_lock_waiting *w = &__get_cpu_var(lock_waiting);
+ int cpu = smp_processor_id();
u64 start;
+ unsigned long flags;
/* If kicker interrupts not initialized yet, just spin */
if (irq == -1)
- return 0;
+ return;
start = spin_time_start();
- /* announce we're spinning */
- prev = spinning_lock(xl);
-
- ADD_STATS(taken_slow, 1);
- ADD_STATS(taken_slow_nested, prev != NULL);
+ /*
+ * Make sure an interrupt handler can't upset things in a
+ * partially setup state.
+ */
+ local_irq_save(flags);
+ /*
+ * We don't really care if we're overwriting some other
+ * (lock,want) pair, as that would mean that we're currently
+ * in an interrupt context, and the outer context had
+ * interrupts enabled. That has already kicked the VCPU out
+ * of xen_poll_irq(), so it will just return spuriously and
+ * retry with newly setup (lock,want).
+ *
+ * The ordering protocol on this is that the "lock" pointer
+ * may only be set non-NULL if the "want" ticket is correct.
+ * If we're updating "want", we must first clear "lock".
+ */
+ w->lock = NULL;
+ smp_wmb();
+ w->want = want;
+ smp_wmb();
+ w->lock = lock;
- do {
- unsigned long flags;
+ /* This uses set_bit, which atomic and therefore a barrier */
+ cpumask_set_cpu(cpu, &waiting_cpus);
+ add_stats(TAKEN_SLOW, 1);
- /* clear pending */
- xen_clear_irq_pending(irq);
+ /* clear pending */
+ xen_clear_irq_pending(irq);
- /* check again make sure it didn't become free while
- we weren't looking */
- ret = xen_spin_trylock(lock);
- if (ret) {
- ADD_STATS(taken_slow_pickup, 1);
+ /* Only check lock once pending cleared */
+ barrier();
- /*
- * If we interrupted another spinlock while it
- * was blocking, make sure it doesn't block
- * without rechecking the lock.
- */
- if (prev != NULL)
- xen_set_irq_pending(irq);
- goto out;
- }
+ /*
+ * Mark entry to slowpath before doing the pickup test to make
+ * sure we don't deadlock with an unlocker.
+ */
+ __ticket_enter_slowpath(lock);
- flags = arch_local_save_flags();
- if (irq_enable) {
- ADD_STATS(taken_slow_irqenable, 1);
- raw_local_irq_enable();
- }
+ /*
+ * check again make sure it didn't become free while
+ * we weren't looking
+ */
+ if (ACCESS_ONCE(lock->tickets.head) == want) {
+ add_stats(TAKEN_SLOW_PICKUP, 1);
+ goto out;
+ }
- /*
- * Block until irq becomes pending. If we're
- * interrupted at this point (after the trylock but
- * before entering the block), then the nested lock
- * handler guarantees that the irq will be left
- * pending if there's any chance the lock became free;
- * xen_poll_irq() returns immediately if the irq is
- * pending.
- */
- xen_poll_irq(irq);
+ /* Allow interrupts while blocked */
+ local_irq_restore(flags);
- raw_local_irq_restore(flags);
+ /*
+ * If an interrupt happens here, it will leave the wakeup irq
+ * pending, which will cause xen_poll_irq() to return
+ * immediately.
+ */
- ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
- } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
+ /* Block until irq becomes pending (or perhaps a spurious wakeup) */
+ xen_poll_irq(irq);
+ add_stats(TAKEN_SLOW_SPURIOUS, !xen_test_irq_pending(irq));
- kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
+ local_irq_save(flags);
+ kstat_incr_irq_this_cpu(irq);
out:
- unspinning_lock(xl, prev);
- spin_time_accum_blocked(start);
-
- return ret;
-}
-
-static inline void __xen_spin_lock(struct arch_spinlock *lock, bool irq_enable)
-{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
- unsigned timeout;
- u8 oldval;
- u64 start_spin;
-
- ADD_STATS(taken, 1);
-
- start_spin = spin_time_start();
+ cpumask_clear_cpu(cpu, &waiting_cpus);
+ w->lock = NULL;
- do {
- u64 start_spin_fast = spin_time_start();
+ local_irq_restore(flags);
- timeout = TIMEOUT;
-
- asm("1: xchgb %1,%0\n"
- " testb %1,%1\n"
- " jz 3f\n"
- "2: rep;nop\n"
- " cmpb $0,%0\n"
- " je 1b\n"
- " dec %2\n"
- " jnz 2b\n"
- "3:\n"
- : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
- : "1" (1)
- : "memory");
-
- spin_time_accum_spinning(start_spin_fast);
-
- } while (unlikely(oldval != 0 &&
- (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable))));
-
- spin_time_accum_total(start_spin);
-}
-
-static void xen_spin_lock(struct arch_spinlock *lock)
-{
- __xen_spin_lock(lock, false);
-}
-
-static void xen_spin_lock_flags(struct arch_spinlock *lock, unsigned long flags)
-{
- __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
+ spin_time_accum_blocked(start);
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_lock_spinning);
-static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
+static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)
{
int cpu;
- ADD_STATS(released_slow, 1);
+ add_stats(RELEASED_SLOW, 1);
+
+ for_each_cpu(cpu, &waiting_cpus) {
+ const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu);
- for_each_online_cpu(cpu) {
- /* XXX should mix up next cpu selection */
- if (per_cpu(lock_spinners, cpu) == xl) {
- ADD_STATS(released_slow_kicked, 1);
+ /* Make sure we read lock before want */
+ if (ACCESS_ONCE(w->lock) == lock &&
+ ACCESS_ONCE(w->want) == next) {
+ add_stats(RELEASED_SLOW_KICKED, 1);
xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
break;
}
}
}
-static void xen_spin_unlock(struct arch_spinlock *lock)
-{
- struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-
- ADD_STATS(released, 1);
-
- smp_wmb(); /* make sure no writes get moved after unlock */
- xl->lock = 0; /* release lock */
-
- /*
- * Make sure unlock happens before checking for waiting
- * spinners. We need a strong barrier to enforce the
- * write-read ordering to different memory locations, as the
- * CPU makes no implied guarantees about their ordering.
- */
- mb();
-
- if (unlikely(xl->spinners))
- xen_spin_unlock_slow(xl);
-}
-
static irqreturn_t dummy_handler(int irq, void *dev_id)
{
BUG();
return IRQ_HANDLED;
}
-void __cpuinit xen_init_lock_cpu(int cpu)
+void xen_init_lock_cpu(int cpu)
{
int irq;
- const char *name;
+ char *name;
+
+ if (!xen_pvspin)
+ return;
+
+ WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n",
+ cpu, per_cpu(lock_kicker_irq, cpu));
name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
cpu,
dummy_handler,
- IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+ IRQF_PERCPU|IRQF_NOBALANCING,
name,
NULL);
if (irq >= 0) {
disable_irq(irq); /* make sure it's never delivered */
per_cpu(lock_kicker_irq, cpu) = irq;
+ per_cpu(irq_name, cpu) = name;
}
printk("cpu %d spinlock event irq %d\n", cpu, irq);
@@ -383,21 +249,62 @@ void __cpuinit xen_init_lock_cpu(int cpu)
void xen_uninit_lock_cpu(int cpu)
{
+ if (!xen_pvspin)
+ return;
+
unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
+ per_cpu(lock_kicker_irq, cpu) = -1;
+ kfree(per_cpu(irq_name, cpu));
+ per_cpu(irq_name, cpu) = NULL;
}
+
+/*
+ * Our init of PV spinlocks is split in two init functions due to us
+ * using paravirt patching and jump labels patching and having to do
+ * all of this before SMP code is invoked.
+ *
+ * The paravirt patching needs to be done _before_ the alternative asm code
+ * is started, otherwise we would not patch the core kernel code.
+ */
void __init xen_init_spinlocks(void)
{
- BUILD_BUG_ON(sizeof(struct xen_spinlock) > sizeof(arch_spinlock_t));
-
- pv_lock_ops.spin_is_locked = xen_spin_is_locked;
- pv_lock_ops.spin_is_contended = xen_spin_is_contended;
- pv_lock_ops.spin_lock = xen_spin_lock;
- pv_lock_ops.spin_lock_flags = xen_spin_lock_flags;
- pv_lock_ops.spin_trylock = xen_spin_trylock;
- pv_lock_ops.spin_unlock = xen_spin_unlock;
+
+ if (!xen_pvspin) {
+ printk(KERN_DEBUG "xen: PV spinlocks disabled\n");
+ return;
+ }
+ printk(KERN_DEBUG "xen: PV spinlocks enabled\n");
+ pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning);
+ pv_lock_ops.unlock_kick = xen_unlock_kick;
}
+/*
+ * While the jump_label init code needs to happend _after_ the jump labels are
+ * enabled and before SMP is started. Hence we use pre-SMP initcall level
+ * init. We cannot do it in xen_init_spinlocks as that is done before
+ * jump labels are activated.
+ */
+static __init int xen_init_spinlocks_jump(void)
+{
+ if (!xen_pvspin)
+ return 0;
+
+ if (!xen_domain())
+ return 0;
+
+ static_key_slow_inc(&paravirt_ticketlocks_enabled);
+ return 0;
+}
+early_initcall(xen_init_spinlocks_jump);
+
+static __init int xen_parse_nopvspin(char *arg)
+{
+ xen_pvspin = false;
+ return 0;
+}
+early_param("xen_nopvspin", xen_parse_nopvspin);
+
#ifdef CONFIG_XEN_DEBUG_FS
static struct dentry *d_spin_debug;
@@ -409,41 +316,28 @@ static int __init xen_spinlock_debugfs(void)
if (d_xen == NULL)
return -ENOMEM;
+ if (!xen_pvspin)
+ return 0;
+
d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
- debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout);
-
- debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken);
debugfs_create_u32("taken_slow", 0444, d_spin_debug,
- &spinlock_stats.taken_slow);
- debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug,
- &spinlock_stats.taken_slow_nested);
+ &spinlock_stats.contention_stats[TAKEN_SLOW]);
debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
- &spinlock_stats.taken_slow_pickup);
+ &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);
debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
- &spinlock_stats.taken_slow_spurious);
- debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug,
- &spinlock_stats.taken_slow_irqenable);
+ &spinlock_stats.contention_stats[TAKEN_SLOW_SPURIOUS]);
- debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released);
debugfs_create_u32("released_slow", 0444, d_spin_debug,
- &spinlock_stats.released_slow);
+ &spinlock_stats.contention_stats[RELEASED_SLOW]);
debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
- &spinlock_stats.released_slow_kicked);
+ &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]);
- debugfs_create_u64("time_spinning", 0444, d_spin_debug,
- &spinlock_stats.time_spinning);
debugfs_create_u64("time_blocked", 0444, d_spin_debug,
&spinlock_stats.time_blocked);
- debugfs_create_u64("time_total", 0444, d_spin_debug,
- &spinlock_stats.time_total);
- debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
- spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
- debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
- spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index ae8a00c39de..c4df9dbd63b 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -12,8 +12,10 @@
#include "xen-ops.h"
#include "mmu.h"
-void xen_arch_pre_suspend(void)
+static void xen_pv_pre_suspend(void)
{
+ xen_mm_pin_all();
+
xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
xen_start_info->console.domU.mfn =
mfn_to_pfn(xen_start_info->console.domU.mfn);
@@ -26,11 +28,11 @@ void xen_arch_pre_suspend(void)
BUG();
}
-void xen_arch_hvm_post_suspend(int suspend_cancelled)
+static void xen_hvm_post_suspend(int suspend_cancelled)
{
#ifdef CONFIG_XEN_PVHVM
int cpu;
- xen_hvm_resume_shared_info();
+ xen_hvm_init_shared_info();
xen_callback_vector();
xen_unplug_emulated_devices();
if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
@@ -41,7 +43,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled)
#endif
}
-void xen_arch_post_suspend(int suspend_cancelled)
+static void xen_pv_post_suspend(int suspend_cancelled)
{
xen_build_mfn_list_list();
@@ -60,6 +62,21 @@ void xen_arch_post_suspend(int suspend_cancelled)
xen_vcpu_restore();
}
+ xen_mm_unpin_all();
+}
+
+void xen_arch_pre_suspend(void)
+{
+ if (xen_pv_domain())
+ xen_pv_pre_suspend();
+}
+
+void xen_arch_post_suspend(int cancelled)
+{
+ if (xen_pv_domain())
+ xen_pv_post_suspend(cancelled);
+ else
+ xen_hvm_post_suspend(cancelled);
}
static void xen_vcpu_notify_restore(void *data)
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 0296a952250..7b78f88c170 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -14,6 +14,8 @@
#include <linux/kernel_stat.h>
#include <linux/math64.h>
#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/pvclock_gtod.h>
#include <asm/pvclock.h>
#include <asm/xen/hypervisor.h>
@@ -36,9 +38,8 @@ static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
/* snapshots of runstate info */
static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot);
-/* unused ns of stolen and blocked time */
+/* unused ns of stolen time */
static DEFINE_PER_CPU(u64, xen_residual_stolen);
-static DEFINE_PER_CPU(u64, xen_residual_blocked);
/* return an consistent snapshot of 64-bit time/counter value */
static u64 get64(const u64 *p)
@@ -115,7 +116,7 @@ static void do_stolen_accounting(void)
{
struct vcpu_runstate_info state;
struct vcpu_runstate_info *snap;
- s64 blocked, runnable, offline, stolen;
+ s64 runnable, offline, stolen;
cputime_t ticks;
get_runstate_snapshot(&state);
@@ -125,7 +126,6 @@ static void do_stolen_accounting(void)
snap = &__get_cpu_var(xen_runstate_snapshot);
/* work out how much time the VCPU has not been runn*ing* */
- blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
@@ -141,17 +141,6 @@ static void do_stolen_accounting(void)
ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
__this_cpu_write(xen_residual_stolen, stolen);
account_steal_ticks(ticks);
-
- /* Add the appropriate number of ticks of blocked time,
- including any left-overs from last time. */
- blocked += __this_cpu_read(xen_residual_blocked);
-
- if (blocked < 0)
- blocked = 0;
-
- ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
- __this_cpu_write(xen_residual_blocked, blocked);
- account_idle_ticks(ticks);
}
/* Get the TSC speed from Xen */
@@ -191,34 +180,56 @@ static void xen_read_wallclock(struct timespec *ts)
put_cpu_var(xen_vcpu);
}
-static unsigned long xen_get_wallclock(void)
+static void xen_get_wallclock(struct timespec *now)
{
- struct timespec ts;
+ xen_read_wallclock(now);
+}
- xen_read_wallclock(&ts);
- return ts.tv_sec;
+static int xen_set_wallclock(const struct timespec *now)
+{
+ return -1;
}
-static int xen_set_wallclock(unsigned long now)
+static int xen_pvclock_gtod_notify(struct notifier_block *nb,
+ unsigned long was_set, void *priv)
{
+ /* Protected by the calling core code serialization */
+ static struct timespec next_sync;
+
struct xen_platform_op op;
- int rc;
+ struct timespec now;
- /* do nothing for domU */
- if (!xen_initial_domain())
- return -1;
+ now = __current_kernel_time();
+
+ /*
+ * We only take the expensive HV call when the clock was set
+ * or when the 11 minutes RTC synchronization time elapsed.
+ */
+ if (!was_set && timespec_compare(&now, &next_sync) < 0)
+ return NOTIFY_OK;
op.cmd = XENPF_settime;
- op.u.settime.secs = now;
- op.u.settime.nsecs = 0;
+ op.u.settime.secs = now.tv_sec;
+ op.u.settime.nsecs = now.tv_nsec;
op.u.settime.system_time = xen_clocksource_read();
- rc = HYPERVISOR_dom0_op(&op);
- WARN(rc != 0, "XENPF_settime failed: now=%ld\n", now);
+ (void)HYPERVISOR_dom0_op(&op);
- return rc;
+ /*
+ * Move the next drift compensation time 11 minutes
+ * ahead. That's emulating the sync_cmos_clock() update for
+ * the hardware RTC.
+ */
+ next_sync = now;
+ next_sync.tv_sec += 11 * 60;
+
+ return NOTIFY_OK;
}
+static struct notifier_block xen_pvclock_gtod_notifier = {
+ .notifier_call = xen_pvclock_gtod_notify,
+};
+
static struct clocksource xen_clocksource __read_mostly = {
.name = "xen",
.rating = 400,
@@ -377,11 +388,16 @@ static const struct clock_event_device xen_vcpuop_clockevent = {
static const struct clock_event_device *xen_clockevent =
&xen_timerop_clockevent;
-static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
+
+struct xen_clock_event_device {
+ struct clock_event_device evt;
+ char *name;
+};
+static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt.irq = -1 };
static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
{
- struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
+ struct clock_event_device *evt = &__get_cpu_var(xen_clock_events).evt;
irqreturn_t ret;
ret = IRQ_NONE;
@@ -395,12 +411,31 @@ static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
return ret;
}
+void xen_teardown_timer(int cpu)
+{
+ struct clock_event_device *evt;
+ BUG_ON(cpu == 0);
+ evt = &per_cpu(xen_clock_events, cpu).evt;
+
+ if (evt->irq >= 0) {
+ unbind_from_irqhandler(evt->irq, NULL);
+ evt->irq = -1;
+ kfree(per_cpu(xen_clock_events, cpu).name);
+ per_cpu(xen_clock_events, cpu).name = NULL;
+ }
+}
+
void xen_setup_timer(int cpu)
{
- const char *name;
+ char *name;
struct clock_event_device *evt;
int irq;
+ evt = &per_cpu(xen_clock_events, cpu).evt;
+ WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu);
+ if (evt->irq >= 0)
+ xen_teardown_timer(cpu);
+
printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
name = kasprintf(GFP_KERNEL, "timer%d", cpu);
@@ -408,31 +443,24 @@ void xen_setup_timer(int cpu)
name = "<timer kasprintf failed>";
irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
- IRQF_DISABLED|IRQF_PERCPU|
- IRQF_NOBALANCING|IRQF_TIMER|
+ IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
IRQF_FORCE_RESUME,
name, NULL);
+ (void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX);
- evt = &per_cpu(xen_clock_events, cpu);
memcpy(evt, xen_clockevent, sizeof(*evt));
evt->cpumask = cpumask_of(cpu);
evt->irq = irq;
+ per_cpu(xen_clock_events, cpu).name = name;
}
-void xen_teardown_timer(int cpu)
-{
- struct clock_event_device *evt;
- BUG_ON(cpu == 0);
- evt = &per_cpu(xen_clock_events, cpu);
- unbind_from_irqhandler(evt->irq, NULL);
-}
void xen_setup_cpu_clockevents(void)
{
BUG_ON(preemptible());
- clockevents_register_device(&__get_cpu_var(xen_clock_events));
+ clockevents_register_device(&__get_cpu_var(xen_clock_events).evt);
}
void xen_timer_resume(void)
@@ -477,6 +505,9 @@ static void __init xen_time_init(void)
xen_setup_runstate_info(cpu);
xen_setup_timer(cpu);
xen_setup_cpu_clockevents();
+
+ if (xen_initial_domain())
+ pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
}
void __init xen_init_time_ops(void)
@@ -489,7 +520,9 @@ void __init xen_init_time_ops(void)
x86_platform.calibrate_tsc = xen_tsc_khz;
x86_platform.get_wallclock = xen_get_wallclock;
- x86_platform.set_wallclock = xen_set_wallclock;
+ /* Dom0 uses the native method to set the hardware RTC. */
+ if (!xen_initial_domain())
+ x86_platform.set_wallclock = xen_set_wallclock;
}
#ifdef CONFIG_XEN_PVHVM
@@ -497,7 +530,11 @@ static void xen_hvm_setup_cpu_clockevents(void)
{
int cpu = smp_processor_id();
xen_setup_runstate_info(cpu);
- xen_setup_timer(cpu);
+ /*
+ * xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence
+ * doing it xen_hvm_cpu_notify (which gets called by smp_init during
+ * early bootup and also during CPU hotplug events).
+ */
xen_setup_cpu_clockevents();
}
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index f9643fc50de..fd92a64d748 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -75,6 +75,17 @@ ENDPROC(xen_sysexit)
* stack state in whatever form its in, we keep things simple by only
* using a single register which is pushed/popped on the stack.
*/
+
+.macro POP_FS
+1:
+ popw %fs
+.pushsection .fixup, "ax"
+2: movw $0, (%esp)
+ jmp 1b
+.popsection
+ _ASM_EXTABLE(1b,2b)
+.endm
+
ENTRY(xen_iret)
/* test eflags for special cases */
testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
@@ -83,17 +94,15 @@ ENTRY(xen_iret)
push %eax
ESP_OFFSET=4 # bytes pushed onto stack
- /*
- * Store vcpu_info pointer for easy access. Do it this way to
- * avoid having to reload %fs
- */
+ /* Store vcpu_info pointer for easy access */
#ifdef CONFIG_SMP
- GET_THREAD_INFO(%eax)
- movl TI_cpu(%eax), %eax
- movl __per_cpu_offset(,%eax,4), %eax
- mov xen_vcpu(%eax), %eax
+ pushw %fs
+ movl $(__KERNEL_PERCPU), %eax
+ movl %eax, %fs
+ movl %fs:xen_vcpu, %eax
+ POP_FS
#else
- movl xen_vcpu, %eax
+ movl %ss:xen_vcpu, %eax
#endif
/* check IF state we're restoring */
@@ -106,11 +115,11 @@ ENTRY(xen_iret)
* resuming the code, so we don't have to be worried about
* being preempted to another CPU.
*/
- setz XEN_vcpu_info_mask(%eax)
+ setz %ss:XEN_vcpu_info_mask(%eax)
xen_iret_start_crit:
/* check for unmasked and pending */
- cmpw $0x0001, XEN_vcpu_info_pending(%eax)
+ cmpw $0x0001, %ss:XEN_vcpu_info_pending(%eax)
/*
* If there's something pending, mask events again so we can
@@ -118,7 +127,7 @@ xen_iret_start_crit:
* touch XEN_vcpu_info_mask.
*/
jne 1f
- movb $1, XEN_vcpu_info_mask(%eax)
+ movb $1, %ss:XEN_vcpu_info_mask(%eax)
1: popl %eax
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 7faed5869e5..485b6958554 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -11,8 +11,28 @@
#include <asm/page_types.h>
#include <xen/interface/elfnote.h>
+#include <xen/interface/features.h>
#include <asm/xen/interface.h>
+#ifdef CONFIG_XEN_PVH
+#define PVH_FEATURES_STR "|writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel"
+/* Note the lack of 'hvm_callback_vector'. Older hypervisor will
+ * balk at this being part of XEN_ELFNOTE_FEATURES, so we put it in
+ * XEN_ELFNOTE_SUPPORTED_FEATURES which older hypervisors will ignore.
+ */
+#define PVH_FEATURES ((1 << XENFEAT_writable_page_tables) | \
+ (1 << XENFEAT_auto_translated_physmap) | \
+ (1 << XENFEAT_supervisor_mode_kernel) | \
+ (1 << XENFEAT_hvm_callback_vector))
+/* The XENFEAT_writable_page_tables is not stricly neccessary as we set that
+ * up regardless whether this CONFIG option is enabled or not, but it
+ * clarifies what the right flags need to be.
+ */
+#else
+#define PVH_FEATURES_STR ""
+#define PVH_FEATURES (0)
+#endif
+
__INIT
ENTRY(startup_xen)
cld
@@ -95,7 +115,10 @@ NEXT_HYPERCALL(arch_6)
#endif
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
- ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
+ ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables|pae_pgdir_above_4gb"; .asciz PVH_FEATURES_STR)
+ ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long (PVH_FEATURES) |
+ (1 << XENFEAT_writable_page_tables) |
+ (1 << XENFEAT_dom0))
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index d2e73d19d36..97d87659f77 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -31,16 +31,19 @@ void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
void xen_reserve_top(void);
extern unsigned long xen_max_p2m_pfn;
+void xen_mm_pin_all(void);
+void xen_mm_unpin_all(void);
void xen_set_pat(u64);
char * __init xen_memory_setup(void);
+char * xen_auto_xlated_memory_setup(void);
void __init xen_arch_setup(void);
void xen_enable_sysenter(void);
void xen_enable_syscall(void);
void xen_vcpu_restore(void);
void xen_callback_vector(void);
-void xen_hvm_resume_shared_info(void);
+void xen_hvm_init_shared_info(void);
void xen_unplug_emulated_devices(void);
void __init xen_build_dynamic_phys_to_machine(void);
@@ -73,7 +76,7 @@ static inline void xen_hvm_smp_init(void) {}
#ifdef CONFIG_PARAVIRT_SPINLOCKS
void __init xen_init_spinlocks(void);
-void __cpuinit xen_init_lock_cpu(int cpu);
+void xen_init_lock_cpu(int cpu);
void xen_uninit_lock_cpu(int cpu);
#else
static inline void xen_init_spinlocks(void)
@@ -105,9 +108,9 @@ static inline void __init xen_init_apic(void)
/* Declare an asm function, along with symbols needed to make it
inlineable */
#define DECL_ASM(ret, name, ...) \
- ret name(__VA_ARGS__); \
- extern char name##_end[]; \
- extern char name##_reloc[] \
+ __visible ret name(__VA_ARGS__); \
+ extern char name##_end[] __visible; \
+ extern char name##_reloc[] __visible
DECL_ASM(void, xen_irq_enable_direct, void);
DECL_ASM(void, xen_irq_disable_direct, void);
@@ -115,12 +118,13 @@ DECL_ASM(unsigned long, xen_save_fl_direct, void);
DECL_ASM(void, xen_restore_fl_direct, unsigned long);
/* These are not functions, and cannot be called normally */
-void xen_iret(void);
-void xen_sysexit(void);
-void xen_sysret32(void);
-void xen_sysret64(void);
-void xen_adjust_exception_frame(void);
+__visible void xen_iret(void);
+__visible void xen_sysexit(void);
+__visible void xen_sysret32(void);
+__visible void xen_sysret64(void);
+__visible void xen_adjust_exception_frame(void);
extern int xen_panic_handler_init(void);
+void xen_pvh_secondary_vcpu_init(int cpu);
#endif /* XEN_OPS_H */