diff options
Diffstat (limited to 'arch/x86/xen')
-rw-r--r-- | arch/x86/xen/Kconfig | 4 | ||||
-rw-r--r-- | arch/x86/xen/enlighten.c | 138 | ||||
-rw-r--r-- | arch/x86/xen/grant-table.c | 64 | ||||
-rw-r--r-- | arch/x86/xen/irq.c | 13 | ||||
-rw-r--r-- | arch/x86/xen/mmu.c | 186 | ||||
-rw-r--r-- | arch/x86/xen/p2m.c | 15 | ||||
-rw-r--r-- | arch/x86/xen/platform-pci-unplug.c | 79 | ||||
-rw-r--r-- | arch/x86/xen/setup.c | 44 | ||||
-rw-r--r-- | arch/x86/xen/smp.c | 49 | ||||
-rw-r--r-- | arch/x86/xen/spinlock.c | 2 | ||||
-rw-r--r-- | arch/x86/xen/time.c | 1 | ||||
-rw-r--r-- | arch/x86/xen/xen-head.S | 25 | ||||
-rw-r--r-- | arch/x86/xen/xen-ops.h | 1 |
13 files changed, 482 insertions, 139 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 1a3c7650564..01b90261fa3 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -51,3 +51,7 @@ config XEN_DEBUG_FS Enable statistics output and various tuning options in debugfs. Enabling this option may incur a significant performance overhead. +config XEN_PVH + bool "Support for running as a PVH guest" + depends on X86_64 && XEN && XEN_PVHVM + def_bool n diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index fa6ade76ef3..201d09a7c46 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -262,8 +262,9 @@ static void __init xen_banner(void) struct xen_extraversion extra; HYPERVISOR_xen_version(XENVER_extraversion, &extra); - printk(KERN_INFO "Booting paravirtualized kernel on %s\n", - pv_info.name); + pr_info("Booting paravirtualized kernel %son %s\n", + xen_feature(XENFEAT_auto_translated_physmap) ? + "with PVH extensions " : "", pv_info.name); printk(KERN_INFO "Xen version: %d.%d%s%s\n", version >> 16, version & 0xffff, extra.extraversion, xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); @@ -433,7 +434,7 @@ static void __init xen_init_cpuid_mask(void) ax = 1; cx = 0; - xen_cpuid(&ax, &bx, &cx, &dx); + cpuid(1, &ax, &bx, &cx, &dx); xsave_mask = (1 << (X86_FEATURE_XSAVE % 32)) | @@ -1142,8 +1143,9 @@ void xen_setup_vcpu_info_placement(void) xen_vcpu_setup(cpu); /* xen_vcpu_setup managed to place the vcpu_info within the - percpu area for all cpus, so make use of it */ - if (have_vcpu_info_placement) { + * percpu area for all cpus, so make use of it. Note that for + * PVH we want to use native IRQ mechanism. */ + if (have_vcpu_info_placement && !xen_pvh_domain()) { pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); @@ -1407,9 +1409,49 @@ static void __init xen_boot_params_init_edd(void) * Set up the GDT and segment registers for -fstack-protector. Until * we do this, we have to be careful not to call any stack-protected * function, which is most of the kernel. + * + * Note, that it is __ref because the only caller of this after init + * is PVH which is not going to use xen_load_gdt_boot or other + * __init functions. */ -static void __init xen_setup_stackprotector(void) +static void __ref xen_setup_gdt(int cpu) { + if (xen_feature(XENFEAT_auto_translated_physmap)) { +#ifdef CONFIG_X86_64 + unsigned long dummy; + + load_percpu_segment(cpu); /* We need to access per-cpu area */ + switch_to_new_gdt(cpu); /* GDT and GS set */ + + /* We are switching of the Xen provided GDT to our HVM mode + * GDT. The new GDT has __KERNEL_CS with CS.L = 1 + * and we are jumping to reload it. + */ + asm volatile ("pushq %0\n" + "leaq 1f(%%rip),%0\n" + "pushq %0\n" + "lretq\n" + "1:\n" + : "=&r" (dummy) : "0" (__KERNEL_CS)); + + /* + * While not needed, we also set the %es, %ds, and %fs + * to zero. We don't care about %ss as it is NULL. + * Strictly speaking this is not needed as Xen zeros those + * out (and also MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE) + * + * Linux zeros them in cpu_init() and in secondary_startup_64 + * (for BSP). + */ + loadsegment(es, 0); + loadsegment(ds, 0); + loadsegment(fs, 0); +#else + /* PVH: TODO Implement. */ + BUG(); +#endif + return; /* PVH does not need any PV GDT ops. */ + } pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; pv_cpu_ops.load_gdt = xen_load_gdt_boot; @@ -1420,6 +1462,58 @@ static void __init xen_setup_stackprotector(void) pv_cpu_ops.load_gdt = xen_load_gdt; } +/* + * A PV guest starts with default flags that are not set for PVH, set them + * here asap. + */ +static void xen_pvh_set_cr_flags(int cpu) +{ + + /* Some of these are setup in 'secondary_startup_64'. The others: + * X86_CR0_TS, X86_CR0_PE, X86_CR0_ET are set by Xen for HVM guests + * (which PVH shared codepaths), while X86_CR0_PG is for PVH. */ + write_cr0(read_cr0() | X86_CR0_MP | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM); + + if (!cpu) + return; + /* + * For BSP, PSE PGE are set in probe_page_size_mask(), for APs + * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu_init. + */ + if (cpu_has_pse) + set_in_cr4(X86_CR4_PSE); + + if (cpu_has_pge) + set_in_cr4(X86_CR4_PGE); +} + +/* + * Note, that it is ref - because the only caller of this after init + * is PVH which is not going to use xen_load_gdt_boot or other + * __init functions. + */ +void __ref xen_pvh_secondary_vcpu_init(int cpu) +{ + xen_setup_gdt(cpu); + xen_pvh_set_cr_flags(cpu); +} + +static void __init xen_pvh_early_guest_init(void) +{ + if (!xen_feature(XENFEAT_auto_translated_physmap)) + return; + + if (!xen_feature(XENFEAT_hvm_callback_vector)) + return; + + xen_have_vector_callback = 1; + xen_pvh_set_cr_flags(0); + +#ifdef CONFIG_X86_32 + BUG(); /* PVH: Implement proper support. */ +#endif +} + /* First C function to be called on Xen boot */ asmlinkage void __init xen_start_kernel(void) { @@ -1431,13 +1525,16 @@ asmlinkage void __init xen_start_kernel(void) xen_domain_type = XEN_PV_DOMAIN; + xen_setup_features(); + xen_pvh_early_guest_init(); xen_setup_machphys_mapping(); /* Install Xen paravirt ops */ pv_info = xen_info; pv_init_ops = xen_init_ops; - pv_cpu_ops = xen_cpu_ops; pv_apic_ops = xen_apic_ops; + if (!xen_pvh_domain()) + pv_cpu_ops = xen_cpu_ops; x86_init.resources.memory_setup = xen_memory_setup; x86_init.oem.arch_setup = xen_arch_setup; @@ -1469,17 +1566,14 @@ asmlinkage void __init xen_start_kernel(void) /* Work out if we support NX */ x86_configure_nx(); - xen_setup_features(); - /* Get mfn list */ - if (!xen_feature(XENFEAT_auto_translated_physmap)) - xen_build_dynamic_phys_to_machine(); + xen_build_dynamic_phys_to_machine(); /* * Set up kernel GDT and segment registers, mainly so that * -fstack-protector code can be executed. */ - xen_setup_stackprotector(); + xen_setup_gdt(0); xen_init_irq_ops(); xen_init_cpuid_mask(); @@ -1548,14 +1642,18 @@ asmlinkage void __init xen_start_kernel(void) /* set the limit of our address space */ xen_reserve_top(); - /* We used to do this in xen_arch_setup, but that is too late on AMD - * were early_cpu_init (run before ->arch_setup()) calls early_amd_init - * which pokes 0xcf8 port. - */ - set_iopl.iopl = 1; - rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); - if (rc != 0) - xen_raw_printk("physdev_op failed %d\n", rc); + /* PVH: runs at default kernel iopl of 0 */ + if (!xen_pvh_domain()) { + /* + * We used to do this in xen_arch_setup, but that is too late + * on AMD were early_cpu_init (run before ->arch_setup()) calls + * early_amd_init which pokes 0xcf8 port. + */ + set_iopl.iopl = 1; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); + if (rc != 0) + xen_raw_printk("physdev_op failed %d\n", rc); + } #ifdef CONFIG_X86_32 /* set up basic CPUID stuff */ diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index 3a5f55d5190..c9858358858 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -125,3 +125,67 @@ void arch_gnttab_unmap(void *shared, unsigned long nr_gframes) apply_to_page_range(&init_mm, (unsigned long)shared, PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL); } +#ifdef CONFIG_XEN_PVH +#include <xen/balloon.h> +#include <xen/events.h> +#include <xen/xen.h> +#include <linux/slab.h> +static int __init xlated_setup_gnttab_pages(void) +{ + struct page **pages; + xen_pfn_t *pfns; + int rc; + unsigned int i; + unsigned long nr_grant_frames = gnttab_max_grant_frames(); + + BUG_ON(nr_grant_frames == 0); + pages = kcalloc(nr_grant_frames, sizeof(pages[0]), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + pfns = kcalloc(nr_grant_frames, sizeof(pfns[0]), GFP_KERNEL); + if (!pfns) { + kfree(pages); + return -ENOMEM; + } + rc = alloc_xenballooned_pages(nr_grant_frames, pages, 0 /* lowmem */); + if (rc) { + pr_warn("%s Couldn't balloon alloc %ld pfns rc:%d\n", __func__, + nr_grant_frames, rc); + kfree(pages); + kfree(pfns); + return rc; + } + for (i = 0; i < nr_grant_frames; i++) + pfns[i] = page_to_pfn(pages[i]); + + rc = arch_gnttab_map_shared(pfns, nr_grant_frames, nr_grant_frames, + &xen_auto_xlat_grant_frames.vaddr); + + if (rc) { + pr_warn("%s Couldn't map %ld pfns rc:%d\n", __func__, + nr_grant_frames, rc); + free_xenballooned_pages(nr_grant_frames, pages); + kfree(pages); + kfree(pfns); + return rc; + } + kfree(pages); + + xen_auto_xlat_grant_frames.pfn = pfns; + xen_auto_xlat_grant_frames.count = nr_grant_frames; + + return 0; +} + +static int __init xen_pvh_gnttab_setup(void) +{ + if (!xen_pvh_domain()) + return -ENODEV; + + return xlated_setup_gnttab_pages(); +} +/* Call it _before_ __gnttab_init as we need to initialize the + * xen_auto_xlat_grant_frames first. */ +core_initcall(xen_pvh_gnttab_setup); +#endif diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 0da7f863056..08f763de26f 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -5,6 +5,7 @@ #include <xen/interface/xen.h> #include <xen/interface/sched.h> #include <xen/interface/vcpu.h> +#include <xen/features.h> #include <xen/events.h> #include <asm/xen/hypercall.h> @@ -22,7 +23,7 @@ void xen_force_evtchn_callback(void) (void)HYPERVISOR_xen_version(0, NULL); } -static unsigned long xen_save_fl(void) +asmlinkage unsigned long xen_save_fl(void) { struct vcpu_info *vcpu; unsigned long flags; @@ -40,7 +41,7 @@ static unsigned long xen_save_fl(void) } PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl); -static void xen_restore_fl(unsigned long flags) +__visible void xen_restore_fl(unsigned long flags) { struct vcpu_info *vcpu; @@ -62,7 +63,7 @@ static void xen_restore_fl(unsigned long flags) } PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl); -static void xen_irq_disable(void) +asmlinkage void xen_irq_disable(void) { /* There's a one instruction preempt window here. We need to make sure we're don't switch CPUs between getting the vcpu @@ -73,7 +74,7 @@ static void xen_irq_disable(void) } PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable); -static void xen_irq_enable(void) +asmlinkage void xen_irq_enable(void) { struct vcpu_info *vcpu; @@ -128,6 +129,8 @@ static const struct pv_irq_ops xen_irq_ops __initconst = { void __init xen_init_irq_ops(void) { - pv_irq_ops = xen_irq_ops; + /* For PVH we use default pv_irq_ops settings. */ + if (!xen_feature(XENFEAT_hvm_callback_vector)) + pv_irq_ops = xen_irq_ops; x86_init.irqs.intr_init = xen_init_IRQ; } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index ce563be09cc..256282e7888 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -365,7 +365,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, /* Assume pteval_t is equivalent to all the other *val_t types. */ static pteval_t pte_mfn_to_pfn(pteval_t val) { - if (val & _PAGE_PRESENT) { + if (pteval_present(val)) { unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; unsigned long pfn = mfn_to_pfn(mfn); @@ -381,7 +381,7 @@ static pteval_t pte_mfn_to_pfn(pteval_t val) static pteval_t pte_pfn_to_mfn(pteval_t val) { - if (val & _PAGE_PRESENT) { + if (pteval_present(val)) { unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; pteval_t flags = val & PTE_FLAGS_MASK; unsigned long mfn; @@ -431,7 +431,7 @@ static pteval_t iomap_pte(pteval_t val) return val; } -static pteval_t xen_pte_val(pte_t pte) +__visible pteval_t xen_pte_val(pte_t pte) { pteval_t pteval = pte.pte; #if 0 @@ -448,7 +448,7 @@ static pteval_t xen_pte_val(pte_t pte) } PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); -static pgdval_t xen_pgd_val(pgd_t pgd) +__visible pgdval_t xen_pgd_val(pgd_t pgd) { return pte_mfn_to_pfn(pgd.pgd); } @@ -479,7 +479,7 @@ void xen_set_pat(u64 pat) WARN_ON(pat != 0x0007010600070106ull); } -static pte_t xen_make_pte(pteval_t pte) +__visible pte_t xen_make_pte(pteval_t pte) { phys_addr_t addr = (pte & PTE_PFN_MASK); #if 0 @@ -514,14 +514,14 @@ static pte_t xen_make_pte(pteval_t pte) } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); -static pgd_t xen_make_pgd(pgdval_t pgd) +__visible pgd_t xen_make_pgd(pgdval_t pgd) { pgd = pte_pfn_to_mfn(pgd); return native_make_pgd(pgd); } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); -static pmdval_t xen_pmd_val(pmd_t pmd) +__visible pmdval_t xen_pmd_val(pmd_t pmd) { return pte_mfn_to_pfn(pmd.pmd); } @@ -580,7 +580,7 @@ static void xen_pmd_clear(pmd_t *pmdp) } #endif /* CONFIG_X86_PAE */ -static pmd_t xen_make_pmd(pmdval_t pmd) +__visible pmd_t xen_make_pmd(pmdval_t pmd) { pmd = pte_pfn_to_mfn(pmd); return native_make_pmd(pmd); @@ -588,13 +588,13 @@ static pmd_t xen_make_pmd(pmdval_t pmd) PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); #if PAGETABLE_LEVELS == 4 -static pudval_t xen_pud_val(pud_t pud) +__visible pudval_t xen_pud_val(pud_t pud) { return pte_mfn_to_pfn(pud.pud); } PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); -static pud_t xen_make_pud(pudval_t pud) +__visible pud_t xen_make_pud(pudval_t pud) { pud = pte_pfn_to_mfn(pud); @@ -1198,44 +1198,40 @@ static void __init xen_cleanhighmap(unsigned long vaddr, * instead of somewhere later and be confusing. */ xen_mc_flush(); } -#endif -static void __init xen_pagetable_init(void) +static void __init xen_pagetable_p2m_copy(void) { -#ifdef CONFIG_X86_64 unsigned long size; unsigned long addr; -#endif - paging_init(); - xen_setup_shared_info(); -#ifdef CONFIG_X86_64 - if (!xen_feature(XENFEAT_auto_translated_physmap)) { - unsigned long new_mfn_list; - - size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - - /* On 32-bit, we get zero so this never gets executed. */ - new_mfn_list = xen_revector_p2m_tree(); - if (new_mfn_list && new_mfn_list != xen_start_info->mfn_list) { - /* using __ka address and sticking INVALID_P2M_ENTRY! */ - memset((void *)xen_start_info->mfn_list, 0xff, size); - - /* We should be in __ka space. */ - BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); - addr = xen_start_info->mfn_list; - /* We roundup to the PMD, which means that if anybody at this stage is - * using the __ka address of xen_start_info or xen_start_info->shared_info - * they are in going to crash. Fortunatly we have already revectored - * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ - size = roundup(size, PMD_SIZE); - xen_cleanhighmap(addr, addr + size); - - size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - memblock_free(__pa(xen_start_info->mfn_list), size); - /* And revector! Bye bye old array */ - xen_start_info->mfn_list = new_mfn_list; - } else - goto skip; - } + unsigned long new_mfn_list; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + + new_mfn_list = xen_revector_p2m_tree(); + /* No memory or already called. */ + if (!new_mfn_list || new_mfn_list == xen_start_info->mfn_list) + return; + + /* using __ka address and sticking INVALID_P2M_ENTRY! */ + memset((void *)xen_start_info->mfn_list, 0xff, size); + + /* We should be in __ka space. */ + BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); + addr = xen_start_info->mfn_list; + /* We roundup to the PMD, which means that if anybody at this stage is + * using the __ka address of xen_start_info or xen_start_info->shared_info + * they are in going to crash. Fortunatly we have already revectored + * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ + size = roundup(size, PMD_SIZE); + xen_cleanhighmap(addr, addr + size); + + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + memblock_free(__pa(xen_start_info->mfn_list), size); + /* And revector! Bye bye old array */ + xen_start_info->mfn_list = new_mfn_list; + /* At this stage, cleanup_highmap has already cleaned __ka space * from _brk_limit way up to the max_pfn_mapped (which is the end of * the ramdisk). We continue on, erasing PMD entries that point to page @@ -1255,7 +1251,15 @@ static void __init xen_pagetable_init(void) * anything at this stage. */ xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); #endif -skip: +} +#endif + +static void __init xen_pagetable_init(void) +{ + paging_init(); + xen_setup_shared_info(); +#ifdef CONFIG_X86_64 + xen_pagetable_p2m_copy(); #endif xen_post_allocator_init(); } @@ -1753,6 +1757,10 @@ static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags) unsigned long pfn = __pa(addr) >> PAGE_SHIFT; pte_t pte = pfn_pte(pfn, prot); + /* For PVH no need to set R/O or R/W to pin them or unpin them. */ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags)) BUG(); } @@ -1863,6 +1871,7 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, * but that's enough to get __va working. We need to fill in the rest * of the physical mapping once some sort of allocator has been set * up. + * NOTE: for PVH, the page tables are native. */ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { @@ -1884,17 +1893,18 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Zap identity mapping */ init_level4_pgt[0] = __pgd(0); - /* Pre-constructed entries are in pfn, so convert to mfn */ - /* L4[272] -> level3_ident_pgt - * L4[511] -> level3_kernel_pgt */ - convert_pfn_mfn(init_level4_pgt); - - /* L3_i[0] -> level2_ident_pgt */ - convert_pfn_mfn(level3_ident_pgt); - /* L3_k[510] -> level2_kernel_pgt - * L3_i[511] -> level2_fixmap_pgt */ - convert_pfn_mfn(level3_kernel_pgt); - + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* Pre-constructed entries are in pfn, so convert to mfn */ + /* L4[272] -> level3_ident_pgt + * L4[511] -> level3_kernel_pgt */ + convert_pfn_mfn(init_level4_pgt); + + /* L3_i[0] -> level2_ident_pgt */ + convert_pfn_mfn(level3_ident_pgt); + /* L3_k[510] -> level2_kernel_pgt + * L3_i[511] -> level2_fixmap_pgt */ + convert_pfn_mfn(level3_kernel_pgt); + } /* We get [511][511] and have Xen's version of level2_kernel_pgt */ l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); @@ -1918,31 +1928,33 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) copy_page(level2_fixmap_pgt, l2); /* Note that we don't do anything with level1_fixmap_pgt which * we don't need. */ + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* Make pagetable pieces RO */ + set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); + set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); + + /* Pin down new L4 */ + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, + PFN_DOWN(__pa_symbol(init_level4_pgt))); + + /* Unpin Xen-provided one */ + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - /* Make pagetable pieces RO */ - set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); - set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); - set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); - - /* Pin down new L4 */ - pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, - PFN_DOWN(__pa_symbol(init_level4_pgt))); - - /* Unpin Xen-provided one */ - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - - /* - * At this stage there can be no user pgd, and no page - * structure to attach it to, so make sure we just set kernel - * pgd. - */ - xen_mc_batch(); - __xen_write_cr3(true, __pa(init_level4_pgt)); - xen_mc_issue(PARAVIRT_LAZY_CPU); + /* + * At this stage there can be no user pgd, and no page + * structure to attach it to, so make sure we just set kernel + * pgd. + */ + xen_mc_batch(); + __xen_write_cr3(true, __pa(init_level4_pgt)); + xen_mc_issue(PARAVIRT_LAZY_CPU); + } else + native_write_cr3(__pa(init_level4_pgt)); /* We can't that easily rip out L3 and L2, as the Xen pagetables are * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for @@ -2103,6 +2115,9 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) static void __init xen_post_allocator_init(void) { + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; pv_mmu_ops.set_pud = xen_set_pud; @@ -2207,6 +2222,15 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { void __init xen_init_mmu_ops(void) { x86_init.paging.pagetable_init = xen_pagetable_init; + + /* Optimization - we can use the HVM one but it has no idea which + * VCPUs are descheduled - which means that it will needlessly IPI + * them. Xen knows so let it do the job. + */ + if (xen_feature(XENFEAT_auto_translated_physmap)) { + pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others; + return; + } pv_mmu_ops = xen_mmu_ops; memset(dummy_mapping, 0xff, PAGE_SIZE); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 2ae8699e876..696c694986d 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -280,6 +280,9 @@ void __ref xen_build_mfn_list_list(void) { unsigned long pfn; + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + /* Pre-initialize p2m_top_mfn to be completely missing */ if (p2m_top_mfn == NULL) { p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); @@ -336,6 +339,9 @@ void __ref xen_build_mfn_list_list(void) void xen_setup_mfn_list_list(void) { + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = @@ -346,10 +352,15 @@ void xen_setup_mfn_list_list(void) /* Set up p2m_top to point to the domain-builder provided p2m pages */ void __init xen_build_dynamic_phys_to_machine(void) { - unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; - unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); + unsigned long *mfn_list; + unsigned long max_pfn; unsigned long pfn; + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + mfn_list = (unsigned long *)xen_start_info->mfn_list; + max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); xen_max_p2m_pfn = max_pfn; p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index 0a7852483ff..a8261716d58 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -30,10 +30,9 @@ #define XEN_PLATFORM_ERR_PROTOCOL -2 #define XEN_PLATFORM_ERR_BLACKLIST -3 -/* store the value of xen_emul_unplug after the unplug is done */ -int xen_platform_pci_unplug; -EXPORT_SYMBOL_GPL(xen_platform_pci_unplug); #ifdef CONFIG_XEN_PVHVM +/* store the value of xen_emul_unplug after the unplug is done */ +static int xen_platform_pci_unplug; static int xen_emul_unplug; static int check_platform_magic(void) @@ -69,6 +68,80 @@ static int check_platform_magic(void) return 0; } +bool xen_has_pv_devices() +{ + if (!xen_domain()) + return false; + + /* PV domains always have them. */ + if (xen_pv_domain()) + return true; + + /* And user has xen_platform_pci=0 set in guest config as + * driver did not modify the value. */ + if (xen_platform_pci_unplug == 0) + return false; + + if (xen_platform_pci_unplug & XEN_UNPLUG_NEVER) + return false; + + if (xen_platform_pci_unplug & XEN_UNPLUG_ALL) + return true; + + /* This is an odd one - we are going to run legacy + * and PV drivers at the same time. */ + if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) + return true; + + /* And the caller has to follow with xen_pv_{disk,nic}_devices + * to be certain which driver can load. */ + return false; +} +EXPORT_SYMBOL_GPL(xen_has_pv_devices); + +static bool __xen_has_pv_device(int state) +{ + /* HVM domains might or might not */ + if (xen_hvm_domain() && (xen_platform_pci_unplug & state)) + return true; + + return xen_has_pv_devices(); +} + +bool xen_has_pv_nic_devices(void) +{ + return __xen_has_pv_device(XEN_UNPLUG_ALL_NICS | XEN_UNPLUG_ALL); +} +EXPORT_SYMBOL_GPL(xen_has_pv_nic_devices); + +bool xen_has_pv_disk_devices(void) +{ + return __xen_has_pv_device(XEN_UNPLUG_ALL_IDE_DISKS | + XEN_UNPLUG_AUX_IDE_DISKS | XEN_UNPLUG_ALL); +} +EXPORT_SYMBOL_GPL(xen_has_pv_disk_devices); + +/* + * This one is odd - it determines whether you want to run PV _and_ + * legacy (IDE) drivers together. This combination is only possible + * under HVM. + */ +bool xen_has_pv_and_legacy_disk_devices(void) +{ + if (!xen_domain()) + return false; + + /* N.B. This is only ever used in HVM mode */ + if (xen_pv_domain()) + return false; + + if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) + return true; + + return false; +} +EXPORT_SYMBOL_GPL(xen_has_pv_and_legacy_disk_devices); + void xen_unplug_emulated_devices(void) { int r; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 68c054f59de..0982233b9b8 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -27,6 +27,7 @@ #include <xen/interface/memory.h> #include <xen/interface/physdev.h> #include <xen/features.h> +#include "mmu.h" #include "xen-ops.h" #include "vdso.h" @@ -34,7 +35,7 @@ extern const char xen_hypervisor_callback[]; extern const char xen_failsafe_callback[]; #ifdef CONFIG_X86_64 -extern const char nmi[]; +extern asmlinkage void nmi(void); #endif extern void xen_sysenter_target(void); extern void xen_syscall_target(void); @@ -81,6 +82,9 @@ static void __init xen_add_extra_mem(u64 start, u64 size) memblock_reserve(start, size); + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + xen_max_p2m_pfn = PFN_DOWN(start + size); for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) { unsigned long mfn = pfn_to_mfn(pfn); @@ -103,6 +107,7 @@ static unsigned long __init xen_do_chunk(unsigned long start, .domid = DOMID_SELF }; unsigned long len = 0; + int xlated_phys = xen_feature(XENFEAT_auto_translated_physmap); unsigned long pfn; int ret; @@ -116,7 +121,7 @@ static unsigned long __init xen_do_chunk(unsigned long start, continue; frame = mfn; } else { - if (mfn != INVALID_P2M_ENTRY) + if (!xlated_phys && mfn != INVALID_P2M_ENTRY) continue; frame = pfn; } @@ -154,6 +159,13 @@ static unsigned long __init xen_do_chunk(unsigned long start, static unsigned long __init xen_release_chunk(unsigned long start, unsigned long end) { + /* + * Xen already ballooned out the E820 non RAM regions for us + * and set them up properly in EPT. + */ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return end - start; + return xen_do_chunk(start, end, true); } @@ -222,7 +234,13 @@ static void __init xen_set_identity_and_release_chunk( * (except for the ISA region which must be 1:1 mapped) to * release the refcounts (in Xen) on the original frames. */ - for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) { + + /* + * PVH E820 matches the hypervisor's P2M which means we need to + * account for the proper values of *release and *identity. + */ + for (pfn = start_pfn; !xen_feature(XENFEAT_auto_translated_physmap) && + pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) { pte_t pte = __pte_ma(0); if (pfn < PFN_UP(ISA_END_ADDRESS)) @@ -559,20 +577,17 @@ void xen_enable_syscall(void) void xen_enable_nmi(void) { #ifdef CONFIG_X86_64 - if (register_callback(CALLBACKTYPE_nmi, nmi)) + if (register_callback(CALLBACKTYPE_nmi, (char *)nmi)) BUG(); #endif } -void __init xen_arch_setup(void) +void __init xen_pvmmu_arch_setup(void) { - xen_panic_handler_init(); - HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); - if (!xen_feature(XENFEAT_auto_translated_physmap)) - HYPERVISOR_vm_assist(VMASST_CMD_enable, - VMASST_TYPE_pae_extended_cr3); + HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_pae_extended_cr3); if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) @@ -581,6 +596,15 @@ void __init xen_arch_setup(void) xen_enable_sysenter(); xen_enable_syscall(); xen_enable_nmi(); +} + +/* This function is not called for HVM domains */ +void __init xen_arch_setup(void) +{ + xen_panic_handler_init(); + if (!xen_feature(XENFEAT_auto_translated_physmap)) + xen_pvmmu_arch_setup(); + #ifdef CONFIG_ACPI if (!(xen_start_info->flags & SIF_INITDOMAIN)) { printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index c36b325abd8..a18eadd8bb4 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -73,9 +73,11 @@ static void cpu_bringup(void) touch_softlockup_watchdog(); preempt_disable(); - xen_enable_sysenter(); - xen_enable_syscall(); - + /* PVH runs in ring 0 and allows us to do native syscalls. Yay! */ + if (!xen_feature(XENFEAT_supervisor_mode_kernel)) { + xen_enable_sysenter(); + xen_enable_syscall(); + } cpu = smp_processor_id(); smp_store_cpu_info(cpu); cpu_data(cpu).x86_max_cores = 1; @@ -97,8 +99,14 @@ static void cpu_bringup(void) wmb(); /* make sure everything is out */ } -static void cpu_bringup_and_idle(void) +/* Note: cpu parameter is only relevant for PVH */ +static void cpu_bringup_and_idle(int cpu) { +#ifdef CONFIG_X86_64 + if (xen_feature(XENFEAT_auto_translated_physmap) && + xen_feature(XENFEAT_supervisor_mode_kernel)) + xen_pvh_secondary_vcpu_init(cpu); +#endif cpu_bringup(); cpu_startup_entry(CPUHP_ONLINE); } @@ -274,9 +282,10 @@ static void __init xen_smp_prepare_boot_cpu(void) native_smp_prepare_boot_cpu(); if (xen_pv_domain()) { - /* We've switched to the "real" per-cpu gdt, so make sure the - old memory can be recycled */ - make_lowmem_page_readwrite(xen_initial_gdt); + if (!xen_feature(XENFEAT_writable_page_tables)) + /* We've switched to the "real" per-cpu gdt, so make + * sure the old memory can be recycled. */ + make_lowmem_page_readwrite(xen_initial_gdt); #ifdef CONFIG_X86_32 /* @@ -360,22 +369,21 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) gdt = get_cpu_gdt_table(cpu); - ctxt->flags = VGCF_IN_KERNEL; - ctxt->user_regs.ss = __KERNEL_DS; #ifdef CONFIG_X86_32 + /* Note: PVH is not yet supported on x86_32. */ ctxt->user_regs.fs = __KERNEL_PERCPU; ctxt->user_regs.gs = __KERNEL_STACK_CANARY; -#else - ctxt->gs_base_kernel = per_cpu_offset(cpu); #endif ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); - { + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + ctxt->flags = VGCF_IN_KERNEL; ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ ctxt->user_regs.ds = __USER_DS; ctxt->user_regs.es = __USER_DS; + ctxt->user_regs.ss = __KERNEL_DS; xen_copy_trap_info(ctxt->trap_ctxt); @@ -396,18 +404,27 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) #ifdef CONFIG_X86_32 ctxt->event_callback_cs = __KERNEL_CS; ctxt->failsafe_callback_cs = __KERNEL_CS; +#else + ctxt->gs_base_kernel = per_cpu_offset(cpu); #endif ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback; ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; + ctxt->user_regs.cs = __KERNEL_CS; + per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); +#ifdef CONFIG_X86_32 } - ctxt->user_regs.cs = __KERNEL_CS; +#else + } else + /* N.B. The user_regs.eip (cpu_bringup_and_idle) is called with + * %rdi having the cpu number - which means are passing in + * as the first parameter the cpu. Subtle! + */ + ctxt->user_regs.rdi = cpu; +#endif ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); - - per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); - if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt)) BUG(); diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 0e36cde12f7..581521c843a 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -106,7 +106,7 @@ static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting); static cpumask_t waiting_cpus; static bool xen_pvspin = true; -static void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) +__visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) { int irq = __this_cpu_read(lock_kicker_irq); struct xen_lock_waiting *w = &__get_cpu_var(lock_waiting); diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 12a1ca707b9..7b78f88c170 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -446,6 +446,7 @@ void xen_setup_timer(int cpu) IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER| IRQF_FORCE_RESUME, name, NULL); + (void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX); memcpy(evt, xen_clockevent, sizeof(*evt)); diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 7faed5869e5..485b6958554 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -11,8 +11,28 @@ #include <asm/page_types.h> #include <xen/interface/elfnote.h> +#include <xen/interface/features.h> #include <asm/xen/interface.h> +#ifdef CONFIG_XEN_PVH +#define PVH_FEATURES_STR "|writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel" +/* Note the lack of 'hvm_callback_vector'. Older hypervisor will + * balk at this being part of XEN_ELFNOTE_FEATURES, so we put it in + * XEN_ELFNOTE_SUPPORTED_FEATURES which older hypervisors will ignore. + */ +#define PVH_FEATURES ((1 << XENFEAT_writable_page_tables) | \ + (1 << XENFEAT_auto_translated_physmap) | \ + (1 << XENFEAT_supervisor_mode_kernel) | \ + (1 << XENFEAT_hvm_callback_vector)) +/* The XENFEAT_writable_page_tables is not stricly neccessary as we set that + * up regardless whether this CONFIG option is enabled or not, but it + * clarifies what the right flags need to be. + */ +#else +#define PVH_FEATURES_STR "" +#define PVH_FEATURES (0) +#endif + __INIT ENTRY(startup_xen) cld @@ -95,7 +115,10 @@ NEXT_HYPERCALL(arch_6) #endif ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) - ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables|pae_pgdir_above_4gb"; .asciz PVH_FEATURES_STR) + ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long (PVH_FEATURES) | + (1 << XENFEAT_writable_page_tables) | + (1 << XENFEAT_dom0)) ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 95f8c614232..1cb6f4c3730 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -123,4 +123,5 @@ __visible void xen_adjust_exception_frame(void); extern int xen_panic_handler_init(void); +void xen_pvh_secondary_vcpu_init(int cpu); #endif /* XEN_OPS_H */ |