diff options
Diffstat (limited to 'arch/x86/xen')
-rw-r--r-- | arch/x86/xen/Kconfig | 5 | ||||
-rw-r--r-- | arch/x86/xen/Makefile | 3 | ||||
-rw-r--r-- | arch/x86/xen/enlighten.c | 201 | ||||
-rw-r--r-- | arch/x86/xen/mmu.c | 328 | ||||
-rw-r--r-- | arch/x86/xen/mmu.h | 1 | ||||
-rw-r--r-- | arch/x86/xen/pci-swiotlb-xen.c | 58 | ||||
-rw-r--r-- | arch/x86/xen/platform-pci-unplug.c | 143 | ||||
-rw-r--r-- | arch/x86/xen/setup.c | 72 | ||||
-rw-r--r-- | arch/x86/xen/smp.c | 2 | ||||
-rw-r--r-- | arch/x86/xen/suspend.c | 12 | ||||
-rw-r--r-- | arch/x86/xen/time.c | 96 | ||||
-rw-r--r-- | arch/x86/xen/xen-ops.h | 13 |
12 files changed, 863 insertions, 71 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index b83e119fbeb..68128a1b401 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -13,6 +13,11 @@ config XEN kernel to boot in a paravirtualized environment under the Xen hypervisor. +config XEN_PVHVM + def_bool y + depends on XEN + depends on X86_LOCAL_APIC + config XEN_MAX_DOMAIN_MEMORY int "Maximum allowed size of a domain in gigabytes" default 8 if X86_32 diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 3bb4fc21f4f..77938515891 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -12,9 +12,10 @@ CFLAGS_mmu.o := $(nostackp) obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ - grant-table.o suspend.o + grant-table.o suspend.o platform-pci-unplug.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o +obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 65d8d79b46a..7d46c844141 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -11,6 +11,7 @@ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 */ +#include <linux/cpu.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/smp.h> @@ -35,8 +36,10 @@ #include <xen/interface/version.h> #include <xen/interface/physdev.h> #include <xen/interface/vcpu.h> +#include <xen/interface/memory.h> #include <xen/features.h> #include <xen/page.h> +#include <xen/hvm.h> #include <xen/hvc-console.h> #include <asm/paravirt.h> @@ -55,7 +58,9 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/reboot.h> +#include <asm/setup.h> #include <asm/stackprotector.h> +#include <asm/hypervisor.h> #include "xen-ops.h" #include "mmu.h" @@ -76,6 +81,10 @@ struct shared_info xen_dummy_shared_info; void *xen_initial_gdt; +RESERVE_BRK(shared_info_page_brk, PAGE_SIZE); +__read_mostly int xen_have_vector_callback; +EXPORT_SYMBOL_GPL(xen_have_vector_callback); + /* * Point at some empty memory to start with. We map the real shared_info * page as soon as fixmap is up and running. @@ -97,6 +106,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; */ static int have_vcpu_info_placement = 1; +static void clamp_max_cpus(void) +{ +#ifdef CONFIG_SMP + if (setup_max_cpus > MAX_VIRT_CPUS) + setup_max_cpus = MAX_VIRT_CPUS; +#endif +} + static void xen_vcpu_setup(int cpu) { struct vcpu_register_vcpu_info info; @@ -104,13 +121,17 @@ static void xen_vcpu_setup(int cpu) struct vcpu_info *vcpup; BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); - per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; - if (!have_vcpu_info_placement) - return; /* already tested, not available */ + if (cpu < MAX_VIRT_CPUS) + per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; - vcpup = &per_cpu(xen_vcpu_info, cpu); + if (!have_vcpu_info_placement) { + if (cpu >= MAX_VIRT_CPUS) + clamp_max_cpus(); + return; + } + vcpup = &per_cpu(xen_vcpu_info, cpu); info.mfn = arbitrary_virt_to_mfn(vcpup); info.offset = offset_in_page(vcpup); @@ -125,6 +146,7 @@ static void xen_vcpu_setup(int cpu) if (err) { printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); have_vcpu_info_placement = 0; + clamp_max_cpus(); } else { /* This cpu is using the registered vcpu info, even if later ones fail to. */ @@ -731,7 +753,6 @@ static void set_xen_basic_apic_ops(void) #endif - static void xen_clts(void) { struct multicall_space mcs; @@ -926,10 +947,6 @@ static const struct pv_init_ops xen_init_ops __initdata = { .patch = xen_patch, }; -static const struct pv_time_ops xen_time_ops __initdata = { - .sched_clock = xen_sched_clock, -}; - static const struct pv_cpu_ops xen_cpu_ops __initdata = { .cpuid = xen_cpuid, @@ -1028,6 +1045,23 @@ static void xen_crash_shutdown(struct pt_regs *regs) xen_reboot(SHUTDOWN_crash); } +static int +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + xen_reboot(SHUTDOWN_crash); + return NOTIFY_DONE; +} + +static struct notifier_block xen_panic_block = { + .notifier_call= xen_panic_event, +}; + +int xen_panic_handler_init(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block); + return 0; +} + static const struct machine_ops __initdata xen_machine_ops = { .restart = xen_restart, .halt = xen_machine_halt, @@ -1067,7 +1101,6 @@ asmlinkage void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; pv_init_ops = xen_init_ops; - pv_time_ops = xen_time_ops; pv_cpu_ops = xen_cpu_ops; pv_apic_ops = xen_apic_ops; @@ -1075,13 +1108,7 @@ asmlinkage void __init xen_start_kernel(void) x86_init.oem.arch_setup = xen_arch_setup; x86_init.oem.banner = xen_banner; - x86_init.timers.timer_init = xen_time_init; - x86_init.timers.setup_percpu_clockev = x86_init_noop; - x86_cpuinit.setup_percpu_clockev = x86_init_noop; - - x86_platform.calibrate_tsc = xen_tsc_khz; - x86_platform.get_wallclock = xen_get_wallclock; - x86_platform.set_wallclock = xen_set_wallclock; + xen_init_time_ops(); /* * Set up some pagetable state before starting to set any ptes. @@ -1145,6 +1172,10 @@ asmlinkage void __init xen_start_kernel(void) pgd = (pgd_t *)xen_start_info->pt_base; + if (!xen_initial_domain()) + __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); + + __supported_pte_mask |= _PAGE_IOMAP; /* Don't do the full vcpu_info placement stuff until we have a possible map and a non-dummy shared_info. */ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; @@ -1206,3 +1237,139 @@ asmlinkage void __init xen_start_kernel(void) x86_64_start_reservations((char *)__pa_symbol(&boot_params)); #endif } + +static uint32_t xen_cpuid_base(void) +{ + uint32_t base, eax, ebx, ecx, edx; + char signature[13]; + + for (base = 0x40000000; base < 0x40010000; base += 0x100) { + cpuid(base, &eax, &ebx, &ecx, &edx); + *(uint32_t *)(signature + 0) = ebx; + *(uint32_t *)(signature + 4) = ecx; + *(uint32_t *)(signature + 8) = edx; + signature[12] = 0; + + if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2)) + return base; + } + + return 0; +} + +static int init_hvm_pv_info(int *major, int *minor) +{ + uint32_t eax, ebx, ecx, edx, pages, msr, base; + u64 pfn; + + base = xen_cpuid_base(); + cpuid(base + 1, &eax, &ebx, &ecx, &edx); + + *major = eax >> 16; + *minor = eax & 0xffff; + printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor); + + cpuid(base + 2, &pages, &msr, &ecx, &edx); + + pfn = __pa(hypercall_page); + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + + xen_setup_features(); + + pv_info = xen_info; + pv_info.kernel_rpl = 0; + + xen_domain_type = XEN_HVM_DOMAIN; + + return 0; +} + +void xen_hvm_init_shared_info(void) +{ + int cpu; + struct xen_add_to_physmap xatp; + static struct shared_info *shared_info_page = 0; + + if (!shared_info_page) + shared_info_page = (struct shared_info *) + extend_brk(PAGE_SIZE, PAGE_SIZE); + xatp.domid = DOMID_SELF; + xatp.idx = 0; + xatp.space = XENMAPSPACE_shared_info; + xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; + if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) + BUG(); + + HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; + + /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info + * page, we use it in the event channel upcall and in some pvclock + * related functions. We don't need the vcpu_info placement + * optimizations because we don't use any pv_mmu or pv_irq op on + * HVM. + * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is + * online but xen_hvm_init_shared_info is run at resume time too and + * in that case multiple vcpus might be online. */ + for_each_online_cpu(cpu) { + per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; + } +} + +#ifdef CONFIG_XEN_PVHVM +static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (long)hcpu; + switch (action) { + case CPU_UP_PREPARE: + per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = { + .notifier_call = xen_hvm_cpu_notify, +}; + +static void __init xen_hvm_guest_init(void) +{ + int r; + int major, minor; + + r = init_hvm_pv_info(&major, &minor); + if (r < 0) + return; + + xen_hvm_init_shared_info(); + + if (xen_feature(XENFEAT_hvm_callback_vector)) + xen_have_vector_callback = 1; + register_cpu_notifier(&xen_hvm_cpu_notifier); + xen_unplug_emulated_devices(); + have_vcpu_info_placement = 0; + x86_init.irqs.intr_init = xen_init_IRQ; + xen_hvm_init_time_ops(); + xen_hvm_init_mmu_ops(); +} + +static bool __init xen_hvm_platform(void) +{ + if (xen_pv_domain()) + return false; + + if (!xen_cpuid_base()) + return false; + + return true; +} + +const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = { + .name = "Xen HVM", + .detect = xen_hvm_platform, + .init_platform = xen_hvm_guest_init, +}; +EXPORT_SYMBOL(x86_hyper_xen_hvm); +#endif diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 914f04695ce..42086ac406a 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -42,6 +42,7 @@ #include <linux/highmem.h> #include <linux/debugfs.h> #include <linux/bug.h> +#include <linux/vmalloc.h> #include <linux/module.h> #include <linux/gfp.h> @@ -51,14 +52,19 @@ #include <asm/mmu_context.h> #include <asm/setup.h> #include <asm/paravirt.h> +#include <asm/e820.h> #include <asm/linkage.h> +#include <asm/page.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> +#include <xen/xen.h> #include <xen/page.h> #include <xen/interface/xen.h> +#include <xen/interface/hvm/hvm_op.h> #include <xen/interface/version.h> +#include <xen/interface/memory.h> #include <xen/hvc-console.h> #include "multicalls.h" @@ -67,6 +73,13 @@ #define MMU_UPDATE_HISTO 30 +/* + * Protects atomic reservation decrease/increase against concurrent increases. + * Also protects non-atomic updates of current_pages and driver_pages, and + * balloon lists. + */ +DEFINE_SPINLOCK(xen_reservation_lock); + #ifdef CONFIG_XEN_DEBUG_FS static struct { @@ -377,6 +390,28 @@ static bool xen_page_pinned(void *ptr) return PagePinned(page); } +static bool xen_iomap_pte(pte_t pte) +{ + return pte_flags(pte) & _PAGE_IOMAP; +} + +static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) +{ + struct multicall_space mcs; + struct mmu_update *u; + + mcs = xen_mc_entry(sizeof(*u)); + u = mcs.args; + + /* ptep might be kmapped when using 32-bit HIGHPTE */ + u->ptr = arbitrary_virt_to_machine(ptep).maddr; + u->val = pte_val_ma(pteval); + + MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO); + + xen_mc_issue(PARAVIRT_LAZY_MMU); +} + static void xen_extend_mmu_update(const struct mmu_update *update) { struct multicall_space mcs; @@ -453,6 +488,11 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { + if (xen_iomap_pte(pteval)) { + xen_set_iomap_pte(ptep, pteval); + goto out; + } + ADD_STATS(set_pte_at, 1); // ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); ADD_STATS(set_pte_at_current, mm == current->mm); @@ -523,8 +563,25 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) return val; } +static pteval_t iomap_pte(pteval_t val) +{ + if (val & _PAGE_PRESENT) { + unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; + pteval_t flags = val & PTE_FLAGS_MASK; + + /* We assume the pte frame number is a MFN, so + just use it as-is. */ + val = ((pteval_t)pfn << PAGE_SHIFT) | flags; + } + + return val; +} + pteval_t xen_pte_val(pte_t pte) { + if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP)) + return pte.pte; + return pte_mfn_to_pfn(pte.pte); } PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); @@ -537,7 +594,22 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); pte_t xen_make_pte(pteval_t pte) { - pte = pte_pfn_to_mfn(pte); + phys_addr_t addr = (pte & PTE_PFN_MASK); + + /* + * Unprivileged domains are allowed to do IOMAPpings for + * PCI passthrough, but not map ISA space. The ISA + * mappings are just dummy local mappings to keep other + * parts of the kernel happy. + */ + if (unlikely(pte & _PAGE_IOMAP) && + (xen_initial_domain() || addr >= ISA_END_ADDRESS)) { + pte = iomap_pte(pte); + } else { + pte &= ~_PAGE_IOMAP; + pte = pte_pfn_to_mfn(pte); + } + return native_make_pte(pte); } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); @@ -593,6 +665,11 @@ void xen_set_pud(pud_t *ptr, pud_t val) void xen_set_pte(pte_t *ptep, pte_t pte) { + if (xen_iomap_pte(pte)) { + xen_set_iomap_pte(ptep, pte); + return; + } + ADD_STATS(pte_update, 1); // ADD_STATS(pte_update_pinned, xen_page_pinned(ptep)); ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); @@ -609,6 +686,11 @@ void xen_set_pte(pte_t *ptep, pte_t pte) #ifdef CONFIG_X86_PAE void xen_set_pte_atomic(pte_t *ptep, pte_t pte) { + if (xen_iomap_pte(pte)) { + xen_set_iomap_pte(ptep, pte); + return; + } + set_64bit((u64 *)ptep, native_pte_val(pte)); } @@ -935,8 +1017,6 @@ static int xen_pin_page(struct mm_struct *mm, struct page *page, read-only, and can be pinned. */ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) { - vm_unmap_aliases(); - xen_mc_batch(); if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) { @@ -1500,7 +1580,6 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l if (PagePinned(virt_to_page(mm->pgd))) { SetPagePinned(page); - vm_unmap_aliases(); if (!PageHighMem(page)) { make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); if (level == PT_PTE && USE_SPLIT_PTLOCKS) @@ -1811,9 +1890,16 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) pte = pfn_pte(phys, prot); break; - default: + case FIX_PARAVIRT_BOOTMAP: + /* This is an MFN, but it isn't an IO mapping from the + IO domain */ pte = mfn_pte(phys, prot); break; + + default: + /* By default, set_fixmap is used for hardware mappings */ + pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP)); + break; } __native_set_fixmap(idx, pte); @@ -1939,8 +2025,240 @@ void __init xen_init_mmu_ops(void) x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; pv_mmu_ops = xen_mmu_ops; + + vmap_lazy_unmap = false; +} + +/* Protected by xen_reservation_lock. */ +#define MAX_CONTIG_ORDER 9 /* 2MB */ +static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER]; + +#define VOID_PTE (mfn_pte(0, __pgprot(0))) +static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, + unsigned long *in_frames, + unsigned long *out_frames) +{ + int i; + struct multicall_space mcs; + + xen_mc_batch(); + for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) { + mcs = __xen_mc_entry(0); + + if (in_frames) + in_frames[i] = virt_to_mfn(vaddr); + + MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); + set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); + + if (out_frames) + out_frames[i] = virt_to_pfn(vaddr); + } + xen_mc_issue(0); +} + +/* + * Update the pfn-to-mfn mappings for a virtual address range, either to + * point to an array of mfns, or contiguously from a single starting + * mfn. + */ +static void xen_remap_exchanged_ptes(unsigned long vaddr, int order, + unsigned long *mfns, + unsigned long first_mfn) +{ + unsigned i, limit; + unsigned long mfn; + + xen_mc_batch(); + + limit = 1u << order; + for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) { + struct multicall_space mcs; + unsigned flags; + + mcs = __xen_mc_entry(0); + if (mfns) + mfn = mfns[i]; + else + mfn = first_mfn + i; + + if (i < (limit - 1)) + flags = 0; + else { + if (order == 0) + flags = UVMF_INVLPG | UVMF_ALL; + else + flags = UVMF_TLB_FLUSH | UVMF_ALL; + } + + MULTI_update_va_mapping(mcs.mc, vaddr, + mfn_pte(mfn, PAGE_KERNEL), flags); + + set_phys_to_machine(virt_to_pfn(vaddr), mfn); + } + + xen_mc_issue(0); +} + +/* + * Perform the hypercall to exchange a region of our pfns to point to + * memory with the required contiguous alignment. Takes the pfns as + * input, and populates mfns as output. + * + * Returns a success code indicating whether the hypervisor was able to + * satisfy the request or not. + */ +static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in, + unsigned long *pfns_in, + unsigned long extents_out, + unsigned int order_out, + unsigned long *mfns_out, + unsigned int address_bits) +{ + long rc; + int success; + + struct xen_memory_exchange exchange = { + .in = { + .nr_extents = extents_in, + .extent_order = order_in, + .extent_start = pfns_in, + .domid = DOMID_SELF + }, + .out = { + .nr_extents = extents_out, + .extent_order = order_out, + .extent_start = mfns_out, + .address_bits = address_bits, + .domid = DOMID_SELF + } + }; + + BUG_ON(extents_in << order_in != extents_out << order_out); + + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); + success = (exchange.nr_exchanged == extents_in); + + BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); + BUG_ON(success && (rc != 0)); + + return success; } +int xen_create_contiguous_region(unsigned long vstart, unsigned int order, + unsigned int address_bits) +{ + unsigned long *in_frames = discontig_frames, out_frame; + unsigned long flags; + int success; + + /* + * Currently an auto-translated guest will not perform I/O, nor will + * it require PAE page directories below 4GB. Therefore any calls to + * this function are redundant and can be ignored. + */ + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + if (unlikely(order > MAX_CONTIG_ORDER)) + return -ENOMEM; + + memset((void *) vstart, 0, PAGE_SIZE << order); + + spin_lock_irqsave(&xen_reservation_lock, flags); + + /* 1. Zap current PTEs, remembering MFNs. */ + xen_zap_pfn_range(vstart, order, in_frames, NULL); + + /* 2. Get a new contiguous memory extent. */ + out_frame = virt_to_pfn(vstart); + success = xen_exchange_memory(1UL << order, 0, in_frames, + 1, order, &out_frame, + address_bits); + + /* 3. Map the new extent in place of old pages. */ + if (success) + xen_remap_exchanged_ptes(vstart, order, NULL, out_frame); + else + xen_remap_exchanged_ptes(vstart, order, in_frames, 0); + + spin_unlock_irqrestore(&xen_reservation_lock, flags); + + return success ? 0 : -ENOMEM; +} +EXPORT_SYMBOL_GPL(xen_create_contiguous_region); + +void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order) +{ + unsigned long *out_frames = discontig_frames, in_frame; + unsigned long flags; + int success; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + if (unlikely(order > MAX_CONTIG_ORDER)) + return; + + memset((void *) vstart, 0, PAGE_SIZE << order); + + spin_lock_irqsave(&xen_reservation_lock, flags); + + /* 1. Find start MFN of contiguous extent. */ + in_frame = virt_to_mfn(vstart); + + /* 2. Zap current PTEs. */ + xen_zap_pfn_range(vstart, order, NULL, out_frames); + + /* 3. Do the exchange for non-contiguous MFNs. */ + success = xen_exchange_memory(1, order, &in_frame, 1UL << order, + 0, out_frames, 0); + + /* 4. Map new pages in place of old pages. */ + if (success) + xen_remap_exchanged_ptes(vstart, order, out_frames, 0); + else + xen_remap_exchanged_ptes(vstart, order, NULL, in_frame); + + spin_unlock_irqrestore(&xen_reservation_lock, flags); +} +EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); + +#ifdef CONFIG_XEN_PVHVM +static void xen_hvm_exit_mmap(struct mm_struct *mm) +{ + struct xen_hvm_pagetable_dying a; + int rc; + + a.domid = DOMID_SELF; + a.gpa = __pa(mm->pgd); + rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); + WARN_ON_ONCE(rc < 0); +} + +static int is_pagetable_dying_supported(void) +{ + struct xen_hvm_pagetable_dying a; + int rc = 0; + + a.domid = DOMID_SELF; + a.gpa = 0x00; + rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); + if (rc < 0) { + printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n"); + return 0; + } + return 1; +} + +void __init xen_hvm_init_mmu_ops(void) +{ + if (is_pagetable_dying_supported()) + pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap; +} +#endif + #ifdef CONFIG_XEN_DEBUG_FS static struct dentry *d_mmu_debug; diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 5fe6bc7f5ec..fa938c4aa2f 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -60,4 +60,5 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, unsigned long xen_read_cr2_direct(void); extern void xen_init_mmu_ops(void); +extern void xen_hvm_init_mmu_ops(void); #endif /* _XEN_MMU_H */ diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c new file mode 100644 index 00000000000..a013ec9d0c5 --- /dev/null +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -0,0 +1,58 @@ +/* Glue code to lib/swiotlb-xen.c */ + +#include <linux/dma-mapping.h> +#include <xen/swiotlb-xen.h> + +#include <asm/xen/hypervisor.h> +#include <xen/xen.h> + +int xen_swiotlb __read_mostly; + +static struct dma_map_ops xen_swiotlb_dma_ops = { + .mapping_error = xen_swiotlb_dma_mapping_error, + .alloc_coherent = xen_swiotlb_alloc_coherent, + .free_coherent = xen_swiotlb_free_coherent, + .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, + .sync_single_for_device = xen_swiotlb_sync_single_for_device, + .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu, + .sync_sg_for_device = xen_swiotlb_sync_sg_for_device, + .map_sg = xen_swiotlb_map_sg_attrs, + .unmap_sg = xen_swiotlb_unmap_sg_attrs, + .map_page = xen_swiotlb_map_page, + .unmap_page = xen_swiotlb_unmap_page, + .dma_supported = xen_swiotlb_dma_supported, +}; + +/* + * pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary + * + * This returns non-zero if we are forced to use xen_swiotlb (by the boot + * option). + */ +int __init pci_xen_swiotlb_detect(void) +{ + + /* If running as PV guest, either iommu=soft, or swiotlb=force will + * activate this IOMMU. If running as PV privileged, activate it + * irregardlesss. + */ + if ((xen_initial_domain() || swiotlb || swiotlb_force) && + (xen_pv_domain())) + xen_swiotlb = 1; + + /* If we are running under Xen, we MUST disable the native SWIOTLB. + * Don't worry about swiotlb_force flag activating the native, as + * the 'swiotlb' flag is the only one turning it on. */ + if (xen_pv_domain()) + swiotlb = 0; + + return xen_swiotlb; +} + +void __init pci_xen_swiotlb_init(void) +{ + if (xen_swiotlb) { + xen_swiotlb_init(1); + dma_ops = &xen_swiotlb_dma_ops; + } +} diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c new file mode 100644 index 00000000000..0f456386cce --- /dev/null +++ b/arch/x86/xen/platform-pci-unplug.c @@ -0,0 +1,143 @@ +/****************************************************************************** + * platform-pci-unplug.c + * + * Xen platform PCI device driver + * Copyright (c) 2010, Citrix + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#include <linux/init.h> +#include <linux/io.h> +#include <linux/module.h> + +#include <xen/platform_pci.h> + +#define XEN_PLATFORM_ERR_MAGIC -1 +#define XEN_PLATFORM_ERR_PROTOCOL -2 +#define XEN_PLATFORM_ERR_BLACKLIST -3 + +/* store the value of xen_emul_unplug after the unplug is done */ +int xen_platform_pci_unplug; +EXPORT_SYMBOL_GPL(xen_platform_pci_unplug); +#ifdef CONFIG_XEN_PVHVM +static int xen_emul_unplug; + +static int __init check_platform_magic(void) +{ + short magic; + char protocol; + + magic = inw(XEN_IOPORT_MAGIC); + if (magic != XEN_IOPORT_MAGIC_VAL) { + printk(KERN_ERR "Xen Platform PCI: unrecognised magic value\n"); + return XEN_PLATFORM_ERR_MAGIC; + } + + protocol = inb(XEN_IOPORT_PROTOVER); + + printk(KERN_DEBUG "Xen Platform PCI: I/O protocol version %d\n", + protocol); + + switch (protocol) { + case 1: + outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM); + outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER); + if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) { + printk(KERN_ERR "Xen Platform: blacklisted by host\n"); + return XEN_PLATFORM_ERR_BLACKLIST; + } + break; + default: + printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version"); + return XEN_PLATFORM_ERR_PROTOCOL; + } + + return 0; +} + +void __init xen_unplug_emulated_devices(void) +{ + int r; + + /* user explicitly requested no unplug */ + if (xen_emul_unplug & XEN_UNPLUG_NEVER) + return; + /* check the version of the xen platform PCI device */ + r = check_platform_magic(); + /* If the version matches enable the Xen platform PCI driver. + * Also enable the Xen platform PCI driver if the host does + * not support the unplug protocol (XEN_PLATFORM_ERR_MAGIC) + * but the user told us that unplugging is unnecessary. */ + if (r && !(r == XEN_PLATFORM_ERR_MAGIC && + (xen_emul_unplug & XEN_UNPLUG_UNNECESSARY))) + return; + /* Set the default value of xen_emul_unplug depending on whether or + * not the Xen PV frontends and the Xen platform PCI driver have + * been compiled for this kernel (modules or built-in are both OK). */ + if (!xen_emul_unplug) { + if (xen_must_unplug_nics()) { + printk(KERN_INFO "Netfront and the Xen platform PCI driver have " + "been compiled for this kernel: unplug emulated NICs.\n"); + xen_emul_unplug |= XEN_UNPLUG_ALL_NICS; + } + if (xen_must_unplug_disks()) { + printk(KERN_INFO "Blkfront and the Xen platform PCI driver have " + "been compiled for this kernel: unplug emulated disks.\n" + "You might have to change the root device\n" + "from /dev/hd[a-d] to /dev/xvd[a-d]\n" + "in your root= kernel command line option\n"); + xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS; + } + } + /* Now unplug the emulated devices */ + if (!(xen_emul_unplug & XEN_UNPLUG_UNNECESSARY)) + outw(xen_emul_unplug, XEN_IOPORT_UNPLUG); + xen_platform_pci_unplug = xen_emul_unplug; +} + +static int __init parse_xen_emul_unplug(char *arg) +{ + char *p, *q; + int l; + + for (p = arg; p; p = q) { + q = strchr(p, ','); + if (q) { + l = q - p; + q++; + } else { + l = strlen(p); + } + if (!strncmp(p, "all", l)) + xen_emul_unplug |= XEN_UNPLUG_ALL; + else if (!strncmp(p, "ide-disks", l)) + xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS; + else if (!strncmp(p, "aux-ide-disks", l)) + xen_emul_unplug |= XEN_UNPLUG_AUX_IDE_DISKS; + else if (!strncmp(p, "nics", l)) + xen_emul_unplug |= XEN_UNPLUG_ALL_NICS; + else if (!strncmp(p, "unnecessary", l)) + xen_emul_unplug |= XEN_UNPLUG_UNNECESSARY; + else if (!strncmp(p, "never", l)) + xen_emul_unplug |= XEN_UNPLUG_NEVER; + else + printk(KERN_WARNING "unrecognised option '%s' " + "in parameter 'xen_emul_unplug'\n", p); + } + return 0; +} +early_param("xen_emul_unplug", parse_xen_emul_unplug); +#endif diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index ad0047f47cd..328b0030542 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -20,6 +20,7 @@ #include <xen/page.h> #include <xen/interface/callback.h> #include <xen/interface/physdev.h> +#include <xen/interface/memory.h> #include <xen/features.h> #include "xen-ops.h" @@ -32,6 +33,73 @@ extern void xen_sysenter_target(void); extern void xen_syscall_target(void); extern void xen_syscall32_target(void); +static unsigned long __init xen_release_chunk(phys_addr_t start_addr, + phys_addr_t end_addr) +{ + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + unsigned long start, end; + unsigned long len = 0; + unsigned long pfn; + int ret; + + start = PFN_UP(start_addr); + end = PFN_DOWN(end_addr); + + if (end <= start) + return 0; + + printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ", + start, end); + for(pfn = start; pfn < end; pfn++) { + unsigned long mfn = pfn_to_mfn(pfn); + + /* Make sure pfn exists to start with */ + if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) + continue; + + set_xen_guest_handle(reservation.extent_start, &mfn); + reservation.nr_extents = 1; + + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &reservation); + WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", + start, end, ret); + if (ret == 1) { + set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + len++; + } + } + printk(KERN_CONT "%ld pages freed\n", len); + + return len; +} + +static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, + const struct e820map *e820) +{ + phys_addr_t max_addr = PFN_PHYS(max_pfn); + phys_addr_t last_end = 0; + unsigned long released = 0; + int i; + + for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { + phys_addr_t end = e820->map[i].addr; + end = min(max_addr, end); + + released += xen_release_chunk(last_end, end); + last_end = e820->map[i].addr + e820->map[i].size; + } + + if (last_end < max_addr) + released += xen_release_chunk(last_end, max_addr); + + printk(KERN_INFO "released %ld pages of unused memory\n", released); + return released; +} /** * machine_specific_memory_setup - Hook for machine specific memory setup. @@ -67,6 +135,8 @@ char * __init xen_memory_setup(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + xen_return_unused_memory(xen_start_info->nr_pages, &e820); + return "Xen"; } @@ -156,6 +226,8 @@ void __init xen_arch_setup(void) struct physdev_set_iopl set_iopl; int rc; + xen_panic_handler_init(); + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index a29693fd313..25f232b18a8 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -394,6 +394,8 @@ static void stop_self(void *v) load_cr3(swapper_pg_dir); /* should set up a minimal gdt */ + set_cpu_online(cpu, false); + HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL); BUG(); } diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index a9c66110803..1d789d56877 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -26,6 +26,18 @@ void xen_pre_suspend(void) BUG(); } +void xen_hvm_post_suspend(int suspend_cancelled) +{ + int cpu; + xen_hvm_init_shared_info(); + xen_callback_vector(); + if (xen_feature(XENFEAT_hvm_safe_pvclock)) { + for_each_online_cpu(cpu) { + xen_setup_runstate_info(cpu); + } + } +} + void xen_post_suspend(int suspend_cancelled) { xen_build_mfn_list_list(); diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index b3c6c59ed30..1a5353a753f 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -20,6 +20,7 @@ #include <asm/xen/hypercall.h> #include <xen/events.h> +#include <xen/features.h> #include <xen/interface/xen.h> #include <xen/interface/vcpu.h> @@ -155,47 +156,8 @@ static void do_stolen_accounting(void) account_idle_ticks(ticks); } -/* - * Xen sched_clock implementation. Returns the number of unstolen - * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED - * states. - */ -unsigned long long xen_sched_clock(void) -{ - struct vcpu_runstate_info state; - cycle_t now; - u64 ret; - s64 offset; - - /* - * Ideally sched_clock should be called on a per-cpu basis - * anyway, so preempt should already be disabled, but that's - * not current practice at the moment. - */ - preempt_disable(); - - now = xen_clocksource_read(); - - get_runstate_snapshot(&state); - - WARN_ON(state.state != RUNSTATE_running); - - offset = now - state.state_entry_time; - if (offset < 0) - offset = 0; - - ret = state.time[RUNSTATE_blocked] + - state.time[RUNSTATE_running] + - offset; - - preempt_enable(); - - return ret; -} - - /* Get the TSC speed from Xen */ -unsigned long xen_tsc_khz(void) +static unsigned long xen_tsc_khz(void) { struct pvclock_vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_info[0].time; @@ -230,7 +192,7 @@ static void xen_read_wallclock(struct timespec *ts) put_cpu_var(xen_vcpu); } -unsigned long xen_get_wallclock(void) +static unsigned long xen_get_wallclock(void) { struct timespec ts; @@ -238,7 +200,7 @@ unsigned long xen_get_wallclock(void) return ts.tv_sec; } -int xen_set_wallclock(unsigned long now) +static int xen_set_wallclock(unsigned long now) { /* do nothing for domU */ return -1; @@ -473,7 +435,11 @@ void xen_timer_resume(void) } } -__init void xen_time_init(void) +static const struct pv_time_ops xen_time_ops __initdata = { + .sched_clock = xen_clocksource_read, +}; + +static __init void xen_time_init(void) { int cpu = smp_processor_id(); struct timespec tp; @@ -497,3 +463,47 @@ __init void xen_time_init(void) xen_setup_timer(cpu); xen_setup_cpu_clockevents(); } + +__init void xen_init_time_ops(void) +{ + pv_time_ops = xen_time_ops; + + x86_init.timers.timer_init = xen_time_init; + x86_init.timers.setup_percpu_clockev = x86_init_noop; + x86_cpuinit.setup_percpu_clockev = x86_init_noop; + + x86_platform.calibrate_tsc = xen_tsc_khz; + x86_platform.get_wallclock = xen_get_wallclock; + x86_platform.set_wallclock = xen_set_wallclock; +} + +#ifdef CONFIG_XEN_PVHVM +static void xen_hvm_setup_cpu_clockevents(void) +{ + int cpu = smp_processor_id(); + xen_setup_runstate_info(cpu); + xen_setup_timer(cpu); + xen_setup_cpu_clockevents(); +} + +__init void xen_hvm_init_time_ops(void) +{ + /* vector callback is needed otherwise we cannot receive interrupts + * on cpu > 0 */ + if (!xen_have_vector_callback && num_present_cpus() > 1) + return; + if (!xen_feature(XENFEAT_hvm_safe_pvclock)) { + printk(KERN_INFO "Xen doesn't support pvclock on HVM," + "disable pv timer\n"); + return; + } + + pv_time_ops = xen_time_ops; + x86_init.timers.setup_percpu_clockev = xen_time_init; + x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents; + + x86_platform.calibrate_tsc = xen_tsc_khz; + x86_platform.get_wallclock = xen_get_wallclock; + x86_platform.set_wallclock = xen_set_wallclock; +} +#endif diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index f9153a300bc..7c8ab86163e 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -38,6 +38,10 @@ void xen_enable_sysenter(void); void xen_enable_syscall(void); void xen_vcpu_restore(void); +void xen_callback_vector(void); +void xen_hvm_init_shared_info(void); +void __init xen_unplug_emulated_devices(void); + void __init xen_build_dynamic_phys_to_machine(void); void xen_init_irq_ops(void); @@ -46,11 +50,8 @@ void xen_setup_runstate_info(int cpu); void xen_teardown_timer(int cpu); cycle_t xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); -unsigned long xen_tsc_khz(void); -void __init xen_time_init(void); -unsigned long xen_get_wallclock(void); -int xen_set_wallclock(unsigned long time); -unsigned long long xen_sched_clock(void); +void __init xen_init_time_ops(void); +void __init xen_hvm_init_time_ops(void); irqreturn_t xen_debug_interrupt(int irq, void *dev_id); @@ -101,4 +102,6 @@ void xen_sysret32(void); void xen_sysret64(void); void xen_adjust_exception_frame(void); +extern int xen_panic_handler_init(void); + #endif /* XEN_OPS_H */ |