diff options
author | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-07-18 10:18:39 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-07-18 10:18:39 -0700 |
commit | 5cc97bf2d8eaa6cab60727c3eba3e85e29062669 (patch) | |
tree | 975976f2cb37b932d65440e6fb9960be93fc0bd7 /arch | |
parent | 826ea8f22cf612d534f33c492c98f7895043bfd1 (diff) | |
parent | dfdcdd42fdf63452ddd1bed6f49ae2a35dfb5d6c (diff) |
Merge branch 'xen-upstream' of ssh://master.kernel.org/pub/scm/linux/kernel/git/jeremy/xen
* 'xen-upstream' of ssh://master.kernel.org/pub/scm/linux/kernel/git/jeremy/xen: (44 commits)
xen: disable all non-virtual drivers
xen: use iret directly when possible
xen: suppress abs symbol warnings for unused reloc pointers
xen: Attempt to patch inline versions of common operations
xen: Place vcpu_info structure into per-cpu memory
xen: handle external requests for shutdown, reboot and sysrq
xen: machine operations
xen: add virtual network device driver
xen: add virtual block device driver.
xen: add the Xenbus sysfs and virtual device hotplug driver
xen: Add grant table support
xen: use the hvc console infrastructure for Xen console
xen: hack to prevent bad segment register reload
xen: lazy-mmu operations
xen: Add support for preemption
xen: SMP guest support
xen: Implement sched_clock
xen: Account for stolen time
xen: ignore RW mapping of RO pages in pagetable_init
xen: Complete pagetable pinning
...
Diffstat (limited to 'arch')
36 files changed, 4381 insertions, 42 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index c7c9c2a15fa..7a11b905ef4 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -222,6 +222,8 @@ config PARAVIRT However, when run without a hypervisor the kernel is theoretically slower. If in doubt, say N. +source "arch/i386/xen/Kconfig" + config VMI bool "VMI Paravirt-ops support" depends on PARAVIRT diff --git a/arch/i386/Makefile b/arch/i386/Makefile index 181cc29a7c4..01f0ff0daaf 100644 --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -93,6 +93,9 @@ mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-i386/mach-es7000 mcore-$(CONFIG_X86_ES7000) := mach-default core-$(CONFIG_X86_ES7000) := arch/i386/mach-es7000/ +# Xen paravirtualization support +core-$(CONFIG_XEN) += arch/i386/xen/ + # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default diff --git a/arch/i386/boot/compressed/relocs.c b/arch/i386/boot/compressed/relocs.c index ce4fda261aa..b0e21c3cee5 100644 --- a/arch/i386/boot/compressed/relocs.c +++ b/arch/i386/boot/compressed/relocs.c @@ -31,6 +31,8 @@ static const char* safe_abs_relocs[] = { "__kernel_rt_sigreturn", "__kernel_sigreturn", "SYSENTER_RETURN", + "xen_irq_disable_direct_reloc", + "xen_save_fl_direct_reloc", }; static int is_safe_abs_reloc(const char* sym_name) diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 27a776c9044..25f7eb51392 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -17,6 +17,8 @@ #include <asm/thread_info.h> #include <asm/elf.h> +#include <xen/interface/xen.h> + #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -59,6 +61,7 @@ void foo(void) OFFSET(TI_addr_limit, thread_info, addr_limit); OFFSET(TI_restart_block, thread_info, restart_block); OFFSET(TI_sysenter_return, thread_info, sysenter_return); + OFFSET(TI_cpu, thread_info, cpu); BLANK(); OFFSET(GDS_size, Xgt_desc_struct, size); @@ -115,4 +118,10 @@ void foo(void) OFFSET(PARAVIRT_iret, paravirt_ops, iret); OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); #endif + +#ifdef CONFIG_XEN + BLANK(); + OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); + OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); +#endif } diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 3c3c220488c..32980b83493 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -1023,6 +1023,91 @@ ENTRY(kernel_thread_helper) CFI_ENDPROC ENDPROC(kernel_thread_helper) +#ifdef CONFIG_XEN +ENTRY(xen_hypervisor_callback) + CFI_STARTPROC + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + + /* Check to see if we got the event in the critical + region in xen_iret_direct, after we've reenabled + events and checked for pending events. This simulates + iret instruction's behaviour where it delivers a + pending interrupt when enabling interrupts. */ + movl PT_EIP(%esp),%eax + cmpl $xen_iret_start_crit,%eax + jb 1f + cmpl $xen_iret_end_crit,%eax + jae 1f + + call xen_iret_crit_fixup + +1: mov %esp, %eax + call xen_evtchn_do_upcall + jmp ret_from_intr + CFI_ENDPROC +ENDPROC(xen_hypervisor_callback) + +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we fix up by reattempting the load, and zeroing the segment +# register if the load fails. +# Category 2 we fix up by jumping to do_iret_error. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by maintaining a status value in EAX. +ENTRY(xen_failsafe_callback) + CFI_STARTPROC + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl $1,%eax +1: mov 4(%esp),%ds +2: mov 8(%esp),%es +3: mov 12(%esp),%fs +4: mov 16(%esp),%gs + testl %eax,%eax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + lea 16(%esp),%esp + CFI_ADJUST_CFA_OFFSET -16 + jz 5f + addl $16,%esp + jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) +5: pushl $0 # EAX == 0 => Category 1 (Bad segment) + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + jmp ret_from_exception + CFI_ENDPROC + +.section .fixup,"ax" +6: xorl %eax,%eax + movl %eax,4(%esp) + jmp 1b +7: xorl %eax,%eax + movl %eax,8(%esp) + jmp 2b +8: xorl %eax,%eax + movl %eax,12(%esp) + jmp 3b +9: xorl %eax,%eax + movl %eax,16(%esp) + jmp 4b +.previous +.section __ex_table,"a" + .align 4 + .long 1b,6b + .long 2b,7b + .long 3b,8b + .long 4b,9b +.previous +ENDPROC(xen_failsafe_callback) + +#endif /* CONFIG_XEN */ + .section .rodata,"a" #include "syscall_table.S" diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index 82714668d43..7c52b222207 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -510,7 +510,8 @@ ENTRY(_stext) /* * BSS section */ -.section ".bss.page_aligned","w" +.section ".bss.page_aligned","wa" + .align PAGE_SIZE_asm ENTRY(swapper_pg_dir) .fill 1024,4,0 ENTRY(swapper_pg_pmd) @@ -538,6 +539,8 @@ fault_msg: .ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n" .asciz "Stack: %p %p %p %p %p %p %p %p\n" +#include "../xen/xen-head.S" + /* * The IDT and GDT 'descriptors' are a strange 48-bit object * only used by the lidt and lgdt instructions. They are not diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index faab09abca5..53f07a8275e 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -228,6 +228,41 @@ static int __init print_banner(void) } core_initcall(print_banner); +static struct resource reserve_ioports = { + .start = 0, + .end = IO_SPACE_LIMIT, + .name = "paravirt-ioport", + .flags = IORESOURCE_IO | IORESOURCE_BUSY, +}; + +static struct resource reserve_iomem = { + .start = 0, + .end = -1, + .name = "paravirt-iomem", + .flags = IORESOURCE_MEM | IORESOURCE_BUSY, +}; + +/* + * Reserve the whole legacy IO space to prevent any legacy drivers + * from wasting time probing for their hardware. This is a fairly + * brute-force approach to disabling all non-virtual drivers. + * + * Note that this must be called very early to have any effect. + */ +int paravirt_disable_iospace(void) +{ + int ret; + + ret = request_resource(&ioport_resource, &reserve_ioports); + if (ret == 0) { + ret = request_resource(&iomem_resource, &reserve_iomem); + if (ret) + release_resource(&reserve_ioports); + } + + return ret; +} + struct paravirt_ops paravirt_ops = { .name = "bare hardware", .paravirt_enabled = 0, @@ -267,7 +302,7 @@ struct paravirt_ops paravirt_ops = { .write_msr = native_write_msr_safe, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, - .get_scheduled_cycles = native_read_tsc, + .sched_clock = native_sched_clock, .get_cpu_khz = native_calculate_cpu_khz, .load_tr_desc = native_load_tr_desc, .set_ldt = native_set_ldt, diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 2d61e65eeb5..74871d066c2 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -601,6 +601,8 @@ void __init setup_arch(char **cmdline_p) * NOTE: at this point the bootmem allocator is fully available. */ + paravirt_post_allocator_init(); + dmi_scan_machine(); #ifdef CONFIG_X86_GENERICARCH diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 6299c080f6e..2d35d850202 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -22,6 +22,7 @@ #include <asm/mtrr.h> #include <asm/tlbflush.h> +#include <asm/mmu_context.h> #include <mach_apic.h> /* @@ -249,13 +250,13 @@ static unsigned long flush_va; static DEFINE_SPINLOCK(tlbstate_lock); /* - * We cannot call mmdrop() because we are in interrupt context, + * We cannot call mmdrop() because we are in interrupt context, * instead update mm->cpu_vm_mask. * * We need to reload %cr3 since the page tables may be going * away from under us.. */ -static inline void leave_mm (unsigned long cpu) +void leave_mm(unsigned long cpu) { if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) BUG(); diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 0b2954534b8..5910d3fac56 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -148,7 +148,7 @@ void __init smp_alloc_memory(void) * a given CPU */ -static void __cpuinit smp_store_cpu_info(int id) +void __cpuinit smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = cpu_data + id; @@ -308,8 +308,7 @@ cpumask_t cpu_coregroup_map(int cpu) /* representing cpus for which sibling maps can be computed */ static cpumask_t cpu_sibling_setup_map; -static inline void -set_cpu_sibling_map(int cpu) +void set_cpu_sibling_map(int cpu) { int i; struct cpuinfo_x86 *c = cpu_data; @@ -1144,8 +1143,7 @@ void __init native_smp_prepare_boot_cpu(void) } #ifdef CONFIG_HOTPLUG_CPU -static void -remove_siblinginfo(int cpu) +void remove_siblinginfo(int cpu) { int sibling; struct cpuinfo_x86 *c = cpu_data; diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index ea63a30ca3e..252f9010f28 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -84,7 +84,7 @@ static inline int check_tsc_unstable(void) * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -static unsigned long cyc2ns_scale __read_mostly; +unsigned long cyc2ns_scale __read_mostly; #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ @@ -93,15 +93,10 @@ static inline void set_cyc2ns_scale(unsigned long cpu_khz) cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; } -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; -} - /* * Scheduler clock - returns current time in nanosec units. */ -unsigned long long sched_clock(void) +unsigned long long native_sched_clock(void) { unsigned long long this_offset; @@ -118,12 +113,24 @@ unsigned long long sched_clock(void) return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); /* read the Time Stamp Counter: */ - get_scheduled_cycles(this_offset); + rdtscll(this_offset); /* return the value in ns */ return cycles_2_ns(this_offset); } +/* We need to define a real function for sched_clock, to override the + weak default version */ +#ifdef CONFIG_PARAVIRT +unsigned long long sched_clock(void) +{ + return paravirt_sched_clock(); +} +#else +unsigned long long sched_clock(void) + __attribute__((alias("native_sched_clock"))); +#endif + unsigned long native_calculate_cpu_khz(void) { unsigned long long start, end; diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c index c12720d7cbc..72042bb7ec9 100644 --- a/arch/i386/kernel/vmi.c +++ b/arch/i386/kernel/vmi.c @@ -362,7 +362,7 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) } #endif -static void vmi_allocate_pt(u32 pfn) +static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn) { vmi_set_page_type(pfn, VMI_PAGE_L1); vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); @@ -891,7 +891,7 @@ static inline int __init activate_vmi(void) paravirt_ops.setup_boot_clock = vmi_time_bsp_init; paravirt_ops.setup_secondary_clock = vmi_time_ap_init; #endif - paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles; + paravirt_ops.sched_clock = vmi_sched_clock; paravirt_ops.get_cpu_khz = vmi_cpu_khz; /* We have true wallclock functions; disable CMOS clock sync */ diff --git a/arch/i386/kernel/vmiclock.c b/arch/i386/kernel/vmiclock.c index 26a37f8a876..f9b845f4e69 100644 --- a/arch/i386/kernel/vmiclock.c +++ b/arch/i386/kernel/vmiclock.c @@ -64,10 +64,10 @@ int vmi_set_wallclock(unsigned long now) return 0; } -/* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */ -unsigned long long vmi_get_sched_cycles(void) +/* paravirt_ops.sched_clock = vmi_sched_clock */ +unsigned long long vmi_sched_clock(void) { - return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE); + return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); } /* paravirt_ops.get_cpu_khz = vmi_cpu_khz */ diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index aa87b06c7c8..00f1bc47d3a 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -88,6 +88,7 @@ SECTIONS . = ALIGN(4096); .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + *(.data.page_aligned) *(.data.idt) } diff --git a/arch/i386/kernel/vsyscall-note.S b/arch/i386/kernel/vsyscall-note.S index d4b5be4f3d5..271f16a8ca0 100644 --- a/arch/i386/kernel/vsyscall-note.S +++ b/arch/i386/kernel/vsyscall-note.S @@ -3,23 +3,40 @@ * Here we can supply some information useful to userland. */ -#include <linux/uts.h> #include <linux/version.h> +#include <linux/elfnote.h> -#define ASM_ELF_NOTE_BEGIN(name, flags, vendor, type) \ - .section name, flags; \ - .balign 4; \ - .long 1f - 0f; /* name length */ \ - .long 3f - 2f; /* data length */ \ - .long type; /* note type */ \ -0: .asciz vendor; /* vendor name */ \ -1: .balign 4; \ -2: +/* Ideally this would use UTS_NAME, but using a quoted string here + doesn't work. Remember to change this when changing the + kernel's name. */ +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END -#define ASM_ELF_NOTE_END \ -3: .balign 4; /* pad out section */ \ - .previous +#ifdef CONFIG_XEN - ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0) - .long LINUX_VERSION_CODE - ASM_ELF_NOTE_END +/* + * Add a special note telling glibc's dynamic linker a fake hardware + * flavor that it will use to choose the search path for libraries in the + * same way it uses real hardware capabilities like "mmx". + * We supply "nosegneg" as the fake capability, to indicate that we + * do not like negative offsets in instructions using segment overrides, + * since we implement those inefficiently. This makes it possible to + * install libraries optimized to avoid those access patterns in someplace + * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file + * corresponding to the bits here is needed to make ldconfig work right. + * It should contain: + * hwcap 1 nosegneg + * to match the mapping of bit to name that we give here. + */ + +/* Bit used for the pseudo-hwcap for non-negative segments. We use + bit 1 to avoid bugs in some versions of glibc when bit 0 is + used; the choice is otherwise arbitrary. */ +#define VDSO_NOTE_NONEGSEG_BIT 1 + +ELFNOTE_START(GNU, 2, "a") + .long 1, 1<<VDSO_NOTE_NONEGSEG_BIT /* ncaps, mask */ + .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */ +ELFNOTE_END +#endif diff --git a/arch/i386/mach-voyager/voyager_thread.c b/arch/i386/mach-voyager/voyager_thread.c index b4b24e0e45e..f9d59533815 100644 --- a/arch/i386/mach-voyager/voyager_thread.c +++ b/arch/i386/mach-voyager/voyager_thread.c @@ -52,7 +52,7 @@ execute(const char *string) NULL, }; - if ((ret = call_usermodehelper(argv[0], argv, envp, 1)) != 0) { + if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) { printk(KERN_ERR "Voyager failed to run \"%s\": %i\n", string, ret); } diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 7135946d366..6a68b1ae061 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -87,7 +87,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); - paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT); + paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); BUG_ON(page_table != pte_offset_kernel(pmd, 0)); } @@ -473,6 +473,7 @@ void zap_low_mappings (void) static int disable_nx __initdata = 0; u64 __supported_pte_mask __read_mostly = ~_PAGE_NX; +EXPORT_SYMBOL_GPL(__supported_pte_mask); /* * noexec = on|off diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index 2eb14a73be9..37992ffb163 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c @@ -60,7 +60,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); - paravirt_alloc_pt(page_to_pfn(base)); + paravirt_alloc_pt(&init_mm, page_to_pfn(base)); for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, addr == address ? prot : ref_prot)); diff --git a/arch/i386/xen/Kconfig b/arch/i386/xen/Kconfig new file mode 100644 index 00000000000..9df99e1885a --- /dev/null +++ b/arch/i386/xen/Kconfig @@ -0,0 +1,11 @@ +# +# This Kconfig describes xen options +# + +config XEN + bool "Enable support for Xen hypervisor" + depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES + help + This is the Linux Xen port. Enabling this will allow the + kernel to boot in a paravirtualized environment under the + Xen hypervisor. diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile new file mode 100644 index 00000000000..343df246bd3 --- /dev/null +++ b/arch/i386/xen/Makefile @@ -0,0 +1,4 @@ +obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \ + events.o time.o manage.o xen-asm.o + +obj-$(CONFIG_SMP) += smp.o diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c new file mode 100644 index 00000000000..9a8c1181c00 --- /dev/null +++ b/arch/i386/xen/enlighten.c @@ -0,0 +1,1144 @@ +/* + * Core of Xen paravirt_ops implementation. + * + * This file contains the xen_paravirt_ops structure itself, and the + * implementations for: + * - privileged instructions + * - interrupt flags + * - segment operations + * - booting and setup + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/preempt.h> +#include <linux/hardirq.h> +#include <linux/percpu.h> +#include <linux/delay.h> +#include <linux/start_kernel.h> +#include <linux/sched.h> +#include <linux/bootmem.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/page-flags.h> +#include <linux/highmem.h> +#include <linux/smp.h> + +#include <xen/interface/xen.h> +#include <xen/interface/physdev.h> +#include <xen/interface/vcpu.h> +#include <xen/interface/sched.h> +#include <xen/features.h> +#include <xen/page.h> + +#include <asm/paravirt.h> +#include <asm/page.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> +#include <asm/fixmap.h> +#include <asm/processor.h> +#include <asm/setup.h> +#include <asm/desc.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/reboot.h> + +#include "xen-ops.h" +#include "mmu.h" +#include "multicalls.h" + +EXPORT_SYMBOL_GPL(hypercall_page); + +DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); + +DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); +DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); +DEFINE_PER_CPU(unsigned long, xen_cr3); + +struct start_info *xen_start_info; +EXPORT_SYMBOL_GPL(xen_start_info); + +static /* __initdata */ struct shared_info dummy_shared_info; + +/* + * Point at some empty memory to start with. We map the real shared_info + * page as soon as fixmap is up and running. + */ +struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info; + +/* + * Flag to determine whether vcpu info placement is available on all + * VCPUs. We assume it is to start with, and then set it to zero on + * the first failure. This is because it can succeed on some VCPUs + * and not others, since it can involve hypervisor memory allocation, + * or because the guest failed to guarantee all the appropriate + * constraints on all VCPUs (ie buffer can't cross a page boundary). + * + * Note that any particular CPU may be using a placed vcpu structure, + * but we can only optimise if the all are. + * + * 0: not available, 1: available + */ +static int have_vcpu_info_placement = 1; + +static void __init xen_vcpu_setup(int cpu) +{ + struct vcpu_register_vcpu_info info; + int err; + struct vcpu_info *vcpup; + + per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; + + if (!have_vcpu_info_placement) + return; /* already tested, not available */ + + vcpup = &per_cpu(xen_vcpu_info, cpu); + + info.mfn = virt_to_mfn(vcpup); + info.offset = offset_in_page(vcpup); + + printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n", + cpu, vcpup, info.mfn, info.offset); + + /* Check to see if the hypervisor will put the vcpu_info + structure where we want it, which allows direct access via + a percpu-variable. */ + err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info); + + if (err) { + printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); + have_vcpu_info_placement = 0; + } else { + /* This cpu is using the registered vcpu info, even if + later ones fail to. */ + per_cpu(xen_vcpu, cpu) = vcpup; + + printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n", + cpu, vcpup); + } +} + +static void __init xen_banner(void) +{ + printk(KERN_INFO "Booting paravirtualized kernel on %s\n", + paravirt_ops.name); + printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); +} + +static void xen_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + unsigned maskedx = ~0; + + /* + * Mask out inconvenient features, to try and disable as many + * unsupported kernel subsystems as possible. + */ + if (*eax == 1) + maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ + (1 << X86_FEATURE_ACPI) | /* disable ACPI */ + (1 << X86_FEATURE_ACC)); /* thermal monitoring */ + + asm(XEN_EMULATE_PREFIX "cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); + *edx &= maskedx; +} + +static void xen_set_debugreg(int reg, unsigned long val) +{ + HYPERVISOR_set_debugreg(reg, val); +} + +static unsigned long xen_get_debugreg(int reg) +{ + return HYPERVISOR_get_debugreg(reg); +} + +static unsigned long xen_save_fl(void) +{ + struct vcpu_info *vcpu; + unsigned long flags; + + vcpu = x86_read_percpu(xen_vcpu); + + /* flag has opposite sense of mask */ + flags = !vcpu->evtchn_upcall_mask; + + /* convert to IF type flag + -0 -> 0x00000000 + -1 -> 0xffffffff + */ + return (-flags) & X86_EFLAGS_IF; +} + +static void xen_restore_fl(unsigned long flags) +{ + struct vcpu_info *vcpu; |