diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-24 09:07:03 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-24 09:07:03 -0700 |
commit | 5fabc487c96819dd12ddb9414835d170fd9cd6d5 (patch) | |
tree | 01532d492e5074b0d3add29bf92ebf9a9d161e9e /arch/x86 | |
parent | c61264f98c1a974ee6f545f61a4ab33b141d6bda (diff) | |
parent | 3f68b0318bbbd61bf08478ab99a149f0d9e5156e (diff) |
Merge branch 'kvm-updates/3.1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
* 'kvm-updates/3.1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (143 commits)
KVM: IOMMU: Disable device assignment without interrupt remapping
KVM: MMU: trace mmio page fault
KVM: MMU: mmio page fault support
KVM: MMU: reorganize struct kvm_shadow_walk_iterator
KVM: MMU: lockless walking shadow page table
KVM: MMU: do not need atomicly to set/clear spte
KVM: MMU: introduce the rules to modify shadow page table
KVM: MMU: abstract some functions to handle fault pfn
KVM: MMU: filter out the mmio pfn from the fault pfn
KVM: MMU: remove bypass_guest_pf
KVM: MMU: split kvm_mmu_free_page
KVM: MMU: count used shadow pages on prepareing path
KVM: MMU: rename 'pt_write' to 'emulate'
KVM: MMU: cleanup for FNAME(fetch)
KVM: MMU: optimize to handle dirty bit
KVM: MMU: cache mmio info on page fault path
KVM: x86: introduce vcpu_mmio_gva_to_gpa to cleanup the code
KVM: MMU: do not update slot bitmap if spte is nonpresent
KVM: MMU: fix walking shadow page table
KVM guest: KVM Steal time registration
...
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/Kconfig | 12 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_emulate.h | 52 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 46 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_para.h | 20 | ||||
-rw-r--r-- | arch/x86/include/asm/msr-index.h | 12 | ||||
-rw-r--r-- | arch/x86/include/asm/paravirt.h | 9 | ||||
-rw-r--r-- | arch/x86/include/asm/paravirt_types.h | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/processor-flags.h | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/vmx.h | 43 | ||||
-rw-r--r-- | arch/x86/kernel/kvm.c | 72 | ||||
-rw-r--r-- | arch/x86/kernel/kvmclock.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/paravirt.c | 9 | ||||
-rw-r--r-- | arch/x86/kvm/Kconfig | 1 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 1749 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 1226 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 25 | ||||
-rw-r--r-- | arch/x86/kvm/mmu_audit.c | 12 | ||||
-rw-r--r-- | arch/x86/kvm/mmutrace.h | 48 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 258 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 6 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 31 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 2784 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 374 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 44 |
24 files changed, 4982 insertions, 1855 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b2127544fbe..a67e014e4e4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -529,6 +529,18 @@ menuconfig PARAVIRT_GUEST if PARAVIRT_GUEST +config PARAVIRT_TIME_ACCOUNTING + bool "Paravirtual steal time accounting" + select PARAVIRT + default n + ---help--- + Select this option to enable fine granularity task steal time + accounting. Time spent executing other tasks in parallel with + the current vCPU is discounted from the vCPU power. To account for + that, there can be a small performance impact. + + If in doubt, say N here. + source "arch/x86/xen/Kconfig" config KVM_CLOCK diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 0049211959c..6040d115ef5 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -229,7 +229,26 @@ struct read_cache { unsigned long end; }; -struct decode_cache { +struct x86_emulate_ctxt { + struct x86_emulate_ops *ops; + + /* Register state before/after emulation. */ + unsigned long eflags; + unsigned long eip; /* eip before instruction emulation */ + /* Emulated execution mode, represented by an X86EMUL_MODE value. */ + int mode; + + /* interruptibility state, as a result of execution of STI or MOV SS */ + int interruptibility; + + bool guest_mode; /* guest running a nested guest */ + bool perm_ok; /* do not check permissions if true */ + bool only_vendor_specific_insn; + + bool have_exception; + struct x86_exception exception; + + /* decode cache */ u8 twobyte; u8 b; u8 intercept; @@ -246,8 +265,6 @@ struct decode_cache { unsigned int d; int (*execute)(struct x86_emulate_ctxt *ctxt); int (*check_perm)(struct x86_emulate_ctxt *ctxt); - unsigned long regs[NR_VCPU_REGS]; - unsigned long eip; /* modrm */ u8 modrm; u8 modrm_mod; @@ -255,34 +272,14 @@ struct decode_cache { u8 modrm_rm; u8 modrm_seg; bool rip_relative; + unsigned long _eip; + /* Fields above regs are cleared together. */ + unsigned long regs[NR_VCPU_REGS]; struct fetch_cache fetch; struct read_cache io_read; struct read_cache mem_read; }; -struct x86_emulate_ctxt { - struct x86_emulate_ops *ops; - - /* Register state before/after emulation. */ - unsigned long eflags; - unsigned long eip; /* eip before instruction emulation */ - /* Emulated execution mode, represented by an X86EMUL_MODE value. */ - int mode; - - /* interruptibility state, as a result of execution of STI or MOV SS */ - int interruptibility; - - bool guest_mode; /* guest running a nested guest */ - bool perm_ok; /* do not check permissions if true */ - bool only_vendor_specific_insn; - - bool have_exception; - struct x86_exception exception; - - /* decode cache */ - struct decode_cache decode; -}; - /* Repeat String Operation Prefix */ #define REPE_PREFIX 0xf3 #define REPNE_PREFIX 0xf2 @@ -373,6 +370,5 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int reason, bool has_error_code, u32 error_code); -int emulate_int_real(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, int irq); +int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq); #endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d2ac8e2ee89..dd51c83aa5d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -48,7 +48,7 @@ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXSAVE \ + | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) @@ -205,6 +205,7 @@ union kvm_mmu_page_role { unsigned invalid:1; unsigned nxe:1; unsigned cr0_wp:1; + unsigned smep_andnot_wp:1; }; }; @@ -227,15 +228,17 @@ struct kvm_mmu_page { * in this shadow page. */ DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); - bool multimapped; /* More than one parent_pte? */ bool unsync; int root_count; /* Currently serving as active root */ unsigned int unsync_children; - union { - u64 *parent_pte; /* !multimapped */ - struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ - }; + unsigned long parent_ptes; /* Reverse mapping for parent_pte */ DECLARE_BITMAP(unsync_child_bitmap, 512); + +#ifdef CONFIG_X86_32 + int clear_spte_count; +#endif + + struct rcu_head rcu; }; struct kvm_pv_mmu_op_buffer { @@ -269,8 +272,6 @@ struct kvm_mmu { gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, struct x86_exception *exception); gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); - void (*prefetch_page)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *page); int (*sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); @@ -346,8 +347,7 @@ struct kvm_vcpu_arch { * put it here to avoid allocation */ struct kvm_pv_mmu_op_buffer mmu_op_buffer; - struct kvm_mmu_memory_cache mmu_pte_chain_cache; - struct kvm_mmu_memory_cache mmu_rmap_desc_cache; + struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; struct kvm_mmu_memory_cache mmu_page_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; @@ -393,6 +393,15 @@ struct kvm_vcpu_arch { unsigned int hw_tsc_khz; unsigned int time_offset; struct page *time_page; + + struct { + u64 msr_val; + u64 last_steal; + u64 accum_steal; + struct gfn_to_hva_cache stime; + struct kvm_steal_time steal; + } st; + u64 last_guest_tsc; u64 last_kernel_ns; u64 last_tsc_nsec; @@ -419,6 +428,11 @@ struct kvm_vcpu_arch { u64 mcg_ctl; u64 *mce_banks; + /* Cache MMIO info */ + u64 mmio_gva; + unsigned access; + gfn_t mmio_gfn; + /* used for guest single stepping over the given code position */ unsigned long singlestep_rip; @@ -441,6 +455,7 @@ struct kvm_arch { unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; unsigned int n_max_mmu_pages; + unsigned int indirect_shadow_pages; atomic_t invlpg_counter; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* @@ -477,6 +492,8 @@ struct kvm_arch { u64 hv_guest_os_id; u64 hv_hypercall; + atomic_t reader_counter; + #ifdef CONFIG_KVM_MMU_AUDIT int audit_point; #endif @@ -559,7 +576,7 @@ struct kvm_x86_ops { void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); - void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); + int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); @@ -636,7 +653,6 @@ void kvm_mmu_module_exit(void); void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); int kvm_mmu_setup(struct kvm_vcpu *vcpu); -void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask); @@ -830,11 +846,12 @@ enum { asmlinkage void kvm_spurious_fault(void); extern bool kvm_rebooting; -#define __kvm_handle_fault_on_reboot(insn) \ +#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \ "666: " insn "\n\t" \ "668: \n\t" \ ".pushsection .fixup, \"ax\" \n" \ "667: \n\t" \ + cleanup_insn "\n\t" \ "cmpb $0, kvm_rebooting \n\t" \ "jne 668b \n\t" \ __ASM_SIZE(push) " $666b \n\t" \ @@ -844,6 +861,9 @@ extern bool kvm_rebooting; _ASM_PTR " 666b, 667b \n\t" \ ".popsection" +#define __kvm_handle_fault_on_reboot(insn) \ + ____kvm_handle_fault_on_reboot(insn, "") + #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_age_hva(struct kvm *kvm, unsigned long hva); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index a427bf77a93..734c3767cfa 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -21,6 +21,7 @@ */ #define KVM_FEATURE_CLOCKSOURCE2 3 #define KVM_FEATURE_ASYNC_PF 4 +#define KVM_FEATURE_STEAL_TIME 5 /* The last 8 bits are used to indicate how to interpret the flags field * in pvclock structure. If no bits are set, all flags are ignored. @@ -30,10 +31,23 @@ #define MSR_KVM_WALL_CLOCK 0x11 #define MSR_KVM_SYSTEM_TIME 0x12 +#define KVM_MSR_ENABLED 1 /* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ #define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 #define MSR_KVM_ASYNC_PF_EN 0x4b564d02 +#define MSR_KVM_STEAL_TIME 0x4b564d03 + +struct kvm_steal_time { + __u64 steal; + __u32 version; + __u32 flags; + __u32 pad[12]; +}; + +#define KVM_STEAL_ALIGNMENT_BITS 5 +#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1))) +#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1) #define KVM_MAX_MMU_OP_BATCH 32 @@ -178,6 +192,7 @@ void __init kvm_guest_init(void); void kvm_async_pf_task_wait(u32 token); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_pf_reason(void); +extern void kvm_disable_steal_time(void); #else #define kvm_guest_init() do { } while (0) #define kvm_async_pf_task_wait(T) do {} while(0) @@ -186,6 +201,11 @@ static inline u32 kvm_read_and_reset_pf_reason(void) { return 0; } + +static inline void kvm_disable_steal_time(void) +{ + return; +} #endif #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index d96bdb25ca3..d52609aeeab 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -441,6 +441,18 @@ #define MSR_IA32_VMX_VMCS_ENUM 0x0000048a #define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b #define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c +#define MSR_IA32_VMX_TRUE_PINBASED_CTLS 0x0000048d +#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e +#define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f +#define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490 + +/* VMX_BASIC bits and bitmasks */ +#define VMX_BASIC_VMCS_SIZE_SHIFT 32 +#define VMX_BASIC_64 0x0001000000000000LLU +#define VMX_BASIC_MEM_TYPE_SHIFT 50 +#define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU +#define VMX_BASIC_MEM_TYPE_WB 6LLU +#define VMX_BASIC_INOUT 0x0040000000000000LLU /* AMD-V MSRs */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index ebbc4d8ab17..a7d2db9a74f 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -230,6 +230,15 @@ static inline unsigned long long paravirt_sched_clock(void) return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); } +struct jump_label_key; +extern struct jump_label_key paravirt_steal_enabled; +extern struct jump_label_key paravirt_steal_rq_enabled; + +static inline u64 paravirt_steal_clock(int cpu) +{ + return PVOP_CALL1(u64, pv_time_ops.steal_clock, cpu); +} + static inline unsigned long long paravirt_read_pmc(int counter) { return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 82885099c86..2c765216311 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -89,6 +89,7 @@ struct pv_lazy_ops { struct pv_time_ops { unsigned long long (*sched_clock)(void); + unsigned long long (*steal_clock)(int cpu); unsigned long (*get_tsc_khz)(void); }; diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 59ab4dffa37..2dddb317bb3 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -59,6 +59,7 @@ #define X86_CR4_OSFXSR 0x00000200 /* enable fast FPU save and restore */ #define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ #define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ +#define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */ #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ #define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 84471b81046..2caf290e989 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -132,6 +132,8 @@ enum vmcs_field { GUEST_IA32_PAT_HIGH = 0x00002805, GUEST_IA32_EFER = 0x00002806, GUEST_IA32_EFER_HIGH = 0x00002807, + GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808, + GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809, GUEST_PDPTR0 = 0x0000280a, GUEST_PDPTR0_HIGH = 0x0000280b, GUEST_PDPTR1 = 0x0000280c, @@ -144,6 +146,8 @@ enum vmcs_field { HOST_IA32_PAT_HIGH = 0x00002c01, HOST_IA32_EFER = 0x00002c02, HOST_IA32_EFER_HIGH = 0x00002c03, + HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04, + HOST_IA32_PERF_GLOBAL_CTRL_HIGH = 0x00002c05, PIN_BASED_VM_EXEC_CONTROL = 0x00004000, CPU_BASED_VM_EXEC_CONTROL = 0x00004002, EXCEPTION_BITMAP = 0x00004004, @@ -426,4 +430,43 @@ struct vmx_msr_entry { u64 value; } __aligned(16); +/* + * Exit Qualifications for entry failure during or after loading guest state + */ +#define ENTRY_FAIL_DEFAULT 0 +#define ENTRY_FAIL_PDPTE 2 +#define ENTRY_FAIL_NMI 3 +#define ENTRY_FAIL_VMCS_LINK_PTR 4 + +/* + * VM-instruction error numbers + */ +enum vm_instruction_error_number { + VMXERR_VMCALL_IN_VMX_ROOT_OPERATION = 1, + VMXERR_VMCLEAR_INVALID_ADDRESS = 2, + VMXERR_VMCLEAR_VMXON_POINTER = 3, + VMXERR_VMLAUNCH_NONCLEAR_VMCS = 4, + VMXERR_VMRESUME_NONLAUNCHED_VMCS = 5, + VMXERR_VMRESUME_AFTER_VMXOFF = 6, + VMXERR_ENTRY_INVALID_CONTROL_FIELD = 7, + VMXERR_ENTRY_INVALID_HOST_STATE_FIELD = 8, + VMXERR_VMPTRLD_INVALID_ADDRESS = 9, + VMXERR_VMPTRLD_VMXON_POINTER = 10, + VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID = 11, + VMXERR_UNSUPPORTED_VMCS_COMPONENT = 12, + VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT = 13, + VMXERR_VMXON_IN_VMX_ROOT_OPERATION = 15, + VMXERR_ENTRY_INVALID_EXECUTIVE_VMCS_POINTER = 16, + VMXERR_ENTRY_NONLAUNCHED_EXECUTIVE_VMCS = 17, + VMXERR_ENTRY_EXECUTIVE_VMCS_POINTER_NOT_VMXON_POINTER = 18, + VMXERR_VMCALL_NONCLEAR_VMCS = 19, + VMXERR_VMCALL_INVALID_VM_EXIT_CONTROL_FIELDS = 20, + VMXERR_VMCALL_INCORRECT_MSEG_REVISION_ID = 22, + VMXERR_VMXOFF_UNDER_DUAL_MONITOR_TREATMENT_OF_SMIS_AND_SMM = 23, + VMXERR_VMCALL_INVALID_SMM_MONITOR_FEATURES = 24, + VMXERR_ENTRY_INVALID_VM_EXECUTION_CONTROL_FIELDS_IN_EXECUTIVE_VMCS = 25, + VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS = 26, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28, +}; + #endif diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 33c07b0b122..a9c2116001d 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -51,6 +51,15 @@ static int parse_no_kvmapf(char *arg) early_param("no-kvmapf", parse_no_kvmapf); +static int steal_acc = 1; +static int parse_no_stealacc(char *arg) +{ + steal_acc = 0; + return 0; +} + +early_param("no-steal-acc", parse_no_stealacc); + struct kvm_para_state { u8 mmu_queue[MMU_QUEUE_SIZE]; int mmu_queue_len; @@ -58,6 +67,8 @@ struct kvm_para_state { static DEFINE_PER_CPU(struct kvm_para_state, para_state); static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); +static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); +static int has_steal_clock = 0; static struct kvm_para_state *kvm_para_state(void) { @@ -441,6 +452,21 @@ static void __init paravirt_ops_setup(void) #endif } +static void kvm_register_steal_time(void) +{ + int cpu = smp_processor_id(); + struct kvm_steal_time *st = &per_cpu(steal_time, cpu); + + if (!has_steal_clock) + return; + + memset(st, 0, sizeof(*st)); + + wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED)); + printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n", + cpu, __pa(st)); +} + void __cpuinit kvm_guest_cpu_init(void) { if (!kvm_para_available()) @@ -457,6 +483,9 @@ void __cpuinit kvm_guest_cpu_init(void) printk(KERN_INFO"KVM setup async PF for cpu %d\n", smp_processor_id()); } + + if (has_steal_clock) + kvm_register_steal_time(); } static void kvm_pv_disable_apf(void *unused) @@ -483,6 +512,31 @@ static struct notifier_block kvm_pv_reboot_nb = { .notifier_call = kvm_pv_reboot_notify, }; +static u64 kvm_steal_clock(int cpu) +{ + u64 steal; + struct kvm_steal_time *src; + int version; + + src = &per_cpu(steal_time, cpu); + do { + version = src->version; + rmb(); + steal = src->steal; + rmb(); + } while ((version & 1) || (version != src->version)); + + return steal; +} + +void kvm_disable_steal_time(void) +{ + if (!has_steal_clock) + return; + + wrmsr(MSR_KVM_STEAL_TIME, 0, 0); +} + #ifdef CONFIG_SMP static void __init kvm_smp_prepare_boot_cpu(void) { @@ -500,6 +554,7 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy) static void kvm_guest_cpu_offline(void *dummy) { + kvm_disable_steal_time(); kvm_pv_disable_apf(NULL); apf_task_wake_all(); } @@ -548,6 +603,11 @@ void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) x86_init.irqs.trap_init = kvm_apf_trap_init; + if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { + has_steal_clock = 1; + pv_time_ops.steal_clock = kvm_steal_clock; + } + #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; register_cpu_notifier(&kvm_cpu_notifier); @@ -555,3 +615,15 @@ void __init kvm_guest_init(void) kvm_guest_cpu_init(); #endif } + +static __init int activate_jump_labels(void) +{ + if (has_steal_clock) { + jump_label_inc(¶virt_steal_enabled); + if (steal_acc) + jump_label_inc(¶virt_steal_rq_enabled); + } + + return 0; +} +arch_initcall(activate_jump_labels); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 6389a6bca11..c1a0188e29a 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -160,6 +160,7 @@ static void __cpuinit kvm_setup_secondary_clock(void) static void kvm_crash_shutdown(struct pt_regs *regs) { native_write_msr(msr_kvm_system_time, 0, 0); + kvm_disable_steal_time(); native_machine_crash_shutdown(regs); } #endif @@ -167,6 +168,7 @@ static void kvm_crash_shutdown(struct pt_regs *regs) static void kvm_shutdown(void) { native_write_msr(msr_kvm_system_time, 0, 0); + kvm_disable_steal_time(); native_machine_shutdown(); } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 869e1aeeb71..613a7931ecc 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -202,6 +202,14 @@ static void native_flush_tlb_single(unsigned long addr) __native_flush_tlb_single(addr); } +struct jump_label_key paravirt_steal_enabled; +struct jump_label_key paravirt_steal_rq_enabled; + +static u64 native_steal_clock(int cpu) +{ + return 0; +} + /* These are in entry.S */ extern void native_iret(void); extern void native_irq_enable_sysexit(void); @@ -307,6 +315,7 @@ struct pv_init_ops pv_init_ops = { struct pv_time_ops pv_time_ops = { .sched_clock = native_sched_clock, + .steal_clock = native_steal_clock, }; struct pv_irq_ops pv_irq_ops = { diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 65cf8233d25..988724b236b 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -31,6 +31,7 @@ config KVM select KVM_ASYNC_PF select USER_RETURN_NOTIFIER select KVM_MMIO + select TASK_DELAY_ACCT ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index adc98675cda..6f08bc940fa 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -407,76 +407,59 @@ struct gprefix { } \ } while (0) -/* Fetch next part of the instruction being emulated. */ -#define insn_fetch(_type, _size, _eip) \ -({ unsigned long _x; \ - rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ - if (rc != X86EMUL_CONTINUE) \ - goto done; \ - (_eip) += (_size); \ - (_type)_x; \ -}) - -#define insn_fetch_arr(_arr, _size, _eip) \ -({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \ - if (rc != X86EMUL_CONTINUE) \ - goto done; \ - (_eip) += (_size); \ -}) - static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, enum x86_intercept intercept, enum x86_intercept_stage stage) { struct x86_instruction_info info = { .intercept = intercept, - .rep_prefix = ctxt->decode.rep_prefix, - .modrm_mod = ctxt->decode.modrm_mod, - .modrm_reg = ctxt->decode.modrm_reg, - .modrm_rm = ctxt->decode.modrm_rm, - .src_val = ctxt->decode.src.val64, - .src_bytes = ctxt->decode.src.bytes, - .dst_bytes = ctxt->decode.dst.bytes, - .ad_bytes = ctxt->decode.ad_bytes, + .rep_prefix = ctxt->rep_prefix, + .modrm_mod = ctxt->modrm_mod, + .modrm_reg = ctxt->modrm_reg, + .modrm_rm = ctxt->modrm_rm, + .src_val = ctxt->src.val64, + .src_bytes = ctxt->src.bytes, + .dst_bytes = ctxt->dst.bytes, + .ad_bytes = ctxt->ad_bytes, .next_rip = ctxt->eip, }; return ctxt->ops->intercept(ctxt, &info, stage); } -static inline unsigned long ad_mask(struct decode_cache *c) +static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) { - return (1UL << (c->ad_bytes << 3)) - 1; + return (1UL << (ctxt->ad_bytes << 3)) - 1; } /* Access/update address held in a register, based on addressing mode. */ static inline unsigned long -address_mask(struct decode_cache *c, unsigned long reg) +address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) { - if (c->ad_bytes == sizeof(unsigned long)) + if (ctxt->ad_bytes == sizeof(unsigned long)) return reg; else - return reg & ad_mask(c); + return reg & ad_mask(ctxt); } static inline unsigned long -register_address(struct decode_cache *c, unsigned long reg) +register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg) { - return address_mask(c, reg); + return address_mask(ctxt, reg); } static inline void -register_address_increment(struct decode_cache *c, unsigned long *reg, int inc) +register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc) { - if (c->ad_bytes == sizeof(unsigned long)) + if (ctxt->ad_bytes == sizeof(unsigned long)) *reg += inc; else - *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c)); + *reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt)); } -static inline void jmp_rel(struct decode_cache *c, int rel) +static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) { - register_address_increment(c, &c->eip, rel); + register_address_increment(ctxt, &ctxt->_eip, rel); } static u32 desc_limit_scaled(struct desc_struct *desc) @@ -486,28 +469,26 @@ static u32 desc_limit_scaled(struct desc_struct *desc) return desc->g ? (limit << 12) | 0xfff : limit; } -static void set_seg_override(struct decode_cache *c, int seg) +static void set_seg_override(struct x86_emulate_ctxt *ctxt, int seg) { - c->has_seg_override = true; - c->seg_override = seg; + ctxt->has_seg_override = true; + ctxt->seg_override = seg; } -static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, int seg) +static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) { if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) return 0; - return ops->get_cached_segment_base(ctxt, seg); + return ctxt->ops->get_cached_segment_base(ctxt, seg); } -static unsigned seg_override(struct x86_emulate_ctxt *ctxt, - struct decode_cache *c) +static unsigned seg_override(struct x86_emulate_ctxt *ctxt) { - if (!c->has_seg_override) + if (!ctxt->has_seg_override) return 0; - return c->seg_override; + return ctxt->seg_override; } static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, @@ -579,7 +560,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, unsigned size, bool write, bool fetch, ulong *linear) { - struct decode_cache *c = &ctxt->decode; struct desc_struct desc; bool usable; ulong la; @@ -587,7 +567,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, u16 sel; unsigned cpl, rpl; - la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea; + la = seg_base(ctxt, addr.seg) + addr.ea; switch (ctxt->mode) { case X86EMUL_MODE_REAL: break; @@ -637,7 +617,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, } break; } - if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : c->ad_bytes != 8) + if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8) la &= (u32)-1; *linear = la; return X86EMUL_CONTINUE; @@ -671,11 +651,10 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); } -static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, +static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, unsigned long eip, u8 *dest) { - struct fetch_cache *fc = &ctxt->decode.fetch; + struct fetch_cache *fc = &ctxt->fetch; int rc; int size, cur_size; @@ -687,8 +666,8 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, rc = __linearize(ctxt, addr, size, false, true, &linear); if (rc != X86EMUL_CONTINUE) return rc; - rc = ops->fetch(ctxt, linear, fc->data + cur_size, - size, &ctxt->exception); + rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size, + size, &ctxt->exception); if (rc != X86EMUL_CONTINUE) return rc; fc->end += size; @@ -698,7 +677,6 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, } static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, unsigned long eip, void *dest, unsigned size) { int rc; @@ -707,13 +685,30 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, if (eip + size - ctxt->eip > 15) return X86EMUL_UNHANDLEABLE; while (size--) { - rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); + rc = do_insn_fetch_byte(ctxt, eip++, dest++); if (rc != X86EMUL_CONTINUE) return rc; } return X86EMUL_CONTINUE; |