aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-07-24 09:07:03 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2011-07-24 09:07:03 -0700
commit5fabc487c96819dd12ddb9414835d170fd9cd6d5 (patch)
tree01532d492e5074b0d3add29bf92ebf9a9d161e9e /arch/x86
parentc61264f98c1a974ee6f545f61a4ab33b141d6bda (diff)
parent3f68b0318bbbd61bf08478ab99a149f0d9e5156e (diff)
Merge branch 'kvm-updates/3.1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
* 'kvm-updates/3.1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (143 commits) KVM: IOMMU: Disable device assignment without interrupt remapping KVM: MMU: trace mmio page fault KVM: MMU: mmio page fault support KVM: MMU: reorganize struct kvm_shadow_walk_iterator KVM: MMU: lockless walking shadow page table KVM: MMU: do not need atomicly to set/clear spte KVM: MMU: introduce the rules to modify shadow page table KVM: MMU: abstract some functions to handle fault pfn KVM: MMU: filter out the mmio pfn from the fault pfn KVM: MMU: remove bypass_guest_pf KVM: MMU: split kvm_mmu_free_page KVM: MMU: count used shadow pages on prepareing path KVM: MMU: rename 'pt_write' to 'emulate' KVM: MMU: cleanup for FNAME(fetch) KVM: MMU: optimize to handle dirty bit KVM: MMU: cache mmio info on page fault path KVM: x86: introduce vcpu_mmio_gva_to_gpa to cleanup the code KVM: MMU: do not update slot bitmap if spte is nonpresent KVM: MMU: fix walking shadow page table KVM guest: KVM Steal time registration ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig12
-rw-r--r--arch/x86/include/asm/kvm_emulate.h52
-rw-r--r--arch/x86/include/asm/kvm_host.h46
-rw-r--r--arch/x86/include/asm/kvm_para.h20
-rw-r--r--arch/x86/include/asm/msr-index.h12
-rw-r--r--arch/x86/include/asm/paravirt.h9
-rw-r--r--arch/x86/include/asm/paravirt_types.h1
-rw-r--r--arch/x86/include/asm/processor-flags.h1
-rw-r--r--arch/x86/include/asm/vmx.h43
-rw-r--r--arch/x86/kernel/kvm.c72
-rw-r--r--arch/x86/kernel/kvmclock.c2
-rw-r--r--arch/x86/kernel/paravirt.c9
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/emulate.c1749
-rw-r--r--arch/x86/kvm/mmu.c1226
-rw-r--r--arch/x86/kvm/mmu.h25
-rw-r--r--arch/x86/kvm/mmu_audit.c12
-rw-r--r--arch/x86/kvm/mmutrace.h48
-rw-r--r--arch/x86/kvm/paging_tmpl.h258
-rw-r--r--arch/x86/kvm/svm.c6
-rw-r--r--arch/x86/kvm/trace.h31
-rw-r--r--arch/x86/kvm/vmx.c2784
-rw-r--r--arch/x86/kvm/x86.c374
-rw-r--r--arch/x86/kvm/x86.h44
24 files changed, 4982 insertions, 1855 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b2127544fbe..a67e014e4e4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -529,6 +529,18 @@ menuconfig PARAVIRT_GUEST
if PARAVIRT_GUEST
+config PARAVIRT_TIME_ACCOUNTING
+ bool "Paravirtual steal time accounting"
+ select PARAVIRT
+ default n
+ ---help---
+ Select this option to enable fine granularity task steal time
+ accounting. Time spent executing other tasks in parallel with
+ the current vCPU is discounted from the vCPU power. To account for
+ that, there can be a small performance impact.
+
+ If in doubt, say N here.
+
source "arch/x86/xen/Kconfig"
config KVM_CLOCK
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0049211959c..6040d115ef5 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -229,7 +229,26 @@ struct read_cache {
unsigned long end;
};
-struct decode_cache {
+struct x86_emulate_ctxt {
+ struct x86_emulate_ops *ops;
+
+ /* Register state before/after emulation. */
+ unsigned long eflags;
+ unsigned long eip; /* eip before instruction emulation */
+ /* Emulated execution mode, represented by an X86EMUL_MODE value. */
+ int mode;
+
+ /* interruptibility state, as a result of execution of STI or MOV SS */
+ int interruptibility;
+
+ bool guest_mode; /* guest running a nested guest */
+ bool perm_ok; /* do not check permissions if true */
+ bool only_vendor_specific_insn;
+
+ bool have_exception;
+ struct x86_exception exception;
+
+ /* decode cache */
u8 twobyte;
u8 b;
u8 intercept;
@@ -246,8 +265,6 @@ struct decode_cache {
unsigned int d;
int (*execute)(struct x86_emulate_ctxt *ctxt);
int (*check_perm)(struct x86_emulate_ctxt *ctxt);
- unsigned long regs[NR_VCPU_REGS];
- unsigned long eip;
/* modrm */
u8 modrm;
u8 modrm_mod;
@@ -255,34 +272,14 @@ struct decode_cache {
u8 modrm_rm;
u8 modrm_seg;
bool rip_relative;
+ unsigned long _eip;
+ /* Fields above regs are cleared together. */
+ unsigned long regs[NR_VCPU_REGS];
struct fetch_cache fetch;
struct read_cache io_read;
struct read_cache mem_read;
};
-struct x86_emulate_ctxt {
- struct x86_emulate_ops *ops;
-
- /* Register state before/after emulation. */
- unsigned long eflags;
- unsigned long eip; /* eip before instruction emulation */
- /* Emulated execution mode, represented by an X86EMUL_MODE value. */
- int mode;
-
- /* interruptibility state, as a result of execution of STI or MOV SS */
- int interruptibility;
-
- bool guest_mode; /* guest running a nested guest */
- bool perm_ok; /* do not check permissions if true */
- bool only_vendor_specific_insn;
-
- bool have_exception;
- struct x86_exception exception;
-
- /* decode cache */
- struct decode_cache decode;
-};
-
/* Repeat String Operation Prefix */
#define REPE_PREFIX 0xf3
#define REPNE_PREFIX 0xf2
@@ -373,6 +370,5 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
u16 tss_selector, int reason,
bool has_error_code, u32 error_code);
-int emulate_int_real(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops, int irq);
+int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d2ac8e2ee89..dd51c83aa5d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -48,7 +48,7 @@
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
| X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
| X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXSAVE \
+ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \
| X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
@@ -205,6 +205,7 @@ union kvm_mmu_page_role {
unsigned invalid:1;
unsigned nxe:1;
unsigned cr0_wp:1;
+ unsigned smep_andnot_wp:1;
};
};
@@ -227,15 +228,17 @@ struct kvm_mmu_page {
* in this shadow page.
*/
DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
- bool multimapped; /* More than one parent_pte? */
bool unsync;
int root_count; /* Currently serving as active root */
unsigned int unsync_children;
- union {
- u64 *parent_pte; /* !multimapped */
- struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
- };
+ unsigned long parent_ptes; /* Reverse mapping for parent_pte */
DECLARE_BITMAP(unsync_child_bitmap, 512);
+
+#ifdef CONFIG_X86_32
+ int clear_spte_count;
+#endif
+
+ struct rcu_head rcu;
};
struct kvm_pv_mmu_op_buffer {
@@ -269,8 +272,6 @@ struct kvm_mmu {
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
struct x86_exception *exception);
gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
- void (*prefetch_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *page);
int (*sync_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
@@ -346,8 +347,7 @@ struct kvm_vcpu_arch {
* put it here to avoid allocation */
struct kvm_pv_mmu_op_buffer mmu_op_buffer;
- struct kvm_mmu_memory_cache mmu_pte_chain_cache;
- struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
+ struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
struct kvm_mmu_memory_cache mmu_page_cache;
struct kvm_mmu_memory_cache mmu_page_header_cache;
@@ -393,6 +393,15 @@ struct kvm_vcpu_arch {
unsigned int hw_tsc_khz;
unsigned int time_offset;
struct page *time_page;
+
+ struct {
+ u64 msr_val;
+ u64 last_steal;
+ u64 accum_steal;
+ struct gfn_to_hva_cache stime;
+ struct kvm_steal_time steal;
+ } st;
+
u64 last_guest_tsc;
u64 last_kernel_ns;
u64 last_tsc_nsec;
@@ -419,6 +428,11 @@ struct kvm_vcpu_arch {
u64 mcg_ctl;
u64 *mce_banks;
+ /* Cache MMIO info */
+ u64 mmio_gva;
+ unsigned access;
+ gfn_t mmio_gfn;
+
/* used for guest single stepping over the given code position */
unsigned long singlestep_rip;
@@ -441,6 +455,7 @@ struct kvm_arch {
unsigned int n_used_mmu_pages;
unsigned int n_requested_mmu_pages;
unsigned int n_max_mmu_pages;
+ unsigned int indirect_shadow_pages;
atomic_t invlpg_counter;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
@@ -477,6 +492,8 @@ struct kvm_arch {
u64 hv_guest_os_id;
u64 hv_hypercall;
+ atomic_t reader_counter;
+
#ifdef CONFIG_KVM_MMU_AUDIT
int audit_point;
#endif
@@ -559,7 +576,7 @@ struct kvm_x86_ops {
void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
- void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
+ int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
@@ -636,7 +653,6 @@ void kvm_mmu_module_exit(void);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
int kvm_mmu_setup(struct kvm_vcpu *vcpu);
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask);
@@ -830,11 +846,12 @@ enum {
asmlinkage void kvm_spurious_fault(void);
extern bool kvm_rebooting;
-#define __kvm_handle_fault_on_reboot(insn) \
+#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \
"666: " insn "\n\t" \
"668: \n\t" \
".pushsection .fixup, \"ax\" \n" \
"667: \n\t" \
+ cleanup_insn "\n\t" \
"cmpb $0, kvm_rebooting \n\t" \
"jne 668b \n\t" \
__ASM_SIZE(push) " $666b \n\t" \
@@ -844,6 +861,9 @@ extern bool kvm_rebooting;
_ASM_PTR " 666b, 667b \n\t" \
".popsection"
+#define __kvm_handle_fault_on_reboot(insn) \
+ ____kvm_handle_fault_on_reboot(insn, "")
+
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
int kvm_age_hva(struct kvm *kvm, unsigned long hva);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index a427bf77a93..734c3767cfa 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -21,6 +21,7 @@
*/
#define KVM_FEATURE_CLOCKSOURCE2 3
#define KVM_FEATURE_ASYNC_PF 4
+#define KVM_FEATURE_STEAL_TIME 5
/* The last 8 bits are used to indicate how to interpret the flags field
* in pvclock structure. If no bits are set, all flags are ignored.
@@ -30,10 +31,23 @@
#define MSR_KVM_WALL_CLOCK 0x11
#define MSR_KVM_SYSTEM_TIME 0x12
+#define KVM_MSR_ENABLED 1
/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
+#define MSR_KVM_STEAL_TIME 0x4b564d03
+
+struct kvm_steal_time {
+ __u64 steal;
+ __u32 version;
+ __u32 flags;
+ __u32 pad[12];
+};
+
+#define KVM_STEAL_ALIGNMENT_BITS 5
+#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
+#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
#define KVM_MAX_MMU_OP_BATCH 32
@@ -178,6 +192,7 @@ void __init kvm_guest_init(void);
void kvm_async_pf_task_wait(u32 token);
void kvm_async_pf_task_wake(u32 token);
u32 kvm_read_and_reset_pf_reason(void);
+extern void kvm_disable_steal_time(void);
#else
#define kvm_guest_init() do { } while (0)
#define kvm_async_pf_task_wait(T) do {} while(0)
@@ -186,6 +201,11 @@ static inline u32 kvm_read_and_reset_pf_reason(void)
{
return 0;
}
+
+static inline void kvm_disable_steal_time(void)
+{
+ return;
+}
#endif
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index d96bdb25ca3..d52609aeeab 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -441,6 +441,18 @@
#define MSR_IA32_VMX_VMCS_ENUM 0x0000048a
#define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b
#define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c
+#define MSR_IA32_VMX_TRUE_PINBASED_CTLS 0x0000048d
+#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e
+#define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f
+#define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490
+
+/* VMX_BASIC bits and bitmasks */
+#define VMX_BASIC_VMCS_SIZE_SHIFT 32
+#define VMX_BASIC_64 0x0001000000000000LLU
+#define VMX_BASIC_MEM_TYPE_SHIFT 50
+#define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU
+#define VMX_BASIC_MEM_TYPE_WB 6LLU
+#define VMX_BASIC_INOUT 0x0040000000000000LLU
/* AMD-V MSRs */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index ebbc4d8ab17..a7d2db9a74f 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -230,6 +230,15 @@ static inline unsigned long long paravirt_sched_clock(void)
return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
}
+struct jump_label_key;
+extern struct jump_label_key paravirt_steal_enabled;
+extern struct jump_label_key paravirt_steal_rq_enabled;
+
+static inline u64 paravirt_steal_clock(int cpu)
+{
+ return PVOP_CALL1(u64, pv_time_ops.steal_clock, cpu);
+}
+
static inline unsigned long long paravirt_read_pmc(int counter)
{
return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 82885099c86..2c765216311 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -89,6 +89,7 @@ struct pv_lazy_ops {
struct pv_time_ops {
unsigned long long (*sched_clock)(void);
+ unsigned long long (*steal_clock)(int cpu);
unsigned long (*get_tsc_khz)(void);
};
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 59ab4dffa37..2dddb317bb3 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -59,6 +59,7 @@
#define X86_CR4_OSFXSR 0x00000200 /* enable fast FPU save and restore */
#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */
+#define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */
#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 84471b81046..2caf290e989 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -132,6 +132,8 @@ enum vmcs_field {
GUEST_IA32_PAT_HIGH = 0x00002805,
GUEST_IA32_EFER = 0x00002806,
GUEST_IA32_EFER_HIGH = 0x00002807,
+ GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
+ GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809,
GUEST_PDPTR0 = 0x0000280a,
GUEST_PDPTR0_HIGH = 0x0000280b,
GUEST_PDPTR1 = 0x0000280c,
@@ -144,6 +146,8 @@ enum vmcs_field {
HOST_IA32_PAT_HIGH = 0x00002c01,
HOST_IA32_EFER = 0x00002c02,
HOST_IA32_EFER_HIGH = 0x00002c03,
+ HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
+ HOST_IA32_PERF_GLOBAL_CTRL_HIGH = 0x00002c05,
PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
EXCEPTION_BITMAP = 0x00004004,
@@ -426,4 +430,43 @@ struct vmx_msr_entry {
u64 value;
} __aligned(16);
+/*
+ * Exit Qualifications for entry failure during or after loading guest state
+ */
+#define ENTRY_FAIL_DEFAULT 0
+#define ENTRY_FAIL_PDPTE 2
+#define ENTRY_FAIL_NMI 3
+#define ENTRY_FAIL_VMCS_LINK_PTR 4
+
+/*
+ * VM-instruction error numbers
+ */
+enum vm_instruction_error_number {
+ VMXERR_VMCALL_IN_VMX_ROOT_OPERATION = 1,
+ VMXERR_VMCLEAR_INVALID_ADDRESS = 2,
+ VMXERR_VMCLEAR_VMXON_POINTER = 3,
+ VMXERR_VMLAUNCH_NONCLEAR_VMCS = 4,
+ VMXERR_VMRESUME_NONLAUNCHED_VMCS = 5,
+ VMXERR_VMRESUME_AFTER_VMXOFF = 6,
+ VMXERR_ENTRY_INVALID_CONTROL_FIELD = 7,
+ VMXERR_ENTRY_INVALID_HOST_STATE_FIELD = 8,
+ VMXERR_VMPTRLD_INVALID_ADDRESS = 9,
+ VMXERR_VMPTRLD_VMXON_POINTER = 10,
+ VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID = 11,
+ VMXERR_UNSUPPORTED_VMCS_COMPONENT = 12,
+ VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT = 13,
+ VMXERR_VMXON_IN_VMX_ROOT_OPERATION = 15,
+ VMXERR_ENTRY_INVALID_EXECUTIVE_VMCS_POINTER = 16,
+ VMXERR_ENTRY_NONLAUNCHED_EXECUTIVE_VMCS = 17,
+ VMXERR_ENTRY_EXECUTIVE_VMCS_POINTER_NOT_VMXON_POINTER = 18,
+ VMXERR_VMCALL_NONCLEAR_VMCS = 19,
+ VMXERR_VMCALL_INVALID_VM_EXIT_CONTROL_FIELDS = 20,
+ VMXERR_VMCALL_INCORRECT_MSEG_REVISION_ID = 22,
+ VMXERR_VMXOFF_UNDER_DUAL_MONITOR_TREATMENT_OF_SMIS_AND_SMM = 23,
+ VMXERR_VMCALL_INVALID_SMM_MONITOR_FEATURES = 24,
+ VMXERR_ENTRY_INVALID_VM_EXECUTION_CONTROL_FIELDS_IN_EXECUTIVE_VMCS = 25,
+ VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS = 26,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28,
+};
+
#endif
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 33c07b0b122..a9c2116001d 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -51,6 +51,15 @@ static int parse_no_kvmapf(char *arg)
early_param("no-kvmapf", parse_no_kvmapf);
+static int steal_acc = 1;
+static int parse_no_stealacc(char *arg)
+{
+ steal_acc = 0;
+ return 0;
+}
+
+early_param("no-steal-acc", parse_no_stealacc);
+
struct kvm_para_state {
u8 mmu_queue[MMU_QUEUE_SIZE];
int mmu_queue_len;
@@ -58,6 +67,8 @@ struct kvm_para_state {
static DEFINE_PER_CPU(struct kvm_para_state, para_state);
static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
+static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
+static int has_steal_clock = 0;
static struct kvm_para_state *kvm_para_state(void)
{
@@ -441,6 +452,21 @@ static void __init paravirt_ops_setup(void)
#endif
}
+static void kvm_register_steal_time(void)
+{
+ int cpu = smp_processor_id();
+ struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
+
+ if (!has_steal_clock)
+ return;
+
+ memset(st, 0, sizeof(*st));
+
+ wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED));
+ printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n",
+ cpu, __pa(st));
+}
+
void __cpuinit kvm_guest_cpu_init(void)
{
if (!kvm_para_available())
@@ -457,6 +483,9 @@ void __cpuinit kvm_guest_cpu_init(void)
printk(KERN_INFO"KVM setup async PF for cpu %d\n",
smp_processor_id());
}
+
+ if (has_steal_clock)
+ kvm_register_steal_time();
}
static void kvm_pv_disable_apf(void *unused)
@@ -483,6 +512,31 @@ static struct notifier_block kvm_pv_reboot_nb = {
.notifier_call = kvm_pv_reboot_notify,
};
+static u64 kvm_steal_clock(int cpu)
+{
+ u64 steal;
+ struct kvm_steal_time *src;
+ int version;
+
+ src = &per_cpu(steal_time, cpu);
+ do {
+ version = src->version;
+ rmb();
+ steal = src->steal;
+ rmb();
+ } while ((version & 1) || (version != src->version));
+
+ return steal;
+}
+
+void kvm_disable_steal_time(void)
+{
+ if (!has_steal_clock)
+ return;
+
+ wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
+}
+
#ifdef CONFIG_SMP
static void __init kvm_smp_prepare_boot_cpu(void)
{
@@ -500,6 +554,7 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy)
static void kvm_guest_cpu_offline(void *dummy)
{
+ kvm_disable_steal_time();
kvm_pv_disable_apf(NULL);
apf_task_wake_all();
}
@@ -548,6 +603,11 @@ void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
x86_init.irqs.trap_init = kvm_apf_trap_init;
+ if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+ has_steal_clock = 1;
+ pv_time_ops.steal_clock = kvm_steal_clock;
+ }
+
#ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
register_cpu_notifier(&kvm_cpu_notifier);
@@ -555,3 +615,15 @@ void __init kvm_guest_init(void)
kvm_guest_cpu_init();
#endif
}
+
+static __init int activate_jump_labels(void)
+{
+ if (has_steal_clock) {
+ jump_label_inc(&paravirt_steal_enabled);
+ if (steal_acc)
+ jump_label_inc(&paravirt_steal_rq_enabled);
+ }
+
+ return 0;
+}
+arch_initcall(activate_jump_labels);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 6389a6bca11..c1a0188e29a 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -160,6 +160,7 @@ static void __cpuinit kvm_setup_secondary_clock(void)
static void kvm_crash_shutdown(struct pt_regs *regs)
{
native_write_msr(msr_kvm_system_time, 0, 0);
+ kvm_disable_steal_time();
native_machine_crash_shutdown(regs);
}
#endif
@@ -167,6 +168,7 @@ static void kvm_crash_shutdown(struct pt_regs *regs)
static void kvm_shutdown(void)
{
native_write_msr(msr_kvm_system_time, 0, 0);
+ kvm_disable_steal_time();
native_machine_shutdown();
}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 869e1aeeb71..613a7931ecc 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -202,6 +202,14 @@ static void native_flush_tlb_single(unsigned long addr)
__native_flush_tlb_single(addr);
}
+struct jump_label_key paravirt_steal_enabled;
+struct jump_label_key paravirt_steal_rq_enabled;
+
+static u64 native_steal_clock(int cpu)
+{
+ return 0;
+}
+
/* These are in entry.S */
extern void native_iret(void);
extern void native_irq_enable_sysexit(void);
@@ -307,6 +315,7 @@ struct pv_init_ops pv_init_ops = {
struct pv_time_ops pv_time_ops = {
.sched_clock = native_sched_clock,
+ .steal_clock = native_steal_clock,
};
struct pv_irq_ops pv_irq_ops = {
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 65cf8233d25..988724b236b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -31,6 +31,7 @@ config KVM
select KVM_ASYNC_PF
select USER_RETURN_NOTIFIER
select KVM_MMIO
+ select TASK_DELAY_ACCT
---help---
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index adc98675cda..6f08bc940fa 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -407,76 +407,59 @@ struct gprefix {
} \
} while (0)
-/* Fetch next part of the instruction being emulated. */
-#define insn_fetch(_type, _size, _eip) \
-({ unsigned long _x; \
- rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
- if (rc != X86EMUL_CONTINUE) \
- goto done; \
- (_eip) += (_size); \
- (_type)_x; \
-})
-
-#define insn_fetch_arr(_arr, _size, _eip) \
-({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \
- if (rc != X86EMUL_CONTINUE) \
- goto done; \
- (_eip) += (_size); \
-})
-
static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
enum x86_intercept intercept,
enum x86_intercept_stage stage)
{
struct x86_instruction_info info = {
.intercept = intercept,
- .rep_prefix = ctxt->decode.rep_prefix,
- .modrm_mod = ctxt->decode.modrm_mod,
- .modrm_reg = ctxt->decode.modrm_reg,
- .modrm_rm = ctxt->decode.modrm_rm,
- .src_val = ctxt->decode.src.val64,
- .src_bytes = ctxt->decode.src.bytes,
- .dst_bytes = ctxt->decode.dst.bytes,
- .ad_bytes = ctxt->decode.ad_bytes,
+ .rep_prefix = ctxt->rep_prefix,
+ .modrm_mod = ctxt->modrm_mod,
+ .modrm_reg = ctxt->modrm_reg,
+ .modrm_rm = ctxt->modrm_rm,
+ .src_val = ctxt->src.val64,
+ .src_bytes = ctxt->src.bytes,
+ .dst_bytes = ctxt->dst.bytes,
+ .ad_bytes = ctxt->ad_bytes,
.next_rip = ctxt->eip,
};
return ctxt->ops->intercept(ctxt, &info, stage);
}
-static inline unsigned long ad_mask(struct decode_cache *c)
+static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
{
- return (1UL << (c->ad_bytes << 3)) - 1;
+ return (1UL << (ctxt->ad_bytes << 3)) - 1;
}
/* Access/update address held in a register, based on addressing mode. */
static inline unsigned long
-address_mask(struct decode_cache *c, unsigned long reg)
+address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg)
{
- if (c->ad_bytes == sizeof(unsigned long))
+ if (ctxt->ad_bytes == sizeof(unsigned long))
return reg;
else
- return reg & ad_mask(c);
+ return reg & ad_mask(ctxt);
}
static inline unsigned long
-register_address(struct decode_cache *c, unsigned long reg)
+register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg)
{
- return address_mask(c, reg);
+ return address_mask(ctxt, reg);
}
static inline void
-register_address_increment(struct decode_cache *c, unsigned long *reg, int inc)
+register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc)
{
- if (c->ad_bytes == sizeof(unsigned long))
+ if (ctxt->ad_bytes == sizeof(unsigned long))
*reg += inc;
else
- *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c));
+ *reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt));
}
-static inline void jmp_rel(struct decode_cache *c, int rel)
+static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
{
- register_address_increment(c, &c->eip, rel);
+ register_address_increment(ctxt, &ctxt->_eip, rel);
}
static u32 desc_limit_scaled(struct desc_struct *desc)
@@ -486,28 +469,26 @@ static u32 desc_limit_scaled(struct desc_struct *desc)
return desc->g ? (limit << 12) | 0xfff : limit;
}
-static void set_seg_override(struct decode_cache *c, int seg)
+static void set_seg_override(struct x86_emulate_ctxt *ctxt, int seg)
{
- c->has_seg_override = true;
- c->seg_override = seg;
+ ctxt->has_seg_override = true;
+ ctxt->seg_override = seg;
}
-static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops, int seg)
+static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
{
if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
return 0;
- return ops->get_cached_segment_base(ctxt, seg);
+ return ctxt->ops->get_cached_segment_base(ctxt, seg);
}
-static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
- struct decode_cache *c)
+static unsigned seg_override(struct x86_emulate_ctxt *ctxt)
{
- if (!c->has_seg_override)
+ if (!ctxt->has_seg_override)
return 0;
- return c->seg_override;
+ return ctxt->seg_override;
}
static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
@@ -579,7 +560,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
unsigned size, bool write, bool fetch,
ulong *linear)
{
- struct decode_cache *c = &ctxt->decode;
struct desc_struct desc;
bool usable;
ulong la;
@@ -587,7 +567,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
u16 sel;
unsigned cpl, rpl;
- la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
+ la = seg_base(ctxt, addr.seg) + addr.ea;
switch (ctxt->mode) {
case X86EMUL_MODE_REAL:
break;
@@ -637,7 +617,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
}
break;
}
- if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : c->ad_bytes != 8)
+ if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8)
la &= (u32)-1;
*linear = la;
return X86EMUL_CONTINUE;
@@ -671,11 +651,10 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
}
-static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
+static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt,
unsigned long eip, u8 *dest)
{
- struct fetch_cache *fc = &ctxt->decode.fetch;
+ struct fetch_cache *fc = &ctxt->fetch;
int rc;
int size, cur_size;
@@ -687,8 +666,8 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
rc = __linearize(ctxt, addr, size, false, true, &linear);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = ops->fetch(ctxt, linear, fc->data + cur_size,
- size, &ctxt->exception);
+ rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size,
+ size, &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
fc->end += size;
@@ -698,7 +677,6 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
}
static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
unsigned long eip, void *dest, unsigned size)
{
int rc;
@@ -707,13 +685,30 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
if (eip + size - ctxt->eip > 15)
return X86EMUL_UNHANDLEABLE;
while (size--) {
- rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
+ rc = do_insn_fetch_byte(ctxt, eip++, dest++);
if (rc != X86EMUL_CONTINUE)
return rc;
}
return X86EMUL_CONTINUE;