diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-07-03 13:21:40 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-07-03 13:21:40 -0700 |
commit | fe489bf4505ae26d3c6d6a1f1d3064c2a9c5cd85 (patch) | |
tree | 46596fd7edf7c4da1dafdb2c62011841e71cf32d | |
parent | 3e34131a65127e73fbae68c82748f32c8af7e4a4 (diff) | |
parent | a3ff5fbc94a829680d4aa005cd17add1c1a1fb5b (diff) |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM fixes from Paolo Bonzini:
"On the x86 side, there are some optimizations and documentation
updates. The big ARM/KVM change for 3.11, support for AArch64, will
come through Catalin Marinas's tree. s390 and PPC have misc cleanups
and bugfixes"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (87 commits)
KVM: PPC: Ignore PIR writes
KVM: PPC: Book3S PR: Invalidate SLB entries properly
KVM: PPC: Book3S PR: Allow guest to use 1TB segments
KVM: PPC: Book3S PR: Don't keep scanning HPTEG after we find a match
KVM: PPC: Book3S PR: Fix invalidation of SLB entry 0 on guest entry
KVM: PPC: Book3S PR: Fix proto-VSID calculations
KVM: PPC: Guard doorbell exception with CONFIG_PPC_DOORBELL
KVM: Fix RTC interrupt coalescing tracking
kvm: Add a tracepoint write_tsc_offset
KVM: MMU: Inform users of mmio generation wraparound
KVM: MMU: document fast invalidate all mmio sptes
KVM: MMU: document fast invalidate all pages
KVM: MMU: document fast page fault
KVM: MMU: document mmio page fault
KVM: MMU: document write_flooding_count
KVM: MMU: document clear_spte_count
KVM: MMU: drop kvm_mmu_zap_mmio_sptes
KVM: MMU: init kvm generation close to mmio wrap-around value
KVM: MMU: add tracepoint for check_mmio_spte
KVM: MMU: fast invalidate all mmio sptes
...
62 files changed, 1382 insertions, 807 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 9bfadeb8be3..66dd2aa53ba 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2278,7 +2278,7 @@ return indicates the attribute is implemented. It does not necessarily indicate that the attribute can be read or written in the device's current state. "addr" is ignored. -4.77 KVM_ARM_VCPU_INIT +4.82 KVM_ARM_VCPU_INIT Capability: basic Architectures: arm, arm64 @@ -2304,7 +2304,7 @@ Possible features: Depends on KVM_CAP_ARM_EL1_32BIT (arm64 only). -4.78 KVM_GET_REG_LIST +4.83 KVM_GET_REG_LIST Capability: basic Architectures: arm, arm64 @@ -2324,7 +2324,7 @@ This ioctl returns the guest registers that are supported for the KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. -4.80 KVM_ARM_SET_DEVICE_ADDR +4.84 KVM_ARM_SET_DEVICE_ADDR Capability: KVM_CAP_ARM_SET_DEVICE_ADDR Architectures: arm, arm64 @@ -2362,7 +2362,7 @@ must be called after calling KVM_CREATE_IRQCHIP, but before calling KVM_RUN on any of the VCPUs. Calling this ioctl twice for any of the base addresses will return -EEXIST. -4.82 KVM_PPC_RTAS_DEFINE_TOKEN +4.85 KVM_PPC_RTAS_DEFINE_TOKEN Capability: KVM_CAP_PPC_RTAS Architectures: ppc diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt index 43fcb761ed1..29089417614 100644 --- a/Documentation/virtual/kvm/mmu.txt +++ b/Documentation/virtual/kvm/mmu.txt @@ -191,12 +191,12 @@ Shadow pages contain the following information: A counter keeping track of how many hardware registers (guest cr3 or pdptrs) are now pointing at the page. While this counter is nonzero, the page cannot be destroyed. See role.invalid. - multimapped: - Whether there exist multiple sptes pointing at this page. - parent_pte/parent_ptes: - If multimapped is zero, parent_pte points at the single spte that points at - this page's spt. Otherwise, parent_ptes points at a data structure - with a list of parent_ptes. + parent_ptes: + The reverse mapping for the pte/ptes pointing at this page's spt. If + parent_ptes bit 0 is zero, only one spte points at this pages and + parent_ptes points at this single spte, otherwise, there exists multiple + sptes pointing at this page and (parent_ptes & ~0x1) points at a data + structure with a list of parent_ptes. unsync: If true, then the translations in this page may not match the guest's translation. This is equivalent to the state of the tlb when a pte is @@ -210,6 +210,24 @@ Shadow pages contain the following information: A bitmap indicating which sptes in spt point (directly or indirectly) at pages that may be unsynchronized. Used to quickly locate all unsychronized pages reachable from a given page. + mmu_valid_gen: + Generation number of the page. It is compared with kvm->arch.mmu_valid_gen + during hash table lookup, and used to skip invalidated shadow pages (see + "Zapping all pages" below.) + clear_spte_count: + Only present on 32-bit hosts, where a 64-bit spte cannot be written + atomically. The reader uses this while running out of the MMU lock + to detect in-progress updates and retry them until the writer has + finished the write. + write_flooding_count: + A guest may write to a page table many times, causing a lot of + emulations if the page needs to be write-protected (see "Synchronized + and unsynchronized pages" below). Leaf pages can be unsynchronized + so that they do not trigger frequent emulation, but this is not + possible for non-leafs. This field counts the number of emulations + since the last time the page table was actually used; if emulation + is triggered too frequently on this page, KVM will unmap the page + to avoid emulation in the future. Reverse map =========== @@ -258,14 +276,26 @@ This is the most complicated event. The cause of a page fault can be: Handling a page fault is performed as follows: + - if the RSV bit of the error code is set, the page fault is caused by guest + accessing MMIO and cached MMIO information is available. + - walk shadow page table + - check for valid generation number in the spte (see "Fast invalidation of + MMIO sptes" below) + - cache the information to vcpu->arch.mmio_gva, vcpu->arch.access and + vcpu->arch.mmio_gfn, and call the emulator + - If both P bit and R/W bit of error code are set, this could possibly + be handled as a "fast page fault" (fixed without taking the MMU lock). See + the description in Documentation/virtual/kvm/locking.txt. - if needed, walk the guest page tables to determine the guest translation (gva->gpa or ngpa->gpa) - if permissions are insufficient, reflect the fault back to the guest - determine the host page - - if this is an mmio request, there is no host page; call the emulator - to emulate the instruction instead + - if this is an mmio request, there is no host page; cache the info to + vcpu->arch.mmio_gva, vcpu->arch.access and vcpu->arch.mmio_gfn - walk the shadow page table to find the spte for the translation, instantiating missing intermediate page tables as necessary + - If this is an mmio request, cache the mmio info to the spte and set some + reserved bit on the spte (see callers of kvm_mmu_set_mmio_spte_mask) - try to unsynchronize the page - if successful, we can let the guest continue and modify the gpte - emulate the instruction @@ -351,6 +381,51 @@ causes its write_count to be incremented, thus preventing instantiation of a large spte. The frames at the end of an unaligned memory slot have artificially inflated ->write_counts so they can never be instantiated. +Zapping all pages (page generation count) +========================================= + +For the large memory guests, walking and zapping all pages is really slow +(because there are a lot of pages), and also blocks memory accesses of +all VCPUs because it needs to hold the MMU lock. + +To make it be more scalable, kvm maintains a global generation number +which is stored in kvm->arch.mmu_valid_gen. Every shadow page stores +the current global generation-number into sp->mmu_valid_gen when it +is created. Pages with a mismatching generation number are "obsolete". + +When KVM need zap all shadow pages sptes, it just simply increases the global +generation-number then reload root shadow pages on all vcpus. As the VCPUs +create new shadow page tables, the old pages are not used because of the +mismatching generation number. + +KVM then walks through all pages and zaps obsolete pages. While the zap +operation needs to take the MMU lock, the lock can be released periodically +so that the VCPUs can make progress. + +Fast invalidation of MMIO sptes +=============================== + +As mentioned in "Reaction to events" above, kvm will cache MMIO +information in leaf sptes. When a new memslot is added or an existing +memslot is changed, this information may become stale and needs to be +invalidated. This also needs to hold the MMU lock while walking all +shadow pages, and is made more scalable with a similar technique. + +MMIO sptes have a few spare bits, which are used to store a +generation number. The global generation number is stored in +kvm_memslots(kvm)->generation, and increased whenever guest memory info +changes. This generation number is distinct from the one described in +the previous section. + +When KVM finds an MMIO spte, it checks the generation number of the spte. +If the generation number of the spte does not equal the global generation +number, it will ignore the cached MMIO information and handle the page +fault through the slow path. + +Since only 19 bits are used to store generation-number on mmio spte, all +pages are zapped when there is an overflow. + + Further reading =============== diff --git a/MAINTAINERS b/MAINTAINERS index c85bf69bb32..60c68fbee64 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4733,10 +4733,10 @@ F: arch/s390/kvm/ F: drivers/s390/kvm/ KERNEL VIRTUAL MACHINE (KVM) FOR ARM -M: Christoffer Dall <cdall@cs.columbia.edu> +M: Christoffer Dall <christoffer.dall@linaro.org> L: kvmarm@lists.cs.columbia.edu W: http://systems.cs.columbia.edu/projects/kvm-arm -S: Maintained +S: Supported F: arch/arm/include/uapi/asm/kvm* F: arch/arm/include/asm/kvm* F: arch/arm/kvm/ diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h index 124623e5ef1..64e96960de2 100644 --- a/arch/arm/include/asm/kvm_arm.h +++ b/arch/arm/include/asm/kvm_arm.h @@ -135,7 +135,6 @@ #define KVM_PHYS_MASK (KVM_PHYS_SIZE - 1ULL) #define PTRS_PER_S2_PGD (1ULL << (KVM_PHYS_SHIFT - 30)) #define S2_PGD_ORDER get_order(PTRS_PER_S2_PGD * sizeof(pgd_t)) -#define S2_PGD_SIZE (1 << S2_PGD_ORDER) /* Virtualization Translation Control Register (VTCR) bits */ #define VTCR_SH0 (3 << 12) diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 18d50322a9e..a2f43ddcc30 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -37,16 +37,18 @@ #define c5_AIFSR 15 /* Auxilary Instrunction Fault Status R */ #define c6_DFAR 16 /* Data Fault Address Register */ #define c6_IFAR 17 /* Instruction Fault Address Register */ -#define c9_L2CTLR 18 /* Cortex A15 L2 Control Register */ -#define c10_PRRR 19 /* Primary Region Remap Register */ -#define c10_NMRR 20 /* Normal Memory Remap Register */ -#define c12_VBAR 21 /* Vector Base Address Register */ -#define c13_CID 22 /* Context ID Register */ -#define c13_TID_URW 23 /* Thread ID, User R/W */ -#define c13_TID_URO 24 /* Thread ID, User R/O */ -#define c13_TID_PRIV 25 /* Thread ID, Privileged */ -#define c14_CNTKCTL 26 /* Timer Control Register (PL1) */ -#define NR_CP15_REGS 27 /* Number of regs (incl. invalid) */ +#define c7_PAR 18 /* Physical Address Register */ +#define c7_PAR_high 19 /* PAR top 32 bits */ +#define c9_L2CTLR 20 /* Cortex A15 L2 Control Register */ +#define c10_PRRR 21 /* Primary Region Remap Register */ +#define c10_NMRR 22 /* Normal Memory Remap Register */ +#define c12_VBAR 23 /* Vector Base Address Register */ +#define c13_CID 24 /* Context ID Register */ +#define c13_TID_URW 25 /* Thread ID, User R/W */ +#define c13_TID_URO 26 /* Thread ID, User R/O */ +#define c13_TID_PRIV 27 /* Thread ID, Privileged */ +#define c14_CNTKCTL 28 /* Timer Control Register (PL1) */ +#define NR_CP15_REGS 29 /* Number of regs (incl. invalid) */ #define ARM_EXCEPTION_RESET 0 #define ARM_EXCEPTION_UNDEFINED 1 @@ -72,8 +74,6 @@ extern char __kvm_hyp_vector[]; extern char __kvm_hyp_code_start[]; extern char __kvm_hyp_code_end[]; -extern void __kvm_tlb_flush_vmid(struct kvm *kvm); - extern void __kvm_flush_vm_context(void); extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index 82b4babead2..a464e8d7b6c 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -65,11 +65,6 @@ static inline bool vcpu_mode_priv(struct kvm_vcpu *vcpu) return cpsr_mode > USR_MODE;; } -static inline bool kvm_vcpu_reg_is_pc(struct kvm_vcpu *vcpu, int reg) -{ - return reg == 15; -} - static inline u32 kvm_vcpu_get_hsr(struct kvm_vcpu *vcpu) { return vcpu->arch.fault.hsr; diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 57cb786a620..7d22517d807 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -23,9 +23,14 @@ #include <asm/kvm_asm.h> #include <asm/kvm_mmio.h> #include <asm/fpstate.h> -#include <asm/kvm_arch_timer.h> +#include <kvm/arm_arch_timer.h> +#if defined(CONFIG_KVM_ARM_MAX_VCPUS) #define KVM_MAX_VCPUS CONFIG_KVM_ARM_MAX_VCPUS +#else +#define KVM_MAX_VCPUS 0 +#endif + #define KVM_USER_MEM_SLOTS 32 #define KVM_PRIVATE_MEM_SLOTS 4 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 @@ -38,7 +43,7 @@ #define KVM_NR_PAGE_SIZES 1 #define KVM_PAGES_PER_HPAGE(x) (1UL<<31) -#include <asm/kvm_vgic.h> +#include <kvm/arm_vgic.h> struct kvm_vcpu; u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode); @@ -190,8 +195,8 @@ int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, int exception_index); -static inline void __cpu_init_hyp_mode(unsigned long long boot_pgd_ptr, - unsigned long long pgd_ptr, +static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr, + phys_addr_t pgd_ptr, unsigned long hyp_stack_ptr, unsigned long vector_ptr) { diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 370e1a8af6a..ebf5015508b 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -41,9 +41,9 @@ config KVM_ARM_HOST Provides host support for ARM processors. config KVM_ARM_MAX_VCPUS - int "Number maximum supported virtual CPUs per VM" if KVM_ARM_HOST - default 4 if KVM_ARM_HOST - default 0 + int "Number maximum supported virtual CPUs per VM" + depends on KVM_ARM_HOST + default 4 help Static number of max supported virtual CPUs per VM. @@ -67,6 +67,4 @@ config KVM_ARM_TIMER ---help--- Adds support for the Architected Timers in virtual machines -source drivers/virtio/Kconfig - endif # VIRTUALIZATION diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index 53c5ed83d16..d99bee4950e 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -14,10 +14,11 @@ CFLAGS_mmu.o := -I. AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt) AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt) -kvm-arm-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o) +KVM := ../../../virt/kvm +kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o obj-y += kvm-arm.o init.o interrupts.o obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o obj-y += coproc.o coproc_a15.o mmio.o psci.o perf.o -obj-$(CONFIG_KVM_ARM_VGIC) += vgic.o -obj-$(CONFIG_KVM_ARM_TIMER) += arch_timer.o +obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic.o +obj-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index ef1703b9587..741f66a2edb 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -800,8 +800,8 @@ long kvm_arch_vm_ioctl(struct file *filp, static void cpu_init_hyp_mode(void *dummy) { - unsigned long long boot_pgd_ptr; - unsigned long long pgd_ptr; + phys_addr_t boot_pgd_ptr; + phys_addr_t pgd_ptr; unsigned long hyp_stack_ptr; unsigned long stack_page; unsigned long vector_ptr; @@ -809,8 +809,8 @@ static void cpu_init_hyp_mode(void *dummy) /* Switch from the HYP stub to our own HYP init vector */ __hyp_set_vectors(kvm_get_idmap_vector()); - boot_pgd_ptr = (unsigned long long)kvm_mmu_get_boot_httbr(); - pgd_ptr = (unsigned long long)kvm_mmu_get_httbr(); + boot_pgd_ptr = kvm_mmu_get_boot_httbr(); + pgd_ptr = kvm_mmu_get_httbr(); stack_page = __get_cpu_var(kvm_arm_hyp_stack_page); hyp_stack_ptr = stack_page + PAGE_SIZE; vector_ptr = (unsigned long)__kvm_hyp_vector; diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index 8eea97be1ed..4a519907043 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -180,6 +180,10 @@ static const struct coproc_reg cp15_regs[] = { NULL, reset_unknown, c6_DFAR }, { CRn( 6), CRm( 0), Op1( 0), Op2( 2), is32, NULL, reset_unknown, c6_IFAR }, + + /* PAR swapped by interrupt.S */ + { CRn( 7), Op1( 0), is64, NULL, reset_unknown64, c7_PAR }, + /* * DC{C,I,CI}SW operations: */ diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c index 3d74a0be47d..df4c82d47ad 100644 --- a/arch/arm/kvm/handle_exit.c +++ b/arch/arm/kvm/handle_exit.c @@ -52,9 +52,6 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) { - if (kvm_psci_call(vcpu)) - return 1; - kvm_inject_undefined(vcpu); return 1; } diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index f7793df62f5..16cd4ba5d7f 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -49,6 +49,7 @@ __kvm_hyp_code_start: ENTRY(__kvm_tlb_flush_vmid_ipa) push {r2, r3} + dsb ishst add r0, r0, #KVM_VTTBR ldrd r2, r3, [r0] mcrr p15, 6, r2, r3, c2 @ Write VTTBR @@ -291,6 +292,7 @@ THUMB( orr r2, r2, #PSR_T_BIT ) ldr r2, =BSYM(panic) msr ELR_hyp, r2 ldr r0, =\panic_str + clrex @ Clear exclusive monitor eret .endm @@ -414,6 +416,10 @@ guest_trap: mrcne p15, 4, r2, c6, c0, 4 @ HPFAR bne 3f + /* Preserve PAR */ + mrrc p15, 0, r0, r1, c7 @ PAR + push {r0, r1} + /* Resolve IPA using the xFAR */ mcr p15, 0, r2, c7, c8, 0 @ ATS1CPR isb @@ -424,13 +430,20 @@ guest_trap: lsl r2, r2, #4 orr r2, r2, r1, lsl #24 + /* Restore PAR */ + pop {r0, r1} + mcrr p15, 0, r0, r1, c7 @ PAR + 3: load_vcpu @ Load VCPU pointer to r0 str r2, [r0, #VCPU_HPFAR] 1: mov r1, #ARM_EXCEPTION_HVC b __kvm_vcpu_return -4: pop {r0, r1, r2} @ Failed translation, return to guest +4: pop {r0, r1} @ Failed translation, return to guest + mcrr p15, 0, r0, r1, c7 @ PAR + clrex + pop {r0, r1, r2} eret /* @@ -456,6 +469,7 @@ switch_to_guest_vfp: pop {r3-r7} pop {r0-r2} + clrex eret #endif diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S index d43cfb5b37c..6f18695a09c 100644 --- a/arch/arm/kvm/interrupts_head.S +++ b/arch/arm/kvm/interrupts_head.S @@ -302,11 +302,14 @@ vcpu .req r0 @ vcpu pointer always in r0 .endif mrc p15, 0, r2, c14, c1, 0 @ CNTKCTL + mrrc p15, 0, r4, r5, c7 @ PAR .if \store_to_vcpu == 0 - push {r2} + push {r2,r4-r5} .else str r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)] + add r12, vcpu, #CP15_OFFSET(c7_PAR) + strd r4, r5, [r12] .endif .endm @@ -319,12 +322,15 @@ vcpu .req r0 @ vcpu pointer always in r0 */ .macro write_cp15_state read_from_vcpu .if \read_from_vcpu == 0 - pop {r2} + pop {r2,r4-r5} .else ldr r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)] + add r12, vcpu, #CP15_OFFSET(c7_PAR) + ldrd r4, r5, [r12] .endif mcr p15, 0, r2, c14, c1, 0 @ CNTKCTL + mcrr p15, 0, r4, r5, c7 @ PAR .if \read_from_vcpu == 0 pop {r2-r12} diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c index 72a12f2171b..b8e06b7a283 100644 --- a/arch/arm/kvm/mmio.c +++ b/arch/arm/kvm/mmio.c @@ -86,12 +86,6 @@ static int decode_hsr(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, sign_extend = kvm |