diff options
71 files changed, 1914 insertions, 519 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 2c994837946..bf33aaa4c59 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1946,6 +1946,40 @@ the guest using the specified gsi pin. The irqfd is removed using the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd and kvm_irqfd.gsi. +4.76 KVM_PPC_ALLOCATE_HTAB + +Capability: KVM_CAP_PPC_ALLOC_HTAB +Architectures: powerpc +Type: vm ioctl +Parameters: Pointer to u32 containing hash table order (in/out) +Returns: 0 on success, -1 on error + +This requests the host kernel to allocate an MMU hash table for a +guest using the PAPR paravirtualization interface. This only does +anything if the kernel is configured to use the Book 3S HV style of +virtualization. Otherwise the capability doesn't exist and the ioctl +returns an ENOTTY error. The rest of this description assumes Book 3S +HV. + +There must be no vcpus running when this ioctl is called; if there +are, it will do nothing and return an EBUSY error. + +The parameter is a pointer to a 32-bit unsigned integer variable +containing the order (log base 2) of the desired size of the hash +table, which must be between 18 and 46. On successful return from the +ioctl, it will have been updated with the order of the hash table that +was allocated. + +If no hash table has been allocated when any vcpu is asked to run +(with the KVM_RUN ioctl), the host kernel will allocate a +default-sized hash table (16 MB). + +If this ioctl is called when a hash table has already been allocated, +the kernel will clear out the existing hash table (zero all HPTEs) and +return the hash table order in the parameter. (If the guest is using +the virtualized real-mode area (VRMA) facility, the kernel will +re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.) + 5. The kvm_run structure ------------------------ diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt index 3b4cd3bf563..41b7ac9884b 100644 --- a/Documentation/virtual/kvm/locking.txt +++ b/Documentation/virtual/kvm/locking.txt @@ -6,7 +6,129 @@ KVM Lock Overview (to be written) -2. Reference +2: Exception +------------ + +Fast page fault: + +Fast page fault is the fast path which fixes the guest page fault out of +the mmu-lock on x86. Currently, the page fault can be fast only if the +shadow page table is present and it is caused by write-protect, that means +we just need change the W bit of the spte. + +What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and +SPTE_MMU_WRITEABLE bit on the spte: +- SPTE_HOST_WRITEABLE means the gfn is writable on host. +- SPTE_MMU_WRITEABLE means the gfn is writable on mmu. The bit is set when + the gfn is writable on guest mmu and it is not write-protected by shadow + page write-protection. + +On fast page fault path, we will use cmpxchg to atomically set the spte W +bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, this +is safe because whenever changing these bits can be detected by cmpxchg. + +But we need carefully check these cases: +1): The mapping from gfn to pfn +The mapping from gfn to pfn may be changed since we can only ensure the pfn +is not changed during cmpxchg. This is a ABA problem, for example, below case +will happen: + +At the beginning: +gpte = gfn1 +gfn1 is mapped to pfn1 on host +spte is the shadow page table entry corresponding with gpte and +spte = pfn1 + + VCPU 0 VCPU0 +on fast page fault path: + + old_spte = *spte; + pfn1 is swapped out: + spte = 0; + + pfn1 is re-alloced for gfn2. + + gpte is changed to point to + gfn2 by the guest: + spte = pfn1; + + if (cmpxchg(spte, old_spte, old_spte+W) + mark_page_dirty(vcpu->kvm, gfn1) + OOPS!!! + +We dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap. + +For direct sp, we can easily avoid it since the spte of direct sp is fixed +to gfn. For indirect sp, before we do cmpxchg, we call gfn_to_pfn_atomic() +to pin gfn to pfn, because after gfn_to_pfn_atomic(): +- We have held the refcount of pfn that means the pfn can not be freed and + be reused for another gfn. +- The pfn is writable that means it can not be shared between different gfns + by KSM. + +Then, we can ensure the dirty bitmaps is correctly set for a gfn. + +Currently, to simplify the whole things, we disable fast page fault for +indirect shadow page. + +2): Dirty bit tracking +In the origin code, the spte can be fast updated (non-atomically) if the +spte is read-only and the Accessed bit has already been set since the +Accessed bit and Dirty bit can not be lost. + +But it is not true after fast page fault since the spte can be marked +writable between reading spte and updating spte. Like below case: + +At the beginning: +spte.W = 0 +spte.Accessed = 1 + + VCPU 0 VCPU0 +In mmu_spte_clear_track_bits(): + + old_spte = *spte; + + /* 'if' condition is satisfied. */ + if (old_spte.Accssed == 1 && + old_spte.W == 0) + spte = 0ull; + on fast page fault path: + spte.W = 1 + memory write on the spte: + spte.Dirty = 1 + + + else + old_spte = xchg(spte, 0ull) + + + if (old_spte.Accssed == 1) + kvm_set_pfn_accessed(spte.pfn); + if (old_spte.Dirty == 1) + kvm_set_pfn_dirty(spte.pfn); + OOPS!!! + +The Dirty bit is lost in this case. + +In order to avoid this kind of issue, we always treat the spte as "volatile" +if it can be updated out of mmu-lock, see spte_has_volatile_bits(), it means, +the spte is always atomicly updated in this case. + +3): flush tlbs due to spte updated +If the spte is updated from writable to readonly, we should flush all TLBs, +otherwise rmap_write_protect will find a read-only spte, even though the +writable spte might be cached on a CPU's TLB. + +As mentioned before, the spte can be updated to writable out of mmu-lock on +fast page fault path, in order to easily audit the path, we see if TLBs need +be flushed caused by this reason in mmu_spte_update() since this is a common +function to update spte (present -> present). + +Since the spte is "volatile" if it can be updated out of mmu-lock, we always +atomicly update the spte, the race caused by fast page fault can be avoided, +See the comments in spte_has_volatile_bits() and mmu_spte_update(). + +3. Reference ------------ Name: kvm_lock @@ -23,3 +145,9 @@ Arch: x86 Protects: - kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset} - tsc offset in vmcb Comment: 'raw' because updating the tsc offsets must not be preempted. + +Name: kvm->mmu_lock +Type: spinlock_t +Arch: any +Protects: -shadow page/shadow tlb entry +Comment: it is a spinlock since it is used in mmu notifier. diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt index 96b41bd9752..73047104858 100644 --- a/Documentation/virtual/kvm/msr.txt +++ b/Documentation/virtual/kvm/msr.txt @@ -223,3 +223,36 @@ MSR_KVM_STEAL_TIME: 0x4b564d03 steal: the amount of time in which this vCPU did not run, in nanoseconds. Time during which the vcpu is idle, will not be reported as steal time. + +MSR_KVM_EOI_EN: 0x4b564d04 + data: Bit 0 is 1 when PV end of interrupt is enabled on the vcpu; 0 + when disabled. Bit 1 is reserved and must be zero. When PV end of + interrupt is enabled (bit 0 set), bits 63-2 hold a 4-byte aligned + physical address of a 4 byte memory area which must be in guest RAM and + must be zeroed. + + The first, least significant bit of 4 byte memory location will be + written to by the hypervisor, typically at the time of interrupt + injection. Value of 1 means that guest can skip writing EOI to the apic + (using MSR or MMIO write); instead, it is sufficient to signal + EOI by clearing the bit in guest memory - this location will + later be polled by the hypervisor. + Value of 0 means that the EOI write is required. + + It is always safe for the guest to ignore the optimization and perform + the APIC EOI write anyway. + + Hypervisor is guaranteed to only modify this least + significant bit while in the current VCPU context, this means that + guest does not need to use either lock prefix or memory ordering + primitives to synchronise with the hypervisor. + + However, hypervisor can set and clear this memory bit at any time: + therefore to make sure hypervisor does not interrupt the + guest and clear the least significant bit in the memory area + in the window between guest testing it to detect + whether it can skip EOI apic write and between guest + clearing it to signal EOI to the hypervisor, + guest must both read the least significant bit in the memory area and + clear it using a single CPU instruction, such as test and clear, or + compare and exchange. diff --git a/Documentation/virtual/kvm/ppc-pv.txt b/Documentation/virtual/kvm/ppc-pv.txt index 6e7c3705093..4911cf95c67 100644 --- a/Documentation/virtual/kvm/ppc-pv.txt +++ b/Documentation/virtual/kvm/ppc-pv.txt @@ -109,8 +109,6 @@ The following bits are safe to be set inside the guest: MSR_EE MSR_RI - MSR_CR - MSR_ME If any other bit changes in the MSR, please still use mtmsr(d). diff --git a/MAINTAINERS b/MAINTAINERS index 7316ab62e5a..cda045337a9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4002,8 +4002,8 @@ F: arch/ia64/include/asm/kvm* F: arch/ia64/kvm/ KERNEL VIRTUAL MACHINE for s390 (KVM/s390) -M: Carsten Otte <cotte@de.ibm.com> M: Christian Borntraeger <borntraeger@de.ibm.com> +M: Cornelia Huck <cornelia.huck@de.ibm.com> M: linux390@de.ibm.com L: linux-s390@vger.kernel.org W: http://www.ibm.com/developerworks/linux/linux390/ diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h index b9f82c84f09..ec6c6b30123 100644 --- a/arch/ia64/include/asm/kvm.h +++ b/arch/ia64/include/asm/kvm.h @@ -26,6 +26,7 @@ /* Select x86 specific features in <linux/kvm.h> */ #define __KVM_HAVE_IOAPIC +#define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_DEVICE_ASSIGNMENT /* Architectural interrupt line count. */ diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig index 9806e55f91b..df5351e3eed 100644 --- a/arch/ia64/kvm/Kconfig +++ b/arch/ia64/kvm/Kconfig @@ -19,6 +19,7 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support" + depends on BROKEN depends on HAVE_KVM && MODULES && EXPERIMENTAL # for device assignment: depends on PCI diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h index 976835d8f22..bf2c06c3387 100644 --- a/arch/powerpc/include/asm/epapr_hcalls.h +++ b/arch/powerpc/include/asm/epapr_hcalls.h @@ -153,6 +153,8 @@ #define EV_HCALL_CLOBBERS2 EV_HCALL_CLOBBERS3, "r5" #define EV_HCALL_CLOBBERS1 EV_HCALL_CLOBBERS2, "r4" +extern bool epapr_paravirt_enabled; +extern u32 epapr_hypercall_start[]; /* * We use "uintptr_t" to define a register because it's guaranteed to be a diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 0554ab062bd..e45c4947a77 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -34,6 +34,8 @@ extern void __replay_interrupt(unsigned int vector); extern void timer_interrupt(struct pt_regs *); extern void performance_monitor_exception(struct pt_regs *regs); +extern void WatchdogException(struct pt_regs *regs); +extern void unknown_exception(struct pt_regs *regs); #ifdef CONFIG_PPC64 #include <asm/paca.h> diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index b0c08b14277..0dd1d86d3e3 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -36,11 +36,8 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu) #define SPAPR_TCE_SHIFT 12 #ifdef CONFIG_KVM_BOOK3S_64_HV -/* For now use fixed-size 16MB page table */ -#define HPT_ORDER 24 -#define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */ -#define HPT_NPTE (HPT_NPTEG << 3) /* 8 PTEs per PTEG */ -#define HPT_HASH_MASK (HPT_NPTEG - 1) +#define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ +extern int kvm_hpt_order; /* order of preallocated HPTs */ #endif #define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index d848cdc4971..50ea12fd7bf 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -237,6 +237,10 @@ struct kvm_arch { unsigned long vrma_slb_v; int rma_setup_done; int using_mmu_notifiers; + u32 hpt_order; + atomic_t vcpus_running; + unsigned long hpt_npte; + unsigned long hpt_mask; spinlock_t slot_phys_lock; unsigned long *slot_phys[KVM_MEM_SLOTS_NUM]; int slot_npages[KVM_MEM_SLOTS_NUM]; @@ -414,7 +418,9 @@ struct kvm_vcpu_arch { ulong mcsrr1; ulong mcsr; u32 dec; +#ifdef CONFIG_BOOKE u32 decar; +#endif u32 tbl; u32 tbu; u32 tcr; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index f68c22fa2fc..0124937a23b 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -119,7 +119,8 @@ extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu); extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu); extern void kvmppc_map_magic(struct kvm_vcpu *vcpu); -extern long kvmppc_alloc_hpt(struct kvm *kvm); +extern long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp); +extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp); extern void kvmppc_free_hpt(struct kvm *kvm); extern long kvmppc_prepare_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem); diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 83afacd3ba7..bb282dd8161 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -128,6 +128,7 @@ ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC),) obj-y += ppc_save_regs.o endif +obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o # Disable GCOV in odd or sensitive code diff --git a/arch/powerpc/kernel/epapr_hcalls.S b/arch/powerpc/kernel/epapr_hcalls.S new file mode 100644 index 00000000000..697b390ebfd --- /dev/null +++ b/arch/powerpc/kernel/epapr_hcalls.S @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2012 Freescale Semiconductor, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/threads.h> +#include <asm/reg.h> +#include <asm/page.h> +#include <asm/cputable.h> +#include <asm/thread_info.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> + +/* Hypercall entry point. Will be patched with device tree instructions. */ +.global epapr_hypercall_start +epapr_hypercall_start: + li r3, -1 + nop + nop + nop + blr diff --git a/arch/powerpc/kernel/epapr_paravirt.c b/arch/powerpc/kernel/epapr_paravirt.c new file mode 100644 index 00000000000..028aeae370b --- /dev/null +++ b/arch/powerpc/kernel/epapr_paravirt.c @@ -0,0 +1,52 @@ +/* + * ePAPR para-virtualization support. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright (C) 2012 Freescale Semiconductor, Inc. + */ + +#include <linux/of.h> +#include <asm/epapr_hcalls.h> +#include <asm/cacheflush.h> +#include <asm/code-patching.h> + +bool epapr_paravirt_enabled; + +static int __init epapr_paravirt_init(void) +{ + struct device_node *hyper_node; + const u32 *insts; + int len, i; + + hyper_node = of_find_node_by_path("/hypervisor"); + if (!hyper_node) + return -ENODEV; + + insts = of_get_property(hyper_node, "hcall-instructions", &len); + if (!insts) + return -ENODEV; + + if (len % 4 || len > (4 * 4)) + return -ENODEV; + + for (i = 0; i < (len / 4); i++) + patch_instruction(epapr_hypercall_start + i, insts[i]); + + epapr_paravirt_enabled = true; + + return 0; +} + +early_initcall(epapr_paravirt_init); diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index 02c167db6ba..867db1de894 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -31,6 +31,7 @@ #include <asm/cacheflush.h> #include <asm/disassemble.h> #include <asm/ppc-opcode.h> +#include <asm/epapr_hcalls.h> #define KVM_MAGIC_PAGE (-4096L) #define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x) @@ -726,7 +727,7 @@ unsigned long kvm_hypercall( |