aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-05-21 17:16:21 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2010-05-21 17:16:21 -0700
commit98edb6ca4174f17a64890a02f44c211c8b44fb3c (patch)
tree033bc5f7da410046d28dd1cefcd2d63cda33d25b
parenta8251096b427283c47e7d8f9568be6b388dd68ec (diff)
parent8fbf065d625617bbbf6b72d5f78f84ad13c8b547 (diff)
Merge branch 'kvm-updates/2.6.35' of git://git.kernel.org/pub/scm/virt/kvm/kvm
* 'kvm-updates/2.6.35' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (269 commits) KVM: x86: Add missing locking to arch specific vcpu ioctls KVM: PPC: Add missing vcpu_load()/vcpu_put() in vcpu ioctls KVM: MMU: Segregate shadow pages with different cr0.wp KVM: x86: Check LMA bit before set_efer KVM: Don't allow lmsw to clear cr0.pe KVM: Add cpuid.txt file KVM: x86: Tell the guest we'll warn it about tsc stability x86, paravirt: don't compute pvclock adjustments if we trust the tsc x86: KVM guest: Try using new kvm clock msrs KVM: x86: export paravirtual cpuid flags in KVM_GET_SUPPORTED_CPUID KVM: x86: add new KVMCLOCK cpuid feature KVM: x86: change msr numbers for kvmclock x86, paravirt: Add a global synchronization point for pvclock x86, paravirt: Enable pvclock flags in vcpu_time_info structure KVM: x86: Inject #GP with the right rip on efer writes KVM: SVM: Don't allow nested guest to VMMCALL into host KVM: x86: Fix exception reinjection forced to true KVM: Fix wallclock version writing race KVM: MMU: Don't read pdptrs with mmu spinlock held in mmu_alloc_roots KVM: VMX: enable VMXON check with SMX enabled (Intel TXT) ...
-rw-r--r--Documentation/kvm/api.txt208
-rw-r--r--Documentation/kvm/cpuid.txt42
-rw-r--r--Documentation/kvm/mmu.txt304
-rw-r--r--arch/ia64/kvm/kvm-ia64.c8
-rw-r--r--arch/ia64/kvm/vmm.c2
-rw-r--r--arch/powerpc/include/asm/asm-compat.h2
-rw-r--r--arch/powerpc/include/asm/kvm.h10
-rw-r--r--arch/powerpc/include/asm/kvm_asm.h2
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h157
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_32.h42
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h28
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_asm.h (renamed from arch/powerpc/include/asm/kvm_book3s_64_asm.h)25
-rw-r--r--arch/powerpc/include/asm/kvm_booke.h96
-rw-r--r--arch/powerpc/include/asm/kvm_fpu.h85
-rw-r--r--arch/powerpc/include/asm/kvm_host.h38
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h97
-rw-r--r--arch/powerpc/include/asm/mmu_context.h2
-rw-r--r--arch/powerpc/include/asm/paca.h10
-rw-r--r--arch/powerpc/include/asm/processor.h3
-rw-r--r--arch/powerpc/include/asm/reg.h10
-rw-r--r--arch/powerpc/kernel/asm-offsets.c102
-rw-r--r--arch/powerpc/kernel/head_32.S14
-rw-r--r--arch/powerpc/kernel/head_64.S4
-rw-r--r--arch/powerpc/kernel/ppc_ksyms.c4
-rw-r--r--arch/powerpc/kvm/44x.c2
-rw-r--r--arch/powerpc/kvm/Kconfig24
-rw-r--r--arch/powerpc/kvm/Makefile20
-rw-r--r--arch/powerpc/kvm/book3s.c503
-rw-r--r--arch/powerpc/kvm/book3s_32_mmu.c54
-rw-r--r--arch/powerpc/kvm/book3s_32_mmu_host.c483
-rw-r--r--arch/powerpc/kvm/book3s_32_sr.S143
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu.c36
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_host.c102
-rw-r--r--arch/powerpc/kvm/book3s_64_slb.S183
-rw-r--r--arch/powerpc/kvm/book3s_emulate.c (renamed from arch/powerpc/kvm/book3s_64_emulate.c)245
-rw-r--r--arch/powerpc/kvm/book3s_exports.c (renamed from arch/powerpc/kvm/book3s_64_exports.c)0
-rw-r--r--arch/powerpc/kvm/book3s_interrupts.S (renamed from arch/powerpc/kvm/book3s_64_interrupts.S)204
-rw-r--r--arch/powerpc/kvm/book3s_paired_singles.c1289
-rw-r--r--arch/powerpc/kvm/book3s_rmhandlers.S (renamed from arch/powerpc/kvm/book3s_64_rmhandlers.S)135
-rw-r--r--arch/powerpc/kvm/book3s_segment.S259
-rw-r--r--arch/powerpc/kvm/booke.c21
-rw-r--r--arch/powerpc/kvm/e500.c2
-rw-r--r--arch/powerpc/kvm/emulate.c55
-rw-r--r--arch/powerpc/kvm/fpu.S273
-rw-r--r--arch/powerpc/kvm/powerpc.c110
-rw-r--r--arch/powerpc/mm/mmu_context_hash32.c29
-rw-r--r--arch/s390/kvm/kvm-s390.c6
-rw-r--r--arch/s390/kvm/kvm-s390.h2
-rw-r--r--arch/x86/include/asm/kvm.h17
-rw-r--r--arch/x86/include/asm/kvm_emulate.h46
-rw-r--r--arch/x86/include/asm/kvm_host.h80
-rw-r--r--arch/x86/include/asm/kvm_para.h13
-rw-r--r--arch/x86/include/asm/msr-index.h5
-rw-r--r--arch/x86/include/asm/pvclock-abi.h4
-rw-r--r--arch/x86/include/asm/pvclock.h1
-rw-r--r--arch/x86/include/asm/svm.h9
-rw-r--r--arch/x86/include/asm/vmx.h12
-rw-r--r--arch/x86/kernel/kvmclock.c56
-rw-r--r--arch/x86/kernel/pvclock.c37
-rw-r--r--arch/x86/kernel/tboot.c1
-rw-r--r--arch/x86/kvm/emulate.c1247
-rw-r--r--arch/x86/kvm/i8259.c53
-rw-r--r--arch/x86/kvm/irq.h1
-rw-r--r--arch/x86/kvm/kvm_timer.h4
-rw-r--r--arch/x86/kvm/mmu.c225
-rw-r--r--arch/x86/kvm/mmutrace.h84
-rw-r--r--arch/x86/kvm/paging_tmpl.h46
-rw-r--r--arch/x86/kvm/svm.c944
-rw-r--r--arch/x86/kvm/timer.c3
-rw-r--r--arch/x86/kvm/trace.h165
-rw-r--r--arch/x86/kvm/vmx.c378
-rw-r--r--arch/x86/kvm/x86.c1599
-rw-r--r--arch/x86/kvm/x86.h7
-rw-r--r--include/linux/kvm.h26
-rw-r--r--include/linux/kvm_host.h16
-rw-r--r--include/linux/tboot.h1
-rw-r--r--include/trace/events/kvm.h1
-rw-r--r--virt/kvm/assigned-dev.c8
-rw-r--r--virt/kvm/coalesced_mmio.c6
-rw-r--r--virt/kvm/iommu.c4
-rw-r--r--virt/kvm/kvm_main.c63
81 files changed, 7826 insertions, 2811 deletions
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index c6416a39816..a237518e51b 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -656,6 +656,7 @@ struct kvm_clock_data {
4.29 KVM_GET_VCPU_EVENTS
Capability: KVM_CAP_VCPU_EVENTS
+Extended by: KVM_CAP_INTR_SHADOW
Architectures: x86
Type: vm ioctl
Parameters: struct kvm_vcpu_event (out)
@@ -676,7 +677,7 @@ struct kvm_vcpu_events {
__u8 injected;
__u8 nr;
__u8 soft;
- __u8 pad;
+ __u8 shadow;
} interrupt;
struct {
__u8 injected;
@@ -688,9 +689,13 @@ struct kvm_vcpu_events {
__u32 flags;
};
+KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that
+interrupt.shadow contains a valid state. Otherwise, this field is undefined.
+
4.30 KVM_SET_VCPU_EVENTS
Capability: KVM_CAP_VCPU_EVENTS
+Extended by: KVM_CAP_INTR_SHADOW
Architectures: x86
Type: vm ioctl
Parameters: struct kvm_vcpu_event (in)
@@ -709,6 +714,183 @@ current in-kernel state. The bits are:
KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel
KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector
+If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in
+the flags field to signal that interrupt.shadow contains a valid state and
+shall be written into the VCPU.
+
+4.32 KVM_GET_DEBUGREGS
+
+Capability: KVM_CAP_DEBUGREGS
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_debugregs (out)
+Returns: 0 on success, -1 on error
+
+Reads debug registers from the vcpu.
+
+struct kvm_debugregs {
+ __u64 db[4];
+ __u64 dr6;
+ __u64 dr7;
+ __u64 flags;
+ __u64 reserved[9];
+};
+
+4.33 KVM_SET_DEBUGREGS
+
+Capability: KVM_CAP_DEBUGREGS
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_debugregs (in)
+Returns: 0 on success, -1 on error
+
+Writes debug registers into the vcpu.
+
+See KVM_GET_DEBUGREGS for the data structure. The flags field is unused
+yet and must be cleared on entry.
+
+4.34 KVM_SET_USER_MEMORY_REGION
+
+Capability: KVM_CAP_USER_MEM
+Architectures: all
+Type: vm ioctl
+Parameters: struct kvm_userspace_memory_region (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_userspace_memory_region {
+ __u32 slot;
+ __u32 flags;
+ __u64 guest_phys_addr;
+ __u64 memory_size; /* bytes */
+ __u64 userspace_addr; /* start of the userspace allocated memory */
+};
+
+/* for kvm_memory_region::flags */
+#define KVM_MEM_LOG_DIRTY_PAGES 1UL
+
+This ioctl allows the user to create or modify a guest physical memory
+slot. When changing an existing slot, it may be moved in the guest
+physical memory space, or its flags may be modified. It may not be
+resized. Slots may not overlap in guest physical address space.
+
+Memory for the region is taken starting at the address denoted by the
+field userspace_addr, which must point at user addressable memory for
+the entire memory slot size. Any object may back this memory, including
+anonymous memory, ordinary files, and hugetlbfs.
+
+It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr
+be identical. This allows large pages in the guest to be backed by large
+pages in the host.
+
+The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which
+instructs kvm to keep track of writes to memory within the slot. See
+the KVM_GET_DIRTY_LOG ioctl.
+
+When the KVM_CAP_SYNC_MMU capability, changes in the backing of the memory
+region are automatically reflected into the guest. For example, an mmap()
+that affects the region will be made visible immediately. Another example
+is madvise(MADV_DROP).
+
+It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl.
+The KVM_SET_MEMORY_REGION does not allow fine grained control over memory
+allocation and is deprecated.
+
+4.35 KVM_SET_TSS_ADDR
+
+Capability: KVM_CAP_SET_TSS_ADDR
+Architectures: x86
+Type: vm ioctl
+Parameters: unsigned long tss_address (in)
+Returns: 0 on success, -1 on error
+
+This ioctl defines the physical address of a three-page region in the guest
+physical address space. The region must be within the first 4GB of the
+guest physical address space and must not conflict with any memory slot
+or any mmio address. The guest may malfunction if it accesses this memory
+region.
+
+This ioctl is required on Intel-based hosts. This is needed on Intel hardware
+because of a quirk in the virtualization implementation (see the internals
+documentation when it pops into existence).
+
+4.36 KVM_ENABLE_CAP
+
+Capability: KVM_CAP_ENABLE_CAP
+Architectures: ppc
+Type: vcpu ioctl
+Parameters: struct kvm_enable_cap (in)
+Returns: 0 on success; -1 on error
+
++Not all extensions are enabled by default. Using this ioctl the application
+can enable an extension, making it available to the guest.
+
+On systems that do not support this ioctl, it always fails. On systems that
+do support it, it only works for extensions that are supported for enablement.
+
+To check if a capability can be enabled, the KVM_CHECK_EXTENSION ioctl should
+be used.
+
+struct kvm_enable_cap {
+ /* in */
+ __u32 cap;
+
+The capability that is supposed to get enabled.
+
+ __u32 flags;
+
+A bitfield indicating future enhancements. Has to be 0 for now.
+
+ __u64 args[4];
+
+Arguments for enabling a feature. If a feature needs initial values to
+function properly, this is the place to put them.
+
+ __u8 pad[64];
+};
+
+4.37 KVM_GET_MP_STATE
+
+Capability: KVM_CAP_MP_STATE
+Architectures: x86, ia64
+Type: vcpu ioctl
+Parameters: struct kvm_mp_state (out)
+Returns: 0 on success; -1 on error
+
+struct kvm_mp_state {
+ __u32 mp_state;
+};
+
+Returns the vcpu's current "multiprocessing state" (though also valid on
+uniprocessor guests).
+
+Possible values are:
+
+ - KVM_MP_STATE_RUNNABLE: the vcpu is currently running
+ - KVM_MP_STATE_UNINITIALIZED: the vcpu is an application processor (AP)
+ which has not yet received an INIT signal
+ - KVM_MP_STATE_INIT_RECEIVED: the vcpu has received an INIT signal, and is
+ now ready for a SIPI
+ - KVM_MP_STATE_HALTED: the vcpu has executed a HLT instruction and
+ is waiting for an interrupt
+ - KVM_MP_STATE_SIPI_RECEIVED: the vcpu has just received a SIPI (vector
+ accesible via KVM_GET_VCPU_EVENTS)
+
+This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel
+irqchip, the multiprocessing state must be maintained by userspace.
+
+4.38 KVM_SET_MP_STATE
+
+Capability: KVM_CAP_MP_STATE
+Architectures: x86, ia64
+Type: vcpu ioctl
+Parameters: struct kvm_mp_state (in)
+Returns: 0 on success; -1 on error
+
+Sets the vcpu's current "multiprocessing state"; see KVM_GET_MP_STATE for
+arguments.
+
+This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel
+irqchip, the multiprocessing state must be maintained by userspace.
5. The kvm_run structure
@@ -820,6 +1002,13 @@ executed a memory-mapped I/O instruction which could not be satisfied
by kvm. The 'data' member contains the written data if 'is_write' is
true, and should be filled by application code otherwise.
+NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO and KVM_EXIT_OSI, the corresponding
+operations are complete (and guest state is consistent) only after userspace
+has re-entered the kernel with KVM_RUN. The kernel side will first finish
+incomplete operations and then check for pending signals. Userspace
+can re-enter the guest with an unmasked signal pending to complete
+pending operations.
+
/* KVM_EXIT_HYPERCALL */
struct {
__u64 nr;
@@ -829,7 +1018,9 @@ true, and should be filled by application code otherwise.
__u32 pad;
} hypercall;
-Unused.
+Unused. This was once used for 'hypercall to userspace'. To implement
+such functionality, use KVM_EXIT_IO (x86) or KVM_EXIT_MMIO (all except s390).
+Note KVM_EXIT_IO is significantly faster than KVM_EXIT_MMIO.
/* KVM_EXIT_TPR_ACCESS */
struct {
@@ -870,6 +1061,19 @@ s390 specific.
powerpc specific.
+ /* KVM_EXIT_OSI */
+ struct {
+ __u64 gprs[32];
+ } osi;
+
+MOL uses a special hypercall interface it calls 'OSI'. To enable it, we catch
+hypercalls and exit with this exit struct that contains all the guest gprs.
+
+If exit_reason is KVM_EXIT_OSI, then the vcpu has triggered such a hypercall.
+Userspace can now handle the hypercall and when it's done modify the gprs as
+necessary. Upon guest entry all guest GPRs will then be replaced by the values
+in this struct.
+
/* Fix the size of the union. */
char padding[256];
};
diff --git a/Documentation/kvm/cpuid.txt b/Documentation/kvm/cpuid.txt
new file mode 100644
index 00000000000..14a12ea92b7
--- /dev/null
+++ b/Documentation/kvm/cpuid.txt
@@ -0,0 +1,42 @@
+KVM CPUID bits
+Glauber Costa <glommer@redhat.com>, Red Hat Inc, 2010
+=====================================================
+
+A guest running on a kvm host, can check some of its features using
+cpuid. This is not always guaranteed to work, since userspace can
+mask-out some, or even all KVM-related cpuid features before launching
+a guest.
+
+KVM cpuid functions are:
+
+function: KVM_CPUID_SIGNATURE (0x40000000)
+returns : eax = 0,
+ ebx = 0x4b4d564b,
+ ecx = 0x564b4d56,
+ edx = 0x4d.
+Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM".
+This function queries the presence of KVM cpuid leafs.
+
+
+function: define KVM_CPUID_FEATURES (0x40000001)
+returns : ebx, ecx, edx = 0
+ eax = and OR'ed group of (1 << flag), where each flags is:
+
+
+flag || value || meaning
+=============================================================================
+KVM_FEATURE_CLOCKSOURCE || 0 || kvmclock available at msrs
+ || || 0x11 and 0x12.
+------------------------------------------------------------------------------
+KVM_FEATURE_NOP_IO_DELAY || 1 || not necessary to perform delays
+ || || on PIO operations.
+------------------------------------------------------------------------------
+KVM_FEATURE_MMU_OP || 2 || deprecated.
+------------------------------------------------------------------------------
+KVM_FEATURE_CLOCKSOURCE2 || 3 || kvmclock available at msrs
+ || || 0x4b564d00 and 0x4b564d01
+------------------------------------------------------------------------------
+KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side
+ || || per-cpu warps are expected in
+ || || kvmclock.
+------------------------------------------------------------------------------
diff --git a/Documentation/kvm/mmu.txt b/Documentation/kvm/mmu.txt
new file mode 100644
index 00000000000..aaed6ab9d7a
--- /dev/null
+++ b/Documentation/kvm/mmu.txt
@@ -0,0 +1,304 @@
+The x86 kvm shadow mmu
+======================
+
+The mmu (in arch/x86/kvm, files mmu.[ch] and paging_tmpl.h) is responsible
+for presenting a standard x86 mmu to the guest, while translating guest
+physical addresses to host physical addresses.
+
+The mmu code attempts to satisfy the following requirements:
+
+- correctness: the guest should not be able to determine that it is running
+ on an emulated mmu except for timing (we attempt to comply
+ with the specification, not emulate the characteristics of
+ a particular implementation such as tlb size)
+- security: the guest must not be able to touch host memory not assigned
+ to it
+- performance: minimize the performance penalty imposed by the mmu
+- scaling: need to scale to large memory and large vcpu guests
+- hardware: support the full range of x86 virtualization hardware
+- integration: Linux memory management code must be in control of guest memory
+ so that swapping, page migration, page merging, transparent
+ hugepages, and similar features work without change
+- dirty tracking: report writes to guest memory to enable live migration
+ and framebuffer-based displays
+- footprint: keep the amount of pinned kernel memory low (most memory
+ should be shrinkable)
+- reliablity: avoid multipage or GFP_ATOMIC allocations
+
+Acronyms
+========
+
+pfn host page frame number
+hpa host physical address
+hva host virtual address
+gfn guest frame number
+gpa guest physical address
+gva guest virtual address
+ngpa nested guest physical address
+ngva nested guest virtual address
+pte page table entry (used also to refer generically to paging structure
+ entries)
+gpte guest pte (referring to gfns)
+spte shadow pte (referring to pfns)
+tdp two dimensional paging (vendor neutral term for NPT and EPT)
+
+Virtual and real hardware supported
+===================================
+
+The mmu supports first-generation mmu hardware, which allows an atomic switch
+of the current paging mode and cr3 during guest entry, as well as
+two-dimensional paging (AMD's NPT and Intel's EPT). The emulated hardware
+it exposes is the traditional 2/3/4 level x86 mmu, with support for global
+pages, pae, pse, pse36, cr0.wp, and 1GB pages. Work is in progress to support
+exposing NPT capable hardware on NPT capable hosts.
+
+Translation
+===========
+
+The primary job of the mmu is to program the processor's mmu to translate
+addresses for the guest. Different translations are required at different
+times:
+
+- when guest paging is disabled, we translate guest physical addresses to
+ host physical addresses (gpa->hpa)
+- when guest paging is enabled, we translate guest virtual addresses, to
+ guest physical addresses, to host physical addresses (gva->gpa->hpa)
+- when the guest launches a guest of its own, we translate nested guest
+ virtual addresses, to nested guest physical addresses, to guest physical
+ addresses, to host physical addresses (ngva->ngpa->gpa->hpa)
+
+The primary challenge is to encode between 1 and 3 translations into hardware
+that support only 1 (traditional) and 2 (tdp) translations. When the
+number of required translations matches the hardware, the mmu operates in
+direct mode; otherwise it operates in shadow mode (see belo