aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/virtual/kvm/api.txt10
-rw-r--r--Documentation/virtual/kvm/hypercalls.txt66
-rw-r--r--Documentation/virtual/kvm/ppc-pv.txt22
-rw-r--r--arch/ia64/kvm/kvm-ia64.c33
-rw-r--r--arch/powerpc/include/asm/kvm_host.h3
-rw-r--r--arch/powerpc/kvm/44x_tlb.c1
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c51
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c4
-rw-r--r--arch/powerpc/kvm/book3s_pr.c4
-rw-r--r--arch/powerpc/kvm/e500_tlb.c3
-rw-r--r--arch/powerpc/kvm/powerpc.c8
-rw-r--r--arch/s390/include/asm/processor.h1
-rw-r--r--arch/s390/kernel/dis.c27
-rw-r--r--arch/s390/kvm/Kconfig1
-rw-r--r--arch/s390/kvm/diag.c4
-rw-r--r--arch/s390/kvm/intercept.c11
-rw-r--r--arch/s390/kvm/interrupt.c23
-rw-r--r--arch/s390/kvm/kvm-s390.c10
-rw-r--r--arch/s390/kvm/priv.c9
-rw-r--r--arch/s390/kvm/sigp.c2
-rw-r--r--arch/s390/kvm/trace-s390.h210
-rw-r--r--arch/s390/kvm/trace.h341
-rw-r--r--arch/x86/Kconfig21
-rw-r--r--arch/x86/include/asm/kvm.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h5
-rw-r--r--arch/x86/include/asm/kvm_para.h6
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/kvm.c3
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/emulate.c48
-rw-r--r--arch/x86/kvm/i8254.c64
-rw-r--r--arch/x86/kvm/i8254.h6
-rw-r--r--arch/x86/kvm/i8259.c18
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/kvm_timer.h18
-rw-r--r--arch/x86/kvm/lapic.c264
-rw-r--r--arch/x86/kvm/lapic.h58
-rw-r--r--arch/x86/kvm/mmu.c144
-rw-r--r--arch/x86/kvm/mmu_audit.c8
-rw-r--r--arch/x86/kvm/paging_tmpl.h8
-rw-r--r--arch/x86/kvm/pmu.c2
-rw-r--r--arch/x86/kvm/svm.c5
-rw-r--r--arch/x86/kvm/timer.c47
-rw-r--r--arch/x86/kvm/vmx.c19
-rw-r--r--arch/x86/kvm/x86.c160
-rw-r--r--arch/x86/kvm/x86.h1
-rw-r--r--include/linux/kvm.h13
-rw-r--r--include/linux/kvm_host.h126
-rw-r--r--kernel/jump_label.c1
-rw-r--r--virt/kvm/Kconfig3
-rw-r--r--virt/kvm/async_pf.c11
-rw-r--r--virt/kvm/ioapic.c37
-rw-r--r--virt/kvm/iommu.c16
-rw-r--r--virt/kvm/irq_comm.c4
-rw-r--r--virt/kvm/kvm_main.c520
58 files changed, 1763 insertions, 730 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index bf33aaa4c59..b91bfd43f00 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -857,7 +857,8 @@ struct kvm_userspace_memory_region {
};
/* for kvm_memory_region::flags */
-#define KVM_MEM_LOG_DIRTY_PAGES 1UL
+#define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0)
+#define KVM_MEM_READONLY (1UL << 1)
This ioctl allows the user to create or modify a guest physical memory
slot. When changing an existing slot, it may be moved in the guest
@@ -873,9 +874,12 @@ It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr
be identical. This allows large pages in the guest to be backed by large
pages in the host.
-The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which
+The flags field supports two flag, KVM_MEM_LOG_DIRTY_PAGES, which
instructs kvm to keep track of writes to memory within the slot. See
-the KVM_GET_DIRTY_LOG ioctl.
+the KVM_GET_DIRTY_LOG ioctl. Another flag is KVM_MEM_READONLY when the
+KVM_CAP_READONLY_MEM capability, it indicates the guest memory is read-only,
+that means, guest is only allowed to read it. Writes will be posted to
+userspace as KVM_EXIT_MMIO exits.
When the KVM_CAP_SYNC_MMU capability, changes in the backing of the memory
region are automatically reflected into the guest. For example, an mmap()
diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt
new file mode 100644
index 00000000000..ea113b5d87a
--- /dev/null
+++ b/Documentation/virtual/kvm/hypercalls.txt
@@ -0,0 +1,66 @@
+Linux KVM Hypercall:
+===================
+X86:
+ KVM Hypercalls have a three-byte sequence of either the vmcall or the vmmcall
+ instruction. The hypervisor can replace it with instructions that are
+ guaranteed to be supported.
+
+ Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
+ The hypercall number should be placed in rax and the return value will be
+ placed in rax. No other registers will be clobbered unless explicitly stated
+ by the particular hypercall.
+
+S390:
+ R2-R7 are used for parameters 1-6. In addition, R1 is used for hypercall
+ number. The return value is written to R2.
+
+ S390 uses diagnose instruction as hypercall (0x500) along with hypercall
+ number in R1.
+
+ PowerPC:
+ It uses R3-R10 and hypercall number in R11. R4-R11 are used as output registers.
+ Return value is placed in R3.
+
+ KVM hypercalls uses 4 byte opcode, that are patched with 'hypercall-instructions'
+ property inside the device tree's /hypervisor node.
+ For more information refer to Documentation/virtual/kvm/ppc-pv.txt
+
+KVM Hypercalls Documentation
+===========================
+The template for each hypercall is:
+1. Hypercall name.
+2. Architecture(s)
+3. Status (deprecated, obsolete, active)
+4. Purpose
+
+1. KVM_HC_VAPIC_POLL_IRQ
+------------------------
+Architecture: x86
+Status: active
+Purpose: Trigger guest exit so that the host can check for pending
+interrupts on reentry.
+
+2. KVM_HC_MMU_OP
+------------------------
+Architecture: x86
+Status: deprecated.
+Purpose: Support MMU operations such as writing to PTE,
+flushing TLB, release PT.
+
+3. KVM_HC_FEATURES
+------------------------
+Architecture: PPC
+Status: active
+Purpose: Expose hypercall availability to the guest. On x86 platforms, cpuid
+used to enumerate which hypercalls are available. On PPC, either device tree
+based lookup ( which is also what EPAPR dictates) OR KVM specific enumeration
+mechanism (which is this hypercall) can be used.
+
+4. KVM_HC_PPC_MAP_MAGIC_PAGE
+------------------------
+Architecture: PPC
+Status: active
+Purpose: To enable communication between the hypervisor and guest there is a
+shared page that contains parts of supervisor visible register state.
+The guest can map this shared page to access its supervisor register through
+memory using this hypercall.
diff --git a/Documentation/virtual/kvm/ppc-pv.txt b/Documentation/virtual/kvm/ppc-pv.txt
index 4911cf95c67..4cd076febb0 100644
--- a/Documentation/virtual/kvm/ppc-pv.txt
+++ b/Documentation/virtual/kvm/ppc-pv.txt
@@ -174,3 +174,25 @@ following:
That way we can inject an arbitrary amount of code as replacement for a single
instruction. This allows us to check for pending interrupts when setting EE=1
for example.
+
+Hypercall ABIs in KVM on PowerPC
+=================================
+1) KVM hypercalls (ePAPR)
+
+These are ePAPR compliant hypercall implementation (mentioned above). Even
+generic hypercalls are implemented here, like the ePAPR idle hcall. These are
+available on all targets.
+
+2) PAPR hypercalls
+
+PAPR hypercalls are needed to run server PowerPC PAPR guests (-M pseries in QEMU).
+These are the same hypercalls that pHyp, the POWER hypervisor implements. Some of
+them are handled in the kernel, some are handled in user space. This is only
+available on book3s_64.
+
+3) OSI hypercalls
+
+Mac-on-Linux is another user of KVM on PowerPC, which has its own hypercall (long
+before KVM). This is supported to maintain compatibility. All these hypercalls get
+forwarded to user space. This is only useful on book3s_32, but can be used with
+book3s_64 as well.
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index bd77cb507c1..eac65380bd2 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -924,6 +924,16 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
return 0;
}
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
+{
+ if (!irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+ irq_event->irq, irq_event->level);
+ return 0;
+}
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -963,29 +973,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;
}
break;
- case KVM_IRQ_LINE_STATUS:
- case KVM_IRQ_LINE: {
- struct kvm_irq_level irq_event;
-
- r = -EFAULT;
- if (copy_from_user(&irq_event, argp, sizeof irq_event))
- goto out;
- r = -ENXIO;
- if (irqchip_in_kernel(kvm)) {
- __s32 status;
- status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
- irq_event.irq, irq_event.level);
- if (ioctl == KVM_IRQ_LINE_STATUS) {
- r = -EFAULT;
- irq_event.status = status;
- if (copy_to_user(argp, &irq_event,
- sizeof irq_event))
- goto out;
- }
- r = 0;
- }
- break;
- }
case KVM_GET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
struct kvm_irqchip chip;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index a8bf5c673a3..28e8f5e5c63 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -53,6 +53,8 @@
struct kvm;
extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+extern int kvm_unmap_hva_range(struct kvm *kvm,
+ unsigned long start, unsigned long end);
extern int kvm_age_hva(struct kvm *kvm, unsigned long hva);
extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
@@ -220,6 +222,7 @@ struct revmap_entry {
#define KVMPPC_GOT_PAGE 0x80
struct kvm_arch_memory_slot {
+ unsigned long *rmap;
};
struct kvm_arch {
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 33aa715dab2..5dd3ab46997 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -319,7 +319,6 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
if (is_error_page(new_page)) {
printk(KERN_ERR "Couldn't get guest page for gfn %llx!\n",
(unsigned long long)gfn);
- kvm_release_page_clean(new_page);
return;
}
hpaddr = page_to_phys(new_page);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index d03eb6f7b05..d95d11322a1 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -705,7 +705,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
goto out_unlock;
hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
- rmap = &memslot->rmap[gfn - memslot->base_gfn];
+ rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
lock_rmap(rmap);
/* Check if we might have been invalidated; let the guest retry if so */
@@ -756,9 +756,12 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
goto out_put;
}
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp,
- unsigned long gfn))
+static int kvm_handle_hva_range(struct kvm *kvm,
+ unsigned long start,
+ unsigned long end,
+ int (*handler)(struct kvm *kvm,
+ unsigned long *rmapp,
+ unsigned long gfn))
{
int ret;
int retval = 0;
@@ -767,15 +770,25 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots) {
- unsigned long start = memslot->userspace_addr;
- unsigned long end;
+ unsigned long hva_start, hva_end;
+ gfn_t gfn, gfn_end;
- end = start + (memslot->npages << PAGE_SHIFT);
- if (hva >= start && hva < end) {
- gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+ hva_start = max(start, memslot->userspace_addr);
+ hva_end = min(end, memslot->userspace_addr +
+ (memslot->npages << PAGE_SHIFT));
+ if (hva_start >= hva_end)
+ continue;
+ /*
+ * {gfn(page) | page intersects with [hva_start, hva_end)} =
+ * {gfn, gfn+1, ..., gfn_end-1}.
+ */
+ gfn = hva_to_gfn_memslot(hva_start, memslot);
+ gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
+
+ for (; gfn < gfn_end; ++gfn) {
+ gfn_t gfn_offset = gfn - memslot->base_gfn;
- ret = handler(kvm, &memslot->rmap[gfn_offset],
- memslot->base_gfn + gfn_offset);
+ ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn);
retval |= ret;
}
}
@@ -783,6 +796,13 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
return retval;
}
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+ int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long gfn))
+{
+ return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
+}
+
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long gfn)
{
@@ -850,6 +870,13 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
return 0;
}
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+ if (kvm->arch.using_mmu_notifiers)
+ kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp);
+ return 0;
+}
+
static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long gfn)
{
@@ -1009,7 +1036,7 @@ long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
unsigned long *rmapp, *map;
preempt_disable();
- rmapp = memslot->rmap;
+ rmapp = memslot->arch.rmap;
map = memslot->dirty_bitmap;
for (i = 0; i < memslot->npages; ++i) {
if (kvm_test_clear_dirty(kvm, rmapp))
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 5c70d19494f..56ac1a5d991 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -84,7 +84,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
return;
- rmap = real_vmalloc_addr(&memslot->rmap[gfn - memslot->base_gfn]);
+ rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
lock_rmap(rmap);
head = *rmap & KVMPPC_RMAP_INDEX;
@@ -180,7 +180,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
if (!slot_is_aligned(memslot, psize))
return H_PARAMETER;
slot_fn = gfn - memslot->base_gfn;
- rmap = &memslot->rmap[slot_fn];
+ rmap = &memslot->arch.rmap[slot_fn];
if (!kvm->arch.using_mmu_notifiers) {
physp = kvm->arch.slot_phys[memslot->id];
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index a1baec340f7..05c28f59f77 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -242,10 +242,8 @@ static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
int i;
hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
- if (is_error_page(hpage)) {
- kvm_release_page_clean(hpage);
+ if (is_error_page(hpage))
return;
- }
hpage_offset = pte->raddr & ~PAGE_MASK;
hpage_offset &= ~0xFFFULL;
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index a2b66717813..ff38b664195 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -520,11 +520,10 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
if (likely(!pfnmap)) {
unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
- pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn);
+ pfn = gfn_to_pfn_memslot(slot, gfn);
if (is_error_pfn(pfn)) {
printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
(long)gfn);
- kvm_release_pfn_clean(pfn);
return;
}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 87f4dc88607..879b14a6140 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -302,10 +302,18 @@ long kvm_arch_dev_ioctl(struct file *filp,
void kvm_arch_free_memslot(struct kvm_memory_slot *free,
struct kvm_memory_slot *dont)
{
+ if (!dont || free->arch.rmap != dont->arch.rmap) {
+ vfree(free->arch.rmap);
+ free->arch.rmap = NULL;
+ }
}
int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
{
+ slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
+ if (!slot->arch.rmap)
+ return -ENOMEM;
+
return 0;
}
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 11e4e323693..eac4fb5fb82 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -140,6 +140,7 @@ extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
extern unsigned long thread_saved_pc(struct task_struct *t);
extern void show_code(struct pt_regs *regs);
+extern int insn_to_mnemonic(unsigned char *instruction, char buf[8]);
unsigned long get_wchan(struct task_struct *p);
#define task_pt_regs(tsk) ((struct pt_regs *) \
diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c
index 619c5d35072..ffb622b16ab 100644
--- a/arch/s390/kernel/dis.c
+++ b/arch/s390/kernel/dis.c
@@ -1468,6 +1468,33 @@ static struct insn *find_insn(unsigned char *code)
return NULL;
}
+/**
+ * insn_to_mnemonic - decode an s390 instruction
+ * @instruction: instruction to decode
+ * @buf: buffer to fill with mnemonic
+ *
+ * Decode the instruction at @instruction and store the corresponding
+ * mnemonic into @buf.
+ * @buf is left unchanged if the instruction could not be decoded.
+ * Returns:
+ * %0 on success, %-ENOENT if the instruction was not found.
+ */
+int insn_to_mnemonic(unsigned char *instruction, char buf[8])
+{
+ struct insn *insn;
+
+ insn = find_insn(instruction);
+ if (!insn)
+ return -ENOENT;
+ if (insn->name[0] == '\0')
+ snprintf(buf, sizeof(buf), "%s",
+ long_insn_name[(int) insn->name[1]]);
+ else
+ snprintf(buf, sizeof(buf), "%.5s", insn->name);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(insn_to_mnemonic);
+
static int print_insn(char *buffer, unsigned char *code, unsigned long addr)
{
struct insn *insn;
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 78eb9847008..a6e2677724e 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -21,6 +21,7 @@ config KVM
depends on HAVE_KVM && EXPERIMENTAL
select PREEMPT_NOTIFIERS
select ANON_INODES
+ select HAVE_KVM_CPU_RELAX_INTERCEPT
---help---
Support hosting paravirtualized guest machines using the SIE
virtualization capability on the mainframe. This should work
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index c88bb779339..a390687feb1 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -14,6 +14,8 @@
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include "kvm-s390.h"
+#include "trace.h"
+#include "trace-s390.h"
static int diag_release_pages(struct kvm_vcpu *vcpu)
{
@@ -98,6 +100,7 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu)
vcpu->run->exit_reason = KVM_EXIT_S390_RESET;
VCPU_EVENT(vcpu, 3, "requesting userspace resets %llx",
vcpu->run->s390_reset_flags);
+ trace_kvm_s390_request_resets(vcpu->run->s390_reset_flags);
return -EREMOTE;
}
@@ -105,6 +108,7 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu)
{
int code = (vcpu->arch.sie_block->ipb & 0xfff0000) >> 16;
+ trace_kvm_s390_handle_diag(vcpu, code);
switch (code) {
case 0x10:
return diag_release_pages(vcpu);
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index adae539f12e..22798ec33fd 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -19,6 +19,8 @@
#include "kvm-s390.h"
#include "gaccess.h"
+#include "trace.h"
+#include "trace-s390.h"
static int handle_lctlg(struct kvm_vcpu *vcpu)
{
@@ -45,6 +47,7 @@ static int handle_lctlg(struct kvm_vcpu *vcpu)
VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2,
disp2);
+ trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr);
do {
rc = get_guest_u64(vcpu, useraddr,
@@ -82,6 +85,7 @@ static int handle_lctl(struct kvm_vcpu *vcpu)
VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2,
disp2);
+ trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, useraddr);
reg = reg1;
do {
@@ -135,6 +139,8 @@ static int handle_stop(struct kvm_vcpu *vcpu)
vcpu->stat.exit_stop_request++;
spin_lock_bh(&vcpu->arch.local_int.lock);
+ trace_kvm_s390_stop_request(vcpu->arch.local_int.action_bits);
+
if (vcpu->arch.local_int.action_bits & ACTION_RELOADVCPU_ON_STOP) {
vcpu->arch.local_int.action_bits &= ~ACTION_RELOADVCPU_ON_STOP;
rc = SIE_INTERCEPT_RERUNVCPU;
@@ -171,6 +177,7 @@ static int handle_validity(struct kvm_vcpu *vcpu)
int rc;
vcpu->stat.exit_validity++;
+ trace_kvm_s390_intercept_validity(vcpu, viwhy);
if (viwhy == 0x37) {
vmaddr = gmap_fault(vcpu->arch.sie_block->prefix,
vcpu->arch.gmap);
@@ -213,6 +220,9 @@ static int handle_instruction(struct kvm_vcpu *vcpu)
intercept_handler_t handler;
vcpu->stat.exit_instruction++;
+ trace_kvm_s390_intercept_instruction(vcpu,
+ vcpu->arch.sie_block->ipa,
+ vcpu->arch.sie_block->ipb);
handler = instruction_handlers[vcpu->arch.sie_block->ipa >> 8];
if (handler)
return handler(vcpu);
@@ -222,6 +232,7 @@ static int handle_instruction(struct kvm_vcpu *vcpu)
static int handle_prog(struct kvm_vcpu *vcpu)
{
vcpu->stat.exit_program_interruption++;
+ trace_kvm_s390_intercept_prog(vcpu, vcpu->arch.sie_block->iprcc);
return kvm_s390_inject_program_int(vcpu, vcpu->arch.sie_block->iprcc);
}
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index b7bc1aac8ed..7556231fb07 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -19,6 +19,7 @@
#include <asm/uaccess.h>
#include "kvm-s390.h"
#include "gaccess.h"
+#include "trace-s390.h"
static int psw_extint_disabled(struct kvm_vcpu *vcpu)
{
@@ -130,6 +131,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
case KVM_S390_INT_EMERGENCY:
VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp emerg");
vcpu->stat.deliver_emergency_signal++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ inti->emerg.code, 0);
rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1201);
if (rc == -EFAULT)
exception = 1;
@@ -152,6 +155,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
case KVM_S390_INT_EXTERNAL_CALL:
VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call");
vcpu->stat.deliver_external_call++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ inti->extcall.code, 0);
rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1202);
if (rc == -EFAULT)
exception = 1;
@@ -175,6 +180,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x",
inti->ext.ext_params);
vcpu->stat.deliver_service_signal++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ inti->ext.ext_params, 0);
rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2401);
if (rc == -EFAULT)
exception = 1;
@@ -198,6 +205,9 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx",
inti->ext.ext_params, inti->ext.ext_params2);
vcpu->stat.deliver_virtio_interrupt++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ inti->ext.ext_params,
+ inti->ext.ext_params2);
rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2603);
if (rc == -EFAULT)
exception = 1;
@@ -229,6 +239,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
case KVM_S390_SIGP_STOP:
VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu stop");
vcpu->stat.deliver_stop_signal++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ 0, 0);
__set_intercept_indicator(vcpu, inti);
break;
@@ -236,12 +248,16 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
VCPU_EVENT(vcpu, 4, "interrupt: set prefix to %x",
inti->prefix.address);
vcpu->stat.deliver_prefix_signal++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ inti->prefix.address, 0);
kvm_s390_set_prefix(vcpu, inti->prefix.address);
break;
case KVM_S390_RESTART:
VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu restart");
vcpu->stat.deliver_restart_signal++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ 0, 0);
rc = copy_to_guest(vcpu, offsetof(struct _lowcore,
restart_old_psw), &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
if (rc == -EFAULT)
@@ -259,6 +275,8 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
inti->pgm.code,
table[vcpu->arch.sie_block->ipa >> 14]);
vcpu->stat.deliver_program_int++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ inti->pgm.code, 0);
rc = put_guest_u16(vcpu, __LC_PGM_INT_CODE, inti->pgm.code);
if (rc == -EFAULT)
exception = 1;
@@ -515,6 +533,7 @@ int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
inti->pgm.code = code;
VCPU_EVENT(vcpu, 3, "inject: program check %d (from kernel)", code);
+ trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, inti->type, code, 0, 1);
spin_lock_bh(&li->lock);
list_add(&inti->list, &li->list);
atomic_set(&li->active, 1);
@@ -556,6 +575,8 @@ int kvm_s390_inject_vm(struct kvm *kvm,
kfree(inti);
return -EINVAL;
}
+ trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64,
+ 2);
mutex_lock(&kvm->lock);
fi = &kvm->arch.float_int;
@@ -621,6 +642,8 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
kfree(inti);
return -EINVAL;
}
+ trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, s390int->type, s390int->parm,
+ s390int->parm64, 2);
mutex_lock(&vcpu->kvm->lock);
li = &vcpu->arch.local_int;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d470ccbfaba..e83df7f0fed 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -32,6 +32,10 @@
#include "kvm-s390.h"
#include "gaccess.h"
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+#include "trace-s390.h"
+
#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
struct kvm_stats_debugfs_item debugfs_entries[] = {
@@ -242,6 +246,7 @@ out_err:
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
VCPU_EVENT(vcpu, 3, "%s", "free cpu");
+ trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id);
if (!kvm_is_ucontrol(vcpu->kvm)) {
clear_bit(63 - vcpu->vcpu_id,
(unsigned long *) &vcpu->kvm->arch.sca->mcn);
@@ -417,6 +422,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
goto out_free_sie_block;
VM_EVENT(kvm, 3, "create cpu %d at %p, sie block at %p", id, vcpu,
vcpu->arch.sie_block);
+ trace_kvm_s390_create_vcpu(id, vcpu, vcpu->arch.sie_block);
return vcpu;
out_free_sie_block:
@@ -607,18 +613,22 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
local_irq_enable();
VCPU_EVENT(vcpu, 6, "entering sie flags %x",
atomic_read(&vcpu->arch.sie_block->cpuflags));
+ trace_kvm_s390_sie_enter(vcpu,
+ atomic_read(&vcpu->arch.sie_block->cpuflags));
rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs);
if (rc) {
if (kvm_is_ucontrol(vcpu->kvm)) {
rc = SIE_INTERCEPT_UCONTROL;
} else {
VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
+ trace_kvm_s390_sie_fault(vcpu);
kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
rc = 0;
}
}
VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
vcpu->arch.sie_block->icptcode);
+ trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode);
local_irq_disable();
kvm_guest_exit();
local_irq_enable();
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 60da903d6f3..ed256fdd7b5 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -20,6 +20,7 @@
#include <asm/sysinfo.h>
#include "gaccess.h"
#include "kvm-s390.h"
+#include "trace.h"
static int handle_set_prefix(struct kvm_vcpu *vcpu)
{
@@ -59,6 +60,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
kvm_s390_set_prefix(vcpu, address);
VCPU_EVENT(vcpu, 5, "setting prefix to %x", address);
+ trace_kvm_s390_handle_prefix(vcpu, 1, address);
out:
return 0;
}
@@ -91,6 +93,7 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu)
}
VCPU_EVENT(vcpu, 5, "storing prefix to %x", address);
+ trace_kvm_s390_handle_prefix(vcpu, 0, address);
out:
return 0;
}
@@ -119,6 +122,7 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
}
VCPU_EVENT(vcpu, 5, "storing cpu address to %llx", useraddr);
+ trace_kvm_s390_handle_stap(vcpu, useraddr);
out:
return 0;
}
@@ -164,9 +168,11 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
&facility_list, sizeof(facility_list));
if (rc == -EFAULT)
kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
- else
+ else {
VCPU_EVENT(vcpu, 5, "store facility list value %x",
facility_list);
+ trace_kvm_s390_handle_stfl(vcpu, facility_list);
+ }
return 0;
}
@@ -278,6 +284,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
goto out_mem;
}
+ trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
free_page(mem);
vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
vcpu->run->s.regs.gprs[0] = 0;
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 56f80e1f98f..566ddf6e8df 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -18,6 +18,7 @@
#include <asm/sigp.h>
#include "gaccess.h"
#include "kvm-s390.h"
+#include "trace.h"
static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
u64 *reg)
@@ -344,6 +345,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
else
parameter = vcpu->run->s.regs.gprs[r1 + 1];
+ trace_kvm_s390_handle_sigp(vcpu, order_code, cpu_addr, parameter);
switch (order_code) {
case SIGP_SENSE:
vcpu->stat.instruction_sigp_sense++;
diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h
new file mode 100644
index 00000000000..90fdf85b5ff
--- /dev/null
+++ b/arch/s390/kvm/trace-s390.h
@@ -0,0 +1,210 @@
+#if !defined(_TRACE_KVMS390_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVMS390_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm-s390
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace-s390
+
+/*
+ * Trace point for the creation of the kvm instance.
+ */
+TRACE_EVENT(kvm_s390_create_vm,
+ TP_PROTO(unsigned long type),
+ TP_ARGS(type),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, type)
+ ),
+
+ TP_fast_assign(
+ __entry->type = type;
+ ),
+
+ TP_printk("create vm%s",
+ __entry->type & KVM_VM_S390_UCONTROL ? " (UCONTROL)" : "")
+ );
+
+/*
+ * Trace points for creation and destruction of vpcus.
+ */
+TRACE_EVENT(kvm_s390_create_vcpu,
+ TP_PROTO(unsigned int id, struct kvm_vcpu *vcpu,
+ struct kvm_s390_sie_block *sie_block),
+ TP_ARGS(id, vcpu, sie_block),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, id)
+ __field(struct kvm_vcpu *, vcpu)
+ __field(struct kvm_s390_sie_block *, sie_block)
+ ),
+
+ TP_fast_assign(
+ __entry->id = id;
+ __entry->vcpu = vcpu;
+ __entry->sie_block = sie_block;
+ ),
+
+ TP_printk("create cpu %d at %p, sie block at %p", __entry->id,
+ __entry->vcpu, __entry->sie_block)
+ );
+
+TRACE_EVENT(kvm_s390_destroy_vcpu,
+ TP_PROTO(unsigned int id),
+ TP_ARGS(id),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, id)
+ ),
+
+ TP_fast_assign(
+ __entry->id = id;
+ ),
+
+ TP_printk("destroy cpu %d", __entry->id)
+ );
+
+/*
+ * Trace points for injection of interrupts, either per machine or
+ * per vcpu.
+ */
+
+#define kvm_s390_int_type \
+ {KVM_S390_SIGP_STOP, "sigp stop"}, \
+ {KVM_S390_PROGRAM_INT, "program interrupt"}, \
+ {KVM_S390_SIGP_SET_PREFIX, "sigp set prefix"}, \
+ {KVM_S390_RESTART, "sigp restart"}, \
+ {KVM_S390_INT_VIRTIO, "virtio interrupt"}, \
+ {KVM_S390_INT_SERVICE, "sclp interrupt"}, \
+ {KVM_S390_INT_EMERGENCY, "sigp emergency"}, \
+ {KVM_S390_INT_EXTERNAL_CALL, "sigp ext call"}
+
+TRACE_EVENT(kvm_s390_inject_vm,
+ TP_PROTO(__u64 type, __u32 parm, __u64 parm64, int who),
+ TP_ARGS(type, parm, parm64, who),
+
+ TP_STRUCT__entry(
+ __field(__u32, inttype)
+ __field(__u32, parm)
+ __field(__u64, parm64)
+ __field(int, who)
+ ),
+
+ TP_fast_assign(
+ __entry->inttype = type & 0x00000000ffffffff;
+ __entry->parm = parm;
+ __entry->parm64 = parm64;
+ __entry->who = who;
+ ),
+
+ TP_printk("inject%s: type:%x (%s) parm:%x parm64:%llx",
+ (__entry->who == 1) ? " (from kernel)" :
+ (__entry->who == 2) ? " (from user)" : "",
+ __entry->inttype,
+ __print_symbolic(__entry->inttype, kvm_s390_int_type),
+ __entry->parm, __entry->parm64)
+ );
+
+TRACE_EVENT(kvm_s390_inject_vcpu,
+ TP_PROTO(unsigned int id, __u64 type, __u32 parm, __u64 parm64, \
+ int who),
+ TP_ARGS(id, type, parm, parm64, who),
+
+ TP_STRUCT__entry(
+ __field(int, id)
+ __field(__u32, inttype)
+ __field(__u32, parm)
+ __field(__u64, parm64)
+ __field(int, who)
+ ),
+
+ TP_fast_assign(
+ __entry->id = id;
+ __entry->inttype = type & 0x00000000ffffffff;
+ __entry->parm = parm;
+ __entry->parm64 = parm64;
+ __entry->who = who;
+ ),
+
+ TP_printk("inject%s (vcpu %d): type:%x (%s) parm:%x parm64:%llx",
+ (__entry->who == 1) ? " (from kernel)" :
+ (__entry->who == 2) ? " (from user)" : "",
+ __entry->id, __entry->inttype,
+ __print_symbolic(__entry->inttype, kvm_s390_int_type),
+ __entry->parm, __entry->parm64)
+ );
+
+/*
+ * Trace point for the actual delivery of interrupts.
+ */
+TRACE_EVENT(kvm_s390_deliver_interrupt,
+ TP_PROTO(unsigned int id, __u64 type, __u32 data0, __u64 data1),
+ TP_ARGS(id, type, data0, data1),
+
+ TP_STRUCT__entry(
+ __field(int, id)
+ __field(__u32, inttype)
+ __field(__u32, data0)
+ __field(__u64, data1)
+ ),
+
+ TP_fast_assign(
+ __entry->id = id;
+ __entry->inttype = type & 0x00000000ffffffff;
+ __entry->data0 = data0;
+ __entry->data1 = data1;
+ ),
+
+ TP_printk("deliver interrupt (vcpu %d): type:%x (%s) " \
+ "data:%08x %016llx",
+ __entry->id, __entry->inttype,
+ __print_symbolic(__entry->inttype, kvm_s390_int_type),
+ __entry->data0, __entry->data1)
+ );
+
+/*
+ * Trace point for resets that may be requested from userspace.
+ */
+TRACE_EVENT(kvm_s390_request_resets,
+ TP_PROTO(__u64 resets),
+ TP_ARGS(resets),
+
+ TP_STRUCT__entry(
+ __field(__u64, resets)
+ ),
+
+ TP_fast_assign(
+ __entry->resets = resets;
+ ),
+
+ TP_printk("requesting userspace resets %llx",
+ __entry->resets)
+ );
+
+/*
+ * Trace point for a vcpu's stop requests.
+ */
+TRACE_EVENT(kvm_s390_stop_request,
+ TP_PROTO(unsigned int action_bits),
+ TP_ARGS(action_bits),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, action_bits)
+ ),
+
+ TP_fast_assign(
+ __entry->action_bits = action_bits;
+ ),
+
+ TP_printk("stop request, action_bits = %08x",
+ __entry->action_bits)
+ );
+
+
+#endif /* _TRACE_KVMS390_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h
new file mode 100644
index 00000000000..2b29e62351d
--- /dev/null
+++ b/arch/s390/kvm/trace.h
@@ -0,0 +1,341 @@
+#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_H
+
+#include <linux/tracepoint.h>
+#include <asm/sigp.h>
+#include <asm/debug.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/*
+ * Helpers for vcpu-specific tracepoints containing the same information
+ * as s390dbf VCPU_EVENTs.
+ */
+#define VCPU_PROTO_COMMON struct kvm_vcpu *vcpu
+#define VCPU_ARGS_COMMON vcpu
+#define VCPU_FIELD_COMMON __field(int, id) \
+ __field(unsigned long, pswmask) \
+ __field(unsigned long, pswaddr)
+#define VCPU_ASSIGN_COMMON do { \
+ __entry->id = vcpu->vcpu_id; \
+ __entry->pswmask = vcpu->arch.sie_block->gpsw.mask; \
+ __entry->pswaddr = vcpu->arch.sie_block->gpsw.addr; \
+ } while (0);
+#define VCPU_TP_PRINTK(p_str, p_args...) \
+ TP_printk("%02d[%016lx-%016lx]: " p_str, __entry->id, \
+ __entry->pswmask, __entry->pswaddr, p_args)
+
+/*
+ * Tracepoints for SIE entry and exit.
+ */
+TRACE_EVENT(kvm_s390_sie_enter,
+ TP_PROTO(VCPU_PROTO_COMMON, int cpuflags),
+ TP_ARGS(VCPU_ARGS_COMMON, cpuflags),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(int, cpuflags)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->cpuflags = cpuflags;
+ ),
+
+ VCPU_TP_PRINTK("entering sie flags %x", __entry->cpuflags)
+ );
+
+TRACE_EVENT(kvm_s390_sie_fault,
+ TP_PROTO(VCPU_PROTO_COMMON),
+ TP_ARGS(VCPU_ARGS_COMMON),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ ),
+
+ VCPU_TP_PRINTK("%s", "fault in sie instruction")
+ );
+
+#define sie_intercept_code \
+ {0x04, "Instruction"}, \
+ {0x08, "Program interruption"}, \
+ {0x0C, "Instruction and program interuption"}, \
+ {0x10, "External request"}, \
+ {0x14, "External interruption"}, \
+ {0x18, "I/O request"}, \
+ {0x1C, "Wait state"}, \
+ {0x20, "Validity"}, \
+ {0x28, "Stop request"}
+
+TRACE_EVENT(kvm_s390_sie_exit,
+ TP_PROTO(VCPU_PROTO_COMMON, u8 icptcode),
+ TP_ARGS(VCPU_ARGS_COMMON, icptcode),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(u8, icptcode)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->icptcode = icptcode;
+ ),
+
+ VCPU_TP_PRINTK("exit sie icptcode %d (%s)", __entry->icptcode,
+ __print_symbolic(__entry->icptcode,
+ sie_intercept_code))
+ );
+
+/*
+ * Trace point for intercepted instructions.
+ */
+TRACE_EVENT(kvm_s390_intercept_instruction,
+ TP_PROTO(VCPU_PROTO_COMMON, __u16 ipa, __u32 ipb),
+ TP_ARGS(VCPU_ARGS_COMMON, ipa, ipb),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(__u64, instruction)
+ __field(char, insn[8])
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->instruction = ((__u64)ipa << 48) |
+ ((__u64)ipb << 16);
+ ),
+
+ VCPU_TP_PRINTK("intercepted instruction %016llx (%s)",
+ __entry->instruction,
+ insn_to_mnemonic((unsigned char *)
+ &__entry->instruction,
+ __entry->insn) ?
+ "unknown" : __entry->insn)
+ );
+
+/*
+ * Trace point for intercepted program interruptions.
+ */
+TRACE_EVENT(kvm_s390_intercept_prog,
+ TP_PROTO(VCPU_PROTO_COMMON, __u16 code),
+ TP_ARGS(VCPU_ARGS_COMMON, code),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(__u16, code)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->code = code;
+ ),
+
+ VCPU_TP_PRINTK("intercepted program interruption %04x",
+ __entry->code)
+ );
+
+/*
+ * Trace point for validity intercepts.
+ */
+TRACE_EVENT(kvm_s390_intercept_validity,
+ TP_PROTO(VCPU_PROTO_COMMON, __u16 viwhy),
+ TP_ARGS(VCPU_ARGS_COMMON, viwhy),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(__u16, viwhy)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->viwhy = viwhy;
+ ),
+
+ VCPU_TP_PRINTK("got validity intercept %04x", __entry->viwhy)
+ );
+
+/*
+ * Trace points for instructions that are of special interest.
+ */
+
+#define sigp_order_codes \
+ {SIGP_SENSE, "sense"}, \
+ {SIGP_EXTERNAL_CALL, "external call"}, \
+ {SIGP_EMERGENCY_SIGNAL, "emergency signal"}, \
+ {SIGP_STOP, "stop"}, \
+ {SIGP_STOP_AND_STORE_STATUS, "stop and store status"}, \
+ {SIGP_SET_ARCHITECTURE, "set architecture"}, \
+ {SIGP_SET_PREFIX, "set prefix"}, \
+ {SIGP_SENSE_RUNNING, "sense running"}, \
+ {SIGP_RESTART, "restart"}
+
+TRACE_EVENT(kvm_s390_handle_sigp,
+ TP_PROTO(VCPU_PROTO_COMMON, __u8 order_code, __u16 cpu_addr, \
+ __u32 parameter),
+ TP_ARGS(VCPU_ARGS_COMMON, order_code, cpu_addr, parameter),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(__u8, order_code)
+ __field(__u16, cpu_addr)
+ __field(__u32, parameter)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->order_code = order_code;
+ __entry->cpu_addr = cpu_addr;
+ __entry->parameter = parameter;
+ ),
+
+ VCPU_TP_PRINTK("handle sigp order %02x (%s), cpu address %04x, " \
+ "parameter %08x", __entry->order_code,
+ __print_symbolic(__entry->order_code,
+ sigp_order_codes),
+ __entry->cpu_addr, __entry->parameter)
+ );
+
+#define diagnose_codes \
+ {0x10, "release pages"}, \
+ {0x44, "time slice end"}, \
+ {0x308, "ipl functions"}, \
+ {0x500, "kvm hypercall"}, \
+ {0x501, "kvm breakpoint"}
+
+TRACE_EVENT(kvm_s390_handle_diag,
+ TP_PROTO(VCPU_PROTO_COMMON, __u16 code),
+ TP_ARGS(VCPU_ARGS_COMMON, code),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(__u16, code)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->code = code;
+ ),
+
+ VCPU_TP_PRINTK("handle diagnose call %04x (%s)", __entry->code,
+ __print_symbolic(__entry->code, diagnose_codes))
+ );
+
+TRACE_EVENT(kvm_s390_handle_lctl,
+ TP_PROTO(VCPU_PROTO_COMMON, int g, int reg1, int reg3, u64 addr),
+ TP_ARGS(VCPU_ARGS_COMMON, g, reg1, reg3, addr),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(int, g)
+ __field(int, reg1)
+ __field(int, reg3)
+ __field(u64, addr)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->g = g;
+ __entry->reg1 = reg1;
+ __entry->reg3 = reg3;
+ __entry->addr = addr;
+ ),
+
+ VCPU_TP_PRINTK("%s: loading cr %x-%x from %016llx",
+ __entry->g ? "lctlg" : "lctl",
+ __entry->reg1, __entry->reg3, __entry->addr)
+ );
+
+TRACE_EVENT(kvm_s390_handle_prefix,
+ TP_PROTO(VCPU_PROTO_COMMON, int set, u32 address),
+ TP_ARGS(VCPU_ARGS_COMMON, set, address),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(int, set)
+ __field(u32, address)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->set = set;
+ __entry->address = address;
+ ),
+
+ VCPU_TP_PRINTK("%s prefix to %08x",
+ __entry->set ? "setting" : "storing",
+ __entry->address)
+ );
+
+TRACE_EVENT(kvm_s390_handle_stap,
+ TP_PROTO(VCPU_PROTO_COMMON, u64 address),
+ TP_ARGS(VCPU_ARGS_COMMON, address),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(u64, address)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->address = address;
+ ),
+
+ VCPU_TP_PRINTK("storing cpu address to %016llx",
+ __entry->address)
+ );
+
+TRACE_EVENT(kvm_s390_handle_stfl,
+ TP_PROTO(VCPU_PROTO_COMMON, unsigned int facility_list),
+ TP_ARGS(VCPU_ARGS_COMMON, facility_list),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(unsigned int, facility_list)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->facility_list = facility_list;
+ ),
+
+ VCPU_TP_PRINTK("store facility list value %08x",
+ __entry->facility_list)
+ );
+
+TRACE_EVENT(kvm_s390_handle_stsi,
+ TP_PROTO(VCPU_PROTO_COMMON, int fc, int sel1, int sel2, u64 addr),
+ TP_ARGS(VCPU_ARGS_COMMON, fc, sel1, sel2, addr),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(int, fc)
+ __field(int, sel1)
+ __field(int, sel2)
+ __field(u64, addr)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->fc = fc;
+ __entry->sel1 = sel1;
+ __entry->sel2 = sel2;
+ __entry->addr = addr;
+ ),
+
+ VCPU_TP_PRINTK("STSI %d.%d.%d information stored to %016llx",
+ __entry->fc, __entry->sel1, __entry->sel2,
+ __entry->addr)
+ );
+
+#endif /* _TRACE_KVM_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8ec3a1aa4ab..a42e2e99caa 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -573,23 +573,18 @@ config PARAVIRT_TIME_ACCOUNTING
source "arch/x86/xen/Kconfig"
-config KVM_CLOCK
- bool "KVM paravirtualized clock"
- select PARAVIRT
- select PARAVIRT_CLOCK
- ---help---
- Turning on this option will allow you to run a paravirtualized clock
- when running over the KVM hypervisor. Instead of relying on a PIT
- (or probably other) emulation by the underlying device model, the host
- provides the guest with timing infrastructure such as time of day, and
- system time
-
config KVM_GUEST
- bool "KVM Guest support"
+ bool "KVM Guest support (including kvmclock)"
+ select PARAVIRT
select PARAVIRT
+ select PARAVIRT_CLOCK
+ default y if PARAVIRT_GUEST
---help---
This option enables various optimizations for running under the KVM
- hypervisor.
+ hypervisor. It includes a paravirtualized clock, so that instead
+ of relying on a PIT (or probably other) emulation by the
+ underlying device model, the host provides the guest with
+ timing infrastructure such as time of day, and system time
source "arch/x86/lguest/Kconfig"
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 246617efd67..521bf252e34 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -25,6 +25,7 @@
#define __KVM_HAVE_DEBUGREGS
#define __KVM_HAVE_XSAVE
#define __KVM_HAVE_XCRS
+#define __KVM_HAVE_READONLY_MEM
/* Architectural interrupt line count. */
#define KVM_NR_INTERRUPTS 256
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 09155d64cf7..fc0e752e756 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -420,6 +420,8 @@ struct kvm_vcpu_arch {
unsigned int hw_tsc_khz;
unsigned int time_offset;
struct page *time_page;
+ /* set guest stopped flag in pvclock flags field */
+ bool pvclock_set_guest_stopped_request;
struct {
u64 msr_val;
@@ -500,11 +502,11 @@ struct kvm_vcpu_arch {
};
struct kvm_lpage_info {
- unsigned long rmap_pde;
int write_count;
};
struct kvm_arch_memory_slot {
+ unsigned long *rmap[KVM_NR_PAGE_SIZES];
struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
};
@@ -957,6 +959,7 @@ extern bool kvm_rebooting;
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end);
int kvm_age_hva(struct kvm *kvm, unsigned long hva);
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 2f7712e08b1..eb3e9d85e1f 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -102,21 +102,21 @@ struct kvm_vcpu_pv_apf_data {
extern void kvmclock_init(void);
extern int kvm_register_clock(char *txt);
-#ifdef CONFIG_KVM_CLOCK
+#ifdef CONFIG_KVM_GUEST
bool kvm_check_and_clear_guest_paused(void);
#else
static inline bool kvm_check_and_clear_guest_paused(void)
{
return false;
}
-#endif /* CONFIG_KVMCLOCK */
+#endif /* CONFIG_KVM_GUEST */
/* This instruction is vmcall. On non-VT architectures, it will generate a
* trap that we will then rewrite to the appropriate instruction.
*/
#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
-/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
+/* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall
* instruction. The hypervisor may replace it with something else but only the
* instructions are guaranteed to be supported.
*
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8215e5652d9..7203298e0b8 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -81,8 +81,7 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
-obj-$(CONFIG_KVM_GUEST) += kvm.o
-obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
+obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index c1d61ee4b4f..b3e5e51bc90 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -354,6 +354,7 @@ static void kvm_pv_guest_cpu_reboot(void *unused)
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
wrmsrl(MSR_KVM_PV_EOI_EN, 0);
kvm_pv_disable_apf();
+ kvm_disable_steal_time();
}
static int kvm_pv_reboot_notify(struct notifier_block *nb,
@@ -396,9 +397,7 @@ void kvm_disable_steal_time(void)
#ifdef CONFIG_SMP
static void __init kvm_smp_prepare_boot_cpu(void)
{
-#ifdef CONFIG_KVM_CLOCK
WARN_ON(kvm_register_clock("primary cpu clock"));
-#endif
kvm_guest_cpu_init();
native_smp_prepare_boot_cpu();
}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f4b9b80e1b9..b3386ae3438 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -957,7 +957,7 @@ void __init setup_arch(char **cmdline_p)
initmem_init();
memblock_find_dma_reserve();
-#ifdef CONFIG_KVM_CLOCK
+#ifdef CONFIG_KVM_GUEST
kvmclock_init();
#endif
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a28f338843e..45c044f0fff 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -37,6 +37,7 @@ config KVM
select TASK_DELAY_ACCT
select PERF_EVENTS
select HAVE_KVM_MSI
+ select HAVE_KVM_CPU_RELAX_INTERCEPT
---help---
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 4f579e8dcac..04d30401c5c 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
- i8254.o timer.o cpuid.o pmu.o
+ i8254.o cpuid.o pmu.o
kvm-intel-y += vmx.o
kvm-amd-y += svm.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0595f1397b7..b496da684bd 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -316,7 +316,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
}
case 7: {
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- /* Mask ebx against host capbability word 9 */
+ /* Mask ebx against host capability word 9 */
if (index == 0) {
entry->ebx &= kvm_supported_word9_x86_features;
cpuid_mask(&entry->ebx, 9);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a3b57a27be8..e8fb6c5c6c0 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -655,7 +655,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
goto bad;
} else {
- /* exapand-down segment */
+ /* expand-down segment */
if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim)
goto bad;
lim = desc.d ? 0xffffffff : 0xffff;
@@ -1179,24 +1179,21 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
int rc;
struct read_cache *mc = &ctxt->mem_read;
- while (size) {
- int n = min(size, 8u);
- size -= n;
- if (mc->pos < mc->end)
- goto read_cached;
+ if (mc->pos < mc->end)
+ goto read_cached;
- rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n,
- &ctxt->exception);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- mc->end += n;
+ WARN_ON((mc->end + size) >= sizeof(mc->data));
- read_cached:
- memcpy(dest, mc->data + mc->pos, n);
- mc->pos += n;
- dest += n;
- addr += n;
- }
+ rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size,
+ &ctxt->exception);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ mc->end += size;
+
+read_cached:
+ memcpy(dest, mc->data + mc->pos, size);
+ mc->pos += size;
return X86EMUL_CONTINUE;
}
@@ -1396,7 +1393,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
err_code = selector & 0xfffc;
err_vec = GP_VECTOR;
- /* can't load system descriptor into segment selecor */
+ /* can't load system descriptor into segment selector */
if (seg <= VCPU_SREG_GS && !seg_desc.s)
goto exception;
@@ -2050,12 +2047,6 @@ static void
setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
struct desc_struct *cs, struct desc_struct *ss)
{
- u16 selector;
-
- memset(cs, 0, sizeof(struct desc_struct));
- ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
- memset(ss, 0, sizeof(struct desc_struct));
-
cs->l = 0; /* will be adjusted later */
set_desc_base(cs, 0); /* flat segment */
cs->g = 1; /* 4kb granularity */
@@ -2065,6 +2056,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
cs->dpl = 0; /* will be adjusted later */
cs->p = 1;
cs->d = 1;
+ cs->avl = 0;
set_desc_base(ss, 0); /* flat segment */
set_desc_limit(ss, 0xfffff); /* 4GB limit */
@@ -2074,6 +2066,8 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
ss->d = 1; /* 32bit stack segment */
ss->dpl = 0;
ss->p = 1;
+ ss->l = 0;
+ ss->avl = 0;
}
static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
@@ -2410,7 +2404,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
/*
- * Now load segment descriptors. If fault happenes at this stage
+ * Now load segment descriptors. If fault happens at this stage
* it is handled in a context of new task
*/
ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
@@ -2652,7 +2646,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
*
* 1. jmp/call/int to task gate: Check against DPL of the task gate
* 2. Exception/IRQ/iret: No check is performed
- * 3. jmp/call to TSS: Check agains DPL of the TSS
+ * 3. jmp/call to TSS: Check against DPL of the TSS
*/
if (reason == TASK_SWITCH_GATE) {
if (idt_index != -1) {
@@ -2693,7 +2687,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
/* set back link to prev task only if NT bit is set in eflags
- note that old_tss_sel is not used afetr this point */
+ note that old_tss_sel is not used after this point */
if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
old_tss_sel = 0xffff;
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index adba28f88d1..11300d2fa71 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -108,7 +108,7 @@ static s64 __kpit_elapsed(struct kvm *kvm)
ktime_t remaining;
struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
- if (!ps->pit_timer.period)
+ if (!ps->period)
return 0;
/*
@@ -120,9 +120,9 @@ static s64 __kpit_elapsed(struct kvm *kvm)
* itself with the initial count and continues counting
* from there.
*/
- remaining = hrtimer_get_remaining(&ps->pit_timer.timer);
- elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
- elapsed = mod_64(elapsed, ps->pit_timer.period);
+ remaining = hrtimer_get_remaining(&ps->timer);
+ elapsed = ps->period - ktime_to_ns(remaining);
+ elapsed = mod_64(elapsed, ps->period);
return elapsed;
}
@@ -238,12 +238,12 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
int value;
spin_lock(&ps->inject_lock);
- value = atomic_dec_return(&ps->pit_timer.pending);
+ value = atomic_dec_return(&ps->pending);
if (value < 0)
/* spurious acks can be generated if, for example, the
* PIC is being reset. Handle it gracefully here
*/
- atomic_inc(&ps->pit_timer.pending);
+ atomic_inc(&ps->pending);
else if (value > 0)
/* in this case, we had multiple outstanding pit interrupts
* that we needed to inject. Reinject
@@ -261,28 +261,17 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
if (!kvm_vcpu_is_bsp(vcpu) || !pit)
return;
- timer = &pit->pit_state.pit_timer.timer;
+ timer = &pit->pit_state.timer;
if (hrtimer_cancel(timer))
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
static void destroy_pit_timer(struct kvm_pit *pit)
{
- hrtimer_cancel(&pit->pit_state.pit_timer.timer);
+ hrtimer_cancel(&pit->pit_state.timer);
flush_kthread_work(&pit->expired);
}
-static bool kpit_is_periodic(struct kvm_timer *ktimer)
-{
- struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state,
- pit_timer);
- return ps->is_periodic;
-}
-
-static struct kvm_timer_ops kpit_ops = {
- .is_periodic = kpit_is_periodic,
-};
-
static void pit_do_work(struct kthread_work *work)
{
struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
@@ -322,16 +311,16 @@ static void pit_do_work(struct kthread_work *work)
static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
{
- struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
- struct kvm_pit *pt = ktimer->kvm->arch.vpit;
+ struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
+ struct kvm_pit *pt = ps->kvm->arch.vpit;
- if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
- atomic_inc(&ktimer->pending);
+ if (ps->reinject || !atomic_read(&ps->pending)) {
+ atomic_inc(&ps->pending);
queue_kthread_work(&pt->worker, &pt->expired);
}
- if (ktimer->t_ops->is_periodic(ktimer)) {
- hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+ if (ps->is_periodic) {
+ hrtimer_add_expires_ns(&ps->timer, ps->period);
return HRTIMER_RESTART;
} else
return HRTIMER_NORESTART;
@@ -340,7 +329,6 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
{
struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
- struct kvm_timer *pt = &ps->pit_timer;
s64 interval;
if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
@@ -351,19 +339,18 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
pr_debug("create pit timer, interval is %llu nsec\n", interval);
/* TODO The new value only affected after the retriggered */
- hrtimer_cancel(&pt->timer);
+ hrtimer_cancel(&ps->timer);
flush_kthread_work(&ps->pit->expired);
- pt->period = interval;
+ ps->period = interval;
ps->is_periodic = is_period;
- pt->timer.function = pit_timer_fn;
- pt->t_ops = &kpit_ops;
- pt->kvm = ps->pit->kvm;
+ ps->timer.function = pit_timer_fn;
+ ps->kvm = ps->pit->kvm;
- atomic_set(&pt->pending, 0);
+ atomic_set(&ps->pending, 0);
ps->irq_ack = 1;
- hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
+ hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval),
HRTIMER_MODE_ABS);
}
@@ -639,7 +626,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
}
mutex_unlock(&pit->pit_state.lock);
- atomic_set(&pit->pit_state.pit_timer.pending, 0);
+ atomic_set(&pit->pit_state.pending, 0);
pit->pit_state.irq_ack = 1;
}
@@ -648,7 +635,7 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
if (!mask) {
- atomic_set(&pit->pit_state.pit_timer.pending, 0);
+ atomic_set(&pit->pit_state.pending, 0);
pit->pit_state.irq_ack = 1;
}
}
@@ -706,12 +693,11 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
pit_state = &pit->pit_state;
pit_state->pit = pit;
- hrtimer_init(&pit_state->pit_timer.timer,
- CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
pit_state->irq_ack_notifier.gsi = 0;
pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
- pit_state->pit_timer.reinject = true;
+ pit_state->reinject = true;
mutex_unlock(&pit->pit_state.lock);
kvm_pit_reset(pit);
@@ -761,7 +747,7 @@ void kvm_free_pit(struct kvm *kvm)
kvm_unregister_irq_ack_notifier(kvm,
&kvm->arch.vpit->pit_state.irq_ack_notifier);
mutex_lock(&kvm->arch.vpit->pit_state.lock);
- timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
+ timer = &kvm->arch.vpit->pit_state.timer;
hrtimer_cancel(timer);
flush_kthread_work(&kvm->arch.vpit->expired);
kthread_stop(kvm->arch.vpit->worker_task);
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index fdf40425ea1..dd1b16b611b 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -24,8 +24,12 @@ struct kvm_kpit_channel_state {
struct kvm_kpit_state {
struct kvm_kpit_channel_state channels[3];
u32 flags;
- struct kvm_timer pit_timer;
bool is_periodic;
+ s64 period; /* unit: ns */
+ struct hrtimer timer;
+ atomic_t pending; /* accumulated triggered timers */
+ bool reinject;
+ struct kvm *kvm;
u32 speaker_data_on;
struct mutex lock;
struct kvm_pit *pit;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index e498b18f010..90c84f947d4 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -190,17 +190,17 @@ void kvm_pic_update_irq(struct kvm_pic *s)
int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
{
- int ret = -1;
+ int ret, irq_level;
+
+ BUG_ON(irq < 0 || irq >= PIC_NUM_PINS);
pic_lock(s);
- if (irq >= 0 && irq < PIC_NUM_PINS) {
- int irq_level = __kvm_irq_line_state(&s->irq_states[irq],
- irq_source_id, level);
- ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
- pic_update_irq(s);
- trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
- s->pics[irq >> 3].imr, ret == 0);
- }
+ irq_level = __kvm_irq_line_state(&s->irq_states[irq],
+ irq_source_id, level);
+ ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
+ pic_update_irq(s);
+ trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
+ s->pics[irq >> 3].imr, ret == 0);
pic_unlock(s);
return ret;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 2086f2bfba3..2d03568e949 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -70,7 +70,7 @@ struct kvm_pic {
struct kvm_io_device dev_slave;
struct kvm_io_device dev_eclr;
void (*ack_notifier)(void *opaque, int irq);
- unsigned long irq_states[16];
+ unsigned long irq_states[PIC_NUM_PINS];
};
struct kvm_pic *kvm_create_pic(struct kvm *kvm);
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
deleted file mode 100644
index 497dbaa366d..00000000000
--- a/arch/x86/kvm/kvm_timer.h
+++ /dev/null
@@ -1,18 +0,0 @@
-
-struct kvm_timer {
- struct hrtimer timer;
- s64 period; /* unit: ns */
- u32 timer_mode_mask;
- u64 tscdeadline;
- atomic_t pending; /* accumulated triggered timers */
- bool reinject;
- struct kvm_timer_ops *t_ops;
- struct kvm *kvm;
- struct kvm_vcpu *vcpu;
-};
-
-struct kvm_timer_ops {
- bool (*is_periodic)(struct kvm_timer *);
-};
-
-enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ce878788a39..18d149d8020 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -34,6 +34,7 @@
#include <asm/current.h>
#include <asm/apicdef.h>
#include <linux/atomic.h>
+#include <linux/jump_label.h>
#include "kvm_cache_regs.h"
#include "irq.h"
#include "trace.h"
@@ -72,11 +73,6 @@
static unsigned int min_timer_period_us = 500;
module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
-static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
-{
- return *((u32 *) (apic->regs + reg_off));
-}
-
static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
{
*((u32 *) (apic->regs + reg_off)) = val;
@@ -117,19 +113,23 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
}
-static inline int apic_hw_enabled(struct kvm_lapic *apic)
-{
- return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
-}
+struct static_key_deferred apic_hw_disabled __read_mostly;
+struct static_key_deferred apic_sw_disabled __read_mostly;
-static inline int apic_sw_enabled(struct kvm_lapic *apic)
+static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
{
- return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
+ if ((kvm_apic_get_reg(apic, APIC_SPIV) ^ val) & APIC_SPIV_APIC_ENABLED) {
+ if (val & APIC_SPIV_APIC_ENABLED)
+ static_key_slow_dec_deferred(&apic_sw_disabled);
+ else
+ static_key_slow_inc(&apic_sw_disabled.key);
+ }
+ apic_set_reg(apic, APIC_SPIV, val);
}
static inline int apic_enabled(struct kvm_lapic *apic)
{
- return apic_sw_enabled(apic) && apic_hw_enabled(apic);
+ return kvm_apic_sw_enabled(apic) && kvm_apic_hw_enabled(apic);
}
#define LVT_MASK \
@@ -141,34 +141,34 @@ static inline int apic_enabled(struct kvm_lapic *apic)
static inline int kvm_apic_id(struct kvm_lapic *apic)
{
- return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
+ return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
}
static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
{
- return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
+ return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
}
static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
{
- return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
+ return kvm_apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
}
static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
{
- return ((apic_get_reg(apic, APIC_LVTT) &
+ return ((kvm_apic_get_reg(apic, APIC_LVTT) &
apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT);
}
static inline int apic_lvtt_period(struct kvm_lapic *apic)
{
- return ((apic_get_reg(apic, APIC_LVTT) &
+ return ((kvm_apic_get_reg(apic, APIC_LVTT) &
apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC);
}
static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
{
- return ((apic_get_reg(apic, APIC_LVTT) &
+ return ((kvm_apic_get_reg(apic, APIC_LVTT) &
apic->lapic_timer.timer_mode_mask) ==
APIC_LVT_TIMER_TSCDEADLINE);
}
@@ -184,7 +184,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
struct kvm_cpuid_entry2 *feat;
u32 v = APIC_VERSION;
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!kvm_vcpu_has_lapic(vcpu))
return;
feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
@@ -285,7 +285,6 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
int highest_irr;
/* This may race with setting of irr in __apic_accept_irq() and
@@ -293,9 +292,9 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
* will cause vmexit immediately and the value will be recalculated
* on the next vmentry.
*/
- if (!apic)
+ if (!kvm_vcpu_has_lapic(vcpu))
return 0;
- highest_irr = apic_find_highest_irr(apic);
+ highest_irr = apic_find_highest_irr(vcpu->arch.apic);
return highest_irr;
}
@@ -378,8 +377,8 @@ static void apic_update_ppr(struct kvm_lapic *apic)
u32 tpr, isrv, ppr, old_ppr;
int isr;
- old_ppr = apic_get_reg(apic, APIC_PROCPRI);
- tpr = apic_get_reg(apic, APIC_TASKPRI);
+ old_ppr = kvm_apic_get_reg(apic, APIC_PROCPRI);
+ tpr = kvm_apic_get_reg(apic, APIC_TASKPRI);
isr = apic_find_highest_isr(apic);
isrv = (isr != -1) ? isr : 0;
@@ -415,13 +414,13 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
u32 logical_id;
if (apic_x2apic_mode(apic)) {
- logical_id = apic_get_reg(apic, APIC_LDR);
+ logical_id = kvm_apic_get_reg(apic, APIC_LDR);
return logical_id & mda;
}
- logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
+ logical_id = GET_APIC_LOGICAL_ID(kvm_apic_get_reg(apic, APIC_LDR));
- switch (apic_get_reg(apic, APIC_DFR)) {
+ switch (kvm_apic_get_reg(apic, APIC_DFR)) {
case APIC_DFR_FLAT:
if (logical_id & mda)
result = 1;
@@ -433,7 +432,7 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
break;
default:
apic_debug("Bad DFR vcpu %d: %08x\n",
- apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
+ apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR));
break;
}
@@ -591,7 +590,7 @@ static int apic_set_eoi(struct kvm_lapic *apic)
apic_clear_isr(vector, apic);
apic_update_ppr(apic);
- if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
+ if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
int trigger_mode;
if (apic_test_vector(vector, apic->regs + APIC_TMR))
@@ -606,8 +605,8 @@ static int apic_set_eoi(struct kvm_lapic *apic)
static void apic_send_ipi(struct kvm_lapic *apic)
{
- u32 icr_low = apic_get_reg(apic, APIC_ICR);
- u32 icr_high = apic_get_reg(apic, APIC_ICR2);
+ u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR);
+ u32 icr_high = kvm_apic_get_reg(apic, APIC_ICR2);
struct kvm_lapic_irq irq;
irq.vector = icr_low & APIC_VECTOR_MASK;
@@ -642,7 +641,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
ASSERT(apic != NULL);
/* if initial count is 0, current count should also be 0 */
- if (apic_get_reg(apic, APIC_TMICT) == 0)
+ if (kvm_apic_get_reg(apic, APIC_TMICT) == 0)
return 0;
remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
@@ -696,13 +695,15 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
val = apic_get_tmcct(apic);
break;
-
+ case APIC_PROCPRI:
+ apic_update_ppr(apic);
+ val = kvm_apic_get_reg(apic, offset);
+ break;
case APIC_TASKPRI:
report_tpr_access(apic, false);
/* fall thru */
default:
- apic_update_ppr(apic);
- val = apic_get_reg(apic, offset);
+ val = kvm_apic_get_reg(apic, offset);
break;
}
@@ -719,7 +720,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
{
unsigned char alignment = offset & 0xf;
u32 result;
- /* this bitmask has a bit cleared for each reserver register */
+ /* this bitmask has a bit cleared for each reserved register */
static const u64 rmask = 0x43ff01ffffffe70cULL;
if ((alignment + len) > 4) {
@@ -754,7 +755,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
{
- return apic_hw_enabled(apic) &&
+ return kvm_apic_hw_enabled(apic) &&
addr >= apic->base_address &&
addr < apic->base_address + LAPIC_MMIO_LENGTH;
}
@@ -777,7 +778,7 @@ static void update_divide_count(struct kvm_lapic *apic)
{
u32 tmp1, tmp2, tdcr;
- tdcr = apic_get_reg(apic, APIC_TDCR);
+ tdcr = kvm_apic_get_reg(apic, APIC_TDCR);
tmp1 = tdcr & 0xf;
tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
apic->divide_count = 0x1 << (tmp2 & 0x7);
@@ -792,9 +793,9 @@ static void start_apic_timer(struct kvm_lapic *apic)
atomic_set(&apic->lapic_timer.pending, 0);
if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
- /* lapic timer in oneshot or peroidic mode */
+ /* lapic timer in oneshot or periodic mode */
now = apic->lapic_timer.timer.base->get_time();
- apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT)
+ apic->lapic_timer.period = (u64)kvm_apic_get_reg(apic, APIC_TMICT)
* APIC_BUS_CYCLE_NS * apic->divide_count;
if (!apic->lapic_timer.period)
@@ -826,7 +827,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
"timer initial count 0x%x, period %lldns, "
"expire @ 0x%016" PRIx64 ".\n", __func__,
APIC_BUS_CYCLE_NS, ktime_to_ns(now),
- apic_get_reg(apic, APIC_TMICT),
+ kvm_apic_get_reg(apic, APIC_TMICT),
apic->lapic_timer.period,
ktime_to_ns(ktime_add_ns(now,
apic->lapic_timer.period)));
@@ -858,7 +859,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
{
- int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0));
+ int nmi_wd_enabled = apic_lvt_nmi_mode(kvm_apic_get_reg(apic, APIC_LVT0));
if (apic_lvt_nmi_mode(lvt0_val)) {
if (!nmi_wd_enabled) {
@@ -909,15 +910,15 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_SPIV: {
u32 mask = 0x3ff;
- if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
+ if (kvm_apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
mask |= APIC_SPIV_DIRECTED_EOI;
- apic_set_reg(apic, APIC_SPIV, val & mask);
+ apic_set_spiv(apic, val & mask);
if (!(val & APIC_SPIV_APIC_ENABLED)) {
int i;
u32 lvt_val;
for (i = 0; i < APIC_LVT_NUM; i++) {
- lvt_val = apic_get_reg(apic,
+ lvt_val = kvm_apic_get_reg(apic,
APIC_LVTT + 0x10 * i);
apic_set_reg(apic, APIC_LVTT + 0x10 * i,
lvt_val | APIC_LVT_MASKED);
@@ -946,7 +947,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_LVT1:
case APIC_LVTERR:
/* TODO: Check vector */
- if (!apic_sw_enabled(apic))
+ if (!kvm_apic_sw_enabled(apic))
val |= APIC_LVT_MASKED;
val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
@@ -955,12 +956,12 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
break;
case APIC_LVTT:
- if ((apic_get_reg(apic, APIC_LVTT) &
+ if ((kvm_apic_get_reg(apic, APIC_LVTT) &
apic->lapic_timer.timer_mode_mask) !=
(val & apic->lapic_timer.timer_mode_mask))
hrtimer_cancel(&apic->lapic_timer.timer);
- if (!apic_sw_enabled(apic))
+ if (!kvm_apic_sw_enabled(apic))
val |= APIC_LVT_MASKED;
val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
apic_set_reg(apic, APIC_LVTT, val);
@@ -1039,24 +1040,30 @@ static int apic_mmio_write(struct kvm_io_device *this,
void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- if (apic)
+ if (kvm_vcpu_has_lapic(vcpu))
apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
}
EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
void kvm_free_lapic(struct kvm_vcpu *vcpu)
{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
if (!vcpu->arch.apic)
return;
- hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
+ hrtimer_cancel(&apic->lapic_timer.timer);
+
+ if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE))
+ static_key_slow_dec_deferred(&apic_hw_disabled);
+
+ if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED))
+ static_key_slow_dec_deferred(&apic_sw_disabled);
- if (vcpu->arch.apic->regs)
- free_page((unsigned long)vcpu->arch.apic->regs);
+ if (apic->regs)
+ free_page((unsigned long)apic->regs);
- kfree(vcpu->arch.apic);
+ kfree(apic);
}
/*
@@ -1068,10 +1075,9 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!apic)
- return 0;
- if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
+ if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+ apic_lvtt_period(apic))
return 0;
return apic->lapic_timer.tscdeadline;
@@ -1080,10 +1086,9 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!apic)
- return;
- if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
+ if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+ apic_lvtt_period(apic))
return;
hrtimer_cancel(&apic->lapic_timer.timer);
@@ -1095,20 +1100,21 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!apic)
+ if (!kvm_vcpu_has_lapic(vcpu))
return;
+
apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
- | (apic_get_reg(apic, APIC_TASKPRI) & 4));
+ | (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4));
}
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
u64 tpr;
- if (!apic)
+ if (!kvm_vcpu_has_lapic(vcpu))
return 0;
- tpr = (u64) apic_get_reg(apic, APIC_TASKPRI);
+
+ tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
return (tpr & 0xf0) >> 4;
}
@@ -1123,6 +1129,14 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
return;
}
+ /* update jump label if enable bit changes */
+ if ((vcpu->arch.apic_base ^ value) & MSR_IA32_APICBASE_ENABLE) {
+ if (value & MSR_IA32_APICBASE_ENABLE)
+ static_key_slow_dec_deferred(&apic_hw_disabled);
+ else
+ static_key_slow_inc(&apic_hw_disabled.key);
+ }
+
if (!kvm_vcpu_is_bsp(apic->vcpu))
value &= ~MSR_IA32_APICBASE_BSP;
@@ -1164,7 +1178,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
apic_set_reg(apic, APIC_DFR, 0xffffffffU);
- apic_set_reg(apic, APIC_SPIV, 0xff);
+ apic_set_spiv(apic, 0xff);
apic_set_reg(apic, APIC_TASKPRI, 0);
apic_set_reg(apic, APIC_LDR, 0);
apic_set_reg(apic, APIC_ESR, 0);
@@ -1183,7 +1197,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
if (kvm_vcpu_is_bsp(vcpu))
- vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ kvm_lapic_set_base(vcpu,
+ vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP);
vcpu->arch.pv_eoi.msr_val = 0;
apic_update_ppr(apic);
@@ -1196,45 +1211,34 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
vcpu->arch.apic_base, apic->base_address);
}
-bool kvm_apic_present(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic);
-}
-
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
-{
- return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
-}
-
/*
*----------------------------------------------------------------------
* timer interface
*----------------------------------------------------------------------
*/
-static bool lapic_is_periodic(struct kvm_timer *ktimer)
+static bool lapic_is_periodic(struct kvm_lapic *apic)
{
- struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic,
- lapic_timer);
return apic_lvtt_period(apic);
}
int apic_has_pending_timer(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *lapic = vcpu->arch.apic;
+ struct kvm_lapic *apic = vcpu->arch.apic;
- if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT))
- return atomic_read(&lapic->lapic_timer.pending);
+ if (kvm_vcpu_has_lapic(vcpu) && apic_enabled(apic) &&
+ apic_lvt_enabled(apic, APIC_LVTT))
+ return atomic_read(&apic->lapic_timer.pending);
return 0;
}
int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
{
- u32 reg = apic_get_reg(apic, lvt_type);
+ u32 reg = kvm_apic_get_reg(apic, lvt_type);
int vector, mode, trig_mode;
- if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
+ if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
vector = reg & APIC_VECTOR_MASK;
mode = reg & APIC_MODE_MASK;
trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
@@ -1251,15 +1255,40 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
kvm_apic_local_deliver(apic, APIC_LVT0);
}
-static struct kvm_timer_ops lapic_timer_ops = {
- .is_periodic = lapic_is_periodic,
-};
-
static const struct kvm_io_device_ops apic_mmio_ops = {
.read = apic_mmio_read,
.write = apic_mmio_write,
};
+static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
+{
+ struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+ struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer);
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ wait_queue_head_t *q = &vcpu->wq;
+
+ /*
+ * There is a race window between reading and incrementing, but we do
+ * not care about potentially losing timer events in the !reinject
+ * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
+ * in vcpu_enter_guest.
+ */
+ if (!atomic_read(&ktimer->pending)) {
+ atomic_inc(&ktimer->pending);
+ /* FIXME: this code should not know anything about vcpus */
+ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+ }
+
+ if (waitqueue_active(q))
+ wake_up_interruptible(q);
+
+ if (lapic_is_periodic(apic)) {
+ hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+ return HRTIMER_RESTART;
+ } else
+ return HRTIMER_NORESTART;
+}
+
int kvm_create_lapic(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic;
@@ -1283,14 +1312,17 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS);
- apic->lapic_timer.timer.function = kvm_timer_fn;
- apic->lapic_timer.t_ops = &lapic_timer_ops;
- apic->lapic_timer.kvm = vcpu->kvm;
- apic->lapic_timer.vcpu = vcpu;
+ apic->lapic_timer.timer.function = apic_timer_fn;
- apic->base_address = APIC_DEFAULT_PHYS_BASE;
- vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
+ /*
+ * APIC is created enabled. This will prevent kvm_lapic_set_base from
+ * thinking that APIC satet has changed.
+ */
+ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
+ kvm_lapic_set_base(vcpu,
+ APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
+ static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
kvm_lapic_reset(vcpu);
kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
@@ -1306,23 +1338,23 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
struct kvm_lapic *apic = vcpu->arch.apic;
int highest_irr;
- if (!apic || !apic_enabled(apic))
+ if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic))
return -1;
apic_update_ppr(apic);
highest_irr = apic_find_highest_irr(apic);
if ((highest_irr == -1) ||
- ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
+ ((highest_irr & 0xF0) <= kvm_apic_get_reg(apic, APIC_PROCPRI)))
return -1;
return highest_irr;
}
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
{
- u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
+ u32 lvt0 = kvm_apic_get_reg(vcpu->arch.apic, APIC_LVT0);
int r = 0;
- if (!apic_hw_enabled(vcpu->arch.apic))
+ if (!kvm_apic_hw_enabled(vcpu->arch.apic))
r = 1;
if ((lvt0 & APIC_LVT_MASKED) == 0 &&
GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
@@ -1334,7 +1366,10 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (apic && atomic_read(&apic->lapic_timer.pending) > 0) {
+ if (!kvm_vcpu_has_lapic(vcpu))
+ return;
+
+ if (atomic_read(&apic->lapic_timer.pending) > 0) {
if (kvm_apic_local_deliver(apic, APIC_LVTT))
atomic_dec(&apic->lapic_timer.pending);
}
@@ -1354,12 +1389,15 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
return vector;
}
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
+void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- apic->base_address = vcpu->arch.apic_base &
- MSR_IA32_APICBASE_BASE;
+ kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
+ /* set SPIV separately to get count of SW disabled APICs right */
+ apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
+ memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
kvm_apic_set_version(vcpu);
apic_update_ppr(apic);
@@ -1374,13 +1412,12 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
struct hrtimer *timer;
- if (!apic)
+ if (!kvm_vcpu_has_lapic(vcpu))
return;
- timer = &apic->lapic_timer.timer;
+ timer = &vcpu->arch.apic->lapic_timer.timer;
if (hrtimer_cancel(timer))
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
@@ -1478,7 +1515,7 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
return;
- tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
+ tpr = kvm_apic_get_reg(apic, APIC_TASKPRI) & 0xff;
max_irr = apic_find_highest_irr(apic);
if (max_irr < 0)
max_irr = 0;
@@ -1537,7 +1574,7 @@ int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!kvm_vcpu_has_lapic(vcpu))
return 1;
/* if this is ICR write vector before command */
@@ -1551,7 +1588,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
struct kvm_lapic *apic = vcpu->arch.apic;
u32 low, high = 0;
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!kvm_vcpu_has_lapic(vcpu))
return 1;
if (apic_reg_read(apic, reg, 4, &low))
@@ -1576,3 +1613,10 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
addr);
}
+
+void kvm_lapic_init(void)
+{
+ /* do not patch jump label more than once per second */
+ jump_label_rate_limit(&apic_hw_disabled, HZ);
+ jump_label_rate_limit(&apic_sw_disabled, HZ);
+}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 4af5405ae1e..615a8b03016 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -2,10 +2,17 @@
#define __KVM_X86_LAPIC_H
#include "iodev.h"
-#include "kvm_timer.h"
#include <linux/kvm_host.h>
+struct kvm_timer {
+ struct hrtimer timer;
+ s64 period; /* unit: ns */
+ u32 timer_mode_mask;
+ u64 tscdeadline;
+ atomic_t pending; /* accumulated triggered timers */
+};
+
struct kvm_lapic {
unsigned long base_address;
struct kvm_io_device dev;
@@ -47,9 +54,8 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
-bool kvm_apic_present(struct kvm_vcpu *vcpu);
+void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s);
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
@@ -71,4 +77,48 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
}
int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
+void kvm_lapic_init(void);
+
+static inline u32 kvm_apic_get_reg(struct kvm_lapic *apic, int reg_off)
+{
+ return *((u32 *) (apic->regs + reg_off));
+}
+
+extern struct static_key kvm_no_apic_vcpu;
+
+static inline bool kvm_vcpu_has_lapic(struct kvm_vcpu *vcpu)
+{
+ if (static_key_false(&kvm_no_apic_vcpu))
+ return vcpu->arch.apic;
+ return true;
+}
+
+extern struct static_key_deferred apic_hw_disabled;
+
+static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic)
+{
+ if (static_key_false(&apic_hw_disabled.key))
+ return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
+ return MSR_IA32_APICBASE_ENABLE;
+}
+
+extern struct static_key_deferred apic_sw_disabled;
+
+static inline int kvm_apic_sw_enabled(struct kvm_lapic *apic)
+{
+ if (static_key_false(&apic_sw_disabled.key))
+ return kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
+ return APIC_SPIV_APIC_ENABLED;
+}
+
+static inline bool kvm_apic_present(struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_has_lapic(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
+}
+
+static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+{
+ return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
+}
+
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7fbd0d273ea..399c177212b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -556,6 +556,14 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
return 0;
pfn = spte_to_pfn(old_spte);
+
+ /*
+ * KVM does not hold the refcount of the page used by
+ * kvm mmu, before reclaiming the page, we should
+ * unmap it from mmu first.
+ */
+ WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn)));
+
if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
kvm_set_pfn_accessed(pfn);
if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
@@ -960,13 +968,10 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
struct kvm_memory_slot *slot)
{
- struct kvm_lpage_info *linfo;
-
- if (likely(level == PT_PAGE_TABLE_LEVEL))
- return &slot->rmap[gfn - slot->base_gfn];
+ unsigned long idx;
- linfo = lpage_info_slot(gfn, slot, level);
- return &linfo->rmap_pde;
+ idx = gfn_to_index(gfn, slot->base_gfn, level);
+ return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
}
/*
@@ -1173,7 +1178,8 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
unsigned long *rmapp;
while (mask) {
- rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
+ rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PT_PAGE_TABLE_LEVEL, slot);
__rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
/* clear the first set bit */
@@ -1200,7 +1206,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
}
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
+ struct kvm_memory_slot *slot, unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1218,7 +1224,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
}
static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
+ struct kvm_memory_slot *slot, unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1259,43 +1265,67 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
return 0;
}
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- unsigned long data,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data))
+static int kvm_handle_hva_range(struct kvm *kvm,
+ unsigned long start,
+ unsigned long end,
+ unsigned long data,
+ int (*handler)(struct kvm *kvm,
+ unsigned long *rmapp,
+ struct kvm_memory_slot *slot,
+ unsigned long data))
{
int j;
- int ret;
- int retval = 0;
+ int ret = 0;
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots) {
- unsigned long start = memslot->userspace_addr;
- unsigned long end;
+ unsigned long hva_start, hva_end;
+ gfn_t gfn_start, gfn_end;
+
+ hva_start = max(start, memslot->userspace_addr);
+ hva_end = min(end, memslot->userspace_addr +
+ (memslot->npages << PAGE_SHIFT));
+ if (hva_start >= hva_end)
+ continue;
+ /*
+ * {gfn(page) | page intersects with [hva_start, hva_end)} =
+ * {gfn_start, gfn_start+1, ..., gfn_end-1}.
+ */
+ gfn_start = hva_to_gfn_memslot(hva_start, memslot);
+ gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
- end = start + (memslot->npages << PAGE_SHIFT);
- if (hva >= start && hva < end) {
- gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
- gfn_t gfn = memslot->base_gfn + gfn_offset;
+ for (j = PT_PAGE_TABLE_LEVEL;
+ j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
+ unsigned long idx, idx_end;
+ unsigned long *rmapp;
- ret = handler(kvm, &memslot->rmap[gfn_offset], data);
+ /*
+ * {idx(page_j) | page_j intersects with
+ * [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}.
+ */
+ idx = gfn_to_index(gfn_start, memslot->base_gfn, j);
+ idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j);
- for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
- struct kvm_lpage_info *linfo;
+ rmapp = __gfn_to_rmap(gfn_start, j, memslot);
- linfo = lpage_info_slot(gfn, memslot,
- PT_DIRECTORY_LEVEL + j);
- ret |= handler(kvm, &linfo->rmap_pde, data);
- }
- trace_kvm_age_page(hva, memslot, ret);
- retval |= ret;
+ for (; idx <= idx_end; ++idx)
+ ret |= handler(kvm, rmapp++, memslot, data);
}
}
- return retval;
+ return ret;
+}
+
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+ unsigned long data,
+ int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+ struct kvm_memory_slot *slot,
+ unsigned long data))
+{
+ return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
}
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
@@ -1303,13 +1333,18 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
}
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+ return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
+}
+
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
{
kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
}
static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
+ struct kvm_memory_slot *slot, unsigned long data)
{
u64 *sptep;
struct rmap_iterator uninitialized_var(iter);
@@ -1323,8 +1358,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
* This has some overhead, but not as much as the cost of swapping
* out actively used pages or breaking up actively used hugepages.
*/
- if (!shadow_accessed_mask)
- return kvm_unmap_rmapp(kvm, rmapp, data);
+ if (!shadow_accessed_mask) {
+ young = kvm_unmap_rmapp(kvm, rmapp, slot, data);
+ goto out;
+ }
for (sptep = rmap_get_first(*rmapp, &iter); sptep;
sptep = rmap_get_next(&iter)) {
@@ -1336,12 +1373,14 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
(unsigned long *)sptep);
}
}
-
+out:
+ /* @data has hva passed to kvm_age_hva(). */
+ trace_kvm_age_page(data, slot, young);
return young;
}
static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
+ struct kvm_memory_slot *slot, unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1379,13 +1418,13 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
- kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
+ kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0);
kvm_flush_remote_tlbs(vcpu->kvm);
}
int kvm_age_hva(struct kvm *kvm, unsigned long hva)
{
- return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
+ return kvm_handle_hva(kvm, hva, hva, kvm_age_rmapp);
}
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
@@ -2457,7 +2496,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
rmap_recycle(vcpu, sptep, gfn);
}
}
- kvm_release_pfn_clean(pfn);
+
+ if (!is_error_pfn(pfn))
+ kvm_release_pfn_clean(pfn);
}
static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -2469,17 +2510,12 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
bool no_dirty_log)
{
struct kvm_memory_slot *slot;
- unsigned long hva;
slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
- if (!slot) {
- get_page(fault_page);
- return page_to_pfn(fault_page);
- }
-
- hva = gfn_to_hva_memslot(slot, gfn);
+ if (!slot)
+ return KVM_PFN_ERR_FAULT;
- return hva_to_pfn_atomic(vcpu->kvm, hva);
+ return gfn_to_pfn_memslot_atomic(slot, gfn);
}
static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
@@ -2611,8 +2647,16 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
{
- kvm_release_pfn_clean(pfn);
- if (is_hwpoison_pfn(pfn)) {
+ /*
+ * Do not cache the mmio info caused by writing the readonly gfn
+ * into the spte otherwise read access on readonly gfn also can
+ * caused mmio page fault and treat it as mmio access.
+ * Return 1 to tell kvm to emulate it.
+ */
+ if (pfn == KVM_PFN_ERR_RO_FAULT)
+ return 1;
+
+ if (pfn == KVM_PFN_ERR_HWPOISON) {
kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
return 0;
}
@@ -3236,8 +3280,6 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
if (!async)
return false; /* *pfn has correct page already */
- put_page(pfn_to_page(*pfn));
-
if (!prefault && can_do_async_pf(vcpu)) {
trace_kvm_try_async_get_page(gva, gfn);
if (kvm_find_async_pf_gfn(vcpu, gfn)) {
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 7d7d0b9e23e..daff69e2115 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -116,10 +116,8 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
+ if (is_error_pfn(pfn))
return;
- }
hpa = pfn << PAGE_SHIFT;
if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
@@ -190,7 +188,6 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- struct kvm_memory_slot *slot;
unsigned long *rmapp;
u64 *sptep;
struct rmap_iterator iter;
@@ -198,8 +195,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
if (sp->role.direct || sp->unsync || sp->role.invalid)
return;
- slot = gfn_to_memslot(kvm, sp->gfn);
- rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
+ rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL);
for (sptep = rmap_get_first(*rmapp, &iter); sptep;
sptep = rmap_get_next(&iter)) {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index bb7cf01cae7..bf8c42bf50f 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -370,10 +370,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
- if (mmu_invalid_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
+ if (mmu_invalid_pfn(pfn))
return;
- }
/*
* we call mmu_set_spte() with host_writable = true because that
@@ -448,10 +446,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
gfn = gpte_to_gfn(gpte);
pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
pte_access & ACC_WRITE_MASK);
- if (mmu_invalid_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
+ if (mmu_invalid_pfn(pfn))
break;
- }
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
NULL, PT_PAGE_TABLE_LEVEL, gfn,
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 9b7ec1150ab..cfc258a6bf9 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -1,5 +1,5 @@
/*
- * Kernel-based Virtual Machine -- Performane Monitoring Unit support
+ * Kernel-based Virtual Machine -- Performance Monitoring Unit support
*
* Copyright 2011 Red Hat, Inc. and/or its affiliates.
*
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index baead950d6c..31be4a55744 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2063,7 +2063,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
if (svm->nested.intercept & 1ULL) {
/*
* The #vmexit can't be emulated here directly because this
- * code path runs with irqs and preemtion disabled. A
+ * code path runs with irqs and preemption disabled. A
* #vmexit emulation might sleep. Only signal request for
* the #vmexit here.
*/
@@ -2105,7 +2105,6 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
return kmap(page);
error:
- kvm_release_page_clean(page);
kvm_inject_gp(&svm->vcpu, 0);
return NULL;
@@ -2409,7 +2408,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
/*
* This function merges the msr permission bitmaps of kvm and the
- * nested vmcb. It is omptimized in that it only merges the parts where
+ * nested vmcb. It is optimized in that it only merges the parts where
* the kvm msr permission bitmap may contain zero bits
*/
int i;
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
deleted file mode 100644
index 6b85cc647f3..00000000000
--- a/arch/x86/kvm/timer.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * timer support
- *
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- */
-
-#include <linux/kvm_host.h>
-#include <linux/kvm.h>
-#include <linux/hrtimer.h>
-#include <linux/atomic.h>
-#include "kvm_timer.h"
-
-enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
-{
- struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
- struct kvm_vcpu *vcpu = ktimer->vcpu;
- wait_queue_head_t *q = &vcpu->wq;
-
- /*
- * There is a race window between reading and incrementing, but we do
- * not care about potentially losing timer events in the !reinject
- * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
- * in vcpu_enter_guest.
- */
- if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
- atomic_inc(&ktimer->pending);
- /* FIXME: this code should not know anything about vcpus */
- kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
- }
-
- if (waitqueue_active(q))
- wake_up_interruptible(q);
-
- if (ktimer->t_ops->is_periodic(ktimer)) {
- hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
- return HRTIMER_RESTART;
- } else
- return HRTIMER_NORESTART;
-}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c00f03de1b7..13e0296cea4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -596,10 +596,9 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
{
struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
- if (is_error_page(page)) {
- kvm_release_page_clean(page);
+ if (is_error_page(page))
return NULL;
- }
+
return page;
}
@@ -1343,7 +1342,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
guest_efer = vmx->vcpu.arch.efer;
/*
- * NX is emulated; LMA and LME handled by hardware; SCE meaninless
+ * NX is emulated; LMA and LME handled by hardware; SCE meaningless
* outside long mode
*/
ignore_bits = EFER_NX | EFER_SCE;
@@ -1991,7 +1990,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
#endif
CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
- CPU_BASED_RDPMC_EXITING |
+ CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
/*
* We can allow some features even when not supported by the
@@ -3254,7 +3253,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
* qemu binaries.
* IA32 arch specifies that at the time of processor reset the
* "Accessed" bit in the AR field of segment registers is 1. And qemu
- * is setting it to 0 in the usedland code. This causes invalid guest
+ * is setting it to 0 in the userland code. This causes invalid guest
* state vmexit when "unrestricted guest" mode is turned on.
* Fix for this setup issue in cpu_reset is being pushed in the qemu
* tree. Newer qemu binaries with that qemu fix would not need this
@@ -4439,7 +4438,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[2] = 0xc1;
}
-/* called to set cr0 as approriate for a mov-to-cr0 exit. */
+/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
{
if (to_vmx(vcpu)->nested.vmxon &&
@@ -6223,6 +6222,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long debugctlmsr;
if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -6262,6 +6262,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_set_interrupt_shadow(vcpu, 0);
atomic_switch_perf_msrs(vmx);
+ debugctlmsr = get_debugctlmsr();
vmx->__launched = vmx->loaded_vmcs->launched;
asm(
@@ -6363,6 +6364,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
#endif
);
+ /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
+ if (debugctlmsr)
+ update_debugctlmsr(debugctlmsr);
+
#ifndef CONFIG_X86_64
/*
* The sysexit path does not restore ds/es, so we must set them to
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dce75b76031..42bbf4187d2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -246,20 +246,14 @@ static void drop_user_return_notifiers(void *ignore)
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
{
- if (irqchip_in_kernel(vcpu->kvm))
- return vcpu->arch.apic_base;
- else
- return vcpu->arch.apic_base;
+ return vcpu->arch.apic_base;
}
EXPORT_SYMBOL_GPL(kvm_get_apic_base);
void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
{
/* TODO: reserve bits check */
- if (irqchip_in_kernel(vcpu->kvm))
- kvm_lapic_set_base(vcpu, data);
- else
- vcpu->arch.apic_base = data;
+ kvm_lapic_set_base(vcpu, data);
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);
@@ -1097,7 +1091,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
* For each generation, we track the original measured
* nanosecond time, offset, and write, so if TSCs are in
* sync, we can match exact offset, and if not, we can match
- * exact software computaion in compute_guest_tsc()
+ * exact software computation in compute_guest_tsc()
*
* These values are tracked in kvm->arch.cur_xxx variables.
*/
@@ -1140,6 +1134,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
unsigned long this_tsc_khz;
s64 kernel_ns, max_kernel_ns;
u64 tsc_timestamp;
+ u8 pvclock_flags;
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
@@ -1221,7 +1216,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_kernel_ns = kernel_ns;
vcpu->last_guest_tsc = tsc_timestamp;
- vcpu->hv_clock.flags = 0;
+
+ pvclock_flags = 0;
+ if (vcpu->pvclock_set_guest_stopped_request) {
+ pvclock_flags |= PVCLOCK_GUEST_STOPPED;
+ vcpu->pvclock_set_guest_stopped_request = false;
+ }
+
+ vcpu->hv_clock.flags = pvclock_flags;
/*
* The interface expects us to write an even number signaling that the
@@ -1504,7 +1506,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
{
gpa_t gpa = data & ~0x3f;
- /* Bits 2:5 are resrved, Should be zero */
+ /* Bits 2:5 are reserved, Should be zero */
if (data & 0x3c)
return 1;
@@ -1639,10 +1641,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
vcpu->arch.time_page =
gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
- if (is_error_page(vcpu->arch.time_page)) {
- kvm_release_page_clean(vcpu->arch.time_page);
+ if (is_error_page(vcpu->arch.time_page))
vcpu->arch.time_page = NULL;
- }
+
break;
}
case MSR_KVM_ASYNC_PF_EN:
@@ -1727,7 +1728,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
* Ignore all writes to this no longer documented MSR.
* Writes are only relevant for old K7 processors,
* all pre-dating SVM, but a recommended workaround from
- * AMD for these chips. It is possible to speicify the
+ * AMD for these chips. It is possible to specify the
* affected processor models on the command line, hence
* the need to ignore the workaround.
*/
@@ -2174,6 +2175,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_GET_TSC_KHZ:
case KVM_CAP_PCI_2_3:
case KVM_CAP_KVMCLOCK_CTRL:
+ case KVM_CAP_READONLY_MEM:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -2355,8 +2357,7 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
- kvm_apic_post_state_restore(vcpu);
+ kvm_apic_post_state_restore(vcpu, s);
update_cr8_intercept(vcpu);
return 0;
@@ -2632,11 +2633,9 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
*/
static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
{
- struct pvclock_vcpu_time_info *src = &vcpu->arch.hv_clock;
if (!vcpu->arch.time_page)
return -EINVAL;
- src->flags |= PVCLOCK_GUEST_STOPPED;
- mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT);
+ vcpu->arch.pvclock_set_guest_stopped_request = true;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
return 0;
}
@@ -3087,7 +3086,7 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
if (!kvm->arch.vpit)
return -ENXIO;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
- kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
+ kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return 0;
}
@@ -3170,6 +3169,16 @@ out:
return r;
}
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
+{
+ if (!irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+ irq_event->irq, irq_event->level);
+ return 0;
+}
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -3276,29 +3285,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
create_pit_unlock:
mutex_unlock(&kvm->slots_lock);
break;
- case KVM_IRQ_LINE_STATUS:
- case KVM_IRQ_LINE: {
- struct kvm_irq_level irq_event;
-
- r = -EFAULT;
- if (copy_from_user(&irq_event, argp, sizeof irq_event))
- goto out;
- r = -ENXIO;
- if (irqchip_in_kernel(kvm)) {
- __s32 status;
- status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
- irq_event.irq, irq_event.level);
- if (ioctl == KVM_IRQ_LINE_STATUS) {
- r = -EFAULT;
- irq_event.status = status;
- if (copy_to_user(argp, &irq_event,
- sizeof irq_event))
- goto out;
- }
- r = 0;
- }
- break;
- }
case KVM_GET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
struct kvm_irqchip *chip;
@@ -3959,10 +3945,8 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
goto emul_write;
page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- if (is_error_page(page)) {
- kvm_release_page_clean(page);
+ if (is_error_page(page))
goto emul_write;
- }
kaddr = kmap_atomic(page);
kaddr += offset_in_page(gpa);
@@ -4490,13 +4474,14 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
{
gpa_t gpa;
+ pfn_t pfn;
if (tdp_enabled)
return false;
/*
* if emulation was due to access to shadowed page table
- * and it failed try to unshadow page and re-entetr the
+ * and it failed try to unshadow page and re-enter the
* guest to let CPU execute the instruction.
*/
if (kvm_mmu_unprotect_page_virt(vcpu, gva))
@@ -4507,8 +4492,17 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
if (gpa == UNMAPPED_GVA)
return true; /* let cpu generate fault */
- if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
+ /*
+ * Do not retry the unhandleable instruction if it faults on the
+ * readonly host memory, otherwise it will goto a infinite loop:
+ * retry instruction -> write #PF -> emulation fail -> retry
+ * instruction -> ...
+ */
+ pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
+ if (!is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
return true;
+ }
return false;
}
@@ -4926,6 +4920,7 @@ int kvm_arch_init(void *opaque)
if (cpu_has_xsave)
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ kvm_lapic_init();
return 0;
out:
@@ -5592,7 +5587,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
/*
* We are here if userspace calls get_regs() in the middle of
* instruction emulation. Registers state needs to be copied
- * back from emulation context to vcpu. Usrapace shouldn't do
+ * back from emulation context to vcpu. Userspace shouldn't do
* that usually, but some bad designed PV devices (vmware
* backdoor interface) need this to work
*/
@@ -6121,7 +6116,7 @@ int kvm_arch_hardware_enable(void *garbage)
* as we reset last_host_tsc on all VCPUs to stop this from being
* called multiple times (one for each physical CPU bringup).
*
- * Platforms with unnreliable TSCs don't have to deal with this, they
+ * Platforms with unreliable TSCs don't have to deal with this, they
* will be compensated by the logic in vcpu_load, which sets the TSC to
* catchup mode. This will catchup all VCPUs to real time, but cannot
* guarantee that they stay in perfect synchronization.
@@ -6174,6 +6169,8 @@ bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
}
+struct static_key kvm_no_apic_vcpu __read_mostly;
+
int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
{
struct page *page;
@@ -6206,7 +6203,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
r = kvm_create_lapic(vcpu);
if (r < 0)
goto fail_mmu_destroy;
- }
+ } else
+ static_key_slow_inc(&kvm_no_apic_vcpu);
vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
GFP_KERNEL);
@@ -6246,6 +6244,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
kvm_mmu_destroy(vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
free_page((unsigned long)vcpu->arch.pio_data);
+ if (!irqchip_in_kernel(vcpu->kvm))
+ static_key_slow_dec(&kvm_no_apic_vcpu);
}
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
@@ -6317,10 +6317,18 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
{
int i;
- for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
- if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
- kvm_kvfree(free->arch.lpage_info[i]);
- free->arch.lpage_info[i] = NULL;
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
+ kvm_kvfree(free->arch.rmap[i]);
+ free->arch.rmap[i] = NULL;
+ }
+ if (i == 0)
+ continue;
+
+ if (!dont || free->arch.lpage_info[i - 1] !=
+ dont->arch.lpage_info[i - 1]) {
+ kvm_kvfree(free->arch.lpage_info[i - 1]);
+ free->arch.lpage_info[i - 1] = NULL;
}
}
}
@@ -6329,23 +6337,30 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
{
int i;
- for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
unsigned long ugfn;
int lpages;
- int level = i + 2;
+ int level = i + 1;
lpages = gfn_to_index(slot->base_gfn + npages - 1,
slot->base_gfn, level) + 1;
- slot->arch.lpage_info[i] =
- kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
- if (!slot->arch.lpage_info[i])
+ slot->arch.rmap[i] =
+ kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i]));
+ if (!slot->arch.rmap[i])
+ goto out_free;
+ if (i == 0)
+ continue;
+
+ slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
+ sizeof(*slot->arch.lpage_info[i - 1]));
+ if (!slot->arch.lpage_info[i - 1])
goto out_free;
if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
- slot->arch.lpage_info[i][0].write_count = 1;
+ slot->arch.lpage_info[i - 1][0].write_count = 1;
if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
- slot->arch.lpage_info[i][lpages - 1].write_count = 1;
+ slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
ugfn = slot->userspace_addr >> PAGE_SHIFT;
/*
* If the gfn and userspace address are not aligned wrt each
@@ -6357,16 +6372,21 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
unsigned long j;
for (j = 0; j < lpages; ++j)
- slot->arch.lpage_info[i][j].write_count = 1;
+ slot->arch.lpage_info[i - 1][j].write_count = 1;
}
}
return 0;
out_free:
- for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
- kvm_kvfree(slot->arch.lpage_info[i]);
- slot->arch.lpage_info[i] = NULL;
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ kvm_kvfree(slot->arch.rmap[i]);
+ slot->arch.rmap[i] = NULL;
+ if (i == 0)
+ continue;
+
+ kvm_kvfree(slot->arch.lpage_info[i - 1]);
+ slot->arch.lpage_info[i - 1] = NULL;
}
return -ENOMEM;
}
@@ -6385,10 +6405,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
map_flags = MAP_SHARED | MAP_ANONYMOUS;
/*To keep backward compatibility with older userspace,
- *x86 needs to hanlde !user_alloc case.
+ *x86 needs to handle !user_alloc case.
*/
if (!user_alloc) {
- if (npages && !old.rmap) {
+ if (npages && !old.npages) {
unsigned long userspace_addr;
userspace_addr = vm_mmap(NULL, 0,
@@ -6416,7 +6436,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
- if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
+ if (!user_alloc && !old.user_alloc && old.npages && !npages) {
int ret;
ret = vm_munmap(old.userspace_addr,
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 3d1134ddb88..2b5219c12ac 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -124,4 +124,5 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
extern u64 host_xcr0;
+extern struct static_key kvm_no_apic_vcpu;
#endif
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 2ce09aa7d3b..d808694673f 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -101,9 +101,13 @@ struct kvm_userspace_memory_region {
__u64 userspace_addr; /* start of the userspace allocated memory */
};
-/* for kvm_memory_region::flags */
-#define KVM_MEM_LOG_DIRTY_PAGES 1UL
-#define KVM_MEMSLOT_INVALID (1UL << 1)
+/*
+ * The bit 0 ~ bit 15 of kvm_memory_region::flags are visible for userspace,
+ * other bits are reserved for kvm internal use which are defined in
+ * include/linux/kvm_host.h.
+ */
+#define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0)
+#define KVM_MEM_READONLY (1UL << 1)
/* for KVM_IRQ_LINE */
struct kvm_irq_level {
@@ -618,6 +622,9 @@ struct kvm_ppc_smmu_info {
#define KVM_CAP_PPC_GET_SMMU_INFO 78
#define KVM_CAP_S390_COW 79
#define KVM_CAP_PPC_ALLOC_HTAB 80
+#ifdef __KVM_HAVE_READONLY_MEM
+#define KVM_CAP_READONLY_MEM 81
+#endif
#ifdef KVM_CAP_IRQ_ROUTING
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b70b48b0109..5972c9845dd 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -21,6 +21,7 @@
#include <linux/slab.h>
#include <linux/rcupdate.h>
#include <linux/ratelimit.h>
+#include <linux/err.h>
#include <asm/signal.h>
#include <linux/kvm.h>
@@ -35,6 +36,13 @@
#endif
/*
+ * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used
+ * in kvm, other bits are visible for userspace which are defined in
+ * include/linux/kvm_h.
+ */
+#define KVM_MEMSLOT_INVALID (1UL << 16)
+
+/*
* If we support unaligned MMIO, at most one fragment will be split into two:
*/
#ifdef KVM_UNALIGNED_MMIO
@@ -49,6 +57,47 @@
(KVM_MMIO_SIZE / KVM_USER_MMIO_SIZE + KVM_EXTRA_MMIO_FRAGMENTS)
/*
+ * For the normal pfn, the highest 12 bits should be zero,
+ * so we can mask these bits to indicate the error.
+ */
+#define KVM_PFN_ERR_MASK (0xfffULL << 52)
+
+#define KVM_PFN_ERR_FAULT (KVM_PFN_ERR_MASK)
+#define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1)
+#define KVM_PFN_ERR_BAD (KVM_PFN_ERR_MASK + 2)
+#define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 3)
+
+static inline bool is_error_pfn(pfn_t pfn)
+{
+ return !!(pfn & KVM_PFN_ERR_MASK);
+}
+
+static inline bool is_noslot_pfn(pfn_t pfn)
+{
+ return pfn == KVM_PFN_ERR_BAD;
+}
+
+static inline bool is_invalid_pfn(pfn_t pfn)
+{
+ return !is_noslot_pfn(pfn) && is_error_pfn(pfn);
+}
+
+#define KVM_HVA_ERR_BAD (PAGE_OFFSET)
+#define KVM_HVA_ERR_RO_BAD (PAGE_OFFSET + PAGE_SIZE)
+
+static inline bool kvm_is_error_hva(unsigned long addr)
+{
+ return addr >= PAGE_OFFSET;
+}
+
+#define KVM_ERR_PTR_BAD_PAGE (ERR_PTR(-ENOENT))
+
+static inline bool is_error_page(struct page *page)
+{
+ return IS_ERR(page);
+}
+
+/*
* vcpu->requests bit members
*/
#define KVM_REQ_TLB_FLUSH 0
@@ -183,6 +232,18 @@ struct kvm_vcpu {
} async_pf;
#endif
+#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+ /*
+ * Cpu relax intercept or pause loop exit optimization
+ * in_spin_loop: set when a vcpu does a pause loop exit
+ * or cpu relax intercepted.
+ * dy_eligible: indicates whether vcpu is eligible for directed yield.
+ */
+ struct {
+ bool in_spin_loop;
+ bool dy_eligible;
+ } spin_loop;
+#endif
struct kvm_vcpu_arch arch;
};
@@ -201,7 +262,6 @@ struct kvm_memory_slot {
gfn_t base_gfn;
unsigned long npages;
unsigned long flags;
- unsigned long *rmap;
unsigned long *dirty_bitmap;
struct kvm_arch_memory_slot arch;
unsigned long userspace_addr;
@@ -378,23 +438,6 @@ id_to_memslot(struct kvm_memslots *slots, int id)
return slot;
}
-#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
-#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
-static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
-
-extern struct page *bad_page;
-extern struct page *fault_page;
-
-extern pfn_t bad_pfn;
-extern pfn_t fault_pfn;
-
-int is_error_page(struct page *page);
-int is_error_pfn(pfn_t pfn);
-int is_hwpoison_pfn(pfn_t pfn);
-int is_fault_pfn(pfn_t pfn);
-int is_noslot_pfn(pfn_t pfn);
-int is_invalid_pfn(pfn_t pfn);
-int kvm_is_error_hva(unsigned long addr);
int kvm_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
int user_alloc);
@@ -422,21 +465,22 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
+unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_release_page_clean(struct page *page);
void kvm_release_page_dirty(struct page *page);
void kvm_set_page_dirty(struct page *page);
void kvm_set_page_accessed(struct page *page);
-pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
bool write_fault, bool *writable);
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
bool *writable);
-pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn);
-void kvm_release_pfn_dirty(pfn_t);
+pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
+pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
+
+void kvm_release_pfn_dirty(pfn_t pfn);
void kvm_release_pfn_clean(pfn_t pfn);
void kvm_set_pfn_dirty(pfn_t pfn);
void kvm_set_pfn_accessed(pfn_t pfn);
@@ -494,6 +538,7 @@ int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
struct
kvm_userspace_memory_region *mem,
int user_alloc);
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level);
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg);
@@ -573,7 +618,7 @@ void kvm_arch_sync_events(struct kvm *kvm);
int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
-int kvm_is_mmio_pfn(pfn_t pfn);
+bool kvm_is_mmio_pfn(pfn_t pfn);
struct kvm_irq_ack_notifier {
struct hlist_node link;
@@ -740,10 +785,12 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
(base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
}
-static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
- gfn_t gfn)
+static inline gfn_t
+hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
{
- return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
+ gfn_t gfn_offset = (hva - slot->userspace_addr) >> PAGE_SHIFT;
+
+ return slot->base_gfn + gfn_offset;
}
static inline gpa_t gfn_to_gpa(gfn_t gfn)
@@ -899,5 +946,32 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
}
}
+#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+
+static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
+{
+ vcpu->spin_loop.in_spin_loop = val;
+}
+static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
+{
+ vcpu->spin_loop.dy_eligible = val;
+}
+
+#else /* !CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
+
+static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
+{
+}
+
+static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
+{
+}
+
+static inline bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
+{
+ return true;
+}
+
+#endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
#endif
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 43049192b5e..60f48fa0fd0 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -118,6 +118,7 @@ void jump_label_rate_limit(struct static_key_deferred *key,
key->timeout = rl;
INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
}
+EXPORT_SYMBOL_GPL(jump_label_rate_limit);
static int addr_conflict(struct jump_entry *entry, void *start, void *end)
{
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 28694f4a913..d01b24b72c6 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -21,3 +21,6 @@ config KVM_ASYNC_PF
config HAVE_KVM_MSI
bool
+
+config HAVE_KVM_CPU_RELAX_INTERCEPT
+ bool
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 74268b4c2ee..ea475cd0351 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -111,8 +111,8 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
list_entry(vcpu->async_pf.done.next,
typeof(*work), link);
list_del(&work->link);
- if (work->page)
- put_page(work->page);
+ if (!is_error_page(work->page))
+ kvm_release_page_clean(work->page);
kmem_cache_free(async_pf_cache, work);
}
spin_unlock(&vcpu->async_pf.lock);
@@ -138,8 +138,8 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
list_del(&work->queue);
vcpu->async_pf.queued--;
- if (work->page)
- put_page(work->page);
+ if (!is_error_page(work->page))
+ kvm_release_page_clean(work->page);
kmem_cache_free(async_pf_cache, work);
}
}
@@ -203,8 +203,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
if (!work)
return -ENOMEM;
- work->page = bad_page;
- get_page(bad_page);
+ work->page = KVM_ERR_PTR_BAD_PAGE;
INIT_LIST_HEAD(&work->queue); /* for list_del to work */
spin_lock(&vcpu->async_pf.lock);
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index ef61d529a6c..cfb7e4d52dc 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -197,28 +197,29 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
u32 old_irr;
u32 mask = 1 << irq;
union kvm_ioapic_redirect_entry entry;
- int ret = 1;
+ int ret, irq_level;
+
+ BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
spin_lock(&ioapic->lock);
old_irr = ioapic->irr;
- if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
- int irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
- irq_source_id, level);
- entry = ioapic->redirtbl[irq];
- irq_level ^= entry.fields.polarity;
- if (!irq_level)
- ioapic->irr &= ~mask;
- else {
- int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
- ioapic->irr |= mask;
- if ((edge && old_irr != ioapic->irr) ||
- (!edge && !entry.fields.remote_irr))
- ret = ioapic_service(ioapic, irq);
- else
- ret = 0; /* report coalesced interrupt */
- }
- trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
+ irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
+ irq_source_id, level);
+ entry = ioapic->redirtbl[irq];
+ irq_level ^= entry.fields.polarity;
+ if (!irq_level) {
+ ioapic->irr &= ~mask;
+ ret = 1;
+ } else {
+ int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+ ioapic->irr |= mask;
+ if ((edge && old_irr != ioapic->irr) ||
+ (!edge && !entry.fields.remote_irr))
+ ret = ioapic_service(ioapic, irq);
+ else
+ ret = 0; /* report coalesced interrupt */
}
+ trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
spin_unlock(&ioapic->lock);
return ret;
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index e9fff9830bf..037cb6730e6 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -42,13 +42,13 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
static void kvm_iommu_put_pages(struct kvm *kvm,
gfn_t base_gfn, unsigned long npages);
-static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
- gfn_t gfn, unsigned long size)
+static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
+ unsigned long size)
{
gfn_t end_gfn;
pfn_t pfn;
- pfn = gfn_to_pfn_memslot(kvm, slot, gfn);
+ pfn = gfn_to_pfn_memslot(slot, gfn);
end_gfn = gfn + (size >> PAGE_SHIFT);
gfn += 1;
@@ -56,7 +56,7 @@ static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
return pfn;
while (gfn < end_gfn)
- gfn_to_pfn_memslot(kvm, slot, gfn++);
+ gfn_to_pfn_memslot(slot, gfn++);
return pfn;
}
@@ -105,7 +105,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
* Pin all pages we are about to map in memory. This is
* important because we unmap and unpin in 4kb steps later.
*/
- pfn = kvm_pin_pages(kvm, slot, gfn, page_size);
+ pfn = kvm_pin_pages(slot, gfn, page_size);
if (is_error_pfn(pfn)) {
gfn += 1;
continue;
@@ -300,6 +300,12 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
/* Get physical address */
phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
+
+ if (!phys) {
+ gfn++;
+ continue;
+ }
+
pfn = phys >> PAGE_SHIFT;
/* Unmap address from IO address space */
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 83402d74a76..7118be0f2f2 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -321,11 +321,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
switch (ue->u.irqchip.irqchip) {
case KVM_IRQCHIP_PIC_MASTER:
e->set = kvm_set_pic_irq;
- max_pin = 16;
+ max_pin = PIC_NUM_PINS;
break;
case KVM_IRQCHIP_PIC_SLAVE:
e->set = kvm_set_pic_irq;
- max_pin = 16;
+ max_pin = PIC_NUM_PINS;
delta = 8;
break;
case KVM_IRQCHIP_IOAPIC:
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 246852397e3..3416f8a31f6 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -100,13 +100,7 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
static bool largepages_enabled = true;
-static struct page *hwpoison_page;
-static pfn_t hwpoison_pfn;
-
-struct page *fault_page;
-pfn_t fault_pfn;
-
-inline int kvm_is_mmio_pfn(pfn_t pfn)
+bool kvm_is_mmio_pfn(pfn_t pfn)
{
if (pfn_valid(pfn)) {
int reserved;
@@ -236,6 +230,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
}
vcpu->run = page_address(page);
+ kvm_vcpu_set_in_spin_loop(vcpu, false);
+ kvm_vcpu_set_dy_eligible(vcpu, false);
+
r = kvm_arch_vcpu_init(vcpu);
if (r < 0)
goto fail_free_run;
@@ -332,8 +329,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
* count is also read inside the mmu_lock critical section.
*/
kvm->mmu_notifier_count++;
- for (; start < end; start += PAGE_SIZE)
- need_tlb_flush |= kvm_unmap_hva(kvm, start);
+ need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
need_tlb_flush |= kvm->tlbs_dirty;
/* we've to flush the tlb before the pages can be freed */
if (need_tlb_flush)
@@ -551,16 +547,12 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
struct kvm_memory_slot *dont)
{
- if (!dont || free->rmap != dont->rmap)
- vfree(free->rmap);
-
if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
kvm_destroy_dirty_bitmap(free);
kvm_arch_free_memslot(free, dont);
free->npages = 0;
- free->rmap = NULL;
}
void kvm_free_physmem(struct kvm *kvm)
@@ -686,6 +678,20 @@ void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new)
slots->generation++;
}
+static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
+{
+ u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
+
+#ifdef KVM_CAP_READONLY_MEM
+ valid_flags |= KVM_MEM_READONLY;
+#endif
+
+ if (mem->flags & ~valid_flags)
+ return -EINVAL;
+
+ return 0;
+}
+
/*
* Allocate some memory and give it an address in the guest physical address
* space.
@@ -706,6 +712,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
struct kvm_memory_slot old, new;
struct kvm_memslots *slots, *old_memslots;
+ r = check_memory_region_flags(mem);
+ if (r)
+ goto out;
+
r = -EINVAL;
/* General sanity checks */
if (mem->memory_size & (PAGE_SIZE - 1))
@@ -769,11 +779,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
if (npages && !old.npages) {
new.user_alloc = user_alloc;
new.userspace_addr = mem->userspace_addr;
-#ifndef CONFIG_S390
- new.rmap = vzalloc(npages * sizeof(*new.rmap));
- if (!new.rmap)
- goto out_free;
-#endif /* not defined CONFIG_S390 */
+
if (kvm_arch_create_memslot(&new, npages))
goto out_free;
}
@@ -832,7 +838,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
/* actual memory is freed via old in kvm_free_physmem_slot below */
if (!npages) {
- new.rmap = NULL;
new.dirty_bitmap = NULL;
memset(&new.arch, 0, sizeof(new.arch));
}
@@ -932,53 +937,6 @@ void kvm_disable_largepages(void)
}
EXPORT_SYMBOL_GPL(kvm_disable_largepages);
-int is_error_page(struct page *page)
-{
- return page == bad_page || page == hwpoison_page || page == fault_page;
-}
-EXPORT_SYMBOL_GPL(is_error_page);
-
-int is_error_pfn(pfn_t pfn)
-{
- return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
-}
-EXPORT_SYMBOL_GPL(is_error_pfn);
-
-int is_hwpoison_pfn(pfn_t pfn)
-{
- return pfn == hwpoison_pfn;
-}
-EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
-
-int is_fault_pfn(pfn_t pfn)
-{
- return pfn == fault_pfn;
-}
-EXPORT_SYMBOL_GPL(is_fault_pfn);
-
-int is_noslot_pfn(pfn_t pfn)
-{
- return pfn == bad_pfn;
-}
-EXPORT_SYMBOL_GPL(is_noslot_pfn);
-
-int is_invalid_pfn(pfn_t pfn)
-{
- return pfn == hwpoison_pfn || pfn == fault_pfn;
-}
-EXPORT_SYMBOL_GPL(is_invalid_pfn);
-
-static inline unsigned long bad_hva(void)
-{
- return PAGE_OFFSET;
-}
-
-int kvm_is_error_hva(unsigned long addr)
-{
- return addr == bad_hva();
-}
-EXPORT_SYMBOL_GPL(kvm_is_error_hva);
-
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
{
return __gfn_to_memslot(kvm_memslots(kvm), gfn);
@@ -1021,28 +979,68 @@ out:
return size;
}
-static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
- gfn_t *nr_pages)
+static bool memslot_is_readonly(struct kvm_memory_slot *slot)
+{
+ return slot->flags & KVM_MEM_READONLY;
+}
+
+static unsigned long __gfn_to_hva_memslot(struct kvm_memory_slot *slot,
+ gfn_t gfn)
+{
+ return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
+}
+
+static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
+ gfn_t *nr_pages, bool write)
{
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
- return bad_hva();
+ return KVM_HVA_ERR_BAD;
+
+ if (memslot_is_readonly(slot) && write)
+ return KVM_HVA_ERR_RO_BAD;
if (nr_pages)
*nr_pages = slot->npages - (gfn - slot->base_gfn);
- return gfn_to_hva_memslot(slot, gfn);
+ return __gfn_to_hva_memslot(slot, gfn);
+}
+
+static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
+ gfn_t *nr_pages)
+{
+ return __gfn_to_hva_many(slot, gfn, nr_pages, true);
}
+unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
+ gfn_t gfn)
+{
+ return gfn_to_hva_many(slot, gfn, NULL);
+}
+EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
+
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
{
return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_hva);
-static pfn_t get_fault_pfn(void)
+/*
+ * The hva returned by this function is only allowed to be read.
+ * It should pair with kvm_read_hva() or kvm_read_hva_atomic().
+ */
+static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn)
{
- get_page(fault_page);
- return fault_pfn;
+ return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false);
+}
+
+static int kvm_read_hva(void *data, void __user *hva, int len)
+{
+ return __copy_from_user(data, hva, len);
+}
+
+static int kvm_read_hva_atomic(void *data, void __user *hva, int len)
+{
+ return __copy_from_user_inatomic(data, hva, len);
}
int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
@@ -1065,108 +1063,186 @@ static inline int check_user_page_hwpoison(unsigned long addr)
return rc == -EHWPOISON;
}
-static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
- bool *async, bool write_fault, bool *writable)
+/*
+ * The atomic path to get the writable pfn which will be stored in @pfn,
+ * true indicates success, otherwise false is returned.
+ */
+static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
+ bool write_fault, bool *writable, pfn_t *pfn)
{
struct page *page[1];
- int npages = 0;
- pfn_t pfn;
-
- /* we can do it either atomically or asynchronously, not both */
- BUG_ON(atomic && async);
-
- BUG_ON(!write_fault && !writable);
+ int npages;
- if (writable)
- *writable = true;
+ if (!(async || atomic))
+ return false;
- if (atomic || async)
- npages = __get_user_pages_fast(addr, 1, 1, page);
+ /*
+ * Fast pin a writable pfn only if it is a write fault request
+ * or the caller allows to map a writable pfn for a read fault
+ * request.
+ */
+ if (!(write_fault || writable))
+ return false;
- if (unlikely(npages != 1) && !atomic) {
- might_sleep();
+ npages = __get_user_pages_fast(addr, 1, 1, page);
+ if (npages == 1) {
+ *pfn = page_to_pfn(page[0]);
if (writable)
- *writable = write_fault;
-
- if (async) {
- down_read(&current->mm->mmap_sem);
- npages = get_user_page_nowait(current, current->mm,
- addr, write_fault, page);
- up_read(&current->mm->mmap_sem);
- } else
- npages = get_user_pages_fast(addr, 1, write_fault,
- page);
-
- /* map read fault as writable if possible */
- if (unlikely(!write_fault) && npages == 1) {
- struct page *wpage[1];
-
- npages = __get_user_pages_fast(addr, 1, 1, wpage);
- if (npages == 1) {
- *writable = true;
- put_page(page[0]);
- page[0] = wpage[0];
- }
- npages = 1;
- }
+ *writable = true;
+ return true;
}
- if (unlikely(npages != 1)) {
- struct vm_area_struct *vma;
+ return false;
+}
- if (atomic)
- return get_fault_pfn();
+/*
+ * The slow path to get the pfn of the specified host virtual address,
+ * 1 indicates success, -errno is returned if error is detected.
+ */
+static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
+ bool *writable, pfn_t *pfn)
+{
+ struct page *page[1];
+ int npages = 0;
- down_read(&current->mm->mmap_sem);
- if (npages == -EHWPOISON ||
- (!async && check_user_page_hwpoison(addr))) {
- up_read(&current->mm->mmap_sem);
- get_page(hwpoison_page);
- return page_to_pfn(hwpoison_page);
- }
+ might_sleep();
- vma = find_vma_intersection(current->mm, addr, addr+1);
-
- if (vma == NULL)
- pfn = get_fault_pfn();
- else if ((vma->vm_flags & VM_PFNMAP)) {
- pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
- vma->vm_pgoff;
- BUG_ON(!kvm_is_mmio_pfn(pfn));
- } else {
- if (async && (vma->vm_flags & VM_WRITE))
- *async = true;
- pfn = get_fault_pfn();
- }
+ if (writable)
+ *writable = write_fault;
+
+ if (async) {
+ down_read(&current->mm->mmap_sem);
+ npages = get_user_page_nowait(current, current->mm,
+ addr, write_fault, page);
up_read(&current->mm->mmap_sem);
} else
- pfn = page_to_pfn(page[0]);
+ npages = get_user_pages_fast(addr, 1, write_fault,
+ page);
+ if (npages != 1)
+ return npages;
+
+ /* map read fault as writable if possible */
+ if (unlikely(!write_fault) && writable) {
+ struct page *wpage[1];
+
+ npages = __get_user_pages_fast(addr, 1, 1, wpage);
+ if (npages == 1) {
+ *writable = true;
+ put_page(page[0]);
+ page[0] = wpage[0];
+ }
+ npages = 1;
+ }
+ *pfn = page_to_pfn(page[0]);
+ return npages;
+}
+
+static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
+{
+ if (unlikely(!(vma->vm_flags & VM_READ)))
+ return false;
+
+ if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
+ return false;
+
+ return true;
+}
+
+/*
+ * Pin guest page in memory and return its pfn.
+ * @addr: host virtual address which maps memory to the guest
+ * @atomic: whether this function can sleep
+ * @async: whether this function need to wait IO complete if the
+ * host page is not in the memory
+ * @write_fault: whether we should get a writable host page
+ * @writable: whether it allows to map a writable host page for !@write_fault
+ *
+ * The function will map a writable host page for these two cases:
+ * 1): @write_fault = true
+ * 2): @write_fault = false && @writable, @writable will tell the caller
+ * whether the mapping is writable.
+ */
+static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
+ bool write_fault, bool *writable)
+{
+ struct vm_area_struct *vma;
+ pfn_t pfn = 0;
+ int npages;
+
+ /* we can do it either atomically or asynchronously, not both */
+ BUG_ON(atomic && async);
+
+ if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn))
+ return pfn;
+
+ if (atomic)
+ return KVM_PFN_ERR_FAULT;
+
+ npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
+ if (npages == 1)
+ return pfn;
+
+ down_read(&current->mm->mmap_sem);
+ if (npages == -EHWPOISON ||
+ (!async && check_user_page_hwpoison(addr))) {
+ pfn = KVM_PFN_ERR_HWPOISON;
+ goto exit;
+ }
+
+ vma = find_vma_intersection(current->mm, addr, addr + 1);
+
+ if (vma == NULL)
+ pfn = KVM_PFN_ERR_FAULT;
+ else if ((vma->vm_flags & VM_PFNMAP)) {
+ pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+ vma->vm_pgoff;
+ BUG_ON(!kvm_is_mmio_pfn(pfn));
+ } else {
+ if (async && vma_is_valid(vma, write_fault))
+ *async = true;
+ pfn = KVM_PFN_ERR_FAULT;
+ }
+exit:
+ up_read(&current->mm->mmap_sem);
return pfn;
}
-pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
+static pfn_t
+__gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
+ bool *async, bool write_fault, bool *writable)
{
- return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
+ unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
+
+ if (addr == KVM_HVA_ERR_RO_BAD)
+ return KVM_PFN_ERR_RO_FAULT;
+
+ if (kvm_is_error_hva(addr))
+ return KVM_PFN_ERR_BAD;
+
+ /* Do not map writable pfn in the readonly memslot. */
+ if (writable && memslot_is_readonly(slot)) {
+ *writable = false;
+ writable = NULL;
+ }
+
+ return hva_to_pfn(addr, atomic, async, write_fault,
+ writable);
}
-EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
bool write_fault, bool *writable)
{
- unsigned long addr;
+ struct kvm_memory_slot *slot;
if (async)
*async = false;
- addr = gfn_to_hva(kvm, gfn);
- if (kvm_is_error_hva(addr)) {
- get_page(bad_page);
- return page_to_pfn(bad_page);
- }
+ slot = gfn_to_memslot(kvm, gfn);
- return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
+ return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault,
+ writable);
}
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
@@ -1195,12 +1271,16 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
-pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn)
+pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
+}
+
+pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
{
- unsigned long addr = gfn_to_hva_memslot(slot, gfn);
- return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
+ return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
int nr_pages)
@@ -1219,30 +1299,42 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
}
EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
+static struct page *kvm_pfn_to_page(pfn_t pfn)
+{
+ if (is_error_pfn(pfn))
+ return KVM_ERR_PTR_BAD_PAGE;
+
+ if (kvm_is_mmio_pfn(pfn)) {
+ WARN_ON(1);
+ return KVM_ERR_PTR_BAD_PAGE;
+ }
+
+ return pfn_to_page(pfn);
+}
+
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
pfn_t pfn;
pfn = gfn_to_pfn(kvm, gfn);
- if (!kvm_is_mmio_pfn(pfn))
- return pfn_to_page(pfn);
- WARN_ON(kvm_is_mmio_pfn(pfn));
-
- get_page(bad_page);
- return bad_page;
+ return kvm_pfn_to_page(pfn);
}
EXPORT_SYMBOL_GPL(gfn_to_page);
void kvm_release_page_clean(struct page *page)
{
+ WARN_ON(is_error_page(page));
+
kvm_release_pfn_clean(page_to_pfn(page));
}
EXPORT_SYMBOL_GPL(kvm_release_page_clean);
void kvm_release_pfn_clean(pfn_t pfn)
{
+ WARN_ON(is_error_pfn(pfn));
+
if (!kvm_is_mmio_pfn(pfn))
put_page(pfn_to_page(pfn));
}
@@ -1250,6 +1342,8 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
void kvm_release_page_dirty(struct page *page)
{
+ WARN_ON(is_error_page(page));
+
kvm_release_pfn_dirty(page_to_pfn(page));
}
EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
@@ -1305,10 +1399,10 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
int r;
unsigned long addr;
- addr = gfn_to_hva(kvm, gfn);
+ addr = gfn_to_hva_read(kvm, gfn);
if (kvm_is_error_hva(addr))
return -EFAULT;
- r = __copy_from_user(data, (void __user *)addr + offset, len);
+ r = kvm_read_hva(data, (void __user *)addr + offset, len);
if (r)
return -EFAULT;
return 0;
@@ -1343,11 +1437,11 @@ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
gfn_t gfn = gpa >> PAGE_SHIFT;
int offset = offset_in_page(gpa);
- addr = gfn_to_hva(kvm, gfn);
+ addr = gfn_to_hva_read(kvm, gfn);
if (kvm_is_error_hva(addr))
return -EFAULT;
pagefault_disable();
- r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
+ r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len);
pagefault_enable();
if (r)
return -EFAULT;
@@ -1580,6 +1674,43 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
}
EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
+#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+/*
+ * Helper that checks whether a VCPU is eligible for directed yield.
+ * Most eligible candidate to yield is decided by following heuristics:
+ *
+ * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
+ * (preempted lock holder), indicated by @in_spin_loop.
+ * Set at the beiginning and cleared at the end of interception/PLE handler.
+ *
+ * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
+ * chance last time (mostly it has become eligible now since we have probably
+ * yielded to lockholder in last iteration. This is done by toggling
+ * @dy_eligible each time a VCPU checked for eligibility.)
+ *
+ * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
+ * to preempted lock-holder could result in wrong VCPU selection and CPU
+ * burning. Giving priority for a potential lock-holder increases lock
+ * progress.
+ *
+ * Since algorithm is based on heuristics, accessing another VCPU data without
+ * locking does not harm. It may result in trying to yield to same VCPU, fail
+ * and continue with next VCPU and so on.
+ */
+bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
+{
+ bool eligible;
+
+ eligible = !vcpu->spin_loop.in_spin_loop ||
+ (vcpu->spin_loop.in_spin_loop &&
+ vcpu->spin_loop.dy_eligible);
+
+ if (vcpu->spin_loop.in_spin_loop)
+ kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
+
+ return eligible;
+}
+#endif
void kvm_vcpu_on_spin(struct kvm_vcpu *me)
{
struct kvm *kvm = me->kvm;
@@ -1589,6 +1720,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
int pass;
int i;
+ kvm_vcpu_set_in_spin_loop(me, true);
/*
* We boost the priority of a VCPU that is runnable but not
* currently running, because it got preempted by something
@@ -1607,6 +1739,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
continue;
if (waitqueue_active(&vcpu->wq))
continue;
+ if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
+ continue;
if (kvm_vcpu_yield_to(vcpu)) {
kvm->last_boosted_vcpu = i;
yielded = 1;
@@ -1614,6 +1748,10 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
}
}
}
+ kvm_vcpu_set_in_spin_loop(me, false);
+
+ /* Ensure vcpu is not eligible during next spinloop */
+ kvm_vcpu_set_dy_eligible(me, false);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
@@ -2093,6 +2231,29 @@ static long kvm_vm_ioctl(struct file *filp,
break;
}
#endif
+#ifdef __KVM_HAVE_IRQ_LINE
+ case KVM_IRQ_LINE_STATUS:
+ case KVM_IRQ_LINE: {
+ struct kvm_irq_level irq_event;
+
+ r = -EFAULT;
+ if (copy_from_user(&irq_event, argp, sizeof irq_event))
+ goto out;
+
+ r = kvm_vm_ioctl_irq_line(kvm, &irq_event);
+ if (r)
+ goto out;
+
+ r = -EFAULT;
+ if (ioctl == KVM_IRQ_LINE_STATUS) {
+ if (copy_to_user(argp, &irq_event, sizeof irq_event))
+ goto out;
+ }
+
+ r = 0;
+ break;
+ }
+#endif
default:
r = kvm_arch_vm_ioctl(filp, ioctl, arg);
if (r == -ENOTTY)
@@ -2697,9 +2858,6 @@ static struct syscore_ops kvm_syscore_ops = {
.resume = kvm_resume,
};
-struct page *bad_page;
-pfn_t bad_pfn;
-
static inline
struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
{
@@ -2731,33 +2889,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
if (r)
goto out_fail;
- bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
- if (bad_page == NULL) {
- r = -ENOMEM;
- goto out;
- }
-
- bad_pfn = page_to_pfn(bad_page);
-
- hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
- if (hwpoison_page == NULL) {
- r = -ENOMEM;
- goto out_free_0;
- }
-
- hwpoison_pfn = page_to_pfn(hwpoison_page);
-
- fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
- if (fault_page == NULL) {
- r = -ENOMEM;
- goto out_free_0;
- }
-
- fault_pfn = page_to_pfn(fault_page);
-
if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
r = -ENOMEM;
goto out_free_0;
@@ -2832,12 +2963,6 @@ out_free_1:
out_free_0a:
free_cpumask_var(cpus_hardware_enabled);
out_free_0:
- if (fault_page)
- __free_page(fault_page);
- if (hwpoison_page)
- __free_page(hwpoison_page);
- __free_page(bad_page);
-out:
kvm_arch_exit();
out_fail:
return r;
@@ -2857,8 +2982,5 @@ void kvm_exit(void)
kvm_arch_hardware_unsetup();
kvm_arch_exit();
free_cpumask_var(cpus_hardware_enabled);
- __free_page(fault_page);
- __free_page(hwpoison_page);
- __free_page(bad_page);
}
EXPORT_SYMBOL_GPL(kvm_exit);