diff options
62 files changed, 3006 insertions, 1466 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index bf33aaa4c59..f6ec3a92e62 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -857,7 +857,8 @@ struct kvm_userspace_memory_region { }; /* for kvm_memory_region::flags */ -#define KVM_MEM_LOG_DIRTY_PAGES 1UL +#define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) +#define KVM_MEM_READONLY (1UL << 1) This ioctl allows the user to create or modify a guest physical memory slot. When changing an existing slot, it may be moved in the guest @@ -873,14 +874,17 @@ It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr be identical. This allows large pages in the guest to be backed by large pages in the host. -The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which -instructs kvm to keep track of writes to memory within the slot. See -the KVM_GET_DIRTY_LOG ioctl. +The flags field supports two flag, KVM_MEM_LOG_DIRTY_PAGES, which instructs +kvm to keep track of writes to memory within the slot. See KVM_GET_DIRTY_LOG +ioctl. The KVM_CAP_READONLY_MEM capability indicates the availability of the +KVM_MEM_READONLY flag. When this flag is set for a memory region, KVM only +allows read accesses. Writes will be posted to userspace as KVM_EXIT_MMIO +exits. -When the KVM_CAP_SYNC_MMU capability, changes in the backing of the memory -region are automatically reflected into the guest. For example, an mmap() -that affects the region will be made visible immediately. Another example -is madvise(MADV_DROP). +When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of +the memory region are automatically reflected into the guest. For example, an +mmap() that affects the region will be made visible immediately. Another +example is madvise(MADV_DROP). It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl. The KVM_SET_MEMORY_REGION does not allow fine grained control over memory @@ -1946,6 +1950,19 @@ the guest using the specified gsi pin. The irqfd is removed using the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd and kvm_irqfd.gsi. +With KVM_CAP_IRQFD_RESAMPLE, KVM_IRQFD supports a de-assert and notify +mechanism allowing emulation of level-triggered, irqfd-based +interrupts. When KVM_IRQFD_FLAG_RESAMPLE is set the user must pass an +additional eventfd in the kvm_irqfd.resamplefd field. When operating +in resample mode, posting of an interrupt through kvm_irq.fd asserts +the specified gsi in the irqchip. When the irqchip is resampled, such +as from an EOI, the gsi is de-asserted and the user is notifed via +kvm_irqfd.resamplefd. It is the user's responsibility to re-queue +the interrupt if the device making use of it still requires service. +Note that closing the resamplefd is not sufficient to disable the +irqfd. The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment +and need not be specified with KVM_IRQFD_FLAG_DEASSIGN. + 4.76 KVM_PPC_ALLOCATE_HTAB Capability: KVM_CAP_PPC_ALLOC_HTAB diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt new file mode 100644 index 00000000000..ea113b5d87a --- /dev/null +++ b/Documentation/virtual/kvm/hypercalls.txt @@ -0,0 +1,66 @@ +Linux KVM Hypercall: +=================== +X86: + KVM Hypercalls have a three-byte sequence of either the vmcall or the vmmcall + instruction. The hypervisor can replace it with instructions that are + guaranteed to be supported. + + Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively. + The hypercall number should be placed in rax and the return value will be + placed in rax. No other registers will be clobbered unless explicitly stated + by the particular hypercall. + +S390: + R2-R7 are used for parameters 1-6. In addition, R1 is used for hypercall + number. The return value is written to R2. + + S390 uses diagnose instruction as hypercall (0x500) along with hypercall + number in R1. + + PowerPC: + It uses R3-R10 and hypercall number in R11. R4-R11 are used as output registers. + Return value is placed in R3. + + KVM hypercalls uses 4 byte opcode, that are patched with 'hypercall-instructions' + property inside the device tree's /hypervisor node. + For more information refer to Documentation/virtual/kvm/ppc-pv.txt + +KVM Hypercalls Documentation +=========================== +The template for each hypercall is: +1. Hypercall name. +2. Architecture(s) +3. Status (deprecated, obsolete, active) +4. Purpose + +1. KVM_HC_VAPIC_POLL_IRQ +------------------------ +Architecture: x86 +Status: active +Purpose: Trigger guest exit so that the host can check for pending +interrupts on reentry. + +2. KVM_HC_MMU_OP +------------------------ +Architecture: x86 +Status: deprecated. +Purpose: Support MMU operations such as writing to PTE, +flushing TLB, release PT. + +3. KVM_HC_FEATURES +------------------------ +Architecture: PPC +Status: active +Purpose: Expose hypercall availability to the guest. On x86 platforms, cpuid +used to enumerate which hypercalls are available. On PPC, either device tree +based lookup ( which is also what EPAPR dictates) OR KVM specific enumeration +mechanism (which is this hypercall) can be used. + +4. KVM_HC_PPC_MAP_MAGIC_PAGE +------------------------ +Architecture: PPC +Status: active +Purpose: To enable communication between the hypervisor and guest there is a +shared page that contains parts of supervisor visible register state. +The guest can map this shared page to access its supervisor register through +memory using this hypercall. diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt index 73047104858..6d470ae7b07 100644 --- a/Documentation/virtual/kvm/msr.txt +++ b/Documentation/virtual/kvm/msr.txt @@ -34,9 +34,12 @@ MSR_KVM_WALL_CLOCK_NEW: 0x4b564d00 time information and check that they are both equal and even. An odd version indicates an in-progress update. - sec: number of seconds for wallclock. + sec: number of seconds for wallclock at time of boot. - nsec: number of nanoseconds for wallclock. + nsec: number of nanoseconds for wallclock at time of boot. + + In order to get the current wallclock time, the system_time from + MSR_KVM_SYSTEM_TIME_NEW needs to be added. Note that although MSRs are per-CPU entities, the effect of this particular MSR is global. @@ -82,20 +85,25 @@ MSR_KVM_SYSTEM_TIME_NEW: 0x4b564d01 time at the time this structure was last updated. Unit is nanoseconds. - tsc_to_system_mul: a function of the tsc frequency. One has - to multiply any tsc-related quantity by this value to get - a value in nanoseconds, besides dividing by 2^tsc_shift + tsc_to_system_mul: multiplier to be used when converting + tsc-related quantity to nanoseconds - tsc_shift: cycle to nanosecond divider, as a power of two, to - allow for shift rights. One has to shift right any tsc-related - quantity by this value to get a value in nanoseconds, besides - multiplying by tsc_to_system_mul. + tsc_shift: shift to be used when converting tsc-related + quantity to nanoseconds. This shift will ensure that + multiplication with tsc_to_system_mul does not overflow. + A positive value denotes a left shift, a negative value + a right shift. - With this information, guests can derive per-CPU time by - doing: + The conversion from tsc to nanoseconds involves an additional + right shift by 32 bits. With this information, guests can + derive per-CPU time by doing: time = (current_tsc - tsc_timestamp) - time = (time * tsc_to_system_mul) >> tsc_shift + if (tsc_shift >= 0) + time <<= tsc_shift; + else + time >>= -tsc_shift; + time = (time * tsc_to_system_mul) >> 32 time = time + system_time flags: bits in this field indicate extended capabilities diff --git a/Documentation/virtual/kvm/ppc-pv.txt b/Documentation/virtual/kvm/ppc-pv.txt index 4911cf95c67..4cd076febb0 100644 --- a/Documentation/virtual/kvm/ppc-pv.txt +++ b/Documentation/virtual/kvm/ppc-pv.txt @@ -174,3 +174,25 @@ following: That way we can inject an arbitrary amount of code as replacement for a single instruction. This allows us to check for pending interrupts when setting EE=1 for example. + +Hypercall ABIs in KVM on PowerPC +================================= +1) KVM hypercalls (ePAPR) + +These are ePAPR compliant hypercall implementation (mentioned above). Even +generic hypercalls are implemented here, like the ePAPR idle hcall. These are +available on all targets. + +2) PAPR hypercalls + +PAPR hypercalls are needed to run server PowerPC PAPR guests (-M pseries in QEMU). +These are the same hypercalls that pHyp, the POWER hypervisor implements. Some of +them are handled in the kernel, some are handled in user space. This is only +available on book3s_64. + +3) OSI hypercalls + +Mac-on-Linux is another user of KVM on PowerPC, which has its own hypercall (long +before KVM). This is supported to maintain compatibility. All these hypercalls get +forwarded to user space. This is only useful on book3s_32, but can be used with +book3s_64 as well. diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index bd77cb507c1..8b3a9c0e771 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -924,6 +924,16 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return 0; } +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event) +{ + if (!irqchip_in_kernel(kvm)) + return -ENXIO; + + irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, + irq_event->irq, irq_event->level); + return 0; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -963,29 +973,6 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; } break; - case KVM_IRQ_LINE_STATUS: - case KVM_IRQ_LINE: { - struct kvm_irq_level irq_event; - - r = -EFAULT; - if (copy_from_user(&irq_event, argp, sizeof irq_event)) - goto out; - r = -ENXIO; - if (irqchip_in_kernel(kvm)) { - __s32 status; - status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, - irq_event.irq, irq_event.level); - if (ioctl == KVM_IRQ_LINE_STATUS) { - r = -EFAULT; - irq_event.status = status; - if (copy_to_user(argp, &irq_event, - sizeof irq_event)) - goto out; - } - r = 0; - } - break; - } case KVM_GET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ struct kvm_irqchip chip; @@ -1626,11 +1613,17 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, return; } -void kvm_arch_flush_shadow(struct kvm *kvm) +void kvm_arch_flush_shadow_all(struct kvm *kvm) { kvm_flush_remote_tlbs(kvm); } +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + kvm_arch_flush_shadow_all(); +} + long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index a8bf5c673a3..28e8f5e5c63 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -53,6 +53,8 @@ struct kvm; extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); +extern int kvm_unmap_hva_range(struct kvm *kvm, + unsigned long start, unsigned long end); extern int kvm_age_hva(struct kvm *kvm, unsigned long hva); extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); @@ -220,6 +222,7 @@ struct revmap_entry { #define KVMPPC_GOT_PAGE 0x80 struct kvm_arch_memory_slot { + unsigned long *rmap; }; struct kvm_arch { diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index 33aa715dab2..5dd3ab46997 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c @@ -319,7 +319,6 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, if (is_error_page(new_page)) { printk(KERN_ERR "Couldn't get guest page for gfn %llx!\n", (unsigned long long)gfn); - kvm_release_page_clean(new_page); return; } hpaddr = page_to_phys(new_page); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index d03eb6f7b05..d95d11322a1 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -705,7 +705,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, goto out_unlock; hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; - rmap = &memslot->rmap[gfn - memslot->base_gfn]; + rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; lock_rmap(rmap); /* Check if we might have been invalidated; let the guest retry if so */ @@ -756,9 +756,12 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, goto out_put; } -static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, - int (*handler)(struct kvm *kvm, unsigned long *rmapp, - unsigned long gfn)) +static int kvm_handle_hva_range(struct kvm *kvm, + unsigned long start, + unsigned long end, + int (*handler)(struct kvm *kvm, + unsigned long *rmapp, + unsigned long gfn)) { int ret; int retval = 0; @@ -767,15 +770,25 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, slots = kvm_memslots(kvm); kvm_for_each_memslot(memslot, slots) { - unsigned long start = memslot->userspace_addr; - unsigned long end; + unsigned long hva_start, hva_end; + gfn_t gfn, gfn_end; - end = start + (memslot->npages << PAGE_SHIFT); - if (hva >= start && hva < end) { - gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn, gfn+1, ..., gfn_end-1}. + */ + gfn = hva_to_gfn_memslot(hva_start, memslot); + gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); + + for (; gfn < gfn_end; ++gfn) { + gfn_t gfn_offset = gfn - memslot->base_gfn; - ret = handler(kvm, &memslot->rmap[gfn_offset], - memslot->base_gfn + gfn_offset); + ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn); retval |= ret; } } @@ -783,6 +796,13 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, return retval; } +static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, + int (*handler)(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn)) +{ + return kvm_handle_hva_range(kvm, hva, hva + 1, handler); +} + static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long gfn) { @@ -850,6 +870,13 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) return 0; } +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +{ + if (kvm->arch.using_mmu_notifiers) + kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp); + return 0; +} + static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long gfn) { @@ -1009,7 +1036,7 @@ long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) unsigned long *rmapp, *map; preempt_disable(); - rmapp = memslot->rmap; + rmapp = memslot->arch.rmap; map = memslot->dirty_bitmap; for (i = 0; i < memslot->npages; ++i) { if (kvm_test_clear_dirty(kvm, rmapp)) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 5c70d19494f..fb0e821622d 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -84,7 +84,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) return; - rmap = real_vmalloc_addr(&memslot->rmap[gfn - memslot->base_gfn]); + rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]); lock_rmap(rmap); head = *rmap & KVMPPC_RMAP_INDEX; @@ -180,7 +180,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, if (!slot_is_aligned(memslot, psize)) return H_PARAMETER; slot_fn = gfn - memslot->base_gfn; - rmap = &memslot->rmap[slot_fn]; + rmap = &memslot->arch.rmap[slot_fn]; if (!kvm->arch.using_mmu_notifiers) { physp = kvm->arch.slot_phys[memslot->id]; @@ -197,7 +197,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, pa &= PAGE_MASK; } else { /* Translate to host virtual address */ - hva = gfn_to_hva_memslot(memslot, gfn); + hva = __gfn_to_hva_memslot(memslot, gfn); /* Look up the Linux PTE for the backing page */ pte_size = psize; diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index a1baec340f7..05c28f59f77 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -242,10 +242,8 @@ static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) int i; hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT); - if (is_error_page(hpage)) { - kvm_release_page_clean(hpage); + if (is_error_page(hpage)) return; - } hpage_offset = pte->raddr & ~PAGE_MASK; hpage_offset &= ~0xFFFULL; diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index a2b66717813..ff38b664195 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -520,11 +520,10 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, if (likely(!pfnmap)) { unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); - pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn); + pfn = gfn_to_pfn_memslot(slot, gfn); if (is_error_pfn(pfn)) { printk(KERN_ERR "Couldn't get real page for gfn %lx!\n", (long)gfn); - kvm_release_pfn_clean(pfn); return; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 87f4dc88607..4d213b8b0fb 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -302,10 +302,18 @@ long kvm_arch_dev_ioctl(struct file *filp, void kvm_arch_free_memslot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { + if (!dont || free->arch.rmap != dont->arch.rmap) { + vfree(free->arch.rmap); + free->arch.rmap = NULL; + } } int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) { + slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); + if (!slot->arch.rmap) + return -ENOMEM; + return 0; } @@ -326,8 +334,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, kvmppc_core_commit_memory_region(kvm, mem); } +void kvm_arch_flush_shadow_all(struct kvm *kvm) +{ +} -void kvm_arch_flush_shadow(struct kvm *kvm) +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) { } diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index f3e0aabfc6b..56831dfa919 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -159,6 +159,7 @@ extern unsigned long thread_saved_pc(struct task_struct *t); extern void show_code(struct pt_regs *regs); extern void print_fn_code(unsigned char *code, unsigned long len); +extern int insn_to_mnemonic(unsigned char *instruction, char buf[8]); unsigned long get_wchan(struct task_struct *p); #define task_pt_regs(tsk) ((struct pt_regs *) \ diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index cc84a24c023..f00286bd2ef 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -1501,6 +1501,33 @@ static struct insn *find_insn(unsigned char *code) return NULL; } +/** + * insn_to_mnemonic - decode an s390 instruction + * @instruction: instruction to decode + * @buf: buffer to fill with mnemonic + * + * Decode the instruction at @instruction and store the corresponding + * mnemonic into @buf. + * @buf is left unchanged if the instruction could not be decoded. + * Returns: + * %0 on success, %-ENOENT if the instruction was not found. + */ +int insn_to_mnemonic(unsigned char *instruction, char buf[8]) +{ |