diff options
Diffstat (limited to 'arch/x86/kernel/kvm.c')
| -rw-r--r-- | arch/x86/kernel/kvm.c | 676 |
1 files changed, 474 insertions, 202 deletions
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 8dc44662394..3dd8e2c4d74 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -20,6 +20,7 @@ * Authors: Anthony Liguori <aliguori@us.ibm.com> */ +#include <linux/context_tracking.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/kvm_para.h> @@ -33,13 +34,17 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/kprobes.h> +#include <linux/debugfs.h> #include <asm/timer.h> #include <asm/cpu.h> #include <asm/traps.h> #include <asm/desc.h> #include <asm/tlbflush.h> - -#define MMU_QUEUE_SIZE 1024 +#include <asm/idle.h> +#include <asm/apic.h> +#include <asm/apicdef.h> +#include <asm/hypervisor.h> +#include <asm/kvm_guest.h> static int kvmapf = 1; @@ -51,19 +56,28 @@ static int parse_no_kvmapf(char *arg) early_param("no-kvmapf", parse_no_kvmapf); -struct kvm_para_state { - u8 mmu_queue[MMU_QUEUE_SIZE]; - int mmu_queue_len; -}; +static int steal_acc = 1; +static int parse_no_stealacc(char *arg) +{ + steal_acc = 0; + return 0; +} -static DEFINE_PER_CPU(struct kvm_para_state, para_state); -static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); +early_param("no-steal-acc", parse_no_stealacc); -static struct kvm_para_state *kvm_para_state(void) +static int kvmclock_vsyscall = 1; +static int parse_no_kvmclock_vsyscall(char *arg) { - return &per_cpu(para_state, raw_smp_processor_id()); + kvmclock_vsyscall = 0; + return 0; } +early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); + +static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); +static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); +static int has_steal_clock = 0; + /* * No need for any "IO delay" on KVM */ @@ -80,7 +94,6 @@ struct kvm_task_sleep_node { u32 token; int cpu; bool halted; - struct mm_struct *mm; }; static struct kvm_task_sleep_head { @@ -109,11 +122,8 @@ void kvm_async_pf_task_wait(u32 token) struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; struct kvm_task_sleep_node n, *e; DEFINE_WAIT(wait); - int cpu, idle; - cpu = get_cpu(); - idle = idle_cpu(cpu); - put_cpu(); + rcu_irq_enter(); spin_lock(&b->lock); e = _find_apf_task(b, token); @@ -122,14 +132,14 @@ void kvm_async_pf_task_wait(u32 token) hlist_del(&e->link); kfree(e); spin_unlock(&b->lock); + + rcu_irq_exit(); return; } n.token = token; n.cpu = smp_processor_id(); - n.mm = current->active_mm; - n.halted = idle || preempt_count() > 1; - atomic_inc(&n.mm->mm_count); + n.halted = is_idle_task(current) || preempt_count() > 1; init_waitqueue_head(&n.wq); hlist_add_head(&n.link, &b->list); spin_unlock(&b->lock); @@ -148,13 +158,16 @@ void kvm_async_pf_task_wait(u32 token) /* * We cannot reschedule. So halt. */ + rcu_irq_exit(); native_safe_halt(); + rcu_irq_enter(); local_irq_disable(); } } if (!n.halted) finish_wait(&n.wq, &wait); + rcu_irq_exit(); return; } EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); @@ -162,9 +175,6 @@ EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); static void apf_task_wake_one(struct kvm_task_sleep_node *n) { hlist_del_init(&n->link); - if (!n->mm) - return; - mmdrop(n->mm); if (n->halted) smp_send_reschedule(n->cpu); else if (waitqueue_active(&n->wq)) @@ -208,7 +218,7 @@ again: * async PF was not yet handled. * Add dummy entry for the token. */ - n = kmalloc(sizeof(*n), GFP_ATOMIC); + n = kzalloc(sizeof(*n), GFP_ATOMIC); if (!n) { /* * Allocation failed! Busy wait while other cpu @@ -220,7 +230,6 @@ again: } n->token = token; n->cpu = smp_processor_id(); - n->mm = NULL; init_waitqueue_head(&n->wq); hlist_add_head(&n->link, &b->list); } else @@ -242,212 +251,85 @@ u32 kvm_read_and_reset_pf_reason(void) return reason; } EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); +NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason); -dotraplinkage void __kprobes +dotraplinkage void do_async_page_fault(struct pt_regs *regs, unsigned long error_code) { + enum ctx_state prev_state; + switch (kvm_read_and_reset_pf_reason()) { default: - do_page_fault(regs, error_code); + trace_do_page_fault(regs, error_code); break; case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ + prev_state = exception_enter(); + exit_idle(); kvm_async_pf_task_wait((u32)read_cr2()); + exception_exit(prev_state); break; case KVM_PV_REASON_PAGE_READY: + rcu_irq_enter(); + exit_idle(); kvm_async_pf_task_wake((u32)read_cr2()); + rcu_irq_exit(); break; } } +NOKPROBE_SYMBOL(do_async_page_fault); -static void kvm_mmu_op(void *buffer, unsigned len) -{ - int r; - unsigned long a1, a2; - - do { - a1 = __pa(buffer); - a2 = 0; /* on i386 __pa() always returns <4G */ - r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2); - buffer += r; - len -= r; - } while (len); -} - -static void mmu_queue_flush(struct kvm_para_state *state) -{ - if (state->mmu_queue_len) { - kvm_mmu_op(state->mmu_queue, state->mmu_queue_len); - state->mmu_queue_len = 0; - } -} - -static void kvm_deferred_mmu_op(void *buffer, int len) -{ - struct kvm_para_state *state = kvm_para_state(); - - if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) { - kvm_mmu_op(buffer, len); - return; - } - if (state->mmu_queue_len + len > sizeof state->mmu_queue) - mmu_queue_flush(state); - memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len); - state->mmu_queue_len += len; -} - -static void kvm_mmu_write(void *dest, u64 val) -{ - __u64 pte_phys; - struct kvm_mmu_op_write_pte wpte; - -#ifdef CONFIG_HIGHPTE - struct page *page; - unsigned long dst = (unsigned long) dest; - - page = kmap_atomic_to_page(dest); - pte_phys = page_to_pfn(page); - pte_phys <<= PAGE_SHIFT; - pte_phys += (dst & ~(PAGE_MASK)); -#else - pte_phys = (unsigned long)__pa(dest); -#endif - wpte.header.op = KVM_MMU_OP_WRITE_PTE; - wpte.pte_val = val; - wpte.pte_phys = pte_phys; - - kvm_deferred_mmu_op(&wpte, sizeof wpte); -} - -/* - * We only need to hook operations that are MMU writes. We hook these so that - * we can use lazy MMU mode to batch these operations. We could probably - * improve the performance of the host code if we used some of the information - * here to simplify processing of batched writes. - */ -static void kvm_set_pte(pte_t *ptep, pte_t pte) -{ - kvm_mmu_write(ptep, pte_val(pte)); -} - -static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - kvm_mmu_write(ptep, pte_val(pte)); -} - -static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd) -{ - kvm_mmu_write(pmdp, pmd_val(pmd)); -} - -#if PAGETABLE_LEVELS >= 3 -#ifdef CONFIG_X86_PAE -static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte) +static void __init paravirt_ops_setup(void) { - kvm_mmu_write(ptep, pte_val(pte)); -} + pv_info.name = "KVM"; + pv_info.paravirt_enabled = 1; -static void kvm_pte_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - kvm_mmu_write(ptep, 0); -} + if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) + pv_cpu_ops.io_delay = kvm_io_delay; -static void kvm_pmd_clear(pmd_t *pmdp) -{ - kvm_mmu_write(pmdp, 0); -} +#ifdef CONFIG_X86_IO_APIC + no_timer_check = 1; #endif - -static void kvm_set_pud(pud_t *pudp, pud_t pud) -{ - kvm_mmu_write(pudp, pud_val(pud)); -} - -#if PAGETABLE_LEVELS == 4 -static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd) -{ - kvm_mmu_write(pgdp, pgd_val(pgd)); } -#endif -#endif /* PAGETABLE_LEVELS >= 3 */ -static void kvm_flush_tlb(void) +static void kvm_register_steal_time(void) { - struct kvm_mmu_op_flush_tlb ftlb = { - .header.op = KVM_MMU_OP_FLUSH_TLB, - }; + int cpu = smp_processor_id(); + struct kvm_steal_time *st = &per_cpu(steal_time, cpu); - kvm_deferred_mmu_op(&ftlb, sizeof ftlb); -} + if (!has_steal_clock) + return; -static void kvm_release_pt(unsigned long pfn) -{ - struct kvm_mmu_op_release_pt rpt = { - .header.op = KVM_MMU_OP_RELEASE_PT, - .pt_phys = (u64)pfn << PAGE_SHIFT, - }; + memset(st, 0, sizeof(*st)); - kvm_mmu_op(&rpt, sizeof rpt); + wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); + pr_info("kvm-stealtime: cpu %d, msr %llx\n", + cpu, (unsigned long long) slow_virt_to_phys(st)); } -static void kvm_enter_lazy_mmu(void) -{ - paravirt_enter_lazy_mmu(); -} +static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; -static void kvm_leave_lazy_mmu(void) +static void kvm_guest_apic_eoi_write(u32 reg, u32 val) { - struct kvm_para_state *state = kvm_para_state(); - - mmu_queue_flush(state); - paravirt_leave_lazy_mmu(); -} - -static void __init paravirt_ops_setup(void) -{ - pv_info.name = "KVM"; - pv_info.paravirt_enabled = 1; - - if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) - pv_cpu_ops.io_delay = kvm_io_delay; - - if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) { - pv_mmu_ops.set_pte = kvm_set_pte; - pv_mmu_ops.set_pte_at = kvm_set_pte_at; - pv_mmu_ops.set_pmd = kvm_set_pmd; -#if PAGETABLE_LEVELS >= 3 -#ifdef CONFIG_X86_PAE - pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic; - pv_mmu_ops.pte_clear = kvm_pte_clear; - pv_mmu_ops.pmd_clear = kvm_pmd_clear; -#endif - pv_mmu_ops.set_pud = kvm_set_pud; -#if PAGETABLE_LEVELS == 4 - pv_mmu_ops.set_pgd = kvm_set_pgd; -#endif -#endif - pv_mmu_ops.flush_tlb_user = kvm_flush_tlb; - pv_mmu_ops.release_pte = kvm_release_pt; - pv_mmu_ops.release_pmd = kvm_release_pt; - pv_mmu_ops.release_pud = kvm_release_pt; - - pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; - pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; - } -#ifdef CONFIG_X86_IO_APIC - no_timer_check = 1; -#endif + /** + * This relies on __test_and_clear_bit to modify the memory + * in a way that is atomic with respect to the local CPU. + * The hypervisor only accesses this memory from the local CPU so + * there's no need for lock or memory barriers. + * An optimization barrier is implied in apic write. + */ + if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi))) + return; + apic_write(APIC_EOI, APIC_EOI_ACK); } -void __cpuinit kvm_guest_cpu_init(void) +void kvm_guest_cpu_init(void) { if (!kvm_para_available()) return; if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { - u64 pa = __pa(&__get_cpu_var(apf_reason)); + u64 pa = slow_virt_to_phys(&__get_cpu_var(apf_reason)); #ifdef CONFIG_PREEMPT pa |= KVM_ASYNC_PF_SEND_ALWAYS; @@ -457,9 +339,22 @@ void __cpuinit kvm_guest_cpu_init(void) printk(KERN_INFO"KVM setup async PF for cpu %d\n", smp_processor_id()); } + + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { + unsigned long pa; + /* Size alignment is implied but just to make it explicit. */ + BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); + __get_cpu_var(kvm_apic_eoi) = 0; + pa = slow_virt_to_phys(&__get_cpu_var(kvm_apic_eoi)) + | KVM_MSR_ENABLED; + wrmsrl(MSR_KVM_PV_EOI_EN, pa); + } + + if (has_steal_clock) + kvm_register_steal_time(); } -static void kvm_pv_disable_apf(void *unused) +static void kvm_pv_disable_apf(void) { if (!__get_cpu_var(apf_reason).enabled) return; @@ -471,11 +366,24 @@ static void kvm_pv_disable_apf(void *unused) smp_processor_id()); } +static void kvm_pv_guest_cpu_reboot(void *unused) +{ + /* + * We disable PV EOI before we load a new kernel by kexec, + * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. + * New kernel can re-enable when it boots. + */ + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + wrmsrl(MSR_KVM_PV_EOI_EN, 0); + kvm_pv_disable_apf(); + kvm_disable_steal_time(); +} + static int kvm_pv_reboot_notify(struct notifier_block *nb, unsigned long code, void *unused) { if (code == SYS_RESTART) - on_each_cpu(kvm_pv_disable_apf, NULL, 1); + on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); return NOTIFY_DONE; } @@ -483,14 +391,37 @@ static struct notifier_block kvm_pv_reboot_nb = { .notifier_call = kvm_pv_reboot_notify, }; +static u64 kvm_steal_clock(int cpu) +{ + u64 steal; + struct kvm_steal_time *src; + int version; + + src = &per_cpu(steal_time, cpu); + do { + version = src->version; + rmb(); + steal = src->steal; + rmb(); + } while ((version & 1) || (version != src->version)); + + return steal; +} + +void kvm_disable_steal_time(void) +{ + if (!has_steal_clock) + return; + + wrmsr(MSR_KVM_STEAL_TIME, 0, 0); +} + #ifdef CONFIG_SMP static void __init kvm_smp_prepare_boot_cpu(void) { -#ifdef CONFIG_KVM_CLOCK - WARN_ON(kvm_register_clock("primary cpu clock")); -#endif kvm_guest_cpu_init(); native_smp_prepare_boot_cpu(); + kvm_spinlock_init(); } static void kvm_guest_cpu_online(void *dummy) @@ -500,12 +431,15 @@ static void kvm_guest_cpu_online(void *dummy) static void kvm_guest_cpu_offline(void *dummy) { - kvm_pv_disable_apf(NULL); + kvm_disable_steal_time(); + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + wrmsrl(MSR_KVM_PV_EOI_EN, 0); + kvm_pv_disable_apf(); apf_task_wake_all(); } -static int __cpuinit kvm_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int kvm_cpu_notify(struct notifier_block *self, unsigned long action, + void *hcpu) { int cpu = (unsigned long)hcpu; switch (action) { @@ -524,14 +458,14 @@ static int __cpuinit kvm_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata kvm_cpu_notifier = { +static struct notifier_block kvm_cpu_notifier = { .notifier_call = kvm_cpu_notify, }; #endif static void __init kvm_apf_trap_init(void) { - set_intr_gate(14, &async_page_fault); + set_intr_gate(14, async_page_fault); } void __init kvm_guest_init(void) @@ -548,6 +482,17 @@ void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) x86_init.irqs.trap_init = kvm_apf_trap_init; + if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { + has_steal_clock = 1; + pv_time_ops.steal_clock = kvm_steal_clock; + } + + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + apic_set_eoi_write(kvm_guest_apic_eoi_write); + + if (kvmclock_vsyscall) + kvm_setup_vsyscall_timeinfo(); + #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; register_cpu_notifier(&kvm_cpu_notifier); @@ -555,3 +500,330 @@ void __init kvm_guest_init(void) kvm_guest_cpu_init(); #endif } + +static noinline uint32_t __kvm_cpuid_base(void) +{ + if (boot_cpu_data.cpuid_level < 0) + return 0; /* So we don't blow up on old processors */ + + if (cpu_has_hypervisor) + return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0); + + return 0; +} + +static inline uint32_t kvm_cpuid_base(void) +{ + static int kvm_cpuid_base = -1; + + if (kvm_cpuid_base == -1) + kvm_cpuid_base = __kvm_cpuid_base(); + + return kvm_cpuid_base; +} + +bool kvm_para_available(void) +{ + return kvm_cpuid_base() != 0; +} +EXPORT_SYMBOL_GPL(kvm_para_available); + +unsigned int kvm_arch_para_features(void) +{ + return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES); +} + +static uint32_t __init kvm_detect(void) +{ + return kvm_cpuid_base(); +} + +const struct hypervisor_x86 x86_hyper_kvm __refconst = { + .name = "KVM", + .detect = kvm_detect, + .x2apic_available = kvm_para_available, +}; +EXPORT_SYMBOL_GPL(x86_hyper_kvm); + +static __init int activate_jump_labels(void) +{ + if (has_steal_clock) { + static_key_slow_inc(¶virt_steal_enabled); + if (steal_acc) + static_key_slow_inc(¶virt_steal_rq_enabled); + } + + return 0; +} +arch_initcall(activate_jump_labels); + +#ifdef CONFIG_PARAVIRT_SPINLOCKS + +/* Kick a cpu by its apicid. Used to wake up a halted vcpu */ +static void kvm_kick_cpu(int cpu) +{ + int apicid; + unsigned long flags = 0; + + apicid = per_cpu(x86_cpu_to_apicid, cpu); + kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid); +} + +enum kvm_contention_stat { + TAKEN_SLOW, + TAKEN_SLOW_PICKUP, + RELEASED_SLOW, + RELEASED_SLOW_KICKED, + NR_CONTENTION_STATS +}; + +#ifdef CONFIG_KVM_DEBUG_FS +#define HISTO_BUCKETS 30 + +static struct kvm_spinlock_stats +{ + u32 contention_stats[NR_CONTENTION_STATS]; + u32 histo_spin_blocked[HISTO_BUCKETS+1]; + u64 time_blocked; +} spinlock_stats; + +static u8 zero_stats; + +static inline void check_zero(void) +{ + u8 ret; + u8 old; + + old = ACCESS_ONCE(zero_stats); + if (unlikely(old)) { + ret = cmpxchg(&zero_stats, old, 0); + /* This ensures only one fellow resets the stat */ + if (ret == old) + memset(&spinlock_stats, 0, sizeof(spinlock_stats)); + } +} + +static inline void add_stats(enum kvm_contention_stat var, u32 val) +{ + check_zero(); + spinlock_stats.contention_stats[var] += val; +} + + +static inline u64 spin_time_start(void) +{ + return sched_clock(); +} + +static void __spin_time_accum(u64 delta, u32 *array) +{ + unsigned index; + + index = ilog2(delta); + check_zero(); + + if (index < HISTO_BUCKETS) + array[index]++; + else + array[HISTO_BUCKETS]++; +} + +static inline void spin_time_accum_blocked(u64 start) +{ + u32 delta; + + delta = sched_clock() - start; + __spin_time_accum(delta, spinlock_stats.histo_spin_blocked); + spinlock_stats.time_blocked += delta; +} + +static struct dentry *d_spin_debug; +static struct dentry *d_kvm_debug; + +struct dentry *kvm_init_debugfs(void) +{ + d_kvm_debug = debugfs_create_dir("kvm-guest", NULL); + if (!d_kvm_debug) + printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n"); + + return d_kvm_debug; +} + +static int __init kvm_spinlock_debugfs(void) +{ + struct dentry *d_kvm; + + d_kvm = kvm_init_debugfs(); + if (d_kvm == NULL) + return -ENOMEM; + + d_spin_debug = debugfs_create_dir("spinlocks", d_kvm); + + debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); + + debugfs_create_u32("taken_slow", 0444, d_spin_debug, + &spinlock_stats.contention_stats[TAKEN_SLOW]); + debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, + &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]); + + debugfs_create_u32("released_slow", 0444, d_spin_debug, + &spinlock_stats.contention_stats[RELEASED_SLOW]); + debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, + &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]); + + debugfs_create_u64("time_blocked", 0444, d_spin_debug, + &spinlock_stats.time_blocked); + + debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, + spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); + + return 0; +} +fs_initcall(kvm_spinlock_debugfs); +#else /* !CONFIG_KVM_DEBUG_FS */ +static inline void add_stats(enum kvm_contention_stat var, u32 val) +{ +} + +static inline u64 spin_time_start(void) +{ + return 0; +} + +static inline void spin_time_accum_blocked(u64 start) +{ +} +#endif /* CONFIG_KVM_DEBUG_FS */ + +struct kvm_lock_waiting { + struct arch_spinlock *lock; + __ticket_t want; +}; + +/* cpus 'waiting' on a spinlock to become available */ +static cpumask_t waiting_cpus; + +/* Track spinlock on which a cpu is waiting */ +static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting); + +__visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) +{ + struct kvm_lock_waiting *w; + int cpu; + u64 start; + unsigned long flags; + + if (in_nmi()) + return; + + w = &__get_cpu_var(klock_waiting); + cpu = smp_processor_id(); + start = spin_time_start(); + + /* + * Make sure an interrupt handler can't upset things in a + * partially setup state. + */ + local_irq_save(flags); + + /* + * The ordering protocol on this is that the "lock" pointer + * may only be set non-NULL if the "want" ticket is correct. + * If we're updating "want", we must first clear "lock". + */ + w->lock = NULL; + smp_wmb(); + w->want = want; + smp_wmb(); + w->lock = lock; + + add_stats(TAKEN_SLOW, 1); + + /* + * This uses set_bit, which is atomic but we should not rely on its + * reordering gurantees. So barrier is needed after this call. + */ + cpumask_set_cpu(cpu, &waiting_cpus); + + barrier(); + + /* + * Mark entry to slowpath before doing the pickup test to make + * sure we don't deadlock with an unlocker. + */ + __ticket_enter_slowpath(lock); + + /* + * check again make sure it didn't become free while + * we weren't looking. + */ + if (ACCESS_ONCE(lock->tickets.head) == want) { + add_stats(TAKEN_SLOW_PICKUP, 1); + goto out; + } + + /* + * halt until it's our turn and kicked. Note that we do safe halt + * for irq enabled case to avoid hang when lock info is overwritten + * in irq spinlock slowpath and no spurious interrupt occur to save us. + */ + if (arch_irqs_disabled_flags(flags)) + halt(); + else + safe_halt(); + +out: + cpumask_clear_cpu(cpu, &waiting_cpus); + w->lock = NULL; + local_irq_restore(flags); + spin_time_accum_blocked(start); +} +PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning); + +/* Kick vcpu waiting on @lock->head to reach value @ticket */ +static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) +{ + int cpu; + + add_stats(RELEASED_SLOW, 1); + for_each_cpu(cpu, &waiting_cpus) { + const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu); + if (ACCESS_ONCE(w->lock) == lock && + ACCESS_ONCE(w->want) == ticket) { + add_stats(RELEASED_SLOW_KICKED, 1); + kvm_kick_cpu(cpu); + break; + } + } +} + +/* + * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. + */ +void __init kvm_spinlock_init(void) +{ + if (!kvm_para_available()) + return; + /* Does host kernel support KVM_FEATURE_PV_UNHALT? */ + if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) + return; + + pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning); + pv_lock_ops.unlock_kick = kvm_unlock_kick; +} + +static __init int kvm_spinlock_init_jump(void) +{ + if (!kvm_para_available()) + return 0; + if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) + return 0; + + static_key_slow_inc(¶virt_ticketlocks_enabled); + printk(KERN_INFO "KVM setup paravirtual spinlock\n"); + + return 0; +} +early_initcall(kvm_spinlock_init_jump); + +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ |
