diff options
Diffstat (limited to 'arch/x86/kernel/kvmclock.c')
| -rw-r--r-- | arch/x86/kernel/kvmclock.c | 267 |
1 files changed, 192 insertions, 75 deletions
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 08a30986d47..d9156ceecdf 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -18,15 +18,19 @@ #include <linux/clocksource.h> #include <linux/kvm_para.h> -#include <asm/arch_hooks.h> +#include <asm/pvclock.h> #include <asm/msr.h> #include <asm/apic.h> #include <linux/percpu.h> -#include <asm/reboot.h> +#include <linux/hardirq.h> +#include <linux/memblock.h> -#define KVM_SCALE 22 +#include <asm/x86_init.h> +#include <asm/reboot.h> static int kvmclock = 1; +static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; +static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; static int parse_no_kvmclock(char *arg) { @@ -36,101 +40,146 @@ static int parse_no_kvmclock(char *arg) early_param("no-kvmclock", parse_no_kvmclock); /* The hypervisor will put information about time periodically here */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock); -#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field - -static inline u64 kvm_get_delta(u64 last_tsc) -{ - int cpu = smp_processor_id(); - u64 delta = native_read_tsc() - last_tsc; - return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE; -} +static struct pvclock_vsyscall_time_info *hv_clock; +static struct pvclock_wall_clock wall_clock; -static struct kvm_wall_clock wall_clock; -static cycle_t kvm_clock_read(void); /* * The wallclock is the time of day when we booted. Since then, some time may * have elapsed since the hypervisor wrote the data. So we try to account for * that with system time */ -static unsigned long kvm_get_wallclock(void) +static void kvm_get_wallclock(struct timespec *now) { - u32 wc_sec, wc_nsec; - u64 delta; - struct timespec ts; - int version, nsec; + struct pvclock_vcpu_time_info *vcpu_time; int low, high; + int cpu; - low = (int)__pa(&wall_clock); - high = ((u64)__pa(&wall_clock) >> 32); + low = (int)__pa_symbol(&wall_clock); + high = ((u64)__pa_symbol(&wall_clock) >> 32); - delta = kvm_clock_read(); + native_write_msr(msr_kvm_wall_clock, low, high); - native_write_msr(MSR_KVM_WALL_CLOCK, low, high); - do { - version = wall_clock.wc_version; - rmb(); - wc_sec = wall_clock.wc_sec; - wc_nsec = wall_clock.wc_nsec; - rmb(); - } while ((wall_clock.wc_version != version) || (version & 1)); + preempt_disable(); + cpu = smp_processor_id(); - delta = kvm_clock_read() - delta; - delta += wc_nsec; - nsec = do_div(delta, NSEC_PER_SEC); - set_normalized_timespec(&ts, wc_sec + delta, nsec); - /* - * Of all mechanisms of time adjustment I've tested, this one - * was the champion! - */ - return ts.tv_sec + 1; + vcpu_time = &hv_clock[cpu].pvti; + pvclock_read_wallclock(&wall_clock, vcpu_time, now); + + preempt_enable(); } -static int kvm_set_wallclock(unsigned long now) +static int kvm_set_wallclock(const struct timespec *now) { - return 0; + return -1; +} + +static cycle_t kvm_clock_read(void) +{ + struct pvclock_vcpu_time_info *src; + cycle_t ret; + int cpu; + + preempt_disable_notrace(); + cpu = smp_processor_id(); + src = &hv_clock[cpu].pvti; + ret = pvclock_clocksource_read(src); + preempt_enable_notrace(); + return ret; +} + +static cycle_t kvm_clock_get_cycles(struct clocksource *cs) +{ + return kvm_clock_read(); } /* - * This is our read_clock function. The host puts an tsc timestamp each time - * it updates a new time. Without the tsc adjustment, we can have a situation - * in which a vcpu starts to run earlier (smaller system_time), but probes - * time later (compared to another vcpu), leading to backwards time + * If we don't do that, there is the possibility that the guest + * will calibrate under heavy load - thus, getting a lower lpj - + * and execute the delays themselves without load. This is wrong, + * because no delay loop can finish beforehand. + * Any heuristics is subject to fail, because ultimately, a large + * poll of guests can be running and trouble each other. So we preset + * lpj here */ -static cycle_t kvm_clock_read(void) +static unsigned long kvm_get_tsc_khz(void) { - u64 last_tsc, now; + struct pvclock_vcpu_time_info *src; int cpu; + unsigned long tsc_khz; preempt_disable(); cpu = smp_processor_id(); + src = &hv_clock[cpu].pvti; + tsc_khz = pvclock_tsc_khz(src); + preempt_enable(); + return tsc_khz; +} - last_tsc = get_clock(cpu, tsc_timestamp); - now = get_clock(cpu, system_time); +static void kvm_get_preset_lpj(void) +{ + unsigned long khz; + u64 lpj; - now += kvm_get_delta(last_tsc); - preempt_enable(); + khz = kvm_get_tsc_khz(); - return now; + lpj = ((u64)khz * 1000); + do_div(lpj, HZ); + preset_lpj = lpj; } + +bool kvm_check_and_clear_guest_paused(void) +{ + bool ret = false; + struct pvclock_vcpu_time_info *src; + int cpu = smp_processor_id(); + + if (!hv_clock) + return ret; + + src = &hv_clock[cpu].pvti; + if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { + src->flags &= ~PVCLOCK_GUEST_STOPPED; + pvclock_touch_watchdogs(); + ret = true; + } + + return ret; +} + static struct clocksource kvm_clock = { .name = "kvm-clock", - .read = kvm_clock_read, + .read = kvm_clock_get_cycles, .rating = 400, .mask = CLOCKSOURCE_MASK(64), - .mult = 1 << KVM_SCALE, - .shift = KVM_SCALE, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -static int kvm_register_clock(void) +int kvm_register_clock(char *txt) { int cpu = smp_processor_id(); - int low, high; - low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; - high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); + int low, high, ret; + struct pvclock_vcpu_time_info *src; + + if (!hv_clock) + return 0; + + src = &hv_clock[cpu].pvti; + low = (int)slow_virt_to_phys(src) | 1; + high = ((u64)slow_virt_to_phys(src) >> 32); + ret = native_write_msr_safe(msr_kvm_system_time, low, high); + printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", + cpu, high, low, txt); + + return ret; +} - return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); +static void kvm_save_sched_clock_state(void) +{ +} + +static void kvm_restore_sched_clock_state(void) +{ + kvm_register_clock("primary cpu clock, resume"); } #ifdef CONFIG_X86_LOCAL_APIC @@ -140,9 +189,7 @@ static void kvm_setup_secondary_clock(void) * Now that the first cpu already had this clocksource initialized, * we shouldn't fail. */ - WARN_ON(kvm_register_clock()); - /* ok, done with our trickery, call native */ - setup_secondary_APIC_clock(); + WARN_ON(kvm_register_clock("secondary cpu clock")); } #endif @@ -157,35 +204,105 @@ static void kvm_setup_secondary_clock(void) #ifdef CONFIG_KEXEC static void kvm_crash_shutdown(struct pt_regs *regs) { - native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); + native_write_msr(msr_kvm_system_time, 0, 0); + kvm_disable_steal_time(); native_machine_crash_shutdown(regs); } #endif static void kvm_shutdown(void) { - native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); + native_write_msr(msr_kvm_system_time, 0, 0); + kvm_disable_steal_time(); native_machine_shutdown(); } void __init kvmclock_init(void) { + unsigned long mem; + int size; + + size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); + if (!kvm_para_available()) return; - if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { - if (kvm_register_clock()) - return; - pv_time_ops.get_wallclock = kvm_get_wallclock; - pv_time_ops.set_wallclock = kvm_set_wallclock; - pv_time_ops.sched_clock = kvm_clock_read; + if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; + } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))) + return; + + printk(KERN_INFO "kvm-clock: Using msrs %x and %x", + msr_kvm_system_time, msr_kvm_wall_clock); + + mem = memblock_alloc(size, PAGE_SIZE); + if (!mem) + return; + hv_clock = __va(mem); + memset(hv_clock, 0, size); + + if (kvm_register_clock("primary cpu clock")) { + hv_clock = NULL; + memblock_free(mem, size); + return; + } + pv_time_ops.sched_clock = kvm_clock_read; + x86_platform.calibrate_tsc = kvm_get_tsc_khz; + x86_platform.get_wallclock = kvm_get_wallclock; + x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; + x86_cpuinit.early_percpu_clock_init = + kvm_setup_secondary_clock; #endif - machine_ops.shutdown = kvm_shutdown; + x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; + x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; + machine_ops.shutdown = kvm_shutdown; #ifdef CONFIG_KEXEC - machine_ops.crash_shutdown = kvm_crash_shutdown; + machine_ops.crash_shutdown = kvm_crash_shutdown; #endif - clocksource_register(&kvm_clock); + kvm_get_preset_lpj(); + clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); + pv_info.paravirt_enabled = 1; + pv_info.name = "KVM"; + + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) + pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); +} + +int __init kvm_setup_vsyscall_timeinfo(void) +{ +#ifdef CONFIG_X86_64 + int cpu; + int ret; + u8 flags; + struct pvclock_vcpu_time_info *vcpu_time; + unsigned int size; + + if (!hv_clock) + return 0; + + size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); + + preempt_disable(); + cpu = smp_processor_id(); + + vcpu_time = &hv_clock[cpu].pvti; + flags = pvclock_read_flags(vcpu_time); + + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) { + preempt_enable(); + return 1; } + + if ((ret = pvclock_init_vsyscall(hv_clock, size))) { + preempt_enable(); + return ret; + } + + preempt_enable(); + + kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; +#endif + return 0; } |
