diff options
Diffstat (limited to 'arch/x86/kvm/x86.c')
| -rw-r--r-- | arch/x86/kvm/x86.c | 527 | 
1 files changed, 341 insertions, 186 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e5ca72a5cdb..ef432f891d3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -94,6 +94,9 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);  static bool ignore_msrs = 0;  module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); +unsigned int min_timer_period_us = 500; +module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); +  bool kvm_has_tsc_control;  EXPORT_SYMBOL_GPL(kvm_has_tsc_control);  u32  kvm_max_guest_tsc_khz; @@ -103,6 +106,8 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);  static u32 tsc_tolerance_ppm = 250;  module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); +static bool backwards_tsc_observed = false; +  #define KVM_NR_SHARED_MSRS 16  struct kvm_shared_msrs_global { @@ -254,14 +259,30 @@ u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)  }  EXPORT_SYMBOL_GPL(kvm_get_apic_base); -void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) -{ -	/* TODO: reserve bits check */ -	kvm_lapic_set_base(vcpu, data); +int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ +	u64 old_state = vcpu->arch.apic_base & +		(MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); +	u64 new_state = msr_info->data & +		(MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); +	u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | +		0x2ff | (guest_cpuid_has_x2apic(vcpu) ? 0 : X2APIC_ENABLE); + +	if (!msr_info->host_initiated && +	    ((msr_info->data & reserved_bits) != 0 || +	     new_state == X2APIC_ENABLE || +	     (new_state == MSR_IA32_APICBASE_ENABLE && +	      old_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) || +	     (new_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE) && +	      old_state == 0))) +		return 1; + +	kvm_lapic_set_base(vcpu, msr_info->data); +	return 0;  }  EXPORT_SYMBOL_GPL(kvm_set_apic_base); -asmlinkage void kvm_spurious_fault(void) +asmlinkage __visible void kvm_spurious_fault(void)  {  	/* Fault while not rebooting.  We want the trace. */  	BUG(); @@ -576,20 +597,35 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)  int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)  { -	u64 xcr0; +	u64 xcr0 = xcr; +	u64 old_xcr0 = vcpu->arch.xcr0; +	u64 valid_bits;  	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */  	if (index != XCR_XFEATURE_ENABLED_MASK)  		return 1; -	xcr0 = xcr;  	if (!(xcr0 & XSTATE_FP))  		return 1;  	if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))  		return 1; -	if (xcr0 & ~host_xcr0) + +	/* +	 * Do not allow the guest to set bits that we do not support +	 * saving.  However, xcr0 bit 0 is always set, even if the +	 * emulated CPU does not support XSAVE (see fx_init). +	 */ +	valid_bits = vcpu->arch.guest_supported_xcr0 | XSTATE_FP; +	if (xcr0 & ~valid_bits)  		return 1; + +	if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR))) +		return 1; +  	kvm_put_guest_xcr0(vcpu);  	vcpu->arch.xcr0 = xcr0; + +	if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK) +		kvm_update_cpuid(vcpu);  	return 0;  } @@ -618,6 +654,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)  	if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))  		return 1; +	if (!guest_cpuid_has_smap(vcpu) && (cr4 & X86_CR4_SMAP)) +		return 1; +  	if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE))  		return 1; @@ -646,6 +685,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)  	    (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))  		kvm_mmu_reset_context(vcpu); +	if ((cr4 ^ old_cr4) & X86_CR4_SMAP) +		update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false); +  	if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)  		kvm_update_cpuid(vcpu); @@ -662,29 +704,15 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)  	}  	if (is_long_mode(vcpu)) { -		if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) { -			if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) -				return 1; -		} else -			if (cr3 & CR3_L_MODE_RESERVED_BITS) -				return 1; -	} else { -		if (is_pae(vcpu)) { -			if (cr3 & CR3_PAE_RESERVED_BITS) -				return 1; -			if (is_paging(vcpu) && -			    !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) -				return 1; -		} -		/* -		 * We don't check reserved bits in nonpae mode, because -		 * this isn't enforced, and VMware depends on this. -		 */ -	} +		if (cr3 & CR3_L_MODE_RESERVED_BITS) +			return 1; +	} else if (is_pae(vcpu) && is_paging(vcpu) && +		   !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) +		return 1;  	vcpu->arch.cr3 = cr3;  	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); -	vcpu->arch.mmu.new_cr3(vcpu); +	kvm_mmu_new_cr3(vcpu);  	return 0;  }  EXPORT_SYMBOL_GPL(kvm_set_cr3); @@ -710,6 +738,12 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)  }  EXPORT_SYMBOL_GPL(kvm_get_cr8); +static void kvm_update_dr6(struct kvm_vcpu *vcpu) +{ +	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) +		kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6); +} +  static void kvm_update_dr7(struct kvm_vcpu *vcpu)  {  	unsigned long dr7; @@ -719,7 +753,9 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu)  	else  		dr7 = vcpu->arch.dr7;  	kvm_x86_ops->set_dr7(vcpu, dr7); -	vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK); +	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; +	if (dr7 & DR7_BP_EN_MASK) +		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;  }  static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) @@ -738,6 +774,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)  		if (val & 0xffffffff00000000ULL)  			return -1; /* #GP */  		vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; +		kvm_update_dr6(vcpu);  		break;  	case 5:  		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) @@ -779,7 +816,10 @@ static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)  			return 1;  		/* fall through */  	case 6: -		*val = vcpu->arch.dr6; +		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) +			*val = vcpu->arch.dr6; +		else +			*val = kvm_x86_ops->get_dr6(vcpu);  		break;  	case 5:  		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) @@ -827,11 +867,12 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);   * kvm-specific. Those are put in the beginning of the list.   */ -#define KVM_SAVE_MSRS_BEGIN	10 +#define KVM_SAVE_MSRS_BEGIN	12  static u32 msrs_to_save[] = {  	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,  	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,  	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, +	HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,  	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,  	MSR_KVM_PV_EOI_EN,  	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, @@ -840,7 +881,7 @@ static u32 msrs_to_save[] = {  	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,  #endif  	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, -	MSR_IA32_FEATURE_CONTROL +	MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS  };  static unsigned num_msrs_to_save; @@ -1070,7 +1111,6 @@ static inline u64 get_kernel_ns(void)  {  	struct timespec ts; -	WARN_ON(preemptible());  	ktime_get_ts(&ts);  	monotonic_to_bootbased(&ts);  	return timespec_to_ns(&ts); @@ -1266,8 +1306,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)  	kvm->arch.last_tsc_write = data;  	kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; -	/* Reset of TSC must disable overshoot protection below */ -	vcpu->arch.hv_clock.tsc_timestamp = 0;  	vcpu->arch.last_guest_tsc = data;  	/* Keep track of which generation this VCPU has synchronized to */ @@ -1436,7 +1474,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)  					&ka->master_kernel_ns,  					&ka->master_cycle_now); -	ka->use_master_clock = host_tsc_clocksource & vcpus_matched; +	ka->use_master_clock = host_tsc_clocksource && vcpus_matched +				&& !backwards_tsc_observed;  	if (ka->use_master_clock)  		atomic_set(&kvm_guest_has_master_clock, 1); @@ -1475,7 +1514,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)  	unsigned long flags, this_tsc_khz;  	struct kvm_vcpu_arch *vcpu = &v->arch;  	struct kvm_arch *ka = &v->kvm->arch; -	s64 kernel_ns, max_kernel_ns; +	s64 kernel_ns;  	u64 tsc_timestamp, host_tsc;  	struct pvclock_vcpu_time_info guest_hv_clock;  	u8 pvclock_flags; @@ -1534,37 +1573,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)  	if (!vcpu->pv_time_enabled)  		return 0; -	/* -	 * Time as measured by the TSC may go backwards when resetting the base -	 * tsc_timestamp.  The reason for this is that the TSC resolution is -	 * higher than the resolution of the other clock scales.  Thus, many -	 * possible measurments of the TSC correspond to one measurement of any -	 * other clock, and so a spread of values is possible.  This is not a -	 * problem for the computation of the nanosecond clock; with TSC rates -	 * around 1GHZ, there can only be a few cycles which correspond to one -	 * nanosecond value, and any path through this code will inevitably -	 * take longer than that.  However, with the kernel_ns value itself, -	 * the precision may be much lower, down to HZ granularity.  If the -	 * first sampling of TSC against kernel_ns ends in the low part of the -	 * range, and the second in the high end of the range, we can get: -	 * -	 * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new -	 * -	 * As the sampling errors potentially range in the thousands of cycles, -	 * it is possible such a time value has already been observed by the -	 * guest.  To protect against this, we must compute the system time as -	 * observed by the guest and ensure the new system time is greater. -	 */ -	max_kernel_ns = 0; -	if (vcpu->hv_clock.tsc_timestamp) { -		max_kernel_ns = vcpu->last_guest_tsc - -				vcpu->hv_clock.tsc_timestamp; -		max_kernel_ns = pvclock_scale_delta(max_kernel_ns, -				    vcpu->hv_clock.tsc_to_system_mul, -				    vcpu->hv_clock.tsc_shift); -		max_kernel_ns += vcpu->last_kernel_ns; -	} -  	if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {  		kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,  				   &vcpu->hv_clock.tsc_shift, @@ -1572,18 +1580,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)  		vcpu->hw_tsc_khz = this_tsc_khz;  	} -	/* with a master <monotonic time, tsc value> tuple, -	 * pvclock clock reads always increase at the (scaled) rate -	 * of guest TSC - no need to deal with sampling errors. -	 */ -	if (!use_master_clock) { -		if (max_kernel_ns > kernel_ns) -			kernel_ns = max_kernel_ns; -	}  	/* With all the info we got, fill in the values */  	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;  	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; -	vcpu->last_kernel_ns = kernel_ns;  	vcpu->last_guest_tsc = tsc_timestamp;  	/* @@ -1625,14 +1624,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)   * the others.   *   * So in those cases, request a kvmclock update for all vcpus. - * The worst case for a remote vcpu to update its kvmclock - * is then bounded by maximum nohz sleep latency. + * We need to rate-limit these requests though, as they can + * considerably slow guests that have a large number of vcpus. + * The time for a remote vcpu to update its kvmclock is bound + * by the delay we use to rate-limit the updates.   */ -static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) +#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100) + +static void kvmclock_update_fn(struct work_struct *work)  {  	int i; -	struct kvm *kvm = v->kvm; +	struct delayed_work *dwork = to_delayed_work(work); +	struct kvm_arch *ka = container_of(dwork, struct kvm_arch, +					   kvmclock_update_work); +	struct kvm *kvm = container_of(ka, struct kvm, arch);  	struct kvm_vcpu *vcpu;  	kvm_for_each_vcpu(i, vcpu, kvm) { @@ -1641,6 +1647,29 @@ static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)  	}  } +static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) +{ +	struct kvm *kvm = v->kvm; + +	set_bit(KVM_REQ_CLOCK_UPDATE, &v->requests); +	schedule_delayed_work(&kvm->arch.kvmclock_update_work, +					KVMCLOCK_UPDATE_DELAY); +} + +#define KVMCLOCK_SYNC_PERIOD (300 * HZ) + +static void kvmclock_sync_fn(struct work_struct *work) +{ +	struct delayed_work *dwork = to_delayed_work(work); +	struct kvm_arch *ka = container_of(dwork, struct kvm_arch, +					   kvmclock_sync_work); +	struct kvm *kvm = container_of(ka, struct kvm, arch); + +	schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); +	schedule_delayed_work(&kvm->arch.kvmclock_sync_work, +					KVMCLOCK_SYNC_PERIOD); +} +  static bool msr_mtrr_valid(unsigned msr)  {  	switch (msr) { @@ -1817,6 +1846,8 @@ static bool kvm_hv_msr_partition_wide(u32 msr)  	switch (msr) {  	case HV_X64_MSR_GUEST_OS_ID:  	case HV_X64_MSR_HYPERCALL: +	case HV_X64_MSR_REFERENCE_TSC: +	case HV_X64_MSR_TIME_REF_COUNT:  		r = true;  		break;  	} @@ -1856,6 +1887,21 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)  		if (__copy_to_user((void __user *)addr, instructions, 4))  			return 1;  		kvm->arch.hv_hypercall = data; +		mark_page_dirty(kvm, gfn); +		break; +	} +	case HV_X64_MSR_REFERENCE_TSC: { +		u64 gfn; +		HV_REFERENCE_TSC_PAGE tsc_ref; +		memset(&tsc_ref, 0, sizeof(tsc_ref)); +		kvm->arch.hv_tsc_page = data; +		if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE)) +			break; +		gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; +		if (kvm_write_guest(kvm, gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT, +			&tsc_ref, sizeof(tsc_ref))) +			return 1; +		mark_page_dirty(kvm, gfn);  		break;  	}  	default: @@ -1870,19 +1916,25 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)  {  	switch (msr) {  	case HV_X64_MSR_APIC_ASSIST_PAGE: { +		u64 gfn;  		unsigned long addr;  		if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {  			vcpu->arch.hv_vapic = data; +			if (kvm_lapic_enable_pv_eoi(vcpu, 0)) +				return 1;  			break;  		} -		addr = gfn_to_hva(vcpu->kvm, data >> -				  HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); +		gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT; +		addr = gfn_to_hva(vcpu->kvm, gfn);  		if (kvm_is_error_hva(addr))  			return 1;  		if (__clear_user((void __user *)addr, PAGE_SIZE))  			return 1;  		vcpu->arch.hv_vapic = data; +		mark_page_dirty(vcpu->kvm, gfn); +		if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED)) +			return 1;  		break;  	}  	case HV_X64_MSR_EOI: @@ -2008,8 +2060,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)  	case 0x200 ... 0x2ff:  		return set_msr_mtrr(vcpu, msr, data);  	case MSR_IA32_APICBASE: -		kvm_set_apic_base(vcpu, data); -		break; +		return kvm_set_apic_base(vcpu, msr_info);  	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:  		return kvm_x2apic_msr_write(vcpu, msr, data);  	case MSR_IA32_TSCDEADLINE: @@ -2282,6 +2333,14 @@ static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)  	case HV_X64_MSR_HYPERCALL:  		data = kvm->arch.hv_hypercall;  		break; +	case HV_X64_MSR_TIME_REF_COUNT: { +		data = +		     div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100); +		break; +	} +	case HV_X64_MSR_REFERENCE_TSC: +		data = kvm->arch.hv_tsc_page; +		break;  	default:  		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);  		return 1; @@ -2299,9 +2358,12 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)  	case HV_X64_MSR_VP_INDEX: {  		int r;  		struct kvm_vcpu *v; -		kvm_for_each_vcpu(r, v, vcpu->kvm) -			if (v == vcpu) +		kvm_for_each_vcpu(r, v, vcpu->kvm) { +			if (v == vcpu) {  				data = r; +				break; +			} +		}  		break;  	}  	case HV_X64_MSR_EOI: @@ -2564,6 +2626,7 @@ int kvm_dev_ioctl_check_extension(long ext)  	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:  	case KVM_CAP_SET_TSS_ADDR:  	case KVM_CAP_EXT_CPUID: +	case KVM_CAP_EXT_EMUL_CPUID:  	case KVM_CAP_CLOCKSOURCE:  	case KVM_CAP_PIT:  	case KVM_CAP_NOP_IO_DELAY: @@ -2574,6 +2637,7 @@ int kvm_dev_ioctl_check_extension(long ext)  	case KVM_CAP_IRQ_INJECT_STATUS:  	case KVM_CAP_IRQFD:  	case KVM_CAP_IOEVENTFD: +	case KVM_CAP_IOEVENTFD_NO_LENGTH:  	case KVM_CAP_PIT2:  	case KVM_CAP_PIT_STATE2:  	case KVM_CAP_SET_IDENTITY_MAP_ADDR: @@ -2591,6 +2655,8 @@ int kvm_dev_ioctl_check_extension(long ext)  	case KVM_CAP_GET_TSC_KHZ:  	case KVM_CAP_KVMCLOCK_CTRL:  	case KVM_CAP_READONLY_MEM: +	case KVM_CAP_HYPERV_TIME: +	case KVM_CAP_IOAPIC_POLARITY_IGNORED:  #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT  	case KVM_CAP_ASSIGN_DEV_IRQ:  	case KVM_CAP_PCI_2_3: @@ -2673,15 +2739,17 @@ long kvm_arch_dev_ioctl(struct file *filp,  		r = 0;  		break;  	} -	case KVM_GET_SUPPORTED_CPUID: { +	case KVM_GET_SUPPORTED_CPUID: +	case KVM_GET_EMULATED_CPUID: {  		struct kvm_cpuid2 __user *cpuid_arg = argp;  		struct kvm_cpuid2 cpuid;  		r = -EFAULT;  		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))  			goto out; -		r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, -						      cpuid_arg->entries); + +		r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries, +					    ioctl);  		if (r)  			goto out; @@ -2715,8 +2783,7 @@ static void wbinvd_ipi(void *garbage)  static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)  { -	return vcpu->kvm->arch.iommu_domain && -		!(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY); +	return kvm_arch_has_noncoherent_dma(vcpu->kvm);  }  void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -2961,8 +3028,11 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,  static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,  					     struct kvm_debugregs *dbgregs)  { +	unsigned long val; +  	memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); -	dbgregs->dr6 = vcpu->arch.dr6; +	_kvm_get_dr(vcpu, 6, &val); +	dbgregs->dr6 = val;  	dbgregs->dr7 = vcpu->arch.dr7;  	dbgregs->flags = 0;  	memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); @@ -2976,7 +3046,9 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,  	memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));  	vcpu->arch.dr6 = dbgregs->dr6; +	kvm_update_dr6(vcpu);  	vcpu->arch.dr7 = dbgregs->dr7; +	kvm_update_dr7(vcpu);  	return 0;  } @@ -2984,11 +3056,13 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,  static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,  					 struct kvm_xsave *guest_xsave)  { -	if (cpu_has_xsave) +	if (cpu_has_xsave) {  		memcpy(guest_xsave->region,  			&vcpu->arch.guest_fpu.state->xsave, -			xstate_size); -	else { +			vcpu->arch.guest_xstate_size); +		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] &= +			vcpu->arch.guest_supported_xcr0 | XSTATE_FPSSE; +	} else {  		memcpy(guest_xsave->region,  			&vcpu->arch.guest_fpu.state->fxsave,  			sizeof(struct i387_fxsave_struct)); @@ -3003,10 +3077,17 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,  	u64 xstate_bv =  		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; -	if (cpu_has_xsave) +	if (cpu_has_xsave) { +		/* +		 * Here we allow setting states that are not present in +		 * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility +		 * with old userspace. +		 */ +		if (xstate_bv & ~kvm_supported_xcr0()) +			return -EINVAL;  		memcpy(&vcpu->arch.guest_fpu.state->xsave, -			guest_xsave->region, xstate_size); -	else { +			guest_xsave->region, vcpu->arch.guest_xstate_size); +	} else {  		if (xstate_bv & ~XSTATE_FPSSE)  			return -EINVAL;  		memcpy(&vcpu->arch.guest_fpu.state->fxsave, @@ -3042,9 +3123,9 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,  	for (i = 0; i < guest_xcrs->nr_xcrs; i++)  		/* Only support XCR0 currently */ -		if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) { +		if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {  			r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, -				guest_xcrs->xcrs[0].value); +				guest_xcrs->xcrs[i].value);  			break;  		}  	if (r) @@ -3192,8 +3273,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,  		r = -EFAULT;  		if (copy_from_user(&va, argp, sizeof va))  			goto out; -		r = 0; -		kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); +		r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);  		break;  	}  	case KVM_X86_SETUP_MCE: { @@ -3560,11 +3640,19 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)  		offset = i * BITS_PER_LONG;  		kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);  	} -	if (is_dirty) -		kvm_flush_remote_tlbs(kvm);  	spin_unlock(&kvm->mmu_lock); +	/* See the comments in kvm_mmu_slot_remove_write_access(). */ +	lockdep_assert_held(&kvm->slots_lock); + +	/* +	 * All the TLBs can be flushed out of mmu lock, see the comments in +	 * kvm_mmu_slot_remove_write_access(). +	 */ +	if (is_dirty) +		kvm_flush_remote_tlbs(kvm); +  	r = -EFAULT;  	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))  		goto out; @@ -3856,6 +3944,23 @@ static void kvm_init_msr_list(void)  	for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {  		if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)  			continue; + +		/* +		 * Even MSRs that are valid in the host may not be exposed +		 * to the guests in some cases.  We could work around this +		 * in VMX with the generic MSR save/load machinery, but it +		 * is not really worthwhile since it will really only +		 * happen with nested virtualization. +		 */ +		switch (msrs_to_save[i]) { +		case MSR_IA32_BNDCFGS: +			if (!kvm_x86_ops->mpx_supported()) +				continue; +			break; +		default: +			break; +		} +  		if (j < i)  			msrs_to_save[j] = msrs_to_save[i];  		j++; @@ -4066,7 +4171,8 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,  		| (write ? PFERR_WRITE_MASK : 0);  	if (vcpu_match_mmio_gva(vcpu, gva) -	    && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) { +	    && !permission_fault(vcpu, vcpu->arch.walk_mmu, +				 vcpu->arch.access, access)) {  		*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |  					(gva & (PAGE_SIZE - 1));  		trace_vcpu_match_mmio(gva, *gpa, write, false); @@ -4352,6 +4458,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,  	if (!exchanged)  		return X86EMUL_CMPXCHG_FAILED; +	mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);  	kvm_mmu_pte_write(vcpu, gpa, new, bytes);  	return X86EMUL_CONTINUE; @@ -4381,8 +4488,6 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,  			       unsigned short port, void *val,  			       unsigned int count, bool in)  { -	trace_kvm_pio(!in, port, size, count); -  	vcpu->arch.pio.port = port;  	vcpu->arch.pio.in = in;  	vcpu->arch.pio.count  = count; @@ -4417,6 +4522,7 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,  	if (ret) {  data_avail:  		memcpy(val, vcpu->arch.pio_data, size * count); +		trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);  		vcpu->arch.pio.count = 0;  		return 1;  	} @@ -4431,6 +4537,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,  	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);  	memcpy(vcpu->arch.pio_data, val, size * count); +	trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);  	return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);  } @@ -4542,11 +4649,6 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)  	return res;  } -static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val) -{ -	kvm_set_rflags(emul_to_vcpu(ctxt), val); -} -  static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)  {  	return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); @@ -4731,7 +4833,6 @@ static const struct x86_emulate_ops emulate_ops = {  	.set_idt	     = emulator_set_idt,  	.get_cr              = emulator_get_cr,  	.set_cr              = emulator_set_cr, -	.set_rflags          = emulator_set_rflags,  	.cpl                 = emulator_get_cpl,  	.get_dr              = emulator_get_dr,  	.set_dr              = emulator_set_dr, @@ -4775,8 +4876,8 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)  static void init_decode_cache(struct x86_emulate_ctxt *ctxt)  { -	memset(&ctxt->twobyte, 0, -	       (void *)&ctxt->_regs - (void *)&ctxt->twobyte); +	memset(&ctxt->opcode_len, 0, +	       (void *)&ctxt->_regs - (void *)&ctxt->opcode_len);  	ctxt->fetch.start = 0;  	ctxt->fetch.end = 0; @@ -4797,7 +4898,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)  	ctxt->eip = kvm_rip_read(vcpu);  	ctxt->mode = (!is_protmode(vcpu))		? X86EMUL_MODE_REAL :  		     (ctxt->eflags & X86_EFLAGS_VM)	? X86EMUL_MODE_VM86 : -		     cs_l				? X86EMUL_MODE_PROT64 : +		     (cs_l && is_long_mode(vcpu))	? X86EMUL_MODE_PROT64 :  		     cs_db				? X86EMUL_MODE_PROT32 :  							  X86EMUL_MODE_PROT16;  	ctxt->guest_mode = is_guest_mode(vcpu); @@ -5094,8 +5195,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,  		ctxt->have_exception = false;  		ctxt->perm_ok = false; -		ctxt->only_vendor_specific_insn -			= emulation_type & EMULTYPE_TRAP_UD; +		ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;  		r = x86_decode_insn(ctxt, insn, insn_len); @@ -5263,7 +5363,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va  	smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); -	raw_spin_lock(&kvm_lock); +	spin_lock(&kvm_lock);  	list_for_each_entry(kvm, &vm_list, vm_list) {  		kvm_for_each_vcpu(i, vcpu, kvm) {  			if (vcpu->cpu != freq->cpu) @@ -5273,7 +5373,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va  				send_ipi = 1;  		}  	} -	raw_spin_unlock(&kvm_lock); +	spin_unlock(&kvm_lock);  	if (freq->old < freq->new && send_ipi) {  		/* @@ -5324,7 +5424,8 @@ static void kvm_timer_init(void)  	int cpu;  	max_tsc_khz = tsc_khz; -	register_hotcpu_notifier(&kvmclock_cpu_notifier_block); + +	cpu_notifier_register_begin();  	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {  #ifdef CONFIG_CPU_FREQ  		struct cpufreq_policy policy; @@ -5341,6 +5442,10 @@ static void kvm_timer_init(void)  	pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);  	for_each_online_cpu(cpu)  		smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); + +	__register_hotcpu_notifier(&kvmclock_cpu_notifier_block); +	cpu_notifier_register_done(); +  }  static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); @@ -5426,12 +5531,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work)  	struct kvm_vcpu *vcpu;  	int i; -	raw_spin_lock(&kvm_lock); +	spin_lock(&kvm_lock);  	list_for_each_entry(kvm, &vm_list, vm_list)  		kvm_for_each_vcpu(i, vcpu, kvm)  			set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);  	atomic_set(&kvm_guest_has_master_clock, 0); -	raw_spin_unlock(&kvm_lock); +	spin_unlock(&kvm_lock);  }  static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); @@ -5496,9 +5601,10 @@ int kvm_arch_init(void *opaque)  		goto out_free_percpu;  	kvm_set_mmio_spte_mask(); -	kvm_init_msr_list();  	kvm_x86_ops = ops; +	kvm_init_msr_list(); +  	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,  			PT_DIRTY_MASK, PT64_NX_MASK, 0); @@ -5718,36 +5824,6 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)  			!kvm_event_needs_reinjection(vcpu);  } -static int vapic_enter(struct kvm_vcpu *vcpu) -{ -	struct kvm_lapic *apic = vcpu->arch.apic; -	struct page *page; - -	if (!apic || !apic->vapic_addr) -		return 0; - -	page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); -	if (is_error_page(page)) -		return -EFAULT; - -	vcpu->arch.apic->vapic_page = page; -	return 0; -} - -static void vapic_exit(struct kvm_vcpu *vcpu) -{ -	struct kvm_lapic *apic = vcpu->arch.apic; -	int idx; - -	if (!apic || !apic->vapic_addr) -		return; - -	idx = srcu_read_lock(&vcpu->kvm->srcu); -	kvm_release_page_dirty(apic->vapic_page); -	mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); -	srcu_read_unlock(&vcpu->kvm->srcu, idx); -} -  static void update_cr8_intercept(struct kvm_vcpu *vcpu)  {  	int max_irr, tpr; @@ -5771,8 +5847,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)  	kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);  } -static void inject_pending_event(struct kvm_vcpu *vcpu) +static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)  { +	int r; +  	/* try to reinject previous events if any */  	if (vcpu->arch.exception.pending) {  		trace_kvm_inj_exception(vcpu->arch.exception.nr, @@ -5782,17 +5860,23 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)  					  vcpu->arch.exception.has_error_code,  					  vcpu->arch.exception.error_code,  					  vcpu->arch.exception.reinject); -		return; +		return 0;  	}  	if (vcpu->arch.nmi_injected) {  		kvm_x86_ops->set_nmi(vcpu); -		return; +		return 0;  	}  	if (vcpu->arch.interrupt.pending) {  		kvm_x86_ops->set_irq(vcpu); -		return; +		return 0; +	} + +	if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { +		r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); +		if (r != 0) +			return r;  	}  	/* try to inject new event if pending */ @@ -5803,12 +5887,25 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)  			kvm_x86_ops->set_nmi(vcpu);  		}  	} else if (kvm_cpu_has_injectable_intr(vcpu)) { +		/* +		 * Because interrupts can be injected asynchronously, we are +		 * calling check_nested_events again here to avoid a race condition. +		 * See https://lkml.org/lkml/2014/7/2/60 for discussion about this +		 * proposal and current concerns.  Perhaps we should be setting +		 * KVM_REQ_EVENT only on certain events and not unconditionally? +		 */ +		if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { +			r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); +			if (r != 0) +				return r; +		}  		if (kvm_x86_ops->interrupt_allowed(vcpu)) {  			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),  					    false);  			kvm_x86_ops->set_irq(vcpu);  		}  	} +	return 0;  }  static void process_nmi(struct kvm_vcpu *vcpu) @@ -5844,6 +5941,11 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)  	kvm_apic_update_tmr(vcpu, tmr);  } +/* + * Returns 1 to let __vcpu_run() continue the guest execution loop without + * exiting to the userspace.  Otherwise, the value will be returned to the + * userspace. + */  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)  {  	int r; @@ -5908,15 +6010,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)  			goto out;  		} -		inject_pending_event(vcpu); - +		if (inject_pending_event(vcpu, req_int_win) != 0) +			req_immediate_exit = true;  		/* enable NMI/IRQ window open exits if needed */ -		if (vcpu->arch.nmi_pending) -			req_immediate_exit = -				kvm_x86_ops->enable_nmi_window(vcpu) != 0; +		else if (vcpu->arch.nmi_pending) +			kvm_x86_ops->enable_nmi_window(vcpu);  		else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) -			req_immediate_exit = -				kvm_x86_ops->enable_irq_window(vcpu) != 0; +			kvm_x86_ops->enable_irq_window(vcpu);  		if (kvm_lapic_enabled(vcpu)) {  			/* @@ -5945,10 +6045,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)  	vcpu->mode = IN_GUEST_MODE; +	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); +  	/* We should set ->mode before check ->requests,  	 * see the comment in make_all_cpus_request.  	 */ -	smp_mb(); +	smp_mb__after_srcu_read_unlock();  	local_irq_disable(); @@ -5958,12 +6060,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)  		smp_wmb();  		local_irq_enable();  		preempt_enable(); +		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);  		r = 1;  		goto cancel_injection;  	} -	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); -  	if (req_immediate_exit)  		smp_send_reschedule(vcpu->cpu); @@ -5975,12 +6076,28 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)  		set_debugreg(vcpu->arch.eff_db[1], 1);  		set_debugreg(vcpu->arch.eff_db[2], 2);  		set_debugreg(vcpu->arch.eff_db[3], 3); +		set_debugreg(vcpu->arch.dr6, 6);  	}  	trace_kvm_entry(vcpu->vcpu_id);  	kvm_x86_ops->run(vcpu);  	/* +	 * Do this here before restoring debug registers on the host.  And +	 * since we do this before handling the vmexit, a DR access vmexit +	 * can (a) read the correct value of the debug registers, (b) set +	 * KVM_DEBUGREG_WONT_EXIT again. +	 */ +	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { +		int i; + +		WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); +		kvm_x86_ops->sync_dirty_debug_regs(vcpu); +		for (i = 0; i < KVM_NR_DB_REGS; i++) +			vcpu->arch.eff_db[i] = vcpu->arch.db[i]; +	} + +	/*  	 * If the guest has used debug registers, at least dr7  	 * will be disabled while returning to the host.  	 * If we don't have active breakpoints in the host, we don't @@ -6047,11 +6164,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)  	struct kvm *kvm = vcpu->kvm;  	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); -	r = vapic_enter(vcpu); -	if (r) { -		srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); -		return r; -	}  	r = 1;  	while (r > 0) { @@ -6103,15 +6215,13 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)  		}  		if (need_resched()) {  			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); -			kvm_resched(vcpu); +			cond_resched();  			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);  		}  	}  	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); -	vapic_exit(vcpu); -  	return r;  } @@ -6176,7 +6286,7 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)  		frag->len -= len;  	} -	if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { +	if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {  		vcpu->mmio_needed = 0;  		/* FIXME: return into emulator if single-stepping.  */ @@ -6417,6 +6527,7 @@ EXPORT_SYMBOL_GPL(kvm_task_switch);  int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,  				  struct kvm_sregs *sregs)  { +	struct msr_data apic_base_msr;  	int mmu_reset_needed = 0;  	int pending_vec, max_bits, idx;  	struct desc_ptr dt; @@ -6440,7 +6551,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,  	mmu_reset_needed |= vcpu->arch.efer != sregs->efer;  	kvm_x86_ops->set_efer(vcpu, sregs->efer); -	kvm_set_apic_base(vcpu, sregs->apic_base); +	apic_base_msr.data = sregs->apic_base; +	apic_base_msr.host_initiated = true; +	kvm_set_apic_base(vcpu, &apic_base_msr);  	mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;  	kvm_x86_ops->set_cr0(vcpu, sregs->cr0); @@ -6688,7 +6801,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)  	if (r)  		return r;  	kvm_vcpu_reset(vcpu); -	r = kvm_mmu_setup(vcpu); +	kvm_mmu_setup(vcpu);  	vcpu_put(vcpu);  	return r; @@ -6698,6 +6811,7 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)  {  	int r;  	struct msr_data msr; +	struct kvm *kvm = vcpu->kvm;  	r = vcpu_load(vcpu);  	if (r) @@ -6708,6 +6822,9 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)  	kvm_write_tsc(vcpu, &msr);  	vcpu_put(vcpu); +	schedule_delayed_work(&kvm->arch.kvmclock_sync_work, +					KVMCLOCK_SYNC_PERIOD); +  	return r;  } @@ -6733,6 +6850,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)  	memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));  	vcpu->arch.dr6 = DR6_FIXED_1; +	kvm_update_dr6(vcpu);  	vcpu->arch.dr7 = DR7_FIXED_1;  	kvm_update_dr7(vcpu); @@ -6835,6 +6953,7 @@ int kvm_arch_hardware_enable(void *garbage)  	 */  	if (backwards_tsc) {  		u64 delta_cyc = max_tsc - local_tsc; +		backwards_tsc_observed = true;  		list_for_each_entry(kvm, &vm_list, vm_list) {  			kvm_for_each_vcpu(i, vcpu, kvm) {  				vcpu->arch.tsc_offset_adjustment += delta_cyc; @@ -6940,6 +7059,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)  	vcpu->arch.ia32_tsc_adjust_msr = 0x0;  	vcpu->arch.pv_time_enabled = false; + +	vcpu->arch.guest_supported_xcr0 = 0; +	vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; +  	kvm_async_pf_hash_reset(vcpu);  	kvm_pmu_init(vcpu); @@ -6981,6 +7104,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)  	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);  	INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);  	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); +	atomic_set(&kvm->arch.noncoherent_dma_count, 0);  	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */  	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); @@ -6994,6 +7118,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)  	pvclock_update_vm_gtod_copy(kvm); +	INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); +	INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); +  	return 0;  } @@ -7031,6 +7158,8 @@ static void kvm_free_vcpus(struct kvm *kvm)  void kvm_arch_sync_events(struct kvm *kvm)  { +	cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); +	cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);  	kvm_free_all_assigned_devices(kvm);  	kvm_free_pit(kvm);  } @@ -7065,7 +7194,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)  	kfree(rcu_dereference_check(kvm->arch.apic_map, 1));  } -void kvm_arch_free_memslot(struct kvm_memory_slot *free, +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,  			   struct kvm_memory_slot *dont)  {  	int i; @@ -7086,7 +7215,8 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,  	}  } -int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, +			    unsigned long npages)  {  	int i; @@ -7208,8 +7338,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,  		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);  	/*  	 * Write protect all pages for dirty logging. -	 * Existing largepage mappings are destroyed here and new ones will -	 * not be created until the end of the logging. +	 * +	 * All the sptes including the large sptes which point to this +	 * slot are set to readonly. We can not create any new large +	 * spte on this slot until the end of the logging. +	 * +	 * See the comments in fast_page_fault().  	 */  	if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))  		kvm_mmu_slot_remove_write_access(kvm, mem->slot); @@ -7228,6 +7362,9 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,  int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)  { +	if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) +		kvm_x86_ops->check_nested_events(vcpu, false); +  	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&  		!vcpu->arch.apf.halted)  		|| !list_empty_careful(&vcpu->async_pf.done) @@ -7283,7 +7420,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)  	int r;  	if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) || -	      is_error_page(work->page)) +	      work->wakeup_all)  		return;  	r = kvm_mmu_reload(vcpu); @@ -7393,7 +7530,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,  	struct x86_exception fault;  	trace_kvm_async_pf_ready(work->arch.token, work->gva); -	if (is_error_page(work->page)) +	if (work->wakeup_all)  		work->arch.token = ~0; /* broadcast wakeup */  	else  		kvm_del_async_pf_gfn(vcpu, work->arch.gfn); @@ -7420,6 +7557,24 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)  			kvm_x86_ops->interrupt_allowed(vcpu);  } +void kvm_arch_register_noncoherent_dma(struct kvm *kvm) +{ +	atomic_inc(&kvm->arch.noncoherent_dma_count); +} +EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma); + +void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm) +{ +	atomic_dec(&kvm->arch.noncoherent_dma_count); +} +EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma); + +bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) +{ +	return atomic_read(&kvm->arch.noncoherent_dma_count); +} +EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); +  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);  | 
