From 67253af52e9133fb4cfbf7a2448a2d3524d1fa6c Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Fri, 25 Apr 2008 10:20:22 +0800
Subject: KVM: Add kvm_x86_ops get_tdp_level()

The function get_tdp_level() provided the number of tdp level for EPT and
NPT rather than the NPT specific macro.

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 include/asm-x86/kvm_host.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 9d963cd6533..897a1be24cf 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -422,6 +422,7 @@ struct kvm_x86_ops {
 				       struct kvm_run *run);
 
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
+	int (*get_tdp_level)(void);
 };
 
 extern struct kvm_x86_ops *kvm_x86_ops;
-- 
cgit v1.2.3-70-g09d2


From 7b52345e2c4c7333bf7eba8034ffc4683fa63c91 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Fri, 25 Apr 2008 21:13:50 +0800
Subject: KVM: MMU: Add EPT support

Enable kvm_set_spte() to generate EPT entries.

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c         | 43 +++++++++++++++++++++++++++++++++----------
 arch/x86/kvm/x86.c         |  3 +++
 include/asm-x86/kvm_host.h |  3 +++
 3 files changed, 39 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 20fb3c852db..c28a36b4cbb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -152,6 +152,12 @@ static struct kmem_cache *mmu_page_header_cache;
 
 static u64 __read_mostly shadow_trap_nonpresent_pte;
 static u64 __read_mostly shadow_notrap_nonpresent_pte;
+static u64 __read_mostly shadow_base_present_pte;
+static u64 __read_mostly shadow_nx_mask;
+static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
+static u64 __read_mostly shadow_user_mask;
+static u64 __read_mostly shadow_accessed_mask;
+static u64 __read_mostly shadow_dirty_mask;
 
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
 {
@@ -160,6 +166,23 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
 
+void kvm_mmu_set_base_ptes(u64 base_pte)
+{
+	shadow_base_present_pte = base_pte;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
+
+void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
+		u64 dirty_mask, u64 nx_mask, u64 x_mask)
+{
+	shadow_user_mask = user_mask;
+	shadow_accessed_mask = accessed_mask;
+	shadow_dirty_mask = dirty_mask;
+	shadow_nx_mask = nx_mask;
+	shadow_x_mask = x_mask;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
+
 static int is_write_protection(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.cr0 & X86_CR0_WP;
@@ -198,7 +221,7 @@ static int is_writeble_pte(unsigned long pte)
 
 static int is_dirty_pte(unsigned long pte)
 {
-	return pte & PT_DIRTY_MASK;
+	return pte & shadow_dirty_mask;
 }
 
 static int is_rmap_pte(u64 pte)
@@ -513,7 +536,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 		return;
 	sp = page_header(__pa(spte));
 	pfn = spte_to_pfn(*spte);
-	if (*spte & PT_ACCESSED_MASK)
+	if (*spte & shadow_accessed_mask)
 		kvm_set_pfn_accessed(pfn);
 	if (is_writeble_pte(*spte))
 		kvm_release_pfn_dirty(pfn);
@@ -1039,17 +1062,17 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 	 * whether the guest actually used the pte (in order to detect
 	 * demand paging).
 	 */
-	spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
+	spte = shadow_base_present_pte | shadow_dirty_mask;
 	if (!speculative)
 		pte_access |= PT_ACCESSED_MASK;
 	if (!dirty)
 		pte_access &= ~ACC_WRITE_MASK;
-	if (!(pte_access & ACC_EXEC_MASK))
-		spte |= PT64_NX_MASK;
-
-	spte |= PT_PRESENT_MASK;
+	if (pte_access & ACC_EXEC_MASK)
+		spte |= shadow_x_mask;
+	else
+		spte |= shadow_nx_mask;
 	if (pte_access & ACC_USER_MASK)
-		spte |= PT_USER_MASK;
+		spte |= shadow_user_mask;
 	if (largepage)
 		spte |= PT_PAGE_SIZE_MASK;
 
@@ -1155,7 +1178,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			}
 
 			table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
-				| PT_WRITABLE_MASK | PT_USER_MASK;
+				| PT_WRITABLE_MASK | shadow_user_mask;
 		}
 		table_addr = table[index] & PT64_BASE_ADDR_MASK;
 	}
@@ -1599,7 +1622,7 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
 {
 	u64 *spte = vcpu->arch.last_pte_updated;
 
-	return !!(spte && (*spte & PT_ACCESSED_MASK));
+	return !!(spte && (*spte & shadow_accessed_mask));
 }
 
 static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0ce556372a4..0735efbfa71 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2417,6 +2417,9 @@ int kvm_arch_init(void *opaque)
 
 	kvm_x86_ops = ops;
 	kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
+	kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
+	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
+			PT_DIRTY_MASK, PT64_NX_MASK, 0);
 	return 0;
 
 out:
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 897a1be24cf..d1dedda958f 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -434,6 +434,9 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
 int kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
+void kvm_mmu_set_base_ptes(u64 base_pte);
+void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
+		u64 dirty_mask, u64 nx_mask, u64 x_mask);
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
-- 
cgit v1.2.3-70-g09d2


From b7ebfb0509692cd923e31650f81ed4d79c9a3e59 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Fri, 25 Apr 2008 21:44:52 +0800
Subject: KVM: VMX: Prepare an identity page table for EPT in real mode

[aliguory: plug leak]

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c         | 79 ++++++++++++++++++++++++++++++++++++++++++++--
 arch/x86/kvm/vmx.h         |  3 ++
 arch/x86/kvm/x86.c         |  2 ++
 include/asm-x86/kvm_host.h |  3 ++
 4 files changed, 84 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 98e4f2b036d..de5f6150f2f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -87,7 +87,7 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_vmx, vcpu);
 }
 
-static int init_rmode_tss(struct kvm *kvm);
+static int init_rmode(struct kvm *kvm);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -1304,7 +1304,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 	fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
 
 	kvm_mmu_reset_context(vcpu);
-	init_rmode_tss(vcpu->kvm);
+	init_rmode(vcpu->kvm);
 }
 
 #ifdef CONFIG_X86_64
@@ -1578,6 +1578,41 @@ out:
 	return ret;
 }
 
+static int init_rmode_identity_map(struct kvm *kvm)
+{
+	int i, r, ret;
+	pfn_t identity_map_pfn;
+	u32 tmp;
+
+	if (!vm_need_ept())
+		return 1;
+	if (unlikely(!kvm->arch.ept_identity_pagetable)) {
+		printk(KERN_ERR "EPT: identity-mapping pagetable "
+			"haven't been allocated!\n");
+		return 0;
+	}
+	if (likely(kvm->arch.ept_identity_pagetable_done))
+		return 1;
+	ret = 0;
+	identity_map_pfn = VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT;
+	r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
+	if (r < 0)
+		goto out;
+	/* Set up identity-mapping pagetable for EPT in real mode */
+	for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
+		tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
+			_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
+		r = kvm_write_guest_page(kvm, identity_map_pfn,
+				&tmp, i * sizeof(tmp), sizeof(tmp));
+		if (r < 0)
+			goto out;
+	}
+	kvm->arch.ept_identity_pagetable_done = true;
+	ret = 1;
+out:
+	return ret;
+}
+
 static void seg_setup(int seg)
 {
 	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -1612,6 +1647,31 @@ out:
 	return r;
 }
 
+static int alloc_identity_pagetable(struct kvm *kvm)
+{
+	struct kvm_userspace_memory_region kvm_userspace_mem;
+	int r = 0;
+
+	down_write(&kvm->slots_lock);
+	if (kvm->arch.ept_identity_pagetable)
+		goto out;
+	kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
+	kvm_userspace_mem.flags = 0;
+	kvm_userspace_mem.guest_phys_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
+	kvm_userspace_mem.memory_size = PAGE_SIZE;
+	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
+	if (r)
+		goto out;
+
+	down_read(&current->mm->mmap_sem);
+	kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
+			VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT);
+	up_read(&current->mm->mmap_sem);
+out:
+	up_write(&kvm->slots_lock);
+	return r;
+}
+
 static void allocate_vpid(struct vcpu_vmx *vmx)
 {
 	int vpid;
@@ -1775,6 +1835,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	return 0;
 }
 
+static int init_rmode(struct kvm *kvm)
+{
+	if (!init_rmode_tss(kvm))
+		return 0;
+	if (!init_rmode_identity_map(kvm))
+		return 0;
+	return 1;
+}
+
 static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1782,7 +1851,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	int ret;
 
 	down_read(&vcpu->kvm->slots_lock);
-	if (!init_rmode_tss(vmx->vcpu.kvm)) {
+	if (!init_rmode(vmx->vcpu.kvm)) {
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -2759,6 +2828,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 		if (alloc_apic_access_page(kvm) != 0)
 			goto free_vmcs;
 
+	if (vm_need_ept())
+		if (alloc_identity_pagetable(kvm) != 0)
+			goto free_vmcs;
+
 	return &vmx->vcpu;
 
 free_vmcs:
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 093b085daf6..f97eccc754e 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -340,6 +340,7 @@ enum vmcs_field {
 #define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED  0x4
 
 #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT	9
+#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT	10
 
 #define VMX_NR_VPIDS				(1 << 16)
 #define VMX_VPID_EXTENT_SINGLE_CONTEXT		1
@@ -353,4 +354,6 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
 #define VMX_EPT_DEFAULT_GAW			3
 
+#define VMX_EPT_IDENTITY_PAGETABLE_ADDR		0xfffbc000ul
+
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0735efbfa71..1842a86f7c3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3909,6 +3909,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_free_physmem(kvm);
 	if (kvm->arch.apic_access_page)
 		put_page(kvm->arch.apic_access_page);
+	if (kvm->arch.ept_identity_pagetable)
+		put_page(kvm->arch.ept_identity_pagetable);
 	kfree(kvm);
 }
 
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index d1dedda958f..e24afaa64a4 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -314,6 +314,9 @@ struct kvm_arch{
 	struct page *apic_access_page;
 
 	gpa_t wall_clock;
+
+	struct page *ept_identity_pagetable;
+	bool ept_identity_pagetable_done;
 };
 
 struct kvm_vm_stat {
-- 
cgit v1.2.3-70-g09d2


From 1439442c7b257b47a83aea4daed8fbf4a32cdff9 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Mon, 28 Apr 2008 12:24:45 +0800
Subject: KVM: VMX: Enable EPT feature for KVM

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c         |   5 +-
 arch/x86/kvm/vmx.c         | 229 +++++++++++++++++++++++++++++++++++++++++++--
 arch/x86/kvm/vmx.h         |   9 ++
 include/asm-x86/kvm_host.h |   1 +
 4 files changed, 234 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3dbedf16973..d9344be3644 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1177,8 +1177,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 				return -ENOMEM;
 			}
 
-			table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
-				| PT_WRITABLE_MASK | shadow_user_mask;
+			table[index] = __pa(new_table->spt)
+				| PT_PRESENT_MASK | PT_WRITABLE_MASK
+				| shadow_user_mask | shadow_x_mask;
 		}
 		table_addr = table[index] & PT64_BASE_ADDR_MASK;
 	}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index de5f6150f2f..bfe4db11989 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -42,7 +42,7 @@ module_param(enable_vpid, bool, 0);
 static int flexpriority_enabled = 1;
 module_param(flexpriority_enabled, bool, 0);
 
-static int enable_ept;
+static int enable_ept = 1;
 module_param(enable_ept, bool, 0);
 
 struct vmcs {
@@ -284,6 +284,18 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva)
 		  : : "a"(&operand), "c"(ext) : "cc", "memory");
 }
 
+static inline void __invept(int ext, u64 eptp, gpa_t gpa)
+{
+	struct {
+		u64 eptp, gpa;
+	} operand = {eptp, gpa};
+
+	asm volatile (ASM_VMX_INVEPT
+			/* CF==1 or ZF==1 --> rc = -1 */
+			"; ja 1f ; ud2 ; 1:\n"
+			: : "a" (&operand), "c" (ext) : "cc", "memory");
+}
+
 static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
 {
 	int i;
@@ -335,6 +347,33 @@ static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
 	__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
 }
 
+static inline void ept_sync_global(void)
+{
+	if (cpu_has_vmx_invept_global())
+		__invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
+}
+
+static inline void ept_sync_context(u64 eptp)
+{
+	if (vm_need_ept()) {
+		if (cpu_has_vmx_invept_context())
+			__invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
+		else
+			ept_sync_global();
+	}
+}
+
+static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
+{
+	if (vm_need_ept()) {
+		if (cpu_has_vmx_invept_individual_addr())
+			__invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
+					eptp, gpa);
+		else
+			ept_sync_context(eptp);
+	}
+}
+
 static unsigned long vmcs_readl(unsigned long field)
 {
 	unsigned long value;
@@ -422,6 +461,8 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 		eb |= 1u << 1;
 	if (vcpu->arch.rmode.active)
 		eb = ~0;
+	if (vm_need_ept())
+		eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
 	vmcs_write32(EXCEPTION_BITMAP, eb);
 }
 
@@ -1352,8 +1393,64 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
 	vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
 }
 
+static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
+{
+	if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
+		if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
+			printk(KERN_ERR "EPT: Fail to load pdptrs!\n");
+			return;
+		}
+		vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]);
+		vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]);
+		vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]);
+		vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]);
+	}
+}
+
+static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+
+static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
+					unsigned long cr0,
+					struct kvm_vcpu *vcpu)
+{
+	if (!(cr0 & X86_CR0_PG)) {
+		/* From paging/starting to nonpaging */
+		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
+			     vmcs_config.cpu_based_exec_ctrl |
+			     (CPU_BASED_CR3_LOAD_EXITING |
+			      CPU_BASED_CR3_STORE_EXITING));
+		vcpu->arch.cr0 = cr0;
+		vmx_set_cr4(vcpu, vcpu->arch.cr4);
+		*hw_cr0 |= X86_CR0_PE | X86_CR0_PG;
+		*hw_cr0 &= ~X86_CR0_WP;
+	} else if (!is_paging(vcpu)) {
+		/* From nonpaging to paging */
+		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
+			     vmcs_config.cpu_based_exec_ctrl &
+			     ~(CPU_BASED_CR3_LOAD_EXITING |
+			       CPU_BASED_CR3_STORE_EXITING));
+		vcpu->arch.cr0 = cr0;
+		vmx_set_cr4(vcpu, vcpu->arch.cr4);
+		if (!(vcpu->arch.cr0 & X86_CR0_WP))
+			*hw_cr0 &= ~X86_CR0_WP;
+	}
+}
+
+static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
+					struct kvm_vcpu *vcpu)
+{
+	if (!is_paging(vcpu)) {
+		*hw_cr4 &= ~X86_CR4_PAE;
+		*hw_cr4 |= X86_CR4_PSE;
+	} else if (!(vcpu->arch.cr4 & X86_CR4_PAE))
+		*hw_cr4 &= ~X86_CR4_PAE;
+}
+
 static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
+	unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) |
+				KVM_VM_CR0_ALWAYS_ON;
+
 	vmx_fpu_deactivate(vcpu);
 
 	if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
@@ -1371,29 +1468,61 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	}
 #endif
 
+	if (vm_need_ept())
+		ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
+
 	vmcs_writel(CR0_READ_SHADOW, cr0);
-	vmcs_writel(GUEST_CR0,
-		    (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
+	vmcs_writel(GUEST_CR0, hw_cr0);
 	vcpu->arch.cr0 = cr0;
 
 	if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
 		vmx_fpu_activate(vcpu);
 }
 
+static u64 construct_eptp(unsigned long root_hpa)
+{
+	u64 eptp;
+
+	/* TODO write the value reading from MSR */
+	eptp = VMX_EPT_DEFAULT_MT |
+		VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
+	eptp |= (root_hpa & PAGE_MASK);
+
+	return eptp;
+}
+
 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
+	unsigned long guest_cr3;
+	u64 eptp;
+
+	guest_cr3 = cr3;
+	if (vm_need_ept()) {
+		eptp = construct_eptp(cr3);
+		vmcs_write64(EPT_POINTER, eptp);
+		ept_sync_context(eptp);
+		ept_load_pdptrs(vcpu);
+		guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
+			VMX_EPT_IDENTITY_PAGETABLE_ADDR;
+	}
+
 	vmx_flush_tlb(vcpu);
-	vmcs_writel(GUEST_CR3, cr3);
+	vmcs_writel(GUEST_CR3, guest_cr3);
 	if (vcpu->arch.cr0 & X86_CR0_PE)
 		vmx_fpu_deactivate(vcpu);
 }
 
 static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
-	vmcs_writel(CR4_READ_SHADOW, cr4);
-	vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ?
-		    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
+	unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.active ?
+		    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
+
 	vcpu->arch.cr4 = cr4;
+	if (vm_need_ept())
+		ept_update_paging_mode_cr4(&hw_cr4, vcpu);
+
+	vmcs_writel(CR4_READ_SHADOW, cr4);
+	vmcs_writel(GUEST_CR4, hw_cr4);
 }
 
 static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
@@ -2116,6 +2245,9 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
 		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
 	if (is_page_fault(intr_info)) {
+		/* EPT won't cause page fault directly */
+		if (vm_need_ept())
+			BUG();
 		cr2 = vmcs_readl(EXIT_QUALIFICATION);
 		KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
 			    (u32)((u64)cr2 >> 32), handler);
@@ -2445,6 +2577,64 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	return kvm_task_switch(vcpu, tss_selector, reason);
 }
 
+static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	u64 exit_qualification;
+	enum emulation_result er;
+	gpa_t gpa;
+	unsigned long hva;
+	int gla_validity;
+	int r;
+
+	exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+
+	if (exit_qualification & (1 << 6)) {
+		printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
+		return -ENOTSUPP;
+	}
+
+	gla_validity = (exit_qualification >> 7) & 0x3;
+	if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
+		printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
+		printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
+			(long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
+			(long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
+		printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
+			(long unsigned int)exit_qualification);
+		kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
+		kvm_run->hw.hardware_exit_reason = 0;
+		return -ENOTSUPP;
+	}
+
+	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+	hva = gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT);
+	if (!kvm_is_error_hva(hva)) {
+		r = kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
+		if (r < 0) {
+			printk(KERN_ERR "EPT: Not enough memory!\n");
+			return -ENOMEM;
+		}
+		return 1;
+	} else {
+		/* must be MMIO */
+		er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
+
+		if (er == EMULATE_FAIL) {
+			printk(KERN_ERR
+			 "EPT: Fail to handle EPT violation vmexit!er is %d\n",
+			 er);
+			printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
+			 (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
+			 (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
+			printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
+				(long unsigned int)exit_qualification);
+			return -ENOTSUPP;
+		} else if (er == EMULATE_DO_MMIO)
+			return 0;
+	}
+	return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -2468,6 +2658,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
 	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
 	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
 	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
+	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -2486,6 +2677,13 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP),
 		    (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit);
 
+	/* Access CR3 don't cause VMExit in paging mode, so we need
+	 * to sync with guest real CR3. */
+	if (vm_need_ept() && is_paging(vcpu)) {
+		vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+		ept_load_pdptrs(vcpu);
+	}
+
 	if (unlikely(vmx->fail)) {
 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		kvm_run->fail_entry.hardware_entry_failure_reason
@@ -2494,7 +2692,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	}
 
 	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
-				exit_reason != EXIT_REASON_EXCEPTION_NMI)
+			(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
+			exit_reason != EXIT_REASON_EPT_VIOLATION))
 		printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
 		       "exit reason is 0x%x\n", __func__, exit_reason);
 	if (exit_reason < kvm_vmx_max_exit_handlers
@@ -2796,6 +2995,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 		return ERR_PTR(-ENOMEM);
 
 	allocate_vpid(vmx);
+	if (id == 0 && vm_need_ept()) {
+		kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
+			VMX_EPT_WRITABLE_MASK |
+			VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
+		kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK,
+				VMX_EPT_FAKE_DIRTY_MASK, 0ull,
+				VMX_EPT_EXECUTABLE_MASK);
+		kvm_enable_tdp();
+	}
 
 	err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
 	if (err)
@@ -2975,9 +3183,14 @@ static int __init vmx_init(void)
 	vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP);
 	vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP);
 
+	if (cpu_has_vmx_ept())
+		bypass_guest_pf = 0;
+
 	if (bypass_guest_pf)
 		kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
 
+	ept_sync_global();
+
 	return 0;
 
 out2:
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index f97eccc754e..79d94c610df 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -353,6 +353,15 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_CONTEXT_BIT		(1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
 #define VMX_EPT_DEFAULT_GAW			3
+#define VMX_EPT_MAX_GAW				0x4
+#define VMX_EPT_MT_EPTE_SHIFT			3
+#define VMX_EPT_GAW_EPTP_SHIFT			3
+#define VMX_EPT_DEFAULT_MT			0x6ull
+#define VMX_EPT_READABLE_MASK			0x1ull
+#define VMX_EPT_WRITABLE_MASK			0x2ull
+#define VMX_EPT_EXECUTABLE_MASK			0x4ull
+#define VMX_EPT_FAKE_ACCESSED_MASK		(1ull << 62)
+#define VMX_EPT_FAKE_DIRTY_MASK			(1ull << 63)
 
 #define VMX_EPT_IDENTITY_PAGETABLE_ADDR		0xfffbc000ul
 
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index e24afaa64a4..4baa9c9e1f4 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -651,6 +651,7 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
 #define ASM_VMX_VMWRITE_RSP_RDX   ".byte 0x0f, 0x79, 0xd4"
 #define ASM_VMX_VMXOFF            ".byte 0x0f, 0x01, 0xc4"
 #define ASM_VMX_VMXON_RAX         ".byte 0xf3, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_INVEPT		  ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
 #define ASM_VMX_INVVPID		  ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
 
 #define MSR_IA32_TIME_STAMP_COUNTER		0x010
-- 
cgit v1.2.3-70-g09d2


From 45c5eb67da5a668abe79c23a7e64dbc87a600f90 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Fri, 25 Apr 2008 17:55:49 -0500
Subject: KVM: ppc: Handle guest idle by emulating MSR[WE] writes

This reduces host CPU usage when the guest is idle. However, the guest must
set MSR[WE] in its idle loop, which Linux did not do until 2.6.26.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/powerpc/kvm/booke_guest.c |  1 +
 arch/powerpc/kvm/powerpc.c     | 20 +++++++++++++++++---
 include/asm-powerpc/kvm_host.h |  1 +
 include/asm-powerpc/kvm_ppc.h  |  5 +++++
 4 files changed, 24 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/kvm/booke_guest.c b/arch/powerpc/kvm/booke_guest.c
index 6d9884a6884..b3db6f4576a 100644
--- a/arch/powerpc/kvm/booke_guest.c
+++ b/arch/powerpc/kvm/booke_guest.c
@@ -49,6 +49,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "inst_emu",   VCPU_STAT(emulated_inst_exits) },
 	{ "dec",        VCPU_STAT(dec_exits) },
 	{ "ext_intr",   VCPU_STAT(ext_intr_exits) },
+	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ NULL }
 };
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index bad40bd2d3a..777e0f34e0e 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -36,13 +36,12 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
 
 int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 {
-	/* XXX implement me */
-	return 0;
+	return !!(v->arch.pending_exceptions);
 }
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
-	return 1;
+	return !(v->arch.msr & MSR_WE);
 }
 
 
@@ -214,6 +213,11 @@ static void kvmppc_decrementer_func(unsigned long data)
 	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
 
 	kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_DECREMENTER);
+
+	if (waitqueue_active(&vcpu->wq)) {
+		wake_up_interruptible(&vcpu->wq);
+		vcpu->stat.halt_wakeup++;
+	}
 }
 
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
@@ -339,6 +343,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	int r;
 	sigset_t sigsaved;
 
+	vcpu_load(vcpu);
+
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
@@ -363,12 +369,20 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
+	vcpu_put(vcpu);
+
 	return r;
 }
 
 int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 {
 	kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_EXTERNAL);
+
+	if (waitqueue_active(&vcpu->wq)) {
+		wake_up_interruptible(&vcpu->wq);
+		vcpu->stat.halt_wakeup++;
+	}
+
 	return 0;
 }
 
diff --git a/include/asm-powerpc/kvm_host.h b/include/asm-powerpc/kvm_host.h
index 04ffbb8e0a3..81a69d71101 100644
--- a/include/asm-powerpc/kvm_host.h
+++ b/include/asm-powerpc/kvm_host.h
@@ -59,6 +59,7 @@ struct kvm_vcpu_stat {
 	u32 emulated_inst_exits;
 	u32 dec_exits;
 	u32 ext_intr_exits;
+	u32 halt_wakeup;
 };
 
 struct tlbe {
diff --git a/include/asm-powerpc/kvm_ppc.h b/include/asm-powerpc/kvm_ppc.h
index 7ac820308a7..b35a7e3ef97 100644
--- a/include/asm-powerpc/kvm_ppc.h
+++ b/include/asm-powerpc/kvm_ppc.h
@@ -77,12 +77,17 @@ static inline void kvmppc_clear_exception(struct kvm_vcpu *vcpu, int exception)
 	clear_bit(priority, &vcpu->arch.pending_exceptions);
 }
 
+/* Helper function for "full" MSR writes. No need to call this if only EE is
+ * changing. */
 static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
 {
 	if ((new_msr & MSR_PR) != (vcpu->arch.msr & MSR_PR))
 		kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR);
 
 	vcpu->arch.msr = new_msr;
+
+	if (vcpu->arch.msr & MSR_WE)
+		kvm_vcpu_block(vcpu);
 }
 
 #endif /* __POWERPC_KVM_PPC_H__ */
-- 
cgit v1.2.3-70-g09d2


From bc1a34f1bf354fabc03e3f465620c80e510d0e8f Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <andrea@qumranet.com>
Date: Thu, 1 May 2008 18:43:33 +0200
Subject: KVM: avoid fx_init() schedule in atomic

This make sure not to schedule in atomic during fx_init. I also
changed the name of fpu_init to fx_finit to avoid duplicating the name
with fpu_init that is already used in the kernel, this makes grep
simpler if nothing else.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c         | 11 ++++++++++-
 include/asm-x86/kvm_host.h |  2 +-
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bc224bba1e8..21338bdb28f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3703,10 +3703,19 @@ void fx_init(struct kvm_vcpu *vcpu)
 {
 	unsigned after_mxcsr_mask;
 
+	/*
+	 * Touch the fpu the first time in non atomic context as if
+	 * this is the first fpu instruction the exception handler
+	 * will fire before the instruction returns and it'll have to
+	 * allocate ram with GFP_KERNEL.
+	 */
+	if (!used_math())
+		fx_save(&vcpu->arch.host_fx_image);
+
 	/* Initialize guest FPU by resetting ours and saving into guest's */
 	preempt_disable();
 	fx_save(&vcpu->arch.host_fx_image);
-	fpu_init();
+	fx_finit();
 	fx_save(&vcpu->arch.guest_fx_image);
 	fx_restore(&vcpu->arch.host_fx_image);
 	preempt_enable();
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 4baa9c9e1f4..1d8cd01fa51 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -627,7 +627,7 @@ static inline void fx_restore(struct i387_fxsave_struct *image)
 	asm("fxrstor (%0)":: "r" (image));
 }
 
-static inline void fpu_init(void)
+static inline void fx_finit(void)
 {
 	asm("finit");
 }
-- 
cgit v1.2.3-70-g09d2