101 files changed, 4926 insertions, 1289 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index f6ec3a92e62..a4df5535996 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1194,12 +1194,15 @@ struct kvm_ppc_pvinfo {
 This ioctl fetches PV specific information that need to be passed to the guest
 using the device tree or other means from vm context.
 
-For now the only implemented piece of information distributed here is an array
-of 4 instructions that make up a hypercall.
+The hcall array defines 4 instructions that make up a hypercall.
 
 If any additional field gets added to this structure later on, a bit for that
 additional piece of information will be set in the flags bitmap.
 
+The flags bitmap is defined as:
+
+   /* the host supports the ePAPR idle hcall
+   #define KVM_PPC_PVINFO_FLAGS_EV_IDLE   (1<<0)
 
 4.48 KVM_ASSIGN_PCI_DEVICE
 
@@ -1731,7 +1734,46 @@ registers, find a list below:
   Arch  |       Register        | Width (bits)
         |                       |
   PPC   | KVM_REG_PPC_HIOR      | 64
-
+  PPC   | KVM_REG_PPC_IAC1      | 64
+  PPC   | KVM_REG_PPC_IAC2      | 64
+  PPC   | KVM_REG_PPC_IAC3      | 64
+  PPC   | KVM_REG_PPC_IAC4      | 64
+  PPC   | KVM_REG_PPC_DAC1      | 64
+  PPC   | KVM_REG_PPC_DAC2      | 64
+  PPC   | KVM_REG_PPC_DABR      | 64
+  PPC   | KVM_REG_PPC_DSCR      | 64
+  PPC   | KVM_REG_PPC_PURR      | 64
+  PPC   | KVM_REG_PPC_SPURR     | 64
+  PPC   | KVM_REG_PPC_DAR       | 64
+  PPC   | KVM_REG_PPC_DSISR     | 32
+  PPC   | KVM_REG_PPC_AMR       | 64
+  PPC   | KVM_REG_PPC_UAMOR     | 64
+  PPC   | KVM_REG_PPC_MMCR0     | 64
+  PPC   | KVM_REG_PPC_MMCR1     | 64
+  PPC   | KVM_REG_PPC_MMCRA     | 64
+  PPC   | KVM_REG_PPC_PMC1      | 32
+  PPC   | KVM_REG_PPC_PMC2      | 32
+  PPC   | KVM_REG_PPC_PMC3      | 32
+  PPC   | KVM_REG_PPC_PMC4      | 32
+  PPC   | KVM_REG_PPC_PMC5      | 32
+  PPC   | KVM_REG_PPC_PMC6      | 32
+  PPC   | KVM_REG_PPC_PMC7      | 32
+  PPC   | KVM_REG_PPC_PMC8      | 32
+  PPC   | KVM_REG_PPC_FPR0      | 64
+          ...
+  PPC   | KVM_REG_PPC_FPR31     | 64
+  PPC   | KVM_REG_PPC_VR0       | 128
+          ...
+  PPC   | KVM_REG_PPC_VR31      | 128
+  PPC   | KVM_REG_PPC_VSR0      | 128
+          ...
+  PPC   | KVM_REG_PPC_VSR31     | 128
+  PPC   | KVM_REG_PPC_FPSCR     | 64
+  PPC   | KVM_REG_PPC_VSCR      | 32
+  PPC   | KVM_REG_PPC_VPA_ADDR  | 64
+  PPC   | KVM_REG_PPC_VPA_SLB   | 128
+  PPC   | KVM_REG_PPC_VPA_DTL   | 128
+  PPC   | KVM_REG_PPC_EPCR	| 32
 
 4.69 KVM_GET_ONE_REG
 
@@ -1747,7 +1789,7 @@ kvm_one_reg struct passed in. On success, the register value can be found
 at the memory location pointed to by "addr".
 
 The list of registers accessible using this interface is identical to the
-list in 4.64.
+list in 4.68.
 
 
 4.70 KVM_KVMCLOCK_CTRL
@@ -1997,6 +2039,93 @@ return the hash table order in the parameter.  (If the guest is using
 the virtualized real-mode area (VRMA) facility, the kernel will
 re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.)
 
+4.77 KVM_S390_INTERRUPT
+
+Capability: basic
+Architectures: s390
+Type: vm ioctl, vcpu ioctl
+Parameters: struct kvm_s390_interrupt (in)
+Returns: 0 on success, -1 on error
+
+Allows to inject an interrupt to the guest. Interrupts can be floating
+(vm ioctl) or per cpu (vcpu ioctl), depending on the interrupt type.
+
+Interrupt parameters are passed via kvm_s390_interrupt:
+
+struct kvm_s390_interrupt {
+	__u32 type;
+	__u32 parm;
+	__u64 parm64;
+};
+
+type can be one of the following:
+
+KVM_S390_SIGP_STOP (vcpu) - sigp restart
+KVM_S390_PROGRAM_INT (vcpu) - program check; code in parm
+KVM_S390_SIGP_SET_PREFIX (vcpu) - sigp set prefix; prefix address in parm
+KVM_S390_RESTART (vcpu) - restart
+KVM_S390_INT_VIRTIO (vm) - virtio external interrupt; external interrupt
+			   parameters in parm and parm64
+KVM_S390_INT_SERVICE (vm) - sclp external interrupt; sclp parameter in parm
+KVM_S390_INT_EMERGENCY (vcpu) - sigp emergency; source cpu in parm
+KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm
+
+Note that the vcpu ioctl is asynchronous to vcpu execution.
+
+4.78 KVM_PPC_GET_HTAB_FD
+
+Capability: KVM_CAP_PPC_HTAB_FD
+Architectures: powerpc
+Type: vm ioctl
+Parameters: Pointer to struct kvm_get_htab_fd (in)
+Returns: file descriptor number (>= 0) on success, -1 on error
+
+This returns a file descriptor that can be used either to read out the
+entries in the guest's hashed page table (HPT), or to write entries to
+initialize the HPT.  The returned fd can only be written to if the
+KVM_GET_HTAB_WRITE bit is set in the flags field of the argument, and
+can only be read if that bit is clear.  The argument struct looks like
+this:
+
+/* For KVM_PPC_GET_HTAB_FD */
+struct kvm_get_htab_fd {
+	__u64	flags;
+	__u64	start_index;
+	__u64	reserved[2];
+};
+
+/* Values for kvm_get_htab_fd.flags */
+#define KVM_GET_HTAB_BOLTED_ONLY	((__u64)0x1)
+#define KVM_GET_HTAB_WRITE		((__u64)0x2)
+
+The `start_index' field gives the index in the HPT of the entry at
+which to start reading.  It is ignored when writing.
+
+Reads on the fd will initially supply information about all
+"interesting" HPT entries.  Interesting entries are those with the
+bolted bit set, if the KVM_GET_HTAB_BOLTED_ONLY bit is set, otherwise
+all entries.  When the end of the HPT is reached, the read() will
+return.  If read() is called again on the fd, it will start again from
+the beginning of the HPT, but will only return HPT entries that have
+changed since they were last read.
+
+Data read or written is structured as a header (8 bytes) followed by a
+series of valid HPT entries (16 bytes) each.  The header indicates how
+many valid HPT entries there are and how many invalid entries follow
+the valid entries.  The invalid entries are not represented explicitly
+in the stream.  The header format is:
+
+struct kvm_get_htab_header {
+	__u32	index;
+	__u16	n_valid;
+	__u16	n_invalid;
+};
+
+Writes to the fd create HPT entries starting at the index given in the
+header; first `n_valid' valid entries with contents from the data
+written, then `n_invalid' invalid entries, invalidating any previously
+valid entries found.
+
 
 5. The kvm_run structure
 ------------------------
@@ -2109,7 +2238,8 @@ executed a memory-mapped I/O instruction which could not be satisfied
 by kvm.  The 'data' member contains the written data if 'is_write' is
 true, and should be filled by application code otherwise.
 
-NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO and KVM_EXIT_OSI, the corresponding
+NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR
+      and KVM_EXIT_PAPR the corresponding
 operations are complete (and guest state is consistent) only after userspace
 has re-entered the kernel with KVM_RUN.  The kernel side will first finish
 incomplete operations and then check for pending signals.  Userspace
diff --git a/MAINTAINERS b/MAINTAINERS
index b0b880da6e5..42f07ea5bbc 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4314,10 +4314,10 @@ F:	include/linux/kvm*
 F:	virt/kvm/
 
 KERNEL VIRTUAL MACHINE (KVM) FOR AMD-V
-M:	Joerg Roedel <joerg.roedel@amd.com>
+M:	Joerg Roedel <joro@8bytes.org>
 L:	kvm@vger.kernel.org
 W:	http://kvm.qumranet.com
-S:	Supported
+S:	Maintained
 F:	arch/x86/include/asm/svm.h
 F:	arch/x86/kvm/svm.c
 
@@ -4325,6 +4325,7 @@ KERNEL VIRTUAL MACHINE (KVM) FOR POWERPC
 M:	Alexander Graf <agraf@suse.de>
 L:	kvm-ppc@vger.kernel.org
 W:	http://kvm.qumranet.com
+T:	git git://github.com/agraf/linux-2.6.git
 S:	Supported
 F:	arch/powerpc/include/asm/kvm*
 F:	arch/powerpc/kvm/
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 0a88cb5d316..bd1c5155503 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1330,6 +1330,11 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 	return -EINVAL;
@@ -1362,11 +1367,9 @@ static void kvm_release_vm_pages(struct kvm *kvm)
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
 	int j;
-	unsigned long base_gfn;
 
 	slots = kvm_memslots(kvm);
 	kvm_for_each_memslot(memslot, slots) {
-		base_gfn = memslot->base_gfn;
 		for (j = 0; j < memslot->npages; j++) {
 			if (memslot->rmap[j])
 				put_page((struct page *)memslot->rmap[j]);
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 2d62b484b3f..650757c300d 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -1,5 +1,4 @@
 
-
 generic-y += clkdev.h
 generic-y += rwsem.h
 generic-y += trace_clock.h
diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h
index bf2c06c3387..d3d634274d2 100644
--- a/arch/powerpc/include/asm/epapr_hcalls.h
+++ b/arch/powerpc/include/asm/epapr_hcalls.h
@@ -50,64 +50,13 @@
 #ifndef _EPAPR_HCALLS_H
 #define _EPAPR_HCALLS_H
 
+#include <uapi/asm/epapr_hcalls.h>
+
+#ifndef __ASSEMBLY__
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <asm/byteorder.h>
 
-#define EV_BYTE_CHANNEL_SEND		1
-#define EV_BYTE_CHANNEL_RECEIVE		2
-#define EV_BYTE_CHANNEL_POLL		3
-#define EV_INT_SET_CONFIG		4
-#define EV_INT_GET_CONFIG		5
-#define EV_INT_SET_MASK			6
-#define EV_INT_GET_MASK			7
-#define EV_INT_IACK			9
-#define EV_INT_EOI			10
-#define EV_INT_SEND_IPI			11
-#define EV_INT_SET_TASK_PRIORITY	12
-#define EV_INT_GET_TASK_PRIORITY	13
-#define EV_DOORBELL_SEND		14
-#define EV_MSGSND			15
-#define EV_IDLE				16
-
-/* vendor ID: epapr */
-#define EV_LOCAL_VENDOR_ID		0	/* for private use */
-#define EV_EPAPR_VENDOR_ID		1
-#define EV_FSL_VENDOR_ID		2	/* Freescale Semiconductor */
-#define EV_IBM_VENDOR_ID		3	/* IBM */
-#define EV_GHS_VENDOR_ID		4	/* Green Hills Software */
-#define EV_ENEA_VENDOR_ID		5	/* Enea */
-#define EV_WR_VENDOR_ID			6	/* Wind River Systems */
-#define EV_AMCC_VENDOR_ID		7	/* Applied Micro Circuits */
-#define EV_KVM_VENDOR_ID		42	/* KVM */
-
-/* The max number of bytes that a byte channel can send or receive per call */
-#define EV_BYTE_CHANNEL_MAX_BYTES	16
-
-
-#define _EV_HCALL_TOKEN(id, num) (((id) << 16) | (num))
-#define EV_HCALL_TOKEN(hcall_num) _EV_HCALL_TOKEN(EV_EPAPR_VENDOR_ID, hcall_num)
-
-/* epapr error codes */
-#define EV_EPERM		1	/* Operation not permitted */
-#define EV_ENOENT		2	/*  Entry Not Found */
-#define EV_EIO			3	/* I/O error occured */
-#define EV_EAGAIN		4	/* The operation had insufficient
-					 * resources to complete and should be
-					 * retried
-					 */
-#define EV_ENOMEM		5	/* There was insufficient memory to
-					 * complete the operation */
-#define EV_EFAULT		6	/* Bad guest address */
-#define EV_ENODEV		7	/* No such device */
-#define EV_EINVAL		8	/* An argument supplied to the hcall
-					   was out of range or invalid */
-#define EV_INTERNAL		9	/* An internal error occured */
-#define EV_CONFIG		10	/* A configuration error was detected */
-#define EV_INVALID_STATE	11	/* The object is in an invalid state */
-#define EV_UNIMPLEMENTED	12	/* Unimplemented hypercall */
-#define EV_BUFFER_OVERFLOW	13	/* Caller-supplied buffer too small */
-
 /*
  * Hypercall register clobber list
  *
@@ -193,7 +142,7 @@ static inline unsigned int ev_int_set_config(unsigned int interrupt,
 	r5  = priority;
 	r6  = destination;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6)
 		: : EV_HCALL_CLOBBERS4
 	);
@@ -222,7 +171,7 @@ static inline unsigned int ev_int_get_config(unsigned int interrupt,
 	r11 = EV_HCALL_TOKEN(EV_INT_GET_CONFIG);
 	r3 = interrupt;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5), "=r" (r6)
 		: : EV_HCALL_CLOBBERS4
 	);
@@ -252,7 +201,7 @@ static inline unsigned int ev_int_set_mask(unsigned int interrupt,
 	r3 = interrupt;
 	r4 = mask;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3), "+r" (r4)
 		: : EV_HCALL_CLOBBERS2
 	);
@@ -277,7 +226,7 @@ static inline unsigned int ev_int_get_mask(unsigned int interrupt,
 	r11 = EV_HCALL_TOKEN(EV_INT_GET_MASK);
 	r3 = interrupt;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3), "=r" (r4)
 		: : EV_HCALL_CLOBBERS2
 	);
@@ -305,7 +254,7 @@ static inline unsigned int ev_int_eoi(unsigned int interrupt)
 	r11 = EV_HCALL_TOKEN(EV_INT_EOI);
 	r3 = interrupt;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3)
 		: : EV_HCALL_CLOBBERS1
 	);
@@ -344,7 +293,7 @@ static inline unsigned int ev_byte_channel_send(unsigned int handle,
 	r7 = be32_to_cpu(p[2]);
 	r8 = be32_to_cpu(p[3]);
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3),
 		  "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7), "+r" (r8)
 		: : EV_HCALL_CLOBBERS6
@@ -383,7 +332,7 @@ static inline unsigned int ev_byte_channel_receive(unsigned int handle,
 	r3 = handle;
 	r4 = *count;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3), "+r" (r4),
 		  "=r" (r5), "=r" (r6), "=r" (r7), "=r" (r8)
 		: : EV_HCALL_CLOBBERS6
@@ -421,7 +370,7 @@ static inline unsigned int ev_byte_channel_poll(unsigned int handle,
 	r11 = EV_HCALL_TOKEN(EV_BYTE_CHANNEL_POLL);
 	r3 = handle;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5)
 		: : EV_HCALL_CLOBBERS3
 	);
@@ -454,7 +403,7 @@ static inline unsigned int ev_int_iack(unsigned int handle,
 	r11 = EV_HCALL_TOKEN(EV_INT_IACK);
 	r3 = handle;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3), "=r" (r4)
 		: : EV_HCALL_CLOBBERS2
 	);
@@ -478,7 +427,7 @@ static inline unsigned int ev_doorbell_send(unsigned int handle)
 	r11 = EV_HCALL_TOKEN(EV_DOORBELL_SEND);
 	r3 = handle;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3)
 		: : EV_HCALL_CLOBBERS1
 	);
@@ -498,12 +447,12 @@ static inline unsigned int ev_idle(void)
 
 	r11 = EV_HCALL_TOKEN(EV_IDLE);
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "=r" (r3)
 		: : EV_HCALL_CLOBBERS1
 	);
 
 	return r3;
 }
-
-#endif
+#endif /* !__ASSEMBLY__ */
+#endif /* _EPAPR_HCALLS_H */
diff --git a/arch/powerpc/include/asm/fsl_hcalls.h b/arch/powerpc/include/asm/fsl_hcalls.h
index 922d9b5fe3d..3abb58394da 100644
--- a/arch/powerpc/include/asm/fsl_hcalls.h
+++ b/arch/powerpc/include/asm/fsl_hcalls.h
@@ -96,7 +96,7 @@ static inline unsigned int fh_send_nmi(unsigned int vcpu_mask)
 	r11 = FH_HCALL_TOKEN(FH_SEND_NMI);
 	r3 = vcpu_mask;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3)
 		: : EV_HCALL_CLOBBERS1
 	);
@@ -151,7 +151,7 @@ static inline unsigned int fh_partition_get_dtprop(int handle,
 	r9 = (uint32_t)propvalue_addr;
 	r10 = *propvalue_len;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11),
 		  "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7),
 		  "+r" (r8), "+r" (r9), "+r" (r10)
@@ -205,7 +205,7 @@ static inline unsigned int fh_partition_set_dtprop(int handle,
 	r9 = (uint32_t)propvalue_addr;
 	r10 = propvalue_len;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11),
 		  "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7),
 		  "+r" (r8), "+r" (r9), "+r" (r10)
@@ -229,7 +229,7 @@ static inline unsigned int fh_partition_restart(unsigned int partition)
 	r11 = FH_HCALL_TOKEN(FH_PARTITION_RESTART);
 	r3 = partition;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3)
 		: : EV_HCALL_CLOBBERS1
 	);
@@ -262,7 +262,7 @@ static inline unsigned int fh_partition_get_status(unsigned int partition,
 	r11 = FH_HCALL_TOKEN(FH_PARTITION_GET_STATUS);
 	r3 = partition;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3), "=r" (r4)
 		: : EV_HCALL_CLOBBERS2
 	);
@@ -295,7 +295,7 @@ static inline unsigned int fh_partition_start(unsigned int partition,
 	r4 = entry_point;
 	r5 = load;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5)
 		: : EV_HCALL_CLOBBERS3
 	);
@@ -317,7 +317,7 @@ static inline unsigned int fh_partition_stop(unsigned int partition)
 	r11 = FH_HCALL_TOKEN(FH_PARTITION_STOP);
 	r3 = partition;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3)
 		: : EV_HCALL_CLOBBERS1
 	);
@@ -376,7 +376,7 @@ static inline unsigned int fh_partition_memcpy(unsigned int source,
 #endif
 	r7 = count;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11),
 		  "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7)
 		: : EV_HCALL_CLOBBERS5
@@ -399,7 +399,7 @@ static inline unsigned int fh_dma_enable(unsigned int liodn)
 	r11 = FH_HCALL_TOKEN(FH_DMA_ENABLE);
 	r3 = liodn;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3)
 		: : EV_HCALL_CLOBBERS1
 	);
@@ -421,7 +421,7 @@ static inline unsigned int fh_dma_disable(unsigned int liodn)
 	r11 = FH_HCALL_TOKEN(FH_DMA_DISABLE);
 	r3 = liodn;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3)
 		: : EV_HCALL_CLOBBERS1
 	);
@@ -447,7 +447,7 @@ static inline unsigned int fh_vmpic_get_msir(unsigned int interrupt,
 	r11 = FH_HCALL_TOKEN(FH_VMPIC_GET_MSIR);
 	r3 = interrupt;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3), "=r" (r4)
 		: : EV_HCALL_CLOBBERS2
 	);
@@ -469,7 +469,7 @@ static inline unsigned int fh_system_reset(void)
 
 	r11 = FH_HCALL_TOKEN(FH_SYSTEM_RESET);
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "=r" (r3)
 		: : EV_HCALL_CLOBBERS1
 	);
@@ -506,7 +506,7 @@ static inline unsigned int fh_err_get_info(int queue, uint32_t *bufsize,
 	r6 = addr_lo;
 	r7 = peek;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6),
 		  "+r" (r7)
 		: : EV_HCALL_CLOBBERS5
@@ -542,7 +542,7 @@ static inline unsigned int fh_get_core_state(unsigned int handle,
 	r3 = handle;
 	r4 = vcpu;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3), "+r" (r4)
 		: : EV_HCALL_CLOBBERS2
 	);
@@ -572,7 +572,7 @@ static inline unsigned int fh_enter_nap(unsigned int handle, unsigned int vcpu)
 	r3 = handle;
 	r4 = vcpu;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3), "+r" (r4)
 		: : EV_HCALL_CLOBBERS2
 	);
@@ -597,7 +597,7 @@ static inline unsigned int fh_exit_nap(unsigned int handle, unsigned int vcpu)
 	r3 = handle;
 	r4 = vcpu;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3), "+r" (r4)
 		: : EV_HCALL_CLOBBERS2
 	);
@@ -618,7 +618,7 @@ static inline unsigned int fh_claim_device(unsigned int handle)
 	r11 = FH_HCALL_TOKEN(FH_CLAIM_DEVICE);
 	r3 = handle;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3)
 		: : EV_HCALL_CLOBBERS1
 	);
@@ -645,7 +645,7 @@ static inline unsigned int fh_partition_stop_dma(unsigned int handle)
 	r11 = FH_HCALL_TOKEN(FH_PARTITION_STOP_DMA);
 	r3 = handle;
 
-	__asm__ __volatile__ ("sc 1"
+	asm volatile("bl	epapr_hypercall_start"
 		: "+r" (r11), "+r" (r3)
 		: : EV_HCALL_CLOBBERS1
 	);
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index 76fdcfef088..aabcdba8f6b 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -118,6 +118,7 @@
 
 #define RESUME_FLAG_NV          (1<<0)  /* Reload guest nonvolatile state? */
 #define RESUME_FLAG_HOST        (1<<1)  /* Resume host? */
+#define RESUME_FLAG_ARCH1	(1<<2)
 
 #define RESUME_GUEST            0
 #define RESUME_GUEST_NV         RESUME_FLAG_NV
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 7aefdb3e1ce..5a56e1c5f85 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -81,6 +81,8 @@ struct kvmppc_vcpu_book3s {
 	u64 sdr1;
 	u64 hior;
 	u64 msr_mask;
+	u64 purr_offset;
+	u64 spurr_offset;
 #ifdef CONFIG_PPC_BOOK3S_32
 	u32 vsid_pool[VSID_POOL_SIZE];
 	u32 vsid_next;
@@ -157,10 +159,14 @@ extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr,
 extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr);
 extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 			long pte_index, unsigned long pteh, unsigned long ptel);
-extern long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
-			long pte_index, unsigned long pteh, unsigned long ptel);
+extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
+			long pte_index, unsigned long pteh, unsigned long ptel,
+			pgd_t *pgdir, bool realmode, unsigned long *idx_ret);
+extern long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
+			unsigned long pte_index, unsigned long avpn,
+			unsigned long *hpret);
 extern long kvmppc_hv_get_dirty_log(struct kvm *kvm,
-			struct kvm_memory_slot *memslot);
+			struct kvm_memory_slot *memslot, unsigned long *map);
 
 extern void kvmppc_entry_trampoline(void);
 extern void kvmppc_hv_entry_trampoline(void);
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 0dd1d86d3e3..38bec1dc992 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -50,6 +50,15 @@ extern int kvm_hpt_order;		/* order of preallocated HPTs */
 #define HPTE_V_HVLOCK	0x40UL
 #define HPTE_V_ABSENT	0x20UL
 
+/*
+ * We use this bit in the guest_rpte field of the revmap entry
+ * to indicate a modified HPTE.
+ */
+#define HPTE_GR_MODIFIED	(1ul << 62)
+
+/* These bits are reserved in the guest view of the HPTE */
+#define HPTE_GR_RESERVED	HPTE_GR_MODIFIED
+
 static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
 {
 	unsigned long tmp, old;
@@ -60,7 +69,7 @@ static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
 		     "	ori	%0,%0,%4\n"
 		     "  stdcx.	%0,0,%2\n"
 		     "	beq+	2f\n"
-		     "	li	%1,%3\n"
+		     "	mr	%1,%3\n"
 		     "2:	isync"
 		     : "=&r" (tmp), "=&r" (old)
 		     : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK)
@@ -237,4 +246,26 @@ static inline bool slot_is_aligned(struct kvm_memory_slot *memslot,
 	return !(memslot->base_gfn & mask) && !(memslot->npages & mask);
 }
 
+/*
+ * This works for 4k, 64k and 16M pages on POWER7,
+ * and 4k and 16M pages on PPC970.
+ */
+static inline unsigned long slb_pgsize_encoding(unsigned long psize)
+{
+	unsigned long senc = 0;
+
+	if (psize > 0x1000) {
+		senc = SLB_VSID_L;
+		if (psize == 0x10000)
+			senc |= SLB_VSID_LP_01;
+	}
+	return senc;
+}
+
+static inline int is_vrma_hpte(unsigned long hpte_v)
+{
+	return (hpte_v & ~0xffffffUL) ==
+		(HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)));
+}
+
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_booke_hv_asm.h b/arch/powerpc/include/asm/kvm_booke_hv_asm.h
index 30a600fa1b6..3a79f532571 100644
--- a/arch/powerpc/include/asm/kvm_booke_hv_asm.h
+++ b/arch/powerpc/include/asm/kvm_booke_hv_asm.h
@@ -17,6 +17,7 @@
  * there are no exceptions for which we fall through directly to
  * the normal host handler.
  *
+ * 32-bit host
  * Expected inputs (normal exceptions):
  *   SCRATCH0 = saved r10
  *   r10 = thread struct
@@ -33,14 +34,38 @@
  *   *(r8 + GPR9) = saved r9
  *   *(r8 + GPR10) = saved r10 (r10 not yet clobbered)
  *   *(r8 + GPR11) = saved r11
+ *
+ * 64-bit host
+ * Expected inputs (GEN/GDBELL/DBG/MC exception types):
+ *  r10 = saved CR
+ *  r13 = PACA_POINTER
+ *  *(r13 + PACA_EX##type + EX_R10) = saved r10
+ *  *(r13 + PACA_EX##type + EX_R11) = saved r11
+ *  SPRN_SPRG_##type##_SCRATCH = saved r13
+ *
+  * Expected inputs (CRIT exception type):
+ *  r10 = saved CR
+ *  r13 = PACA_POINTER
+ *  *(r13 + PACA_EX##type + EX_R10) = saved r10
+ *  *(r13 + PACA_EX##type + EX_R11) = saved r11
+ *  *(r13 + PACA_EX##type + EX_R13) = saved r13
+ *
+ * Expected inputs (TLB exception type):
+ *  r10 = saved CR
+ *  r13 = PACA_POINTER
+ *  *(r13 + PACA_EX##type + EX_TLB_R10) = saved r10
+ *  *(r13 + PACA_EX##type + EX_TLB_R11) = saved r11
+ *  SPRN_SPRG_GEN_SCRATCH = saved r13
+ *
+ * Only the bolted version of TLB miss exception handlers is supported now.
  */
 .macro DO_KVM intno srr1
 #ifdef CONFIG_KVM_BOOKE_HV
 BEGIN_FTR_SECTION
 	mtocrf	0x80, r11	/* check MSR[GS] without clobbering reg */
-	bf	3, kvmppc_resume_\intno\()_\srr1
+	bf	3, 1975f
 	b	kvmppc_handler_\intno\()_\srr1
-kvmppc_resume_\intno\()_\srr1:
+1975:
 END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
 #endif
 .endm
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 28e8f5e5c63..ca9bf459db6 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -46,7 +46,7 @@
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #endif
 
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#if !defined(CONFIG_KVM_440)
 #include <linux/mmu_notifier.h>
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
@@ -204,7 +204,7 @@ struct revmap_entry {
 };
 
 /*
- * We use the top bit of each memslot->rmap entry as a lock bit,
+ * We use the top bit of each memslot->arch.rmap entry as a lock bit,
  * and bit 32 as a present flag.  The bottom 32 bits are the
  * index in the guest HPT of a HPTE that points to the page.
  */
@@ -215,14 +215,17 @@ struct revmap_entry {
 #define KVMPPC_RMAP_PRESENT	0x100000000ul
 #define KVMPPC_RMAP_INDEX	0xfffffffful
 
-/* Low-order bits in kvm->arch.slot_phys[][] */
+/* Low-order bits in memslot->arch.slot_phys[] */
 #define KVMPPC_PAGE_ORDER_MASK	0x1f
 #define KVMPPC_PAGE_NO_CACHE	HPTE_R_I	/* 0x20 */
 #define KVMPPC_PAGE_WRITETHRU	HPTE_R_W	/* 0x40 */
 #define KVMPPC_GOT_PAGE		0x80
 
 struct kvm_arch_memory_slot {
+#ifdef CONFIG_KVM_BOOK3S_64_HV
 	unsigned long *rmap;
+	unsigned long *slot_phys;
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
 };
 
 struct kvm_arch {
@@ -243,12 +246,12 @@ struct kvm_arch {
 	int using_mmu_notifiers;
 	u32 hpt_order;
 	atomic_t vcpus_running;
+	u32 online_vcores;
 	unsigned long hpt_npte;
 	unsigned long hpt_mask;
+	atomic_t hpte_mod_interest;
 	spinlock_t slot_phys_lock;
-	unsigned long *slot_phys[KVM_MEM_SLOTS_NUM];
-	int slot_npages[KVM_MEM_SLOTS_NUM];
-	unsigned short last_vcpu[NR_CPUS];
+	cpumask_t need_tlb_flush;
 	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
 	struct kvmppc_linear_info *hpt_li;
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
@@ -273,6 +276,7 @@ struct kvmppc_vcore {
 	int nap_count;
 	int napping_threads;
 	u16 pcpu;
+	u16 last_cpu;
 	u8 vcore_state;
 	u8 in_guest;
 	struct list_head runnable_threads;
@@ -288,9 +292,10 @@ struct kvmppc_vcore {
 
 /* Values for vcore_state */
 #define VCORE_INACTIVE	0
-#define VCORE_RUNNING	1
-#define VCORE_EXITING	2
-#define VCORE_SLEEPING	3
+#define VCORE_SLEEPING	1
+#define VCORE_STARTING	2
+#define VCORE_RUNNING	3
+#define VCORE_EXITING	4
 
 /*
  * Struct used to manage memory for a virtual processor area
@@ -346,6 +351,27 @@ struct kvmppc_slb {
 	bool class	: 1;
 };
 
+# ifdef CONFIG_PPC_FSL_BOOK3E
+#define KVMPPC_BOOKE_IAC_NUM	2
+#define KVMPPC_BOOKE_DAC_NUM	2
+# else
+#define KVMPPC_BOOKE_IAC_NUM	4
+#define KVMPPC_BOOKE_DAC_NUM	2
+# endif
+#define KVMPPC_BOOKE_MAX_IAC	4
+#define KVMPPC_BOOKE_MAX_DAC	2
+
+struct kvmppc_booke_debug_reg {
+	u32 dbcr0;
+	u32 dbcr1;
+	u32 dbcr2;
+#ifdef CONFIG_KVM_E500MC
+	u32 dbcr4;
+#endif
+	u64 iac[KVMPPC_BOOKE_MAX_IAC];
+	u64 dac[KVMPPC_BOOKE_MAX_DAC];
+};
+
 struct kvm_vcpu_arch {
 	ulong host_stack;
 	u32 host_pid;
@@ -380,13 +406,18 @@ struct kvm_vcpu_arch {
 	u32 host_mas4;
 	u32 host_mas6;
 	u32 shadow_epcr;
-	u32 epcr;
 	u32 shadow_msrp;
 	u32 eplc;
 	u32 epsc;
 	u32 oldpir;
 #endif
 
+#if defined(CONFIG_BOOKE)
+#if defined(CONFIG_KVM_BOOKE_HV) || defined(CONFIG_64BIT)
+	u32 epcr;
+#endif
+#endif
+
 #ifdef CONFIG_PPC_BOOK3S
 	/* For Gekko paired singles */
 	u32 qpr[32];
@@ -440,8 +471,6 @@ struct kvm_vcpu_arch {
 
 	u32 ccr0;
 	u32 ccr1;
-	u32 dbcr0;
-	u32 dbcr1;
 	u32 dbsr;
 
 	u64 mmcr[3];
@@ -471,9 +500,12 @@ struct kvm_vcpu_arch {
 	ulong fault_esr;
 	ulong queued_dear;
 	ulong queued_esr;
+	spinlock_t wdt_lock;
+	struct timer_list wdt_timer;
 	u32 tlbcfg[4];
 	u32 mmucfg;
 	u32 epr;
+	struct kvmppc_booke_debug_reg dbg_reg;
 #endif
 	gpa_t paddr_accessed;
 	gva_t vaddr_accessed;
@@ -486,6 +518,7 @@ struct kvm_vcpu_arch {
 	u8 osi_needed;
 	u8 osi_enabled;
 	u8 papr_enabled;
+	u8 watchdog_enabled;
 	u8 sane;
 	u8 cpu_type;
 	u8 hcall_needed;
@@ -497,7 +530,6 @@ struct kvm_vcpu_arch {
 	u64 dec_jiffies;
 	u64 dec_expires;
 	unsigned long pending_exceptions;
-	u16 last_cpu;
 	u8 ceded;
 	u8 prodded;
 	u32 last_inst;
@@ -534,13 +566,17 @@ struct kvm_vcpu_arch {
 	unsigned long dtl_index;
 	u64 stolen_logged;
 	struct kvmppc_vpa slb_shadow;
+
+	spinlock_t tbacct_lock;
+	u64 busy_stolen;
+	u64 busy_preempt;
 #endif
 };
 
 /* Values for vcpu->arch.state */
-#define KVMPPC_VCPU_STOPPED		0
-#define KVMPPC_VCPU_BUSY_IN_HOST	1
-#define KVMPPC_VCPU_RUNNABLE		2
+#define KVMPPC_VCPU_NOTREADY		0
+#define KVMPPC_VCPU_RUNNABLE		1
+#define KVMPPC_VCPU_BUSY_IN_HOST	2
 
 /* Values for vcpu->arch.io_gpr */
 #define KVM_MMIO_REG_MASK	0x001f
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index 9365860fb7f..2b119654b4c 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -21,7 +21,6 @@
 
 #include <uapi/asm/kvm_para.h>
 
-
 #ifdef CONFIG_KVM_GUEST
 
 #include <linux/of.h>
@@ -55,7 +54,7 @@ static unsigned long kvm_hypercall(unsigned long *in,
 				   unsigned long *out,
 				   unsigned long nr)
 {
-	return HC_EV_UNIMPLEMENTED;
+	return EV_UNIMPLEMENTED;
 }
 
 #endif
@@ -66,7 +65,7 @@ static inline long kvm_hypercall0_1(unsigned int nr, unsigned long *r2)
 	unsigned long out[8];
 	unsigned long r;
 
-	r = kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+	r = kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
 	*r2 = out[0];
 
 	return r;
@@ -77,7 +76,7 @@ static inline long kvm_hypercall0(unsigned int nr)
 	unsigned long in[8];
 	unsigned long out[8];
 
-	return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+	return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
 }
 
 static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
@@ -86,7 +85,7 @@ static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
 	unsigned long out[8];
 
 	in[0] = p1;
-	return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+	return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
 }
 
 static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
@@ -97,7 +96,7 @@ static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
 
 	in[0] = p1;
 	in[1] = p2;
-	return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+	return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
 }
 
 static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
@@ -109,7 +108,7 @@ static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
 	in[0] = p1;
 	in[1] = p2;
 	in[2] = p3;
-	return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+	return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
 }
 
 static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
@@ -123,7 +122,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
 	in[1] = p2;
 	in[2] = p3;
 	in[3] = p4;
-	return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+	return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
 }
 
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index e006f0bdea9..572aa753061 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -28,6 +28,7 @@
 #include <linux/types.h>
 #include <linux/kvm_types.h>
 #include <linux/kvm_host.h>
+#include <linux/bug.h>
 #ifdef CONFIG_PPC_BOOK3S
 #include <asm/kvm_book3s.h>
 #else
@@ -68,6 +69,8 @@ extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
 extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
 extern void kvmppc_decrementer_func(unsigned long data);
 extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
+extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu);
+extern void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu);
 
 /* Core-specific hooks */
 
@@ -104,6 +107,7 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
                                        struct kvm_interrupt *irq);
 extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
                                          struct kvm_interrupt *irq);
+extern void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu);
 
 extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                   unsigned int op, int *advance);
@@ -111,6 +115,7 @@ extern int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn,
 				     ulong val);
 extern int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn,
 				     ulong *val);
+extern int kvmppc_core_check_requests(struct kvm_vcpu *vcpu);
 
 extern int kvmppc_booke_init(void);
 extern void kvmppc_booke_exit(void);
@@ -139,16 +144,28 @@ extern struct kvmppc_linear_info *kvm_alloc_hpt(void);
 extern void kvm_release_hpt(struct kvmppc_linear_info *li);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
+extern void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
+				     struct kvm_memory_slot *dont);
+extern int kvmppc_core_create_memslot(struct kvm_memory_slot *slot,
+				      unsigned long npages);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				struct kvm_memory_slot *memslot,
 				struct kvm_userspace_memory_region *mem);
 extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem);
+				struct kvm_userspace_memory_region *mem,
+				struct kvm_memory_slot old);
 extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm,
 				      struct kvm_ppc_smmu_info *info);
+extern void kvmppc_core_flush_memslot(struct kvm *kvm,
+				      struct kvm_memory_slot *memslot);
 
 extern int kvmppc_bookehv_init(void);
 extern void kvmppc_bookehv_exit(void);
 
+extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu);
+
+extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *);
+
 /*
  * Cuts out inst bits with ordering according to spec.
  * That means the leftmost bit is zero. All given bits are included.
@@ -182,6 +199,41 @@ static inline u32 kvmppc_set_field(u64 inst, int msb, int lsb, int value)
 	return r;
 }
 
+union kvmppc_one_reg {
+	u32	wval;
+	u64	dval;
+	vector128 vval;
+	u64	vsxval[2];
+	struct {
+		u64	addr;
+		u64	length;
+	}	vpaval;
+};
+
+#define one_reg_size(id)	\
+	(1ul << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
+
+#define get_reg_val(id, reg)	({		\
+	union kvmppc_one_reg __u;		\
+	switch (one_reg_size(id)) {		\
+	case 4: __u.wval = (reg); break;	\
+	case 8: __u.dval = (reg); break;	\
+	default: BUG();				\
+	}					\
+	__u;					\
+})
+
+
+#define set_reg_val(id, val)	({		\
+	u64 __v;				\
+	switch (one_reg_size(id)) {		\
+	case 4: __v = (val).wval; break;	\
+	case 8: __v = (val).dval; break;	\
+	default: BUG();				\
+	}					\
+	__v;					\
+})
+
 void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 
@@ -190,6 +242,8 @@ int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 
 int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg);
 int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg);
+int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *);
+int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *);
 
 void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
 
@@ -230,5 +284,36 @@ static inline void kvmppc_mmu_flush_icache(pfn_t pfn)
 	}
 }
 
+/* Please call after prepare_to_enter. This function puts the lazy ee state
+   back to normal mode, without actually enabling interrupts. */
+static inline void kvmppc_lazy_ee_enable(void)
+{
+#ifdef CONFIG_PPC64
+	/* Only need to enable IRQs by hard enabling them after this */
+	local_paca->irq_happened = 0;
+	local_paca->soft_enabled = 1;
+#endif
+}
+
+static inline ulong kvmppc_get_ea_indexed(struct kvm_vcpu *vcpu, int ra, int rb)
+{
+	ulong ea;
+	ulong msr_64bit = 0;
+
+	ea = kvmppc_get_gpr(vcpu, rb);
+	if (ra)
+		ea += kvmppc_get_gpr(vcpu, ra);
+
+#if defined(CONFIG_PPC_BOOK3E_64)
+	msr_64bit = MSR_CM;
+#elif defined(CONFIG_PPC_BOOK3S_64)
+	msr_64bit = MSR_SF;
+#endif
+
+	if (!(vcpu->arch.shared->msr & msr_64bit))
+		ea = (uint32_t)ea;
+
+	return ea;
+}
 
 #endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h
index eeabcdbc30f..99d43e0c1e4 100644
--- a/arch/powerpc/include/asm/mmu-book3e.h
+++ b/arch/powerpc/include/asm/mmu-book3e.h
@@ -59,7 +59,7 @@
 #define MAS1_TSIZE_SHIFT	7
 #define MAS1_TSIZE(x)		(((x) << MAS1_TSIZE_SHIFT) & MAS1_TSIZE_MASK)
 
-#define MAS2_EPN		0xFFFFF000
+#define MAS2_EPN		(~0xFFFUL)
 #define MAS2_X0			0x00000040
 #define MAS2_X1			0x00000020
 #define MAS2_W			0x00000010
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index 9673f73eb8d..2fdb47a19ef 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -121,6 +121,16 @@ extern char initial_stab[];
 #define PP_RXRX 3	/* Supervisor read,       User read */
 #define PP_RXXX	(HPTE_R_PP0 | 2)	/* Supervisor read, user none */
 
+/* Fields for tlbiel instruction in architecture 2.06 */
+#define TLBIEL_INVAL_SEL_MASK	0xc00	/* invalidation selector */
+#define  TLBIEL_INVAL_PAGE	0x000	/* invalidate a single page */
+#define  TLBIEL_INVAL_SET_LPID	0x800	/* invalidate a set for current LPID */
+#define  TLBIEL_INVAL_SET	0xc00	/* invalidate a set for all LPIDs */
+#define TLBIEL_INVAL_SET_MASK	0xfff000	/* set number to inval. */
+#define TLBIEL_INVAL_SET_SHIFT	12
+
+#define POWER7_TLB_SETS		128	/* # sets in POWER7 TLB */
+
 #ifndef __ASSEMBLY__
 
 struct hash_pte {
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index d24c1416396..97d37278ea2 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -518,6 +518,7 @@
 #define	  SRR1_WS_DEEPER	0x00020000 /* Some resources not maintained */
 #define	  SRR1_WS_DEEP		0x00010000 /* All resources maintained */
 #define   SRR1_PROGFPE		0x00100000 /* Floating Point Enabled */
+#define   SRR1_PROGILL		0x00080000 /* Illegal instruction */
 #define   SRR1_PROGPRIV		0x00040000 /* Privileged instruction */
 #define   SRR1_PROGTRAP		0x00020000 /* Trap */
 #define   SRR1_PROGADDR		0x00010000 /* SRR0 contains subsequent addr */
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index 2d916c4982c..e07e6af5e1f 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -539,6 +539,13 @@
 #define TCR_FIE		0x00800000	/* FIT Interrupt Enable */
 #define TCR_ARE		0x00400000	/* Auto Reload Enable */
 
+#ifdef CONFIG_E500
+#define TCR_GET_WP(tcr)  ((((tcr) & 0xC0000000) >> 30) | \
+			      (((tcr) & 0x1E0000) >> 15))
+#else
+#define TCR_GET_WP(tcr)  (((tcr) & 0xC0000000) >> 30)
+#endif
+
 /* Bit definitions for the TSR. */
 #define TSR_ENW		0x80000000	/* Enable Next Watchdog */
 #define TSR_WIS		0x40000000	/* WDT Interrupt Status */
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index e807e9d8e3f..5a4e437c238 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -67,6 +67,14 @@ void generic_mach_cpu_die(void);
 void generic_set_cpu_dead(unsigned int cpu);
 void generic_set_cpu_up(unsigned int cpu);
 int generic_check_cpu_restart(unsigned int cpu);
+
+extern void inhibit_secondary_onlining(void);
+extern void uninhibit_secondary_onlining(void);
+
+#else /* HOTPLUG_CPU */
+static inline void inhibit_secondary_onlining(void) {}
+static inline void uninhibit_secondary_onlining(void) {}
+
 #endif
 
 #ifdef CONFIG_PPC64
diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild
index a33c3c03bb2..f7bca637074 100644
--- a/arch/powerpc/include/uapi/asm/Kbuild
+++ b/arch/powerpc/include/uapi/asm/Kbuild
@@ -7,6 +7,7 @@ header-y += bootx.h
 header-y += byteorder.h
 header-y += cputable.h
 header-y += elf.h
+header-y += epapr_hcalls.h
 header-y += errno.h
 header-y += fcntl.h
 header-y += ioctl.h
diff --git a/arch/powerpc/include/uapi/asm/epapr_hcalls.h b/arch/powerpc/include/uapi/asm/epapr_hcalls.h
new file mode 100644
index 00000000000..7f9c74b4670
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/epapr_hcalls.h
@@ -0,0 +1,98 @@
+/*
+ * ePAPR hcall interface
+ *
+ * Copyright 2008-2011 Freescale Semiconductor, Inc.
+ *
+ * Author: Timur Tabi <timur@freescale.com>
+ *
+ * This file is provided under a dual BSD/GPL license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _UAPI_ASM_POWERPC_EPAPR_HCALLS_H
+#define _UAPI_ASM_POWERPC_EPAPR_HCALLS_H
+
+#define EV_BYTE_CHANNEL_SEND		1
+#define EV_BYTE_CHANNEL_RECEIVE		2
+#define EV_BYTE_CHANNEL_POLL		3
+#define EV_INT_SET_CONFIG		4
+#define EV_INT_GET_CONFIG		5
+#define EV_INT_SET_MASK			6
+#define EV_INT_GET_MASK			7
+#define EV_INT_IACK			9
+#define EV_INT_EOI			10
+#define EV_INT_SEND_IPI			11
+#define EV_INT_SET_TASK_PRIORITY	12
+#define EV_INT_GET_TASK_PRIORITY	13
+#define EV_DOORBELL_SEND		14
+#define EV_MSGSND			15
+#define EV_IDLE				16
+
+/* vendor ID: epapr */
+#define EV_LOCAL_VENDOR_ID		0	/* for private use */
+#define EV_EPAPR_VENDOR_ID		1
+#define EV_FSL_VENDOR_ID		2	/* Freescale Semiconductor */
+#define EV_IBM_VENDOR_ID		3	/* IBM */
+#define EV_GHS_VENDOR_ID		4	/* Green Hills Software */
+#define EV_ENEA_VENDOR_ID		5	/* Enea */
+#define EV_WR_VENDOR_ID			6	/* Wind River Systems */
+#define EV_AMCC_VENDOR_ID		7	/* Applied Micro Circuits */
+#define EV_KVM_VENDOR_ID		42	/* KVM */
+
+/* The max number of bytes that a byte channel can send or receive per call */
+#define EV_BYTE_CHANNEL_MAX_BYTES	16
+
+
+#define _EV_HCALL_TOKEN(id, num) (((id) << 16) | (num))
+#define EV_HCALL_TOKEN(hcall_num) _EV_HCALL_TOKEN(EV_EPAPR_VENDOR_ID, hcall_num)
+
+/* epapr return codes */
+#define EV_SUCCESS		0
+#define EV_EPERM		1	/* Operation not permitted */
+#define EV_ENOENT		2	/*  Entry Not Found */
+#define EV_EIO			3	/* I/O error occured */
+#define EV_EAGAIN		4	/* The operation had insufficient
+					 * resources to complete and should be
+					 * retried
+					 */
+#define EV_ENOMEM		5	/* There was insufficient memory to
+					 * complete the operation */
+#define EV_EFAULT		6	/* Bad guest address */
+#define EV_ENODEV		7	/* No such device */
+#define EV_EINVAL		8	/* An argument supplied to the hcall
+					   was out of range or invalid */
+#define EV_INTERNAL		9	/* An internal error occured */
+#define EV_CONFIG		10	/* A configuration error was detected */
+#define EV_INVALID_STATE	11	/* The object is in an invalid state */
+#define EV_UNIMPLEMENTED	12	/* Unimplemented hypercall */
+#define EV_BUFFER_OVERFLOW	13	/* Caller-supplied buffer too small */
+
+#endif /* _UAPI_ASM_POWERPC_EPAPR_HCALLS_H */
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 1bea4d8ea6f..2fba8a66fb1 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -221,6 +221,12 @@ struct kvm_sregs {
 
 			__u32 dbsr;	/* KVM_SREGS_E_UPDATE_DBSR */
 			__u32 dbcr[3];
+			/*
+			 * iac/dac registers are 64bit wide, while this API
+			 * interface provides only lower 32 bits on 64 bit
+			 * processors. ONE_REG interface is added for 64bit
+			 * iac/dac registers.
+			 */
 			__u32 iac[4];
 			__u32 dac[2];
 			__u32 dvc[2];
@@ -325,6 +331,86 @@ struct kvm_book3e_206_tlb_params {
 	__u32 reserved[8];
 };
 
+/* For KVM_PPC_GET_HTAB_FD */
+struct kvm_get_htab_fd {
+	__u64	flags;
+	__u64	start_index;
+	__u64	reserved[2];
+};
+
+/* Values for kvm_get_htab_fd.flags */
+#define KVM_GET_HTAB_BOLTED_ONLY	((__u64)0x1)
+#define KVM_GET_HTAB_WRITE		((__u64)0x2)
+
+/*
+ * Data read on the file descriptor is formatted as a series of
+ * records, each consisting of a header followed by a series of
+ * `n_valid' HPTEs (16 bytes each), which are all valid.  Following
+ * those valid HPTEs there are `n_invalid' invalid HPTEs, which
+ * are not represented explicitly in the stream.  The same format
+ * is used for writing.
+ */
+struct kvm_get_htab_header {
+	__u32	index;
+	__u16	n_valid;
+	__u16	n_invalid;
+};
+
 #define KVM_REG_PPC_HIOR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1)
+#define KVM_REG_PPC_IAC1	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x2)
+#define KVM_REG_PPC_IAC2	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3)
+#define KVM_REG_PPC_IAC3	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x4)
+#define KVM_REG_PPC_IAC4	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x5)
+#define KVM_REG_PPC_DAC1	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x6)
+#define KVM_REG_PPC_DAC2	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x7)
+#define KVM_REG_PPC_DABR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8)
+#define KVM_REG_PPC_DSCR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x9)
+#define KVM_REG_PPC_PURR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa)
+#define KVM_REG_PPC_SPURR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb)
+#define KVM_REG_PPC_DAR		(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc)
+#define KVM_REG_PPC_DSISR	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xd)
+#define KVM_REG_PPC_AMR		(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xe)
+#define KVM_REG_PPC_UAMOR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xf)
+
+#define KVM_REG_PPC_MMCR0	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x10)
+#define KVM_REG_PPC_MMCR1	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x11)
+#define KVM_REG_PPC_MMCRA	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12)
+
+#define KVM_REG_PPC_PMC1	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x18)
+#define KVM_REG_PPC_PMC2	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x19)
+#define KVM_REG_PPC_PMC3	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1a)
+#define KVM_REG_PPC_PMC4	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1b)
+#define KVM_REG_PPC_PMC5	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1c)
+#define KVM_REG_PPC_PMC6	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1d)
+#define KVM_REG_PPC_PMC7	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1e)
+#define KVM_REG_PPC_PMC8	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1f)
+
+/* 32 floating-point registers */
+#define KVM_REG_PPC_FPR0	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x20)
+#define KVM_REG_PPC_FPR(n)	(KVM_REG_PPC_FPR0 + (n))
+#define KVM_REG_PPC_FPR31	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3f)
+
+/* 32 VMX/Altivec vector registers */
+#define KVM_REG_PPC_VR0		(KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x40)
+#define KVM_REG_PPC_VR(n)	(KVM_REG_PPC_VR0 + (n))
+#define KVM_REG_PPC_VR31	(KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x5f)
+
+/* 32 double-width FP registers for VSX */
+/* High-order halves overlap with FP regs */
+#define KVM_REG_PPC_VSR0	(KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x60)
+#define KVM_REG_PPC_VSR(n)	(KVM_REG_PPC_VSR0 + (n))
+#define KVM_REG_PPC_VSR31	(KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x7f)
+
+/* FP and vector status/control registers */
+#define KVM_REG_PPC_FPSCR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x80)
+#define KVM_REG_PPC_VSCR	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x81)
+
+/* Virtual processor areas */
+/* For SLB & DTL, address in high (first) half, length in low half */
+#define KVM_REG_PPC_VPA_ADDR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x82)
+#define KVM_REG_PPC_VPA_SLB	(KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x83)
+#define KVM_REG_PPC_VPA_DTL	(KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x84)
+
+#define KVM_REG_PPC_EPCR	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85)
 
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/include/uapi/asm/kvm_para.h b/arch/powerpc/include/uapi/asm/kvm_para.h
index 5e04383a1db..ed0e0254b47 100644
--- a/arch/powerpc/include/uapi/asm/kvm_para.h
+++ b/arch/powerpc/include/uapi/asm/kvm_para.h
@@ -75,9 +75,10 @@ struct kvm_vcpu_arch_shared {
 };
 
 #define KVM_SC_MAGIC_R0		0x4b564d21 /* "KVM!" */
-#define HC_VENDOR_KVM		(42 << 16)
-#define HC_EV_SUCCESS		0
-#define HC_EV_UNIMPLEMENTED	12
+
+#define KVM_HCALL_TOKEN(num)     _EV_HCALL_TOKEN(EV_KVM_VENDOR_ID, num)
+
+#include <uapi/asm/epapr_hcalls.h>
 
 #define KVM_FEATURE_MAGIC_PAGE	1
 
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 7523539cfe9..4e23ba2f3ca 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -441,8 +441,7 @@ int main(void)
 	DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
 	DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
 	DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
-	DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter));
-	DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
+	DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits));
 	DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
 	DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
 	DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
@@ -470,7 +469,6 @@ int main(void)
 	DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
 	DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
 	DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
-	DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu));
 	DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
 	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
 	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
diff --git a/arch/powerpc/kernel/epapr_hcalls.S b/arch/powerpc/kernel/epapr_hcalls.S
index 697b390ebfd..62c0dc23782 100644
--- a/arch/powerpc/kernel/epapr_hcalls.S
+++ b/arch/powerpc/kernel/epapr_hcalls.S
@@ -8,13 +8,41 @@
  */
 
 #include <linux/threads.h>
+#include <asm/epapr_hcalls.h>
 #include <asm/reg.h>
 #include <asm/page.h>
 #include <asm/cputable.h>
 #include <asm/thread_info.h>
 #include <asm/ppc_asm.h>
+#include <asm/asm-compat.h>
 #include <asm/asm-offsets.h>
 
+/* epapr_ev_idle() was derived from e500_idle() */
+_GLOBAL(epapr_ev_idle)
+	CURRENT_THREAD_INFO(r3, r1)
+	PPC_LL	r4, TI_LOCAL_FLAGS(r3)	/* set napping bit */
+	ori	r4, r4,_TLF_NAPPING	/* so when we take an exception */
+	PPC_STL	r4, TI_LOCAL_FLAGS(r3)	/* it will return to our caller */
+
+	wrteei	1
+
+idle_loop:
+	LOAD_REG_IMMEDIATE(r11, EV_HCALL_TOKEN(EV_IDLE))
+
+.global epapr_ev_idle_start
+epapr_ev_idle_start:
+	li	r3, -1
+	nop
+	nop
+	nop
+
+	/*
+	 * Guard against spurious wakeups from a hypervisor --
+	 * only interrupt will cause us to return to LR due to
+	 * _TLF_NAPPING.
+	 */
+	b	idle_loop
+
 /* Hypercall entry point. Will be patched with device tree instructions. */
 .global epapr_hypercall_start
 epapr_hypercall_start:
diff --git a/arch/powerpc/kernel/epapr_paravirt.c b/arch/powerpc/kernel/epapr_paravirt.c
index 028aeae370b..f3eab8594d9 100644
--- a/arch/powerpc/kernel/epapr_paravirt.c
+++ b/arch/powerpc/kernel/epapr_paravirt.c
@@ -21,6 +21,10 @@
 #include <asm/epapr_hcalls.h>
 #include <asm/cacheflush.h>
 #include <asm/code-patching.h>
+#include <asm/machdep.h>
+
+extern void epapr_ev_idle(void);
+extern u32 epapr_ev_idle_start[];
 
 bool epapr_paravirt_enabled;
 
@@ -41,8 +45,13 @@ static int __init epapr_paravirt_init(void)
 	if (len % 4 || len > (4 * 4))
 		return -ENODEV;
 
-	for (i = 0; i < (len / 4); i++)
+	for (i = 0; i < (len / 4); i++) {
 		patch_instruction(epapr_hypercall_start + i, insts[i]);
+		patch_instruction(epapr_ev_idle_start + i, insts[i]);
+	}
+
+	if (of_get_property(hyper_node, "has-idle", NULL))
+		ppc_md.power_save = epapr_ev_idle;
 
 	epapr_paravirt_enabled = true;
 
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 867db1de894..a61b133c4f9 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -419,7 +419,7 @@ static void kvm_map_magic_page(void *data)
 	in[0] = KVM_MAGIC_PAGE;
 	in[1] = KVM_MAGIC_PAGE;
 
-	kvm_hypercall(in, out, HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE);
+	kvm_hypercall(in, out, KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE));
 
 	*features = out[0];
 }
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index 19e4288d848..78b8766fd79 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -43,6 +43,7 @@
 #include <asm/dcr.h>
 #include <asm/ftrace.h>
 #include <asm/switch_to.h>
+#include <asm/epapr_hcalls.h>
 
 #ifdef CONFIG_PPC32
 extern void transfer_to_handler(void);
@@ -191,3 +192,7 @@ EXPORT_SYMBOL(__arch_hweight64);
 #ifdef CONFIG_PPC_BOOK3S_64
 EXPORT_SYMBOL_GPL(mmu_psize_defs);
 #endif
+
+#ifdef CONFIG_EPAPR_PARAVIRT
+EXPORT_SYMBOL(epapr_hypercall_start);
+#endif
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 2b952b5386f..e5b133ebd8a 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -427,6 +427,45 @@ int generic_check_cpu_restart(unsigned int cpu)
 {
 	return per_cpu(cpu_state, cpu) == CPU_UP_PREPARE;
 }
+
+static atomic_t secondary_inhibit_count;
+
+/*
+ * Don't allow secondary CPU threads to come online
+ */
+void inhibit_secondary_onlining(void)
+{
+	/*
+	 * This makes secondary_inhibit_count stable during cpu
+	 * online/offline operations.
+	 */
+	get_online_cpus();
+
+	atomic_inc(&secondary_inhibit_count);
+	put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(inhibit_secondary_onlining);
+
+/*
+ * Allow secondary CPU threads to come online again
+ */
+void uninhibit_secondary_onlining(void)
+{
+	get_online_cpus();
+	atomic_dec(&secondary_inhibit_count);
+	put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(uninhibit_secondary_onlining);
+
+static int secondaries_inhibited(void)
+{
+	return atomic_read(&secondary_inhibit_count);
+}
+
+#else /* HOTPLUG_CPU */
+
+#define secondaries_inhibited()		0
+
 #endif
 
 static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle)
@@ -445,6 +484,13 @@ int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	int rc, c;
 
+	/*
+	 * Don't allow secondary threads to come online if inhibited
+	 */
+	if (threads_per_core > 1 && secondaries_inhibited() &&
+	    cpu % threads_per_core != 0)
+		return -EBUSY;
+
 	if (smp_ops == NULL ||
 	    (smp_ops->cpu_bootable && !smp_ops->cpu_bootable(cpu)))
 		return -EINVAL;
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 50e7dbc7356..3d7fd21c65f 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -83,6 +83,7 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
 		vcpu_44x->shadow_refs[i].gtlb_index = -1;
 
 	vcpu->arch.cpu_type = KVM_CPU_440;
+	vcpu->arch.pvr = mfspr(SPRN_PVR);
 
 	return 0;
 }
diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c
index c8c61578fdf..35ec0a8547d 100644
--- a/arch/powerpc/kvm/44x_emulate.c
+++ b/arch/powerpc/kvm/44x_emulate.c
@@ -27,12 +27,70 @@
 #include "booke.h"
 #include "44x_tlb.h"
 
+#define XOP_MFDCRX  259
 #define XOP_MFDCR   323
+#define XOP_MTDCRX  387
 #define XOP_MTDCR   451
 #define XOP_TLBSX   914
 #define XOP_ICCCI   966
 #define XOP_TLBWE   978
 
+static int emulate_mtdcr(struct kvm_vcpu *vcpu, int rs, int dcrn)
+{
+	/* emulate some access in kernel */
+	switch (dcrn) {
+	case DCRN_CPR0_CONFIG_ADDR:
+		vcpu->arch.cpr0_cfgaddr = kvmppc_get_gpr(vcpu, rs);
+		return EMULATE_DONE;
+	default:
+		vcpu->run->dcr.dcrn = dcrn;
+		vcpu->run->dcr.data = kvmppc_get_gpr(vcpu, rs);
+		vcpu->run->dcr.is_write = 1;
+		vcpu->arch.dcr_is_write = 1;
+		vcpu->arch.dcr_needed = 1;
+		kvmppc_account_exit(vcpu, DCR_EXITS);
+		return EMULATE_DO_DCR;
+	}
+}
+
+static int emulate_mfdcr(struct kvm_vcpu *vcpu, int rt, int dcrn)
+{
+	/* The guest may access CPR0 registers to determine the timebase
+	 * frequency, and it must know the real host frequency because it
+	 * can directly access the timebase registers.
+	 *
+	 * It would be possible to emulate those accesses in userspace,
+	 * but userspace can really only figure out the end frequency.
+	 * We could decompose that into the factors that compute it, but
+	 * that's tricky math, and it's easier to just report the real
+	 * CPR0 values.
+	 */
+	switch (dcrn) {
+	case DCRN_CPR0_CONFIG_ADDR:
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.cpr0_cfgaddr);
+		break;
+	case DCRN_CPR0_CONFIG_DATA:
+		local_irq_disable();
+		mtdcr(DCRN_CPR0_CONFIG_ADDR,
+			  vcpu->arch.cpr0_cfgaddr);
+		kvmppc_set_gpr(vcpu, rt,
+			       mfdcr(DCRN_CPR0_CONFIG_DATA));
+		local_irq_enable();
+		break;
+	default:
+		vcpu->run->dcr.dcrn = dcrn;
+		vcpu->run->dcr.data =  0;
+		vcpu->run->dcr.is_write = 0;
+		vcpu->arch.dcr_is_write = 0;
+		vcpu->arch.io_gpr = rt;
+		vcpu->arch.dcr_needed = 1;
+		kvmppc_account_exit(vcpu, DCR_EXITS);
+		return EMULATE_DO_DCR;
+	}
+
+	return EMULATE_DONE;
+}
+
 int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                            unsigned int inst, int *advance)
 {
@@ -50,55 +108,21 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		switch (get_xop(inst)) {
 
 		case XOP_MFDCR:
-			/* The guest may access CPR0 registers to determine the timebase
-			 * frequency, and it must know the real host frequency because it
-			 * can directly access the timebase registers.
-			 *
-			 * It would be possible to emulate those accesses in userspace,
-			 * but userspace can really only figure out the end frequency.
-			 * We could decompose that into the factors that compute it, but
-			 * that's tricky math, and it's easier to just report the real
-			 * CPR0 values.
-			 */
-			switch (dcrn) {
-			case DCRN_CPR0_CONFIG_ADDR:
-				kvmppc_set_gpr(vcpu, rt, vcpu->arch.cpr0_cfgaddr);
-				break;
-			case DCRN_CPR0_CONFIG_DATA:
-				local_irq_disable();
-				mtdcr(DCRN_CPR0_CONFIG_ADDR,
-					  vcpu->arch.cpr0_cfgaddr);
-				kvmppc_set_gpr(vcpu, rt,
-					       mfdcr(DCRN_CPR0_CONFIG_DATA));
-				local_irq_enable();
-				break;
-			default:
-				run->dcr.dcrn = dcrn;
-				run->dcr.data =  0;
-				run->dcr.is_write = 0;
-				vcpu->arch.io_gpr = rt;
-				vcpu->arch.dcr_needed = 1;
-				kvmppc_account_exit(vcpu, DCR_EXITS);
-				emulated = EMULATE_DO_DCR;
-			}
+			emulated = emulate_mfdcr(vcpu, rt, dcrn);
+			break;
 
+		case XOP_MFDCRX:
+			emulated = emulate_mfdcr(vcpu, rt,
+					kvmppc_get_gpr(vcpu, ra));
 			break;
 
 		case XOP_MTDCR:
-			/* emulate some access in kernel */
-			switch (dcrn) {
-			case DCRN_CPR0_CONFIG_ADDR:
-				vcpu->arch.cpr0_cfgaddr = kvmppc_get_gpr(vcpu, rs);
-				break;
-			default:
-				run->dcr.dcrn = dcrn;
-				run->dcr.data = kvmppc_get_gpr(vcpu, rs);
-				run->dcr.is_write = 1;
-				vcpu->arch.dcr_needed = 1;
-				kvmppc_account_exit(vcpu, DCR_EXITS);
-				emulated = EMULATE_DO_DCR;
-			}
+			emulated = emulate_mtdcr(vcpu, rs, dcrn);
+			break;
 
+		case XOP_MTDCRX:
+			emulated = emulate_mtdcr(vcpu, rs,
+					kvmppc_get_gpr(vcpu, ra));
 			break;
 
 		case XOP_TLBWE:
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index f4dacb9c57f..4730c953f43 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -20,6 +20,7 @@ config KVM
 	bool
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
+	select HAVE_KVM_EVENTFD
 
 config KVM_BOOK3S_HANDLER
 	bool
@@ -36,6 +37,7 @@ config KVM_BOOK3S_64_HANDLER
 config KVM_BOOK3S_PR
 	bool
 	select KVM_MMIO
+	select MMU_NOTIFIER
 
 config KVM_BOOK3S_32
 	tristate "KVM support for PowerPC book3s_32 processors"
@@ -123,6 +125,7 @@ config KVM_E500V2
 	depends on EXPERIMENTAL && E500 && !PPC_E500MC
 	select KVM
 	select KVM_MMIO
+	select MMU_NOTIFIER
 	---help---
 	  Support running unmodified E500 guest kernels in virtual machines on
 	  E500v2 host processors.
@@ -138,6 +141,7 @@ config KVM_E500MC
 	select KVM
 	select KVM_MMIO
 	select KVM_BOOKE_HV
+	select MMU_NOTIFIER
 	---help---
 	  Support running unmodified E500MC/E5500 (32-bit) guest kernels in
 	  virtual machines on E500MC/E5500 host processors.
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index c2a08636e6d..1e473d46322 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -6,7 +6,8 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
 
 ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
 
-common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
+common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o \
+						eventfd.o)
 
 CFLAGS_44x_tlb.o  := -I.
 CFLAGS_e500_tlb.o := -I.
@@ -72,10 +73,12 @@ kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
 	book3s_hv_rmhandlers.o \
 	book3s_hv_rm_mmu.o \
 	book3s_64_vio_hv.o \
+	book3s_hv_ras.o \
 	book3s_hv_builtin.o
 
 kvm-book3s_64-module-objs := \
 	../../../virt/kvm/kvm_main.o \
+	../../../virt/kvm/eventfd.o \
 	powerpc.o \
 	emulate.o \
 	book3s.o \
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 3f2a8360c85..a4b64528524 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -411,6 +411,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
+void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+}
+
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 	int i;
@@ -476,6 +485,122 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	return -ENOTSUPP;
 }
 
+int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+{
+	int r;
+	union kvmppc_one_reg val;
+	int size;
+	long int i;
+
+	size = one_reg_size(reg->id);
+	if (size > sizeof(val))
+		return -EINVAL;
+
+	r = kvmppc_get_one_reg(vcpu, reg->id, &val);
+
+	if (r == -EINVAL) {
+		r = 0;
+		switch (reg->id) {
+		case KVM_REG_PPC_DAR:
+			val = get_reg_val(reg->id, vcpu->arch.shared->dar);
+			break;
+		case KVM_REG_PPC_DSISR:
+			val = get_reg_val(reg->id, vcpu->arch.shared->dsisr);
+			break;
+		case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
+			i = reg->id - KVM_REG_PPC_FPR0;
+			val = get_reg_val(reg->id, vcpu->arch.fpr[i]);
+			break;
+		case KVM_REG_PPC_FPSCR:
+			val = get_reg_val(reg->id, vcpu->arch.fpscr);
+			break;
+#ifdef CONFIG_ALTIVEC
+		case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31:
+			if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
+				r = -ENXIO;
+				break;
+			}
+			val.vval = vcpu->arch.vr[reg->id - KVM_REG_PPC_VR0];
+			break;
+		case KVM_REG_PPC_VSCR:
+			if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
+				r = -ENXIO;
+				break;
+			}
+			val = get_reg_val(reg->id, vcpu->arch.vscr.u[3]);
+			break;
+#endif /* CONFIG_ALTIVEC */
+		default:
+			r = -EINVAL;
+			break;
+		}
+	}
+	if (r)
+		return r;
+
+	if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size))
+		r = -EFAULT;
+
+	return r;
+}
+
+int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+{
+	int r;
+	union kvmppc_one_reg val;
+	int size;
+	long int i;
+
+	size = one_reg_size(reg->id);
+	if (size > sizeof(val))
+		return -EINVAL;
+
+	if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size))
+		return -EFAULT;
+
+	r = kvmppc_set_one_reg(vcpu, reg->id, &val);
+
+	if (r == -EINVAL) {
+		r = 0;
+		switch (reg->id) {
+		case KVM_REG_PPC_DAR:
+			vcpu->arch.shared->dar = set_reg_val(reg->id, val);
+			break;
+		case KVM_REG_PPC_DSISR:
+			vcpu->arch.shared->dsisr = set_reg_val(reg->id, val);
+			break;
+		case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
+			i = reg->id - KVM_REG_PPC_FPR0;
+			vcpu->arch.fpr[i] = set_reg_val(reg->id, val);
+			break;
+		case KVM_REG_PPC_FPSCR:
+			vcpu->arch.fpscr = set_reg_val(reg->id, val);
+			break;
+#ifdef CONFIG_ALTIVEC
+		case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31:
+			if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
+				r = -ENXIO;
+				break;
+			}
+			vcpu->arch.vr[reg->id - KVM_REG_PPC_VR0] = val.vval;
+			break;
+		case KVM_REG_PPC_VSCR:
+			if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
+				r = -ENXIO;
+				break;
+			}
+			vcpu->arch.vscr.u[3] = set_reg_val(reg->id, val);
+			break;
+#endif /* CONFIG_ALTIVEC */
+		default:
+			r = -EINVAL;
+			break;
+		}
+	}
+
+	return r;
+}
+
 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
                                   struct kvm_translation *tr)
 {
diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
index b0f625a3334..00e619bf608 100644
--- a/arch/powerpc/kvm/book3s_32_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -155,7 +155,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
 
 	/* Get host physical address for gpa */
 	hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
-	if (is_error_pfn(hpaddr)) {
+	if (is_error_noslot_pfn(hpaddr)) {
 		printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n",
 				 orig_pte->eaddr);
 		r = -EINVAL;
@@ -254,6 +254,7 @@ next_pteg:
 
 	kvmppc_mmu_hpte_cache_map(vcpu, pte);
 
+	kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT);
 out:
 	return r;
 }
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 4d72f9ebc55..ead58e31729 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -93,7 +93,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
 
 	/* Get host physical address for gpa */
 	hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
-	if (is_error_pfn(hpaddr)) {
+	if (is_error_noslot_pfn(hpaddr)) {
 		printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr);
 		r = -EINVAL;
 		goto out;
@@ -171,6 +171,7 @@ map_again:
 
 		kvmppc_mmu_hpte_cache_map(vcpu, pte);
 	}
+	kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT);
 
 out:
 	return r;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index d95d11322a1..8cc18abd6dd 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -24,6 +24,9 @@
 #include <linux/slab.h>
 #include <linux/hugetlb.h>
 #include <linux/vmalloc.h>
+#include <linux/srcu.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
 
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
@@ -40,6 +43,11 @@
 /* Power architecture requires HPT is at least 256kB */
 #define PPC_MIN_HPT_ORDER	18
 
+static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
+				long pte_index, unsigned long pteh,
+				unsigned long ptel, unsigned long *pte_idx_ret);
+static void kvmppc_rmap_reset(struct kvm *kvm);
+
 long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 {
 	unsigned long hpt;
@@ -137,10 +145,11 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
 		/* Set the entire HPT to 0, i.e. invalid HPTEs */
 		memset((void *)kvm->arch.hpt_virt, 0, 1ul << order);
 		/*
-		 * Set the whole last_vcpu array to an invalid vcpu number.
-		 * This ensures that each vcpu will flush its TLB on next entry.
+		 * Reset all the reverse-mapping chains for all memslots
 		 */
-		memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu));
+		kvmppc_rmap_reset(kvm);
+		/* Ensure that each vcpu will flush its TLB on next entry. */
+		cpumask_setall(&kvm->arch.need_tlb_flush);
 		*htab_orderp = order;
 		err = 0;
 	} else {
@@ -184,6 +193,7 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
 	unsigned long addr, hash;
 	unsigned long psize;
 	unsigned long hp0, hp1;
+	unsigned long idx_ret;
 	long ret;
 	struct kvm *kvm = vcpu->kvm;
 
@@ -215,7 +225,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
 		hash = (hash << 3) + 7;
 		hp_v = hp0 | ((addr >> 16) & ~0x7fUL);
 		hp_r = hp1 | addr;
-		ret = kvmppc_virtmode_h_enter(vcpu, H_EXACT, hash, hp_v, hp_r);
+		ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r,
+						 &idx_ret);
 		if (ret != H_SUCCESS) {
 			pr_err("KVM: map_vrma at %lx failed, ret=%ld\n",
 			       addr, ret);
@@ -260,7 +271,7 @@ static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
 
 /*
  * This is called to get a reference to a guest page if there isn't
- * one already in the kvm->arch.slot_phys[][] arrays.
+ * one already in the memslot->arch.slot_phys[] array.
  */
 static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn,
 				  struct kvm_memory_slot *memslot,
@@ -275,7 +286,7 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn,
 	struct vm_area_struct *vma;
 	unsigned long pfn, i, npages;
 
-	physp = kvm->arch.slot_phys[memslot->id];
+	physp = memslot->arch.slot_phys;
 	if (!physp)
 		return -EINVAL;
 	if (physp[gfn - memslot->base_gfn])
@@ -353,15 +364,10 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn,
 	return err;
 }
 
-/*
- * We come here on a H_ENTER call from the guest when we are not
- * using mmu notifiers and we don't have the requested page pinned
- * already.
- */
-long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
-			long pte_index, unsigned long pteh, unsigned long ptel)
+long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
+				long pte_index, unsigned long pteh,
+				unsigned long ptel, unsigned long *pte_idx_ret)
 {
-	struct kvm *kvm = vcpu->kvm;
 	unsigned long psize, gpa, gfn;
 	struct kvm_memory_slot *memslot;
 	long ret;
@@ -389,8 +395,8 @@ long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
  do_insert:
 	/* Protect linux PTE lookup from page table destruction */
 	rcu_read_lock_sched();	/* this disables preemption too */
-	vcpu->arch.pgdir = current->mm->pgd;
-	ret = kvmppc_h_enter(vcpu, flags, pte_index, pteh, ptel);
+	ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel,
+				current->mm->pgd, false, pte_idx_ret);
 	rcu_read_unlock_sched();
 	if (ret == H_TOO_HARD) {
 		/* this can't happen */
@@ -401,6 +407,19 @@ long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 
 }
 
+/*
+ * We come here on a H_ENTER call from the guest when we are not
+ * using mmu notifiers and we don't have the requested page pinned
+ * already.
+ */
+long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
+			     long pte_index, unsigned long pteh,
+			     unsigned long ptel)
+{
+	return kvmppc_virtmode_do_h_enter(vcpu->kvm, flags, pte_index,
+					  pteh, ptel, &vcpu->arch.gpr[4]);
+}
+
 static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu,
 							 gva_t eaddr)
 {
@@ -570,7 +589,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	struct kvm *kvm = vcpu->kvm;
 	unsigned long *hptep, hpte[3], r;
 	unsigned long mmu_seq, psize, pte_size;
-	unsigned long gfn, hva, pfn;
+	unsigned long gpa, gfn, hva, pfn;
 	struct kvm_memory_slot *memslot;
 	unsigned long *rmap;
 	struct revmap_entry *rev;
@@ -608,15 +627,14 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	/* Translate the logical address and get the page */
 	psize = hpte_page_size(hpte[0], r);
-	gfn = hpte_rpn(r, psize);
+	gpa = (r & HPTE_R_RPN & ~(psize - 1)) | (ea & (psize - 1));
+	gfn = gpa >> PAGE_SHIFT;
 	memslot = gfn_to_memslot(kvm, gfn);
 
 	/* No memslot means it's an emulated MMIO region */
-	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
-		unsigned long gpa = (gfn << PAGE_SHIFT) | (ea & (psize - 1));
+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
 		return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
 					      dsisr & DSISR_ISSTORE);
-	}
 
 	if (!kvm->arch.using_mmu_notifiers)
 		return -EFAULT;		/* should never get here */
@@ -710,7 +728,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	/* Check if we might have been invalidated; let the guest retry if so */
 	ret = RESUME_GUEST;
-	if (mmu_notifier_retry(vcpu, mmu_seq)) {
+	if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) {
 		unlock_rmap(rmap);
 		goto out_unlock;
 	}
@@ -756,6 +774,25 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	goto out_put;
 }
 
+static void kvmppc_rmap_reset(struct kvm *kvm)
+{
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	int srcu_idx;
+
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	slots = kvm->memslots;
+	kvm_for_each_memslot(memslot, slots) {
+		/*
+		 * This assumes it is acceptable to lose reference and
+		 * change bits across a reset.
+		 */
+		memset(memslot->arch.rmap, 0,
+		       memslot->npages * sizeof(*memslot->arch.rmap));
+	}
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+}
+
 static int kvm_handle_hva_range(struct kvm *kvm,
 				unsigned long start,
 				unsigned long end,
@@ -850,7 +887,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 		psize = hpte_page_size(hptep[0], ptel);
 		if ((hptep[0] & HPTE_V_VALID) &&
 		    hpte_rpn(ptel, psize) == gfn) {
-			hptep[0] |= HPTE_V_ABSENT;
+			if (kvm->arch.using_mmu_notifiers)
+				hptep[0] |= HPTE_V_ABSENT;
 			kvmppc_invalidate_hpte(kvm, hptep, i);
 			/* Harvest R and C */
 			rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C);
@@ -877,6 +915,28 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
 	return 0;
 }
 
+void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
+{
+	unsigned long *rmapp;
+	unsigned long gfn;
+	unsigned long n;
+
+	rmapp = memslot->arch.rmap;
+	gfn = memslot->base_gfn;
+	for (n = memslot->npages; n; --n) {
+		/*
+		 * Testing the present bit without locking is OK because
+		 * the memslot has been marked invalid already, and hence
+		 * no new HPTEs referencing this page can be created,
+		 * thus the present bit can't go from 0 to 1.
+		 */
+		if (*rmapp & KVMPPC_RMAP_PRESENT)
+			kvm_unmap_rmapp(kvm, rmapp, gfn);
+		++rmapp;
+		++gfn;
+	}
+}
+
 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			 unsigned long gfn)
 {
@@ -1030,16 +1090,16 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
 	return ret;
 }
 
-long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
+long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			     unsigned long *map)
 {
 	unsigned long i;
-	unsigned long *rmapp, *map;
+	unsigned long *rmapp;
 
 	preempt_disable();
 	rmapp = memslot->arch.rmap;
-	map = memslot->dirty_bitmap;
 	for (i = 0; i < memslot->npages; ++i) {
-		if (kvm_test_clear_dirty(kvm, rmapp))
+		if (kvm_test_clear_dirty(kvm, rmapp) && map)
 			__set_bit_le(i, map);
 		++rmapp;
 	}
@@ -1057,20 +1117,22 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
 	unsigned long hva, psize, offset;
 	unsigned long pa;
 	unsigned long *physp;
+	int srcu_idx;
 
+	srcu_idx = srcu_read_lock(&kvm->srcu);
 	memslot = gfn_to_memslot(kvm, gfn);
 	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
-		return NULL;
+		goto err;
 	if (!kvm->arch.using_mmu_notifiers) {
-		physp = kvm->arch.slot_phys[memslot->id];
+		physp = memslot->arch.slot_phys;
 		if (!physp)
-			return NULL;
+			goto err;
 		physp += gfn - memslot->base_gfn;
 		pa = *physp;
 		if (!pa) {
 			if (kvmppc_get_guest_page(kvm, gfn, memslot,
 						  PAGE_SIZE) < 0)
-				return NULL;
+				goto err;
 			pa = *physp;
 		}
 		page = pfn_to_page(pa >> PAGE_SHIFT);
@@ -1079,9 +1141,11 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
 		hva = gfn_to_hva_memslot(memslot, gfn);
 		npages = get_user_pages_fast(hva, 1, 1, pages);
 		if (npages < 1)
-			return NULL;
+			goto err;
 		page = pages[0];
 	}
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+
 	psize = PAGE_SIZE;
 	if (PageHuge(page)) {
 		page = compound_head(page);
@@ -1091,6 +1155,10 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
 	if (nb_ret)
 		*nb_ret = psize - offset;
 	return page_address(page) + offset;
+
+ err:
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	return NULL;
 }
 
 void kvmppc_unpin_guest_page(struct kvm *kvm, void *va)
@@ -1100,6 +1168,348 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va)
 	put_page(page);
 }
 
+/*
+ * Functions for reading and writing the hash table via reads and
+ * writes on a file descriptor.
+ *
+ * Reads return the guest view of the hash table, which has to be
+ * pieced together from the real hash table and the guest_rpte
+ * values in the revmap array.
+ *
+ * On writes, each HPTE written is considered in turn, and if it
+ * is valid, it is written to the HPT as if an H_ENTER with the
+ * exact flag set was done.  When the invalid count is non-zero
+ * in the header written to the stream, the kernel will make
+ * sure that that many HPTEs are invalid, and invalidate them
+ * if not.
+ */
+
+struct kvm_htab_ctx {
+	unsigned long	index;
+	unsigned long	flags;
+	struct kvm	*kvm;
+	int		first_pass;
+};
+
+#define HPTE_SIZE	(2 * sizeof(unsigned long))
+
+static long record_hpte(unsigned long flags, unsigned long *hptp,
+			unsigned long *hpte, struct revmap_entry *revp,
+			int want_valid, int first_pass)
+{
+	unsigned long v, r;
+	int ok = 1;
+	int valid, dirty;
+
+	/* Unmodified entries are uninteresting except on the first pass */
+	dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
+	if (!first_pass && !dirty)
+		return 0;
+
+	valid = 0;
+	if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) {
+		valid = 1;
+		if ((flags & KVM_GET_HTAB_BOLTED_ONLY) &&
+		    !(hptp[0] & HPTE_V_BOLTED))
+			valid = 0;
+	}
+	if (valid != want_valid)
+		return 0;
+
+	v = r = 0;
+	if (valid || dirty) {
+		/* lock the HPTE so it's stable and read it */
+		preempt_disable();
+		while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
+			cpu_relax();
+		v = hptp[0];
+		if (v & HPTE_V_ABSENT) {
+			v &= ~HPTE_V_ABSENT;
+			v |= HPTE_V_VALID;
+		}
+		/* re-evaluate valid and dirty from synchronized HPTE value */
+		valid = !!(v & HPTE_V_VALID);
+		if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED))
+			valid = 0;
+		r = revp->guest_rpte | (hptp[1] & (HPTE_R_R | HPTE_R_C));
+		dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
+		/* only clear modified if this is the right sort of entry */
+		if (valid == want_valid && dirty) {
+			r &= ~HPTE_GR_MODIFIED;
+			revp->guest_rpte = r;
+		}
+		asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
+		hptp[0] &= ~HPTE_V_HVLOCK;
+		preempt_enable();
+		if (!(valid == want_valid && (first_pass || dirty)))
+			ok = 0;
+	}
+	hpte[0] = v;
+	hpte[1] = r;
+	return ok;
+}
+
+static ssize_t kvm_htab_read(struct file *file, char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	struct kvm_htab_ctx *ctx = file->private_data;
+	struct kvm *kvm = ctx->kvm;
+	struct kvm_get_htab_header hdr;
+	unsigned long *hptp;
+	struct revmap_entry *revp;
+	unsigned long i, nb, nw;
+	unsigned long __user *lbuf;
+	struct kvm_get_htab_header __user *hptr;
+	unsigned long flags;
+	int first_pass;
+	unsigned long hpte[2];
+
+	if (!access_ok(VERIFY_WRITE, buf, count))
+		return -EFAULT;
+
+	first_pass = ctx->first_pass;
+	flags = ctx->flags;
+
+	i = ctx->index;
+	hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
+	revp = kvm->arch.revmap + i;
+	lbuf = (unsigned long __user *)buf;
+
+	nb = 0;
+	while (nb + sizeof(hdr) + HPTE_SIZE < count) {
+		/* Initialize header */
+		hptr = (struct kvm_get_htab_header __user *)buf;
+		hdr.n_valid = 0;
+		hdr.n_invalid = 0;
+		nw = nb;
+		nb += sizeof(hdr);
+		lbuf = (unsigned long __user *)(buf + sizeof(hdr));
+
+		/* Skip uninteresting entries, i.e. clean on not-first pass */
+		if (!first_pass) {
+			while (i < kvm->arch.hpt_npte &&
+			       !(revp->guest_rpte & HPTE_GR_MODIFIED)) {
+				++i;
+				hptp += 2;
+				++revp;
+			}
+		}
+		hdr.index = i;
+
+		/* Grab a series of valid entries */
+		while (i < kvm->arch.hpt_npte &&
+		       hdr.n_valid < 0xffff &&
+		       nb + HPTE_SIZE < count &&
+		       record_hpte(flags, hptp, hpte, revp, 1, first_pass)) {
+			/* valid entry, write it out */
+			++hdr.n_valid;
+			if (__put_user(hpte[0], lbuf) ||
+			    __put_user(hpte[1], lbuf + 1))
+				return -EFAULT;
+			nb += HPTE_SIZE;
+			lbuf += 2;
+			++i;
+			hptp += 2;
+			++revp;
+		}
+		/* Now skip invalid entries while we can */
+		while (i < kvm->arch.hpt_npte &&
+		       hdr.n_invalid < 0xffff &&
+		       record_hpte(flags, hptp, hpte, revp, 0, first_pass)) {
+			/* found an invalid entry */
+			++hdr.n_invalid;
+			++i;
+			hptp += 2;
+			++revp;
+		}
+
+		if (hdr.n_valid || hdr.n_invalid) {
+			/* write back the header */
+			if (__copy_to_user(hptr, &hdr, sizeof(hdr)))
+				return -EFAULT;
+			nw = nb;
+			buf = (char __user *)lbuf;
+		} else {
+			nb = nw;
+		}
+
+		/* Check if we've wrapped around the hash table */
+		if (i >= kvm->arch.hpt_npte) {
+			i = 0;
+			ctx->first_pass = 0;
+			break;
+		}
+	}
+
+	ctx->index = i;
+
+	return nb;
+}
+
+static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
+			      size_t count, loff_t *ppos)
+{
+	struct kvm_htab_ctx *ctx = file->private_data;
+	struct kvm *kvm = ctx->kvm;
+	struct kvm_get_htab_header hdr;
+	unsigned long i, j;
+	unsigned long v, r;
+	unsigned long __user *lbuf;
+	unsigned long *hptp;
+	unsigned long tmp[2];
+	ssize_t nb;
+	long int err, ret;
+	int rma_setup;
+
+	if (!access_ok(VERIFY_READ, buf, count))
+		return -EFAULT;
+
+	/* lock out vcpus from running while we're doing this */
+	mutex_lock(&kvm->lock);
+	rma_setup = kvm->arch.rma_setup_done;
+	if (rma_setup) {
+		kvm->arch.rma_setup_done = 0;	/* temporarily */
+		/* order rma_setup_done vs. vcpus_running */
+		smp_mb();
+		if (atomic_read(&kvm->arch.vcpus_running)) {
+			kvm->arch.rma_setup_done = 1;
+			mutex_unlock(&kvm->lock);
+			return -EBUSY;
+		}
+	}
+
+	err = 0;
+	for (nb = 0; nb + sizeof(hdr) <= count; ) {
+		err = -EFAULT;
+		if (__copy_from_user(&hdr, buf, sizeof(hdr)))
+			break;
+
+		err = 0;
+		if (nb + hdr.n_valid * HPTE_SIZE > count)
+			break;
+
+		nb += sizeof(hdr);
+		buf += sizeof(hdr);
+
+		err = -EINVAL;
+		i = hdr.index;
+		if (i >= kvm->arch.hpt_npte ||
+		    i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte)
+			break;
+
+		hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
+		lbuf = (unsigned long __user *)buf;
+		for (j = 0; j < hdr.n_valid; ++j) {
+			err = -EFAULT;
+			if (__get_user(v, lbuf) || __get_user(r, lbuf + 1))
+				goto out;
+			err = -EINVAL;
+			if (!(v & HPTE_V_VALID))
+				goto out;
+			lbuf += 2;
+			nb += HPTE_SIZE;
+
+			if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT))
+				kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
+			err = -EIO;
+			ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r,
+							 tmp);
+			if (ret != H_SUCCESS) {
+				pr_err("kvm_htab_write ret %ld i=%ld v=%lx "
+				       "r=%lx\n", ret, i, v, r);
+				goto out;
+			}
+			if (!rma_setup && is_vrma_hpte(v)) {
+				unsigned long psize = hpte_page_size(v, r);
+				unsigned long senc = slb_pgsize_encoding(psize);
+				unsigned long lpcr;
+
+				kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
+					(VRMA_VSID << SLB_VSID_SHIFT_1T);
+				lpcr = kvm->arch.lpcr & ~LPCR_VRMASD;
+				lpcr |= senc << (LPCR_VRMASD_SH - 4);
+				kvm->arch.lpcr = lpcr;
+				rma_setup = 1;
+			}
+			++i;
+			hptp += 2;
+		}
+
+		for (j = 0; j < hdr.n_invalid; ++j) {
+			if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT))
+				kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
+			++i;
+			hptp += 2;
+		}
+		err = 0;
+	}
+
+ out:
+	/* Order HPTE updates vs. rma_setup_done */
+	smp_wmb();
+	kvm->arch.rma_setup_done = rma_setup;
+	mutex_unlock(&kvm->lock);
+
+	if (err)
+		return err;
+	return nb;
+}
+
+static int kvm_htab_release(struct inode *inode, struct file *filp)
+{
+	struct kvm_htab_ctx *ctx = filp->private_data;
+
+	filp->private_data = NULL;
+	if (!(ctx->flags & KVM_GET_HTAB_WRITE))
+		atomic_dec(&ctx->kvm->arch.hpte_mod_interest);
+	kvm_put_kvm(ctx->kvm);
+	kfree(ctx);
+	return 0;
+}
+
+static struct file_operations kvm_htab_fops = {
+	.read		= kvm_htab_read,
+	.write		= kvm_htab_write,
+	.llseek		= default_llseek,
+	.release	= kvm_htab_release,
+};
+
+int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
+{
+	int ret;
+	struct kvm_htab_ctx *ctx;
+	int rwflag;
+
+	/* reject flags we don't recognize */
+	if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE))
+		return -EINVAL;
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+	kvm_get_kvm(kvm);
+	ctx->kvm = kvm;
+	ctx->index = ghf->start_index;
+	ctx->flags = ghf->flags;
+	ctx->first_pass = 1;
+
+	rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY;
+	ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag);
+	if (ret < 0) {
+		kvm_put_kvm(kvm);
+		return ret;
+	}
+
+	if (rwflag == O_RDONLY) {
+		mutex_lock(&kvm->slots_lock);
+		atomic_inc(&kvm->arch.hpte_mod_interest);
+		/* make sure kvmppc_do_h_enter etc. see the increment */
+		synchronize_srcu_expedited(&kvm->srcu);
+		mutex_unlock(&kvm->slots_lock);
+	}
+
+	return ret;
+}
+
 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index b9a989dc76c..d31a716f7f2 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -22,6 +22,7 @@
 #include <asm/kvm_book3s.h>
 #include <asm/reg.h>
 #include <asm/switch_to.h>
+#include <asm/time.h>
 
 #define OP_19_XOP_RFID		18
 #define OP_19_XOP_RFI		50
@@ -395,6 +396,12 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 		    (mfmsr() & MSR_HV))
 			vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
 		break;
+	case SPRN_PURR:
+		to_book3s(vcpu)->purr_offset = spr_val - get_tb();
+		break;
+	case SPRN_SPURR:
+		to_book3s(vcpu)->spurr_offset = spr_val - get_tb();
+		break;
 	case SPRN_GQR0:
 	case SPRN_GQR1:
 	case SPRN_GQR2:
@@ -412,6 +419,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 	case SPRN_CTRLF:
 	case SPRN_CTRLT:
 	case SPRN_L2CR:
+	case SPRN_DSCR:
 	case SPRN_MMCR0_GEKKO:
 	case SPRN_MMCR1_GEKKO:
 	case SPRN_PMC1_GEKKO:
@@ -483,9 +491,15 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
 		*spr_val = to_book3s(vcpu)->hid[5];
 		break;
 	case SPRN_CFAR:
-	case SPRN_PURR:
+	case SPRN_DSCR:
 		*spr_val = 0;
 		break;
+	case SPRN_PURR:
+		*spr_val = get_tb() + to_book3s(vcpu)->purr_offset;
+		break;
+	case SPRN_SPURR:
+		*spr_val = get_tb() + to_book3s(vcpu)->purr_offset;
+		break;
 	case SPRN_GQR0:
 	case SPRN_GQR1:
 	case SPRN_GQR2:
diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c
index a150817d6d4..7057a02f090 100644
--- a/arch/powerpc/kvm/book3s_exports.c
+++ b/arch/powerpc/kvm/book3s_exports.c
@@ -28,8 +28,5 @@ EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu);
 #ifdef CONFIG_ALTIVEC
 EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec);
 #endif
-#ifdef CONFIG_VSX
-EXPORT_SYMBOL_GPL(kvmppc_load_up_vsx);
-#endif
 #endif
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 721d4603a23..71d0c90b62b 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -30,6 +30,7 @@
 #include <linux/cpumask.h>
 #include <linux/spinlock.h>
 #include <linux/page-flags.h>
+#include <linux/srcu.h>
 
 #include <asm/reg.h>
 #include <asm/cputable.h>
@@ -46,6 +47,7 @@
 #include <asm/page.h>
 #include <asm/hvcall.h>
 #include <asm/switch_to.h>
+#include <asm/smp.h>
 #include <linux/gfp.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
@@ -55,25 +57,77 @@
 /* #define EXIT_DEBUG_SIMPLE */
 /* #define EXIT_DEBUG_INT */
 
+/* Used to indicate that a guest page fault needs to be handled */
+#define RESUME_PAGE_FAULT	(RESUME_GUEST | RESUME_FLAG_ARCH1)
+
+/* Used as a "null" value for timebase values */
+#define TB_NIL	(~(u64)0)
+
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
+/*
+ * We use the vcpu_load/put functions to measure stolen time.
+ * Stolen time is counted as time when either the vcpu is able to
+ * run as part of a virtual core, but the task running the vcore
+ * is preempted or sleeping, or when the vcpu needs something done
+ * in the kernel by the task running the vcpu, but that task is
+ * preempted or sleeping.  Those two things have to be counted
+ * separately, since one of the vcpu tasks will take on the job
+ * of running the core, and the other vcpu tasks in the vcore will
+ * sleep waiting for it to do that, but that sleep shouldn't count
+ * as stolen time.
+ *
+ * Hence we accumulate stolen time when the vcpu can run as part of
+ * a vcore using vc->stolen_tb, and the stolen time when the vcpu
+ * needs its task to do other things in the kernel (for example,
+ * service a page fault) in busy_stolen.  We don't accumulate
+ * stolen time for a vcore when it is inactive, or for a vcpu
+ * when it is in state RUNNING or NOTREADY.  NOTREADY is a bit of
+ * a misnomer; it means that the vcpu task is not executing in
+ * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in
+ * the kernel.  We don't have any way of dividing up that time
+ * between time that the vcpu is genuinely stopped, time that
+ * the task is actively working on behalf of the vcpu, and time
+ * that the task is preempted, so we don't count any of it as
+ * stolen.
+ *
+ * Updates to busy_stolen are protected by arch.tbacct_lock;
+ * updates to vc->stolen_tb are protected by the arch.tbacct_lock
+ * of the vcpu that has taken responsibility for running the vcore
+ * (i.e. vc->runner).  The stolen times are measured in units of
+ * timebase ticks.  (Note that the != TB_NIL checks below are
+ * purely defensive; they should never fail.)
+ */
+
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
-	local_paca->kvm_hstate.kvm_vcpu = vcpu;
-	local_paca->kvm_hstate.kvm_vcore = vc;
-	if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE)
+	spin_lock(&vcpu->arch.tbacct_lock);
+	if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE &&
+	    vc->preempt_tb != TB_NIL) {
 		vc->stolen_tb += mftb() - vc->preempt_tb;
+		vc->preempt_tb = TB_NIL;
+	}
+	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
+	    vcpu->arch.busy_preempt != TB_NIL) {
+		vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt;
+		vcpu->arch.busy_preempt = TB_NIL;
+	}
+	spin_unlock(&vcpu->arch.tbacct_lock);
 }
 
 void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
+	spin_lock(&vcpu->arch.tbacct_lock);
 	if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE)
 		vc->preempt_tb = mftb();
+	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
+		vcpu->arch.busy_preempt = mftb();
+	spin_unlock(&vcpu->arch.tbacct_lock);
 }
 
 void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
@@ -142,6 +196,22 @@ static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
 	vpa->yield_count = 1;
 }
 
+static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v,
+		   unsigned long addr, unsigned long len)
+{
+	/* check address is cacheline aligned */
+	if (addr & (L1_CACHE_BYTES - 1))
+		return -EINVAL;
+	spin_lock(&vcpu->arch.vpa_update_lock);
+	if (v->next_gpa != addr || v->len != len) {
+		v->next_gpa = addr;
+		v->len = addr ? len : 0;
+		v->update_pending = 1;
+	}
+	spin_unlock(&vcpu->arch.vpa_update_lock);
+	return 0;
+}
+
 /* Length for a per-processor buffer is passed in at offset 4 in the buffer */
 struct reg_vpa {
 	u32 dummy;
@@ -317,10 +387,16 @@ static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
 
 static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
 {
+	if (!(vcpu->arch.vpa.update_pending ||
+	      vcpu->arch.slb_shadow.update_pending ||
+	      vcpu->arch.dtl.update_pending))
+		return;
+
 	spin_lock(&vcpu->arch.vpa_update_lock);
 	if (vcpu->arch.vpa.update_pending) {
 		kvmppc_update_vpa(vcpu, &vcpu->arch.vpa);
-		init_vpa(vcpu, vcpu->arch.vpa.pinned_addr);
+		if (vcpu->arch.vpa.pinned_addr)
+			init_vpa(vcpu, vcpu->arch.vpa.pinned_addr);
 	}
 	if (vcpu->arch.dtl.update_pending) {
 		kvmppc_update_vpa(vcpu, &vcpu->arch.dtl);
@@ -332,24 +408,61 @@ static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
 	spin_unlock(&vcpu->arch.vpa_update_lock);
 }
 
+/*
+ * Return the accumulated stolen time for the vcore up until `now'.
+ * The caller should hold the vcore lock.
+ */
+static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now)
+{
+	u64 p;
+
+	/*
+	 * If we are the task running the vcore, then since we hold
+	 * the vcore lock, we can't be preempted, so stolen_tb/preempt_tb
+	 * can't be updated, so we don't need the tbacct_lock.
+	 * If the vcore is inactive, it can't become active (since we
+	 * hold the vcore lock), so the vcpu load/put functions won't
+	 * update stolen_tb/preempt_tb, and we don't need tbacct_lock.
+	 */
+	if (vc->vcore_state != VCORE_INACTIVE &&
+	    vc->runner->arch.run_task != current) {
+		spin_lock(&vc->runner->arch.tbacct_lock);
+		p = vc->stolen_tb;
+		if (vc->preempt_tb != TB_NIL)
+			p += now - vc->preempt_tb;
+		spin_unlock(&vc->runner->arch.tbacct_lock);
+	} else {
+		p = vc->stolen_tb;
+	}
+	return p;
+}
+
 static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
 				    struct kvmppc_vcore *vc)
 {
 	struct dtl_entry *dt;
 	struct lppaca *vpa;
-	unsigned long old_stolen;
+	unsigned long stolen;
+	unsigned long core_stolen;
+	u64 now;
 
 	dt = vcpu->arch.dtl_ptr;
 	vpa = vcpu->arch.vpa.pinned_addr;
-	old_stolen = vcpu->arch.stolen_logged;
-	vcpu->arch.stolen_logged = vc->stolen_tb;
+	now = mftb();
+	core_stolen = vcore_stolen_time(vc, now);
+	stolen = core_stolen - vcpu->arch.stolen_logged;
+	vcpu->arch.stolen_logged = core_stolen;
+	spin_lock(&vcpu->arch.tbacct_lock);
+	stolen += vcpu->arch.busy_stolen;
+	vcpu->arch.busy_stolen = 0;
+	spin_unlock(&vcpu->arch.tbacct_lock);
 	if (!dt || !vpa)
 		return;
 	memset(dt, 0, sizeof(struct dtl_entry));
 	dt->dispatch_reason = 7;
 	dt->processor_id = vc->pcpu + vcpu->arch.ptid;
-	dt->timebase = mftb();
-	dt->enqueue_to_dispatch_time = vc->stolen_tb - old_stolen;
+	dt->timebase = now;
+	dt->enqueue_to_dispatch_time = stolen;
 	dt->srr0 = kvmppc_get_pc(vcpu);
 	dt->srr1 = vcpu->arch.shregs.msr;
 	++dt;
@@ -366,13 +479,16 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 	unsigned long req = kvmppc_get_gpr(vcpu, 3);
 	unsigned long target, ret = H_SUCCESS;
 	struct kvm_vcpu *tvcpu;
+	int idx;
 
 	switch (req) {
 	case H_ENTER:
+		idx = srcu_read_lock(&vcpu->kvm->srcu);
 		ret = kvmppc_virtmode_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4),
 					      kvmppc_get_gpr(vcpu, 5),
 					      kvmppc_get_gpr(vcpu, 6),
 					      kvmppc_get_gpr(vcpu, 7));
+		srcu_read_unlock(&vcpu->kvm->srcu, idx);
 		break;
 	case H_CEDE:
 		break;
@@ -429,6 +545,17 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	case BOOK3S_INTERRUPT_PERFMON:
 		r = RESUME_GUEST;
 		break;
+	case BOOK3S_INTERRUPT_MACHINE_CHECK:
+		/*
+		 * Deliver a machine check interrupt to the guest.
+		 * We have to do this, even if the host has handled the
+		 * machine check, because machine checks use SRR0/1 and
+		 * the interrupt might have trashed guest state in them.
+		 */
+		kvmppc_book3s_queue_irqprio(vcpu,
+					    BOOK3S_INTERRUPT_MACHINE_CHECK);
+		r = RESUME_GUEST;
+		break;
 	case BOOK3S_INTERRUPT_PROGRAM:
 	{
 		ulong flags;
@@ -470,12 +597,12 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	 * have been handled already.
 	 */
 	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
-		r = kvmppc_book3s_hv_page_fault(run, vcpu,
-				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
+		r = RESUME_PAGE_FAULT;
 		break;
 	case BOOK3S_INTERRUPT_H_INST_STORAGE:
-		r = kvmppc_book3s_hv_page_fault(run, vcpu,
-				kvmppc_get_pc(vcpu), 0);
+		vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
+		vcpu->arch.fault_dsisr = 0;
+		r = RESUME_PAGE_FAULT;
 		break;
 	/*
 	 * This occurs if the guest executes an illegal instruction.
@@ -535,36 +662,174 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
 {
-	int r = -EINVAL;
+	int r = 0;
+	long int i;
 
-	switch (reg->id) {
+	switch (id) {
 	case KVM_REG_PPC_HIOR:
-		r = put_user(0, (u64 __user *)reg->addr);
+		*val = get_reg_val(id, 0);
+		break;
+	case KVM_REG_PPC_DABR:
+		*val = get_reg_val(id, vcpu->arch.dabr);
+		break;
+	case KVM_REG_PPC_DSCR:
+		*val = get_reg_val(id, vcpu->arch.dscr);
+		break;
+	case KVM_REG_PPC_PURR:
+		*val = get_reg_val(id, vcpu->arch.purr);
+		break;
+	case KVM_REG_PPC_SPURR:
+		*val = get_reg_val(id, vcpu->arch.spurr);
+		break;
+	case KVM_REG_PPC_AMR:
+		*val = get_reg_val(id, vcpu->arch.amr);
+		break;
+	case KVM_REG_PPC_UAMOR:
+		*val = get_reg_val(id, vcpu->arch.uamor);
+		break;
+	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRA:
+		i = id - KVM_REG_PPC_MMCR0;
+		*val = get_reg_val(id, vcpu->arch.mmcr[i]);
+		break;
+	case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
+		i = id - KVM_REG_PPC_PMC1;
+		*val = get_reg_val(id, vcpu->arch.pmc[i]);
+		break;
+#ifdef CONFIG_VSX
+	case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
+		if (cpu_has_feature(CPU_FTR_VSX)) {
+			/* VSX => FP reg i is stored in arch.vsr[2*i] */
+			long int i = id - KVM_REG_PPC_FPR0;
+			*val = get_reg_val(id, vcpu->arch.vsr[2 * i]);
+		} else {
+			/* let generic code handle it */
+			r = -EINVAL;
+		}
+		break;
+	case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31:
+		if (cpu_has_feature(CPU_FTR_VSX)) {
+			long int i = id - KVM_REG_PPC_VSR0;
+			val->vsxval[0] = vcpu->arch.vsr[2 * i];
+			val->vsxval[1] = vcpu->arch.vsr[2 * i + 1];
+		} else {
+			r = -ENXIO;
+		}
+		break;
+#endif /* CONFIG_VSX */
+	case KVM_REG_PPC_VPA_ADDR:
+		spin_lock(&vcpu->arch.vpa_update_lock);
+		*val = get_reg_val(id, vcpu->arch.vpa.next_gpa);
+		spin_unlock(&vcpu->arch.vpa_update_lock);
+		break;
+	case KVM_REG_PPC_VPA_SLB:
+		spin_lock(&vcpu->arch.vpa_update_lock);
+		val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa;
+		val->vpaval.length = vcpu->arch.slb_shadow.len;
+		spin_unlock(&vcpu->arch.vpa_update_lock);
+		break;
+	case KVM_REG_PPC_VPA_DTL:
+		spin_lock(&vcpu->arch.vpa_update_lock);
+		val->vpaval.addr = vcpu->arch.dtl.next_gpa;
+		val->vpaval.length = vcpu->arch.dtl.len;
+		spin_unlock(&vcpu->arch.vpa_update_lock);
 		break;
 	default:
+		r = -EINVAL;
 		break;
 	}
 
 	return r;
 }
 
-int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
 {
-	int r = -EINVAL;
+	int r = 0;
+	long int i;
+	unsigned long addr, len;
 
-	switch (reg->id) {
+	switch (id) {
 	case KVM_REG_PPC_HIOR:
-	{
-		u64 hior;
 		/* Only allow this to be set to zero */
-		r = get_user(hior, (u64 __user *)reg->addr);
-		if (!r && (hior != 0))
+		if (set_reg_val(id, *val))
 			r = -EINVAL;
 		break;
-	}
+	case KVM_REG_PPC_DABR:
+		vcpu->arch.dabr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_DSCR:
+		vcpu->arch.dscr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_PURR:
+		vcpu->arch.purr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_SPURR:
+		vcpu->arch.spurr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_AMR:
+		vcpu->arch.amr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_UAMOR:
+		vcpu->arch.uamor = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRA:
+		i = id - KVM_REG_PPC_MMCR0;
+		vcpu->arch.mmcr[i] = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
+		i = id - KVM_REG_PPC_PMC1;
+		vcpu->arch.pmc[i] = set_reg_val(id, *val);
+		break;
+#ifdef CONFIG_VSX
+	case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
+		if (cpu_has_feature(CPU_FTR_VSX)) {
+			/* VSX => FP reg i is stored in arch.vsr[2*i] */
+			long int i = id - KVM_REG_PPC_FPR0;
+			vcpu->arch.vsr[2 * i] = set_reg_val(id, *val);
+		} else {
+			/* let generic code handle it */
+			r = -EINVAL;
+		}
+		break;
+	case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31:
+		if (cpu_has_feature(CPU_FTR_VSX)) {
+			long int i = id - KVM_REG_PPC_VSR0;
+			vcpu->arch.vsr[2 * i] = val->vsxval[0];
+			vcpu->arch.vsr[2 * i + 1] = val->vsxval[1];
+		} else {
+			r = -ENXIO;
+		}
+		break;
+#endif /* CONFIG_VSX */
+	case KVM_REG_PPC_VPA_ADDR:
+		addr = set_reg_val(id, *val);
+		r = -EINVAL;
+		if (!addr && (vcpu->arch.slb_shadow.next_gpa ||
+			      vcpu->arch.dtl.next_gpa))
+			break;
+		r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca));
+		break;
+	case KVM_REG_PPC_VPA_SLB:
+		addr = val->vpaval.addr;
+		len = val->vpaval.length;
+		r = -EINVAL;
+		if (addr && !vcpu->arch.vpa.next_gpa)
+			break;
+		r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len);
+		break;
+	case KVM_REG_PPC_VPA_DTL:
+		addr = val->vpaval.addr;
+		len = val->vpaval.length;
+		r = -EINVAL;
+		if (addr && (len < sizeof(struct dtl_entry) ||
+			     !vcpu->arch.vpa.next_gpa))
+			break;
+		len -= len % sizeof(struct dtl_entry);
+		r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len);
+		break;
 	default:
+		r = -EINVAL;
 		break;
 	}
 
@@ -599,20 +864,18 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 		goto free_vcpu;
 
 	vcpu->arch.shared = &vcpu->arch.shregs;
-	vcpu->arch.last_cpu = -1;
 	vcpu->arch.mmcr[0] = MMCR0_FC;
 	vcpu->arch.ctrl = CTRL_RUNLATCH;
 	/* default to host PVR, since we can't spoof it */
 	vcpu->arch.pvr = mfspr(SPRN_PVR);
 	kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
 	spin_lock_init(&vcpu->arch.vpa_update_lock);
+	spin_lock_init(&vcpu->arch.tbacct_lock);
+	vcpu->arch.busy_preempt = TB_NIL;
 
 	kvmppc_mmu_book3s_hv_init(vcpu);
 
-	/*
-	 * We consider the vcpu stopped until we see the first run ioctl for it.
-	 */
-	vcpu->arch.state = KVMPPC_VCPU_STOPPED;
+	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
 
 	init_waitqueue_head(&vcpu->arch.cpu_run);
 
@@ -624,9 +887,10 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 			INIT_LIST_HEAD(&vcore->runnable_threads);
 			spin_lock_init(&vcore->lock);
 			init_waitqueue_head(&vcore->wq);
-			vcore->preempt_tb = mftb();
+			vcore->preempt_tb = TB_NIL;
 		}
 		kvm->arch.vcores[core] = vcore;
+		kvm->arch.online_vcores++;
 	}
 	mutex_unlock(&kvm->lock);
 
@@ -637,7 +901,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	++vcore->num_threads;
 	spin_unlock(&vcore->lock);
 	vcpu->arch.vcore = vcore;
-	vcpu->arch.stolen_logged = vcore->stolen_tb;
 
 	vcpu->arch.cpu_type = KVM_CPU_3S_64;
 	kvmppc_sanity_check(vcpu);
@@ -697,17 +960,18 @@ extern void xics_wake_cpu(int cpu);
 static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
 				   struct kvm_vcpu *vcpu)
 {
-	struct kvm_vcpu *v;
+	u64 now;
 
 	if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
 		return;
+	spin_lock(&vcpu->arch.tbacct_lock);
+	now = mftb();
+	vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) -
+		vcpu->arch.stolen_logged;
+	vcpu->arch.busy_preempt = now;
 	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
+	spin_unlock(&vcpu->arch.tbacct_lock);
 	--vc->n_runnable;
-	++vc->n_busy;
-	/* decrement the physical thread id of each following vcpu */
-	v = vcpu;
-	list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list)
-		--v->arch.ptid;
 	list_del(&vcpu->arch.run_list);
 }
 
@@ -720,6 +984,7 @@ static int kvmppc_grab_hwthread(int cpu)
 
 	/* Ensure the thread won't go into the kernel if it wakes */
 	tpaca->kvm_hstate.hwthread_req = 1;
+	tpaca->kvm_hstate.kvm_vcpu = NULL;
 
 	/*
 	 * If the thread is already executing in the kernel (e.g. handling
@@ -769,7 +1034,6 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
 	smp_wmb();
 #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
 	if (vcpu->arch.ptid) {
-		kvmppc_grab_hwthread(cpu);
 		xics_wake_cpu(cpu);
 		++vc->n_woken;
 	}
@@ -795,7 +1059,8 @@ static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc)
 
 /*
  * Check that we are on thread 0 and that any other threads in
- * this core are off-line.
+ * this core are off-line.  Then grab the threads so they can't
+ * enter the kernel.
  */
 static int on_primary_thread(void)
 {
@@ -807,6 +1072,17 @@ static int on_primary_thread(void)
 	while (++thr < threads_per_core)
 		if (cpu_online(cpu + thr))
 			return 0;
+
+	/* Grab all hw threads so they can't go into the kernel */
+	for (thr = 1; thr < threads_per_core; ++thr) {
+		if (kvmppc_grab_hwthread(cpu + thr)) {
+			/* Couldn't grab one; let the others go */
+			do {
+				kvmppc_release_hwthread(cpu + thr);
+			} while (--thr > 0);
+			return 0;
+		}
+	}
 	return 1;
 }
 
@@ -814,21 +1090,24 @@ static int on_primary_thread(void)
  * Run a set of guest threads on a physical core.
  * Called with vc->lock held.
  */
-static int kvmppc_run_core(struct kvmppc_vcore *vc)
+static void kvmppc_run_core(struct kvmppc_vcore *vc)
 {
 	struct kvm_vcpu *vcpu, *vcpu0, *vnext;
 	long ret;
 	u64 now;
 	int ptid, i, need_vpa_update;
+	int srcu_idx;
+	struct kvm_vcpu *vcpus_to_update[threads_per_core];
 
 	/* don't start if any threads have a signal pending */
 	need_vpa_update = 0;
 	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
 		if (signal_pending(vcpu->arch.run_task))
-			return 0;
-		need_vpa_update |= vcpu->arch.vpa.update_pending |
-			vcpu->arch.slb_shadow.update_pending |
-			vcpu->arch.dtl.update_pending;
+			return;
+		if (vcpu->arch.vpa.update_pending ||
+		    vcpu->arch.slb_shadow.update_pending ||
+		    vcpu->arch.dtl.update_pending)
+			vcpus_to_update[need_vpa_update++] = vcpu;
 	}
 
 	/*
@@ -838,7 +1117,7 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 	vc->n_woken = 0;
 	vc->nap_count = 0;
 	vc->entry_exit_count = 0;
-	vc->vcore_state = VCORE_RUNNING;
+	vc->vcore_state = VCORE_STARTING;
 	vc->in_guest = 0;
 	vc->napping_threads = 0;
 
@@ -848,24 +1127,12 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 	 */
 	if (need_vpa_update) {
 		spin_unlock(&vc->lock);
-		list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
-			kvmppc_update_vpas(vcpu);
+		for (i = 0; i < need_vpa_update; ++i)
+			kvmppc_update_vpas(vcpus_to_update[i]);
 		spin_lock(&vc->lock);
 	}
 
 	/*
-	 * Make sure we are running on thread 0, and that
-	 * secondary threads are offline.
-	 * XXX we should also block attempts to bring any
-	 * secondary threads online.
-	 */
-	if (threads_per_core > 1 && !on_primary_thread()) {
-		list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
-			vcpu->arch.ret = -EBUSY;
-		goto out;
-	}
-
-	/*
 	 * Assign physical thread IDs, first to non-ceded vcpus
 	 * and then to ceded ones.
 	 */
@@ -879,28 +1146,36 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 		}
 	}
 	if (!vcpu0)
-		return 0;		/* nothing to run */
+		goto out;	/* nothing to run; should never happen */
 	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
 		if (vcpu->arch.ceded)
 			vcpu->arch.ptid = ptid++;
 
-	vc->stolen_tb += mftb() - vc->preempt_tb;
+	/*
+	 * Make sure we are running on thread 0, and that
+	 * secondary threads are offline.
+	 */
+	if (threads_per_core > 1 && !on_primary_thread()) {
+		list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+			vcpu->arch.ret = -EBUSY;
+		goto out;
+	}
+
 	vc->pcpu = smp_processor_id();
 	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
 		kvmppc_start_thread(vcpu);
 		kvmppc_create_dtl_entry(vcpu, vc);
 	}
-	/* Grab any remaining hw threads so they can't go into the kernel */
-	for (i = ptid; i < threads_per_core; ++i)
-		kvmppc_grab_hwthread(vc->pcpu + i);
 
+	vc->vcore_state = VCORE_RUNNING;
 	preempt_disable();
 	spin_unlock(&vc->lock);
 
 	kvm_guest_enter();
+
+	srcu_idx = srcu_read_lock(&vcpu0->kvm->srcu);
+
 	__kvmppc_vcore_entry(NULL, vcpu0);
-	for (i = 0; i < threads_per_core; ++i)
-		kvmppc_release_hwthread(vc->pcpu + i);
 
 	spin_lock(&vc->lock);
 	/* disable sending of IPIs on virtual external irqs */
@@ -909,10 +1184,14 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 	/* wait for secondary threads to finish writing their state to memory */
 	if (vc->nap_count < vc->n_woken)
 		kvmppc_wait_for_nap(vc);
+	for (i = 0; i < threads_per_core; ++i)
+		kvmppc_release_hwthread(vc->pcpu + i);
 	/* prevent other vcpu threads from doing kvmppc_start_thread() now */
 	vc->vcore_state = VCORE_EXITING;
 	spin_unlock(&vc->lock);
 
+	srcu_read_unlock(&vcpu0->kvm->srcu, srcu_idx);
+
 	/* make sure updates to secondary vcpu structs are visible now */
 	smp_mb();
 	kvm_guest_exit();
@@ -920,6 +1199,7 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 	preempt_enable();
 	kvm_resched(vcpu);
 
+	spin_lock(&vc->lock);
 	now = get_tb();
 	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
 		/* cancel pending dec exception if dec is positive */
@@ -943,10 +1223,8 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 		}
 	}
 
-	spin_lock(&vc->lock);
  out:
 	vc->vcore_state = VCORE_INACTIVE;
-	vc->preempt_tb = mftb();
 	list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
 				 arch.run_list) {
 		if (vcpu->arch.ret != RESUME_GUEST) {
@@ -954,8 +1232,6 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 			wake_up(&vcpu->arch.cpu_run);
 		}
 	}
-
-	return 1;
 }
 
 /*
@@ -979,20 +1255,11 @@ static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state)
 static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 {
 	DEFINE_WAIT(wait);
-	struct kvm_vcpu *v;
-	int all_idle = 1;
 
 	prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
 	vc->vcore_state = VCORE_SLEEPING;
 	spin_unlock(&vc->lock);
-	list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
-		if (!v->arch.ceded || v->arch.pending_exceptions) {
-			all_idle = 0;
-			break;
-		}
-	}
-	if (all_idle)
-		schedule();
+	schedule();
 	finish_wait(&vc->wq, &wait);
 	spin_lock(&vc->lock);
 	vc->vcore_state = VCORE_INACTIVE;
@@ -1001,13 +1268,13 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
 	int n_ceded;
-	int prev_state;
 	struct kvmppc_vcore *vc;
 	struct kvm_vcpu *v, *vn;
 
 	kvm_run->exit_reason = 0;
 	vcpu->arch.ret = RESUME_GUEST;
 	vcpu->arch.trap = 0;
+	kvmppc_update_vpas(vcpu);
 
 	/*
 	 * Synchronize with other threads in this virtual core
@@ -1017,8 +1284,9 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	vcpu->arch.ceded = 0;
 	vcpu->arch.run_task = current;
 	vcpu->arch.kvm_run = kvm_run;
-	prev_state = vcpu->arch.state;
+	vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
 	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
+	vcpu->arch.busy_preempt = TB_NIL;
 	list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
 	++vc->n_runnable;
 
@@ -1027,33 +1295,26 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	 * If the vcore is already running, we may be able to start
 	 * this thread straight away and have it join in.
 	 */
-	if (prev_state == KVMPPC_VCPU_STOPPED) {
+	if (!signal_pending(current)) {
 		if (vc->vcore_state == VCORE_RUNNING &&
 		    VCORE_EXIT_COUNT(vc) == 0) {
 			vcpu->arch.ptid = vc->n_runnable - 1;
+			kvmppc_create_dtl_entry(vcpu, vc);
 			kvmppc_start_thread(vcpu);
+		} else if (vc->vcore_state == VCORE_SLEEPING) {
+			wake_up(&vc->wq);
 		}
 
-	} else if (prev_state == KVMPPC_VCPU_BUSY_IN_HOST)
-		--vc->n_busy;
+	}
 
 	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
 	       !signal_pending(current)) {
-		if (vc->n_busy || vc->vcore_state != VCORE_INACTIVE) {
+		if (vc->vcore_state != VCORE_INACTIVE) {
 			spin_unlock(&vc->lock);
 			kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE);
 			spin_lock(&vc->lock);
 			continue;
 		}
-		vc->runner = vcpu;
-		n_ceded = 0;
-		list_for_each_entry(v, &vc->runnable_threads, arch.run_list)
-			n_ceded += v->arch.ceded;
-		if (n_ceded == vc->n_runnable)
-			kvmppc_vcore_blocked(vc);
-		else
-			kvmppc_run_core(vc);
-
 		list_for_each_entry_safe(v, vn, &vc->runnable_threads,
 					 arch.run_list) {
 			kvmppc_core_prepare_to_enter(v);
@@ -1065,22 +1326,40 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 				wake_up(&v->arch.cpu_run);
 			}
 		}
+		if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
+			break;
+		vc->runner = vcpu;
+		n_ceded = 0;
+		list_for_each_entry(v, &vc->runnable_threads, arch.run_list)
+			if (!v->arch.pending_exceptions)
+				n_ceded += v->arch.ceded;
+		if (n_ceded == vc->n_runnable)
+			kvmppc_vcore_blocked(vc);
+		else
+			kvmppc_run_core(vc);
 		vc->runner = NULL;
 	}
 
-	if (signal_pending(current)) {
-		if (vc->vcore_state == VCORE_RUNNING ||
-		    vc->vcore_state == VCORE_EXITING) {
-			spin_unlock(&vc->lock);
-			kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
-			spin_lock(&vc->lock);
-		}
-		if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
-			kvmppc_remove_runnable(vc, vcpu);
-			vcpu->stat.signal_exits++;
-			kvm_run->exit_reason = KVM_EXIT_INTR;
-			vcpu->arch.ret = -EINTR;
-		}
+	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
+	       (vc->vcore_state == VCORE_RUNNING ||
+		vc->vcore_state == VCORE_EXITING)) {
+		spin_unlock(&vc->lock);
+		kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
+		spin_lock(&vc->lock);
+	}
+
+	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
+		kvmppc_remove_runnable(vc, vcpu);
+		vcpu->stat.signal_exits++;
+		kvm_run->exit_reason = KVM_EXIT_INTR;
+		vcpu->arch.ret = -EINTR;
+	}
+
+	if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
+		/* Wake up some vcpu to run the core */
+		v = list_first_entry(&vc->runnable_threads,
+				     struct kvm_vcpu, arch.run_list);
+		wake_up(&v->arch.cpu_run);
 	}
 
 	spin_unlock(&vc->lock);
@@ -1090,6 +1369,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
 	int r;
+	int srcu_idx;
 
 	if (!vcpu->arch.sane) {
 		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -1120,6 +1400,7 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	flush_vsx_to_thread(current);
 	vcpu->arch.wqp = &vcpu->arch.vcore->wq;
 	vcpu->arch.pgdir = current->mm->pgd;
+	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
 
 	do {
 		r = kvmppc_run_vcpu(run, vcpu);
@@ -1128,10 +1409,16 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		    !(vcpu->arch.shregs.msr & MSR_PR)) {
 			r = kvmppc_pseries_do_hcall(vcpu);
 			kvmppc_core_prepare_to_enter(vcpu);
+		} else if (r == RESUME_PAGE_FAULT) {
+			srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+			r = kvmppc_book3s_hv_page_fault(run, vcpu,
+				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
+			srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
 		}
 	} while (r == RESUME_GUEST);
 
  out:
+	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
 	atomic_dec(&vcpu->kvm->arch.vcpus_running);
 	return r;
 }
@@ -1273,7 +1560,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 	n = kvm_dirty_bitmap_bytes(memslot);
 	memset(memslot->dirty_bitmap, 0, n);
 
-	r = kvmppc_hv_get_dirty_log(kvm, memslot);
+	r = kvmppc_hv_get_dirty_log(kvm, memslot, memslot->dirty_bitmap);
 	if (r)
 		goto out;
 
@@ -1287,67 +1574,88 @@ out:
 	return r;
 }
 
-static unsigned long slb_pgsize_encoding(unsigned long psize)
+static void unpin_slot(struct kvm_memory_slot *memslot)
 {
-	unsigned long senc = 0;
+	unsigned long *physp;
+	unsigned long j, npages, pfn;
+	struct page *page;
 
-	if (psize > 0x1000) {
-		senc = SLB_VSID_L;
-		if (psize == 0x10000)
-			senc |= SLB_VSID_LP_01;
+	physp = memslot->arch.slot_phys;
+	npages = memslot->npages;
+	if (!physp)
+		return;
+	for (j = 0; j < npages; j++) {
+		if (!(physp[j] & KVMPPC_GOT_PAGE))
+			continue;
+		pfn = physp[j] >> PAGE_SHIFT;
+		page = pfn_to_page(pfn);
+		SetPageDirty(page);
+		put_page(page);
+	}
+}
+
+void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
+			      struct kvm_memory_slot *dont)
+{
+	if (!dont || free->arch.rmap != dont->arch.rmap) {
+		vfree(free->arch.rmap);
+		free->arch.rmap = NULL;
+	}
+	if (!dont || free->arch.slot_phys != dont->arch.slot_phys) {
+		unpin_slot(free);
+		vfree(free->arch.slot_phys);
+		free->arch.slot_phys = NULL;
 	}
-	return senc;
+}
+
+int kvmppc_core_create_memslot(struct kvm_memory_slot *slot,
+			       unsigned long npages)
+{
+	slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
+	if (!slot->arch.rmap)
+		return -ENOMEM;
+	slot->arch.slot_phys = NULL;
+
+	return 0;
 }
 
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem)
+				      struct kvm_memory_slot *memslot,
+				      struct kvm_userspace_memory_region *mem)
 {
-	unsigned long npages;
 	unsigned long *phys;
 
-	/* Allocate a slot_phys array */
-	phys = kvm->arch.slot_phys[mem->slot];
-	if (!kvm->arch.using_mmu_notifiers && !phys) {
-		npages = mem->memory_size >> PAGE_SHIFT;
-		phys = vzalloc(npages * sizeof(unsigned long));
+	/* Allocate a slot_phys array if needed */
+	phys = memslot->arch.slot_phys;
+	if (!kvm->arch.using_mmu_notifiers && !phys && memslot->npages) {
+		phys = vzalloc(memslot->npages * sizeof(unsigned long));
 		if (!phys)
 			return -ENOMEM;
-		kvm->arch.slot_phys[mem->slot] = phys;
-		kvm->arch.slot_npages[mem->slot] = npages;
+		memslot->arch.slot_phys = phys;
 	}
 
 	return 0;
 }
 
-static void unpin_slot(struct kvm *kvm, int slot_id)
+void kvmppc_core_commit_memory_region(struct kvm *kvm,
+				      struct kvm_userspace_memory_region *mem,
+				      struct kvm_memory_slot old)
 {
-	unsigned long *physp;
-	unsigned long j, npages, pfn;
-	struct page *page;
+	unsigned long npages = mem->memory_size >> PAGE_SHIFT;
+	struct kvm_memory_slot *memslot;
 
-	physp = kvm->arch.slot_phys[slot_id];
-	npages = kvm->arch.slot_npages[slot_id];
-	if (physp) {
-		spin_lock(&kvm->arch.slot_phys_lock);
-		for (j = 0; j < npages; j++) {
-			if (!(physp[j] & KVMPPC_GOT_PAGE))
-				continue;
-			pfn = physp[j] >> PAGE_SHIFT;
-			page = pfn_to_page(pfn);
-			SetPageDirty(page);
-			put_page(page);
-		}
-		kvm->arch.slot_phys[slot_id] = NULL;
-		spin_unlock(&kvm->arch.slot_phys_lock);
-		vfree(physp);
+	if (npages && old.npages) {
+		/*
+		 * If modifying a memslot, reset all the rmap dirty bits.
+		 * If this is a new memslot, we don't need to do anything
+		 * since the rmap array starts out as all zeroes,
+		 * i.e. no pages are dirty.
+		 */
+		memslot = id_to_memslot(kvm->memslots, mem->slot);
+		kvmppc_hv_get_dirty_log(kvm, memslot, NULL);
 	}
 }
 
-void kvmppc_core_commit_memory_region(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem)
-{
-}
-
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 {
 	int err = 0;
@@ -1362,6 +1670,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 	unsigned long rmls;
 	unsigned long *physp;
 	unsigned long i, npages;
+	int srcu_idx;
 
 	mutex_lock(&kvm->lock);
 	if (kvm->arch.rma_setup_done)
@@ -1377,12 +1686,13 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 	}
 
 	/* Look up the memslot for guest physical address 0 */
+	srcu_idx = srcu_read_lock(&kvm->srcu);
 	memslot = gfn_to_memslot(kvm, 0);
 
 	/* We must have some memory at 0 by now */
 	err = -EINVAL;
 	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
-		goto out;
+		goto out_srcu;
 
 	/* Look up the VMA for the start of this memory slot */
 	hva = memslot->userspace_addr;
@@ -1406,14 +1716,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 		err = -EPERM;
 		if (cpu_has_feature(CPU_FTR_ARCH_201)) {
 			pr_err("KVM: CPU requires an RMO\n");
-			goto out;
+			goto out_srcu;
 		}
 
 		/* We can handle 4k, 64k or 16M pages in the VRMA */
 		err = -EINVAL;
 		if (!(psize == 0x1000 || psize == 0x10000 ||
 		      psize == 0x1000000))
-			goto out;
+			goto out_srcu;
 
 		/* Update VRMASD field in the LPCR */
 		senc = slb_pgsize_encoding(psize);
@@ -1436,7 +1746,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 		err = -EINVAL;
 		if (rmls < 0) {
 			pr_err("KVM: Can't use RMA of 0x%lx bytes\n", rma_size);
-			goto out;
+			goto out_srcu;
 		}
 		atomic_inc(&ri->use_count);
 		kvm->arch.rma = ri;
@@ -1465,17 +1775,24 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 		/* Initialize phys addrs of pages in RMO */
 		npages = ri->npages;
 		porder = __ilog2(npages);
-		physp = kvm->arch.slot_phys[memslot->id];
-		spin_lock(&kvm->arch.slot_phys_lock);
-		for (i = 0; i < npages; ++i)
-			physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) + porder;
-		spin_unlock(&kvm->arch.slot_phys_lock);
+		physp = memslot->arch.slot_phys;
+		if (physp) {
+			if (npages > memslot->npages)
+				npages = memslot->npages;
+			spin_lock(&kvm->arch.slot_phys_lock);
+			for (i = 0; i < npages; ++i)
+				physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) +
+					porder;
+			spin_unlock(&kvm->arch.slot_phys_lock);
+		}
 	}
 
 	/* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */
 	smp_wmb();
 	kvm->arch.rma_setup_done = 1;
 	err = 0;
+ out_srcu:
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
  out:
 	mutex_unlock(&kvm->lock);
 	return err;
@@ -1496,6 +1813,13 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 		return -ENOMEM;
 	kvm->arch.lpid = lpid;
 
+	/*
+	 * Since we don't flush the TLB when tearing down a VM,
+	 * and this lpid might have previously been used,
+	 * make sure we flush on each core before running the new VM.
+	 */
+	cpumask_setall(&kvm->arch.need_tlb_flush);
+
 	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
 
 	kvm->arch.rma = NULL;
@@ -1523,16 +1847,19 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 
 	kvm->arch.using_mmu_notifiers = !!cpu_has_feature(CPU_FTR_ARCH_206);
 	spin_lock_init(&kvm->arch.slot_phys_lock);
+
+	/*
+	 * Don't allow secondary CPU threads to come online
+	 * while any KVM VMs exist.
+	 */
+	inhibit_secondary_onlining();
+
 	return 0;
 }
 
 void kvmppc_core_destroy_vm(struct kvm *kvm)
 {
-	unsigned long i;
-
-	if (!kvm->arch.using_mmu_notifiers)
-		for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
-			unpin_slot(kvm, i);
+	uninhibit_secondary_onlining();
 
 	if (kvm->arch.rma) {
 		kvm_release_rma(kvm->arch.rma);
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index fb4eac290fe..ec0a9e5de10 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -157,8 +157,8 @@ static void __init kvm_linear_init_one(ulong size, int count, int type)
 	linear_info = alloc_bootmem(count * sizeof(struct kvmppc_linear_info));
 	for (i = 0; i < count; ++i) {
 		linear = alloc_bootmem_align(size, size);
-		pr_info("Allocated KVM %s at %p (%ld MB)\n", typestr, linear,
-			size >> 20);
+		pr_debug("Allocated KVM %s at %p (%ld MB)\n", typestr, linear,
+			 size >> 20);
 		linear_info[i].base_virt = linear;
 		linear_info[i].base_pfn = __pa(linear) >> PAGE_SHIFT;
 		linear_info[i].npages = npages;
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
new file mode 100644
index 00000000000..35f3cf0269b
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -0,0 +1,144 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * Copyright 2012 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/kernel.h>
+#include <asm/opal.h>
+
+/* SRR1 bits for machine check on POWER7 */
+#define SRR1_MC_LDSTERR		(1ul << (63-42))
+#define SRR1_MC_IFETCH_SH	(63-45)
+#define SRR1_MC_IFETCH_MASK	0x7
+#define SRR1_MC_IFETCH_SLBPAR		2	/* SLB parity error */
+#define SRR1_MC_IFETCH_SLBMULTI		3	/* SLB multi-hit */
+#define SRR1_MC_IFETCH_SLBPARMULTI	4	/* SLB parity + multi-hit */
+#define SRR1_MC_IFETCH_TLBMULTI		5	/* I-TLB multi-hit */
+
+/* DSISR bits for machine check on POWER7 */
+#define DSISR_MC_DERAT_MULTI	0x800		/* D-ERAT multi-hit */
+#define DSISR_MC_TLB_MULTI	0x400		/* D-TLB multi-hit */
+#define DSISR_MC_SLB_PARITY	0x100		/* SLB parity error */
+#define DSISR_MC_SLB_MULTI	0x080		/* SLB multi-hit */
+#define DSISR_MC_SLB_PARMULTI	0x040		/* SLB parity + multi-hit */
+
+/* POWER7 SLB flush and reload */
+static void reload_slb(struct kvm_vcpu *vcpu)
+{
+	struct slb_shadow *slb;
+	unsigned long i, n;
+
+	/* First clear out SLB */
+	asm volatile("slbmte %0,%0; slbia" : : "r" (0));
+
+	/* Do they have an SLB shadow buffer registered? */
+	slb = vcpu->arch.slb_shadow.pinned_addr;
+	if (!slb)
+		return;
+
+	/* Sanity check */
+	n = min_t(u32, slb->persistent, SLB_MIN_SIZE);
+	if ((void *) &slb->save_area[n] > vcpu->arch.slb_shadow.pinned_end)
+		return;
+
+	/* Load up the SLB from that */
+	for (i = 0; i < n; ++i) {
+		unsigned long rb = slb->save_area[i].esid;
+		unsigned long rs = slb->save_area[i].vsid;
+
+		rb = (rb & ~0xFFFul) | i;	/* insert entry number */
+		asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb));
+	}
+}
+
+/* POWER7 TLB flush */
+static void flush_tlb_power7(struct kvm_vcpu *vcpu)
+{
+	unsigned long i, rb;
+
+	rb = TLBIEL_INVAL_SET_LPID;
+	for (i = 0; i < POWER7_TLB_SETS; ++i) {
+		asm volatile("tlbiel %0" : : "r" (rb));
+		rb += 1 << TLBIEL_INVAL_SET_SHIFT;
+	}
+}
+
+/*
+ * On POWER7, see if we can handle a machine check that occurred inside
+ * the guest in real mode, without switching to the host partition.
+ *
+ * Returns: 0 => exit guest, 1 => deliver machine check to guest
+ */
+static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu)
+{
+	unsigned long srr1 = vcpu->arch.shregs.msr;
+	struct opal_machine_check_event *opal_evt;
+	long handled = 1;
+
+	if (srr1 & SRR1_MC_LDSTERR) {
+		/* error on load/store */
+		unsigned long dsisr = vcpu->arch.shregs.dsisr;
+
+		if (dsisr & (DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI |
+			     DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI)) {
+			/* flush and reload SLB; flushes D-ERAT too */
+			reload_slb(vcpu);
+			dsisr &= ~(DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI |
+				   DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI);
+		}
+		if (dsisr & DSISR_MC_TLB_MULTI) {
+			flush_tlb_power7(vcpu);
+			dsisr &= ~DSISR_MC_TLB_MULTI;
+		}
+		/* Any other errors we don't understand? */
+		if (dsisr & 0xffffffffUL)
+			handled = 0;
+	}
+
+	switch ((srr1 >> SRR1_MC_IFETCH_SH) & SRR1_MC_IFETCH_MASK) {
+	case 0:
+		break;
+	case SRR1_MC_IFETCH_SLBPAR:
+	case SRR1_MC_IFETCH_SLBMULTI:
+	case SRR1_MC_IFETCH_SLBPARMULTI:
+		reload_slb(vcpu);
+		break;
+	case SRR1_MC_IFETCH_TLBMULTI:
+		flush_tlb_power7(vcpu);
+		break;
+	default:
+		handled = 0;
+	}
+
+	/*
+	 * See if OPAL has already handled the condition.
+	 * We assume that if the condition is recovered then OPAL
+	 * will have generated an error log event that we will pick
+	 * up and log later.
+	 */
+	opal_evt = local_paca->opal_mc_evt;
+	if (opal_evt->version == OpalMCE_V1 &&
+	    (opal_evt->severity == OpalMCE_SEV_NO_ERROR ||
+	     opal_evt->disposition == OpalMCE_DISPOSITION_RECOVERED))
+		handled = 1;
+
+	if (handled)
+		opal_evt->in_use = 0;
+
+	return handled;
+}
+
+long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu)
+{
+	if (cpu_has_feature(CPU_FTR_ARCH_206))
+		return kvmppc_realmode_mc_power7(vcpu);
+
+	return 0;
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index fb0e821622d..19c93bae1ae 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -35,6 +35,37 @@ static void *real_vmalloc_addr(void *x)
 	return __va(addr);
 }
 
+/* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */
+static int global_invalidates(struct kvm *kvm, unsigned long flags)
+{
+	int global;
+
+	/*
+	 * If there is only one vcore, and it's currently running,
+	 * we can use tlbiel as long as we mark all other physical
+	 * cores as potentially having stale TLB entries for this lpid.
+	 * If we're not using MMU notifiers, we never take pages away
+	 * from the guest, so we can use tlbiel if requested.
+	 * Otherwise, don't use tlbiel.
+	 */
+	if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcore)
+		global = 0;
+	else if (kvm->arch.using_mmu_notifiers)
+		global = 1;
+	else
+		global = !(flags & H_LOCAL);
+
+	if (!global) {
+		/* any other core might now have stale TLB entries... */
+		smp_wmb();
+		cpumask_setall(&kvm->arch.need_tlb_flush);
+		cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu,
+				  &kvm->arch.need_tlb_flush);
+	}
+
+	return global;
+}
+
 /*
  * Add this HPTE into the chain for the real page.
  * Must be called with the chain locked; it unlocks the chain.
@@ -59,13 +90,24 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
 		head->back = pte_index;
 	} else {
 		rev->forw = rev->back = pte_index;
-		i = pte_index;
+		*rmap = (*rmap & ~KVMPPC_RMAP_INDEX) |
+			pte_index | KVMPPC_RMAP_PRESENT;
 	}
-	smp_wmb();
-	*rmap = i | KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT; /* unlock */
+	unlock_rmap(rmap);
 }
 EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
 
+/*
+ * Note modification of an HPTE; set the HPTE modified bit
+ * if anyone is interested.
+ */
+static inline void note_hpte_modification(struct kvm *kvm,
+					  struct revmap_entry *rev)
+{
+	if (atomic_read(&kvm->arch.hpte_mod_interest))
+		rev->guest_rpte |= HPTE_GR_MODIFIED;
+}
+
 /* Remove this HPTE from the chain for a real page */
 static void remove_revmap_chain(struct kvm *kvm, long pte_index,
 				struct revmap_entry *rev,
@@ -81,7 +123,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
 	ptel = rev->guest_rpte |= rcbits;
 	gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
 	memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
-	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
+	if (!memslot)
 		return;
 
 	rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
@@ -103,14 +145,14 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
 	unlock_rmap(rmap);
 }
 
-static pte_t lookup_linux_pte(struct kvm_vcpu *vcpu, unsigned long hva,
+static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
 			      int writing, unsigned long *pte_sizep)
 {
 	pte_t *ptep;
 	unsigned long ps = *pte_sizep;
 	unsigned int shift;
 
-	ptep = find_linux_pte_or_hugepte(vcpu->arch.pgdir, hva, &shift);
+	ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
 	if (!ptep)
 		return __pte(0);
 	if (shift)
@@ -130,15 +172,15 @@ static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v)
 	hpte[0] = hpte_v;
 }
 
-long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
-		    long pte_index, unsigned long pteh, unsigned long ptel)
+long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
+		       long pte_index, unsigned long pteh, unsigned long ptel,
+		       pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret)
 {
-	struct kvm *kvm = vcpu->kvm;
 	unsigned long i, pa, gpa, gfn, psize;
 	unsigned long slot_fn, hva;
 	unsigned long *hpte;
 	struct revmap_entry *rev;
-	unsigned long g_ptel = ptel;
+	unsigned long g_ptel;
 	struct kvm_memory_slot *memslot;
 	unsigned long *physp, pte_size;
 	unsigned long is_io;
@@ -147,13 +189,14 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	unsigned int writing;
 	unsigned long mmu_seq;
 	unsigned long rcbits;
-	bool realmode = vcpu->arch.vcore->vcore_state == VCORE_RUNNING;
 
 	psize = hpte_page_size(pteh, ptel);
 	if (!psize)
 		return H_PARAMETER;
 	writing = hpte_is_writable(ptel);
 	pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
+	ptel &= ~HPTE_GR_RESERVED;
+	g_ptel = ptel;
 
 	/* used later to detect if we might have been invalidated */
 	mmu_seq = kvm->mmu_notifier_seq;
@@ -183,7 +226,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	rmap = &memslot->arch.rmap[slot_fn];
 
 	if (!kvm->arch.using_mmu_notifiers) {
-		physp = kvm->arch.slot_phys[memslot->id];
+		physp = memslot->arch.slot_phys;
 		if (!physp)
 			return H_PARAMETER;
 		physp += slot_fn;
@@ -201,7 +244,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 
 		/* Look up the Linux PTE for the backing page */
 		pte_size = psize;
-		pte = lookup_linux_pte(vcpu, hva, writing, &pte_size);
+		pte = lookup_linux_pte(pgdir, hva, writing, &pte_size);
 		if (pte_present(pte)) {
 			if (writing && !pte_write(pte))
 				/* make the actual HPTE be read-only */
@@ -210,6 +253,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 			pa = pte_pfn(pte) << PAGE_SHIFT;
 		}
 	}
+
 	if (pte_size < psize)
 		return H_PARAMETER;
 	if (pa && pte_size > psize)
@@ -287,8 +331,10 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	rev = &kvm->arch.revmap[pte_index];
 	if (realmode)
 		rev = real_vmalloc_addr(rev);
-	if (rev)
+	if (rev) {
 		rev->guest_rpte = g_ptel;
+		note_hpte_modification(kvm, rev);
+	}
 
 	/* Link HPTE into reverse-map chain */
 	if (pteh & HPTE_V_VALID) {
@@ -297,7 +343,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 		lock_rmap(rmap);
 		/* Check for pending invalidations under the rmap chain lock */
 		if (kvm->arch.using_mmu_notifiers &&
-		    mmu_notifier_retry(vcpu, mmu_seq)) {
+		    mmu_notifier_retry(kvm, mmu_seq)) {
 			/* inval in progress, write a non-present HPTE */
 			pteh |= HPTE_V_ABSENT;
 			pteh &= ~HPTE_V_VALID;
@@ -318,10 +364,17 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	hpte[0] = pteh;
 	asm volatile("ptesync" : : : "memory");
 
-	vcpu->arch.gpr[4] = pte_index;
+	*pte_idx_ret = pte_index;
 	return H_SUCCESS;
 }
-EXPORT_SYMBOL_GPL(kvmppc_h_enter);
+EXPORT_SYMBOL_GPL(kvmppc_do_h_enter);
+
+long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
+		    long pte_index, unsigned long pteh, unsigned long ptel)
+{
+	return kvmppc_do_h_enter(vcpu->kvm, flags, pte_index, pteh, ptel,
+				 vcpu->arch.pgdir, true, &vcpu->arch.gpr[4]);
+}
 
 #define LOCK_TOKEN	(*(u32 *)(&get_paca()->lock_token))
 
@@ -343,11 +396,10 @@ static inline int try_lock_tlbie(unsigned int *lock)
 	return old == 0;
 }
 
-long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
-		     unsigned long pte_index, unsigned long avpn,
-		     unsigned long va)
+long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
+			unsigned long pte_index, unsigned long avpn,
+			unsigned long *hpret)
 {
-	struct kvm *kvm = vcpu->kvm;
 	unsigned long *hpte;
 	unsigned long v, r, rb;
 	struct revmap_entry *rev;
@@ -369,7 +421,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
 	if (v & HPTE_V_VALID) {
 		hpte[0] &= ~HPTE_V_VALID;
 		rb = compute_tlbie_rb(v, hpte[1], pte_index);
-		if (!(flags & H_LOCAL) && atomic_read(&kvm->online_vcpus) > 1) {
+		if (global_invalidates(kvm, flags)) {
 			while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
 				cpu_relax();
 			asm volatile("ptesync" : : : "memory");
@@ -385,13 +437,22 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
 		/* Read PTE low word after tlbie to get final R/C values */
 		remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]);
 	}
-	r = rev->guest_rpte;
+	r = rev->guest_rpte & ~HPTE_GR_RESERVED;
+	note_hpte_modification(kvm, rev);
 	unlock_hpte(hpte, 0);
 
-	vcpu->arch.gpr[4] = v;
-	vcpu->arch.gpr[5] = r;
+	hpret[0] = v;
+	hpret[1] = r;
 	return H_SUCCESS;
 }
+EXPORT_SYMBOL_GPL(kvmppc_do_h_remove);
+
+long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
+		     unsigned long pte_index, unsigned long avpn)
+{
+	return kvmppc_do_h_remove(vcpu->kvm, flags, pte_index, avpn,
+				  &vcpu->arch.gpr[4]);
+}
 
 long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 {
@@ -459,6 +520,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 
 			args[j] = ((0x80 | flags) << 56) + pte_index;
 			rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+			note_hpte_modification(kvm, rev);
 
 			if (!(hp[0] & HPTE_V_VALID)) {
 				/* insert R and C bits from PTE */
@@ -534,8 +596,6 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 		return H_NOT_FOUND;
 	}
 
-	if (atomic_read(&kvm->online_vcpus) == 1)
-		flags |= H_LOCAL;
 	v = hpte[0];
 	bits = (flags << 55) & HPTE_R_PP0;
 	bits |= (flags << 48) & HPTE_R_KEY_HI;
@@ -548,6 +608,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 	if (rev) {
 		r = (rev->guest_rpte & ~mask) | bits;
 		rev->guest_rpte = r;
+		note_hpte_modification(kvm, rev);
 	}
 	r = (hpte[1] & ~mask) | bits;
 
@@ -555,7 +616,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 	if (v & HPTE_V_VALID) {
 		rb = compute_tlbie_rb(v, r, pte_index);
 		hpte[0] = v & ~HPTE_V_VALID;
-		if (!(flags & H_LOCAL)) {
+		if (global_invalidates(kvm, flags)) {
 			while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
 				cpu_relax();
 			asm volatile("ptesync" : : : "memory");
@@ -568,6 +629,28 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 			asm volatile("tlbiel %0" : : "r" (rb));
 			asm volatile("ptesync" : : : "memory");
 		}
+		/*
+		 * If the host has this page as readonly but the guest
+		 * wants to make it read/write, reduce the permissions.
+		 * Checking the host permissions involves finding the
+		 * memslot and then the Linux PTE for the page.
+		 */
+		if (hpte_is_writable(r) && kvm->arch.using_mmu_notifiers) {
+			unsigned long psize, gfn, hva;
+			struct kvm_memory_slot *memslot;
+			pgd_t *pgdir = vcpu->arch.pgdir;
+			pte_t pte;
+
+			psize = hpte_page_size(v, r);
+			gfn = ((r & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
+			memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
+			if (memslot) {
+				hva = __gfn_to_hva_memslot(memslot, gfn);
+				pte = lookup_linux_pte(pgdir, hva, 1, &psize);
+				if (pte_present(pte) && !pte_write(pte))
+					r = hpte_make_readonly(r);
+			}
+		}
 	}
 	hpte[1] = r;
 	eieio();
@@ -599,8 +682,10 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
 			v &= ~HPTE_V_ABSENT;
 			v |= HPTE_V_VALID;
 		}
-		if (v & HPTE_V_VALID)
+		if (v & HPTE_V_VALID) {
 			r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C));
+			r &= ~HPTE_GR_RESERVED;
+		}
 		vcpu->arch.gpr[4 + i * 2] = v;
 		vcpu->arch.gpr[5 + i * 2] = r;
 	}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 74a24bbb963..10b6c358dd7 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -27,6 +27,7 @@
 #include <asm/asm-offsets.h>
 #include <asm/exception-64s.h>
 #include <asm/kvm_book3s_asm.h>
+#include <asm/mmu-hash64.h>
 
 /*****************************************************************************
  *                                                                           *
@@ -134,8 +135,11 @@ kvm_start_guest:
 
 27:	/* XXX should handle hypervisor maintenance interrupts etc. here */
 
+	/* reload vcpu pointer after clearing the IPI */
+	ld	r4,HSTATE_KVM_VCPU(r13)
+	cmpdi	r4,0
 	/* if we have no vcpu to run, go back to sleep */
-	beq	cr1,kvm_no_guest
+	beq	kvm_no_guest
 
 	/* were we napping due to cede? */
 	lbz	r0,HSTATE_NAPPING(r13)
@@ -310,7 +314,33 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
 	mtspr	SPRN_LPID,r7
 	isync
+
+	/* See if we need to flush the TLB */
+	lhz	r6,PACAPACAINDEX(r13)	/* test_bit(cpu, need_tlb_flush) */
+	clrldi	r7,r6,64-6		/* extract bit number (6 bits) */
+	srdi	r6,r6,6			/* doubleword number */
+	sldi	r6,r6,3			/* address offset */
+	add	r6,r6,r9
+	addi	r6,r6,KVM_NEED_FLUSH	/* dword in kvm->arch.need_tlb_flush */
 	li	r0,1
+	sld	r0,r0,r7
+	ld	r7,0(r6)
+	and.	r7,r7,r0
+	beq	22f
+23:	ldarx	r7,0,r6			/* if set, clear the bit */
+	andc	r7,r7,r0
+	stdcx.	r7,0,r6
+	bne	23b
+	li	r6,128			/* and flush the TLB */
+	mtctr	r6
+	li	r7,0x800		/* IS field = 0b10 */
+	ptesync
+28:	tlbiel	r7
+	addi	r7,r7,0x1000
+	bdnz	28b
+	ptesync
+
+22:	li	r0,1
 	stb	r0,VCORE_IN_GUEST(r5)	/* signal secondaries to continue */
 	b	10f
 
@@ -333,36 +363,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	mr	r9,r4
 	blt	hdec_soon
 
-	/*
-	 * Invalidate the TLB if we could possibly have stale TLB
-	 * entries for this partition on this core due to the use
-	 * of tlbiel.
-	 * XXX maybe only need this on primary thread?
-	 */
-	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
-	lwz	r5,VCPU_VCPUID(r4)
-	lhz	r6,PACAPACAINDEX(r13)
-	rldimi	r6,r5,0,62		/* XXX map as if threads 1:1 p:v */
-	lhz	r8,VCPU_LAST_CPU(r4)
-	sldi	r7,r6,1			/* see if this is the same vcpu */
-	add	r7,r7,r9		/* as last ran on this pcpu */
-	lhz	r0,KVM_LAST_VCPU(r7)
-	cmpw	r6,r8			/* on the same cpu core as last time? */
-	bne	3f
-	cmpw	r0,r5			/* same vcpu as this core last ran? */
-	beq	1f
-3:	sth	r6,VCPU_LAST_CPU(r4)	/* if not, invalidate partition TLB */
-	sth	r5,KVM_LAST_VCPU(r7)
-	li	r6,128
-	mtctr	r6
-	li	r7,0x800		/* IS field = 0b10 */
-	ptesync
-2:	tlbiel	r7
-	addi	r7,r7,0x1000
-	bdnz	2b
-	ptesync
-1:
-
 	/* Save purr/spurr */
 	mfspr	r5,SPRN_PURR
 	mfspr	r6,SPRN_SPURR
@@ -679,8 +679,7 @@ BEGIN_FTR_SECTION
 1:
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
-nohpte_cont:
-hcall_real_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
+guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
 	/* Save DEC */
 	mfspr	r5,SPRN_DEC
 	mftb	r6
@@ -701,6 +700,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 	std	r6, VCPU_FAULT_DAR(r9)
 	stw	r7, VCPU_FAULT_DSISR(r9)
 
+	/* See if it is a machine check */
+	cmpwi	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
+	beq	machine_check_realmode
+mc_cont:
+
 	/* Save guest CTRL register, set runlatch to 1 */
 6:	mfspr	r6,SPRN_CTRLF
 	stw	r6,VCPU_CTRL(r9)
@@ -1113,38 +1117,41 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	/*
 	 * For external and machine check interrupts, we need
 	 * to call the Linux handler to process the interrupt.
-	 * We do that by jumping to the interrupt vector address
-	 * which we have in r12.  The [h]rfid at the end of the
+	 * We do that by jumping to absolute address 0x500 for
+	 * external interrupts, or the machine_check_fwnmi label
+	 * for machine checks (since firmware might have patched
+	 * the vector area at 0x200).  The [h]rfid at the end of the
 	 * handler will return to the book3s_hv_interrupts.S code.
 	 * For other interrupts we do the rfid to get back
-	 * to the book3s_interrupts.S code here.
+	 * to the book3s_hv_interrupts.S code here.
 	 */
 	ld	r8, HSTATE_VMHANDLER(r13)
 	ld	r7, HSTATE_HOST_MSR(r13)
 
+	cmpwi	cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK
 	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
+BEGIN_FTR_SECTION
 	beq	11f
-	cmpwi	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/* RFI into the highmem handler, or branch to interrupt handler */
-12:	mfmsr	r6
-	mtctr	r12
+	mfmsr	r6
 	li	r0, MSR_RI
 	andc	r6, r6, r0
 	mtmsrd	r6, 1			/* Clear RI in MSR */
 	mtsrr0	r8
 	mtsrr1	r7
-	beqctr
+	beqa	0x500			/* external interrupt (PPC970) */
+	beq	cr1, 13f		/* machine check */
 	RFI
 
-11:
-BEGIN_FTR_SECTION
-	b	12b
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
-	mtspr	SPRN_HSRR0, r8
+	/* On POWER7, we have external interrupts set to use HSRR0/1 */
+11:	mtspr	SPRN_HSRR0, r8
 	mtspr	SPRN_HSRR1, r7
 	ba	0x500
 
+13:	b	machine_check_fwnmi
+
 /*
  * Check whether an HDSI is an HPTE not found fault or something else.
  * If it is an HPTE not found fault that is due to the guest accessing
@@ -1177,7 +1184,7 @@ kvmppc_hdsi:
 	cmpdi	r3, 0			/* retry the instruction */
 	beq	6f
 	cmpdi	r3, -1			/* handle in kernel mode */
-	beq	nohpte_cont
+	beq	guest_exit_cont
 	cmpdi	r3, -2			/* MMIO emulation; need instr word */
 	beq	2f
 
@@ -1191,6 +1198,7 @@ kvmppc_hdsi:
 	li	r10, BOOK3S_INTERRUPT_DATA_STORAGE
 	li	r11, (MSR_ME << 1) | 1	/* synthesize MSR_SF | MSR_ME */
 	rotldi	r11, r11, 63
+fast_interrupt_c_return:
 6:	ld	r7, VCPU_CTR(r9)
 	lwz	r8, VCPU_XER(r9)
 	mtctr	r7
@@ -1223,7 +1231,7 @@ kvmppc_hdsi:
 	/* Unset guest mode. */
 	li	r0, KVM_GUEST_MODE_NONE
 	stb	r0, HSTATE_IN_GUEST(r13)
-	b	nohpte_cont
+	b	guest_exit_cont
 
 /*
  * Similarly for an HISI, reflect it to the guest as an ISI unless
@@ -1249,9 +1257,9 @@ kvmppc_hisi:
 	ld	r11, VCPU_MSR(r9)
 	li	r12, BOOK3S_INTERRUPT_H_INST_STORAGE
 	cmpdi	r3, 0			/* retry the instruction */
-	beq	6f
+	beq	fast_interrupt_c_return
 	cmpdi	r3, -1			/* handle in kernel mode */
-	beq	nohpte_cont
+	beq	guest_exit_cont
 
 	/* Synthesize an ISI for the guest */
 	mr	r11, r3
@@ -1260,12 +1268,7 @@ kvmppc_hisi:
 	li	r10, BOOK3S_INTERRUPT_INST_STORAGE
 	li	r11, (MSR_ME << 1) | 1	/* synthesize MSR_SF | MSR_ME */
 	rotldi	r11, r11, 63
-6:	ld	r7, VCPU_CTR(r9)
-	lwz	r8, VCPU_XER(r9)
-	mtctr	r7
-	mtxer	r8
-	mr	r4, r9
-	b	fast_guest_return
+	b	fast_interrupt_c_return
 
 3:	ld	r6, VCPU_KVM(r9)	/* not relocated, use VRMA */
 	ld	r5, KVM_VRMA_SLB_V(r6)
@@ -1281,14 +1284,14 @@ kvmppc_hisi:
 hcall_try_real_mode:
 	ld	r3,VCPU_GPR(R3)(r9)
 	andi.	r0,r11,MSR_PR
-	bne	hcall_real_cont
+	bne	guest_exit_cont
 	clrrdi	r3,r3,2
 	cmpldi	r3,hcall_real_table_end - hcall_real_table
-	bge	hcall_real_cont
+	bge	guest_exit_cont
 	LOAD_REG_ADDR(r4, hcall_real_table)
 	lwzx	r3,r3,r4
 	cmpwi	r3,0
-	beq	hcall_real_cont
+	beq	guest_exit_cont
 	add	r3,r3,r4
 	mtctr	r3
 	mr	r3,r9		/* get vcpu pointer */
@@ -1309,7 +1312,7 @@ hcall_real_fallback:
 	li	r12,BOOK3S_INTERRUPT_SYSCALL
 	ld	r9, HSTATE_KVM_VCPU(r13)
 
-	b	hcall_real_cont
+	b	guest_exit_cont
 
 	.globl	hcall_real_table
 hcall_real_table:
@@ -1568,6 +1571,21 @@ kvm_cede_exit:
 	li	r3,H_TOO_HARD
 	blr
 
+	/* Try to handle a machine check in real mode */
+machine_check_realmode:
+	mr	r3, r9		/* get vcpu pointer */
+	bl	.kvmppc_realmode_machine_check
+	nop
+	cmpdi	r3, 0		/* continue exiting from guest? */
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	li	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
+	beq	mc_cont
+	/* If not, deliver a machine check.  SRR0/1 are already set */
+	li	r10, BOOK3S_INTERRUPT_MACHINE_CHECK
+	li	r11, (MSR_ME << 1) | 1	/* synthesize MSR_SF | MSR_ME */
+	rotldi	r11, r11, 63
+	b	fast_interrupt_c_return
+
 secondary_too_late:
 	ld	r5,HSTATE_KVM_VCORE(r13)
 	HMT_LOW
@@ -1587,6 +1605,10 @@ secondary_too_late:
 	.endr
 
 secondary_nap:
+	/* Clear our vcpu pointer so we don't come back in early */
+	li	r0, 0
+	std	r0, HSTATE_KVM_VCPU(r13)
+	lwsync
 	/* Clear any pending IPI - assume we're a secondary thread */
 	ld	r5, HSTATE_XICS_PHYS(r13)
 	li	r7, XICS_XIRR
@@ -1612,8 +1634,6 @@ secondary_nap:
 kvm_no_guest:
 	li	r0, KVM_HWTHREAD_IN_NAP
 	stb	r0, HSTATE_HWTHREAD_STATE(r13)
-	li	r0, 0
-	std	r0, HSTATE_KVM_VCPU(r13)
 
 	li	r3, LPCR_PECE0
 	mfspr	r4, SPRN_LPCR
diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
index 41cb0017e75..2c86b0d6371 100644
--- a/arch/powerpc/kvm/book3s_mmu_hpte.c
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -114,11 +114,6 @@ static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 	hlist_del_init_rcu(&pte->list_vpte);
 	hlist_del_init_rcu(&pte->list_vpte_long);
 
-	if (pte->pte.may_write)
-		kvm_release_pfn_dirty(pte->pfn);
-	else
-		kvm_release_pfn_clean(pte->pfn);
-
 	spin_unlock(&vcpu3s->mmu_lock);
 
 	vcpu3s->hpte_cache_count--;
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 05c28f59f77..28d38adeca7 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -52,8 +52,6 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
 #define MSR_USER32 MSR_USER
 #define MSR_USER64 MSR_USER
 #define HW_PAGE_SIZE PAGE_SIZE
-#define __hard_irq_disable local_irq_disable
-#define __hard_irq_enable local_irq_enable
 #endif
 
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -66,7 +64,7 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max;
 	svcpu_put(svcpu);
 #endif
-
+	vcpu->cpu = smp_processor_id();
 #ifdef CONFIG_PPC_BOOK3S_32
 	current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu;
 #endif
@@ -83,17 +81,71 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 	svcpu_put(svcpu);
 #endif
 
-	kvmppc_giveup_ext(vcpu, MSR_FP);
-	kvmppc_giveup_ext(vcpu, MSR_VEC);
-	kvmppc_giveup_ext(vcpu, MSR_VSX);
+	kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX);
+	vcpu->cpu = -1;
+}
+
+int kvmppc_core_check_requests(struct kvm_vcpu *vcpu)
+{
+	int r = 1; /* Indicate we want to get back into the guest */
+
+	/* We misuse TLB_FLUSH to indicate that we want to clear
+	   all shadow cache entries */
+	if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
+		kvmppc_mmu_pte_flush(vcpu, 0, 0);
+
+	return r;
+}
+
+/************* MMU Notifiers *************/
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+	trace_kvm_unmap_hva(hva);
+
+	/*
+	 * Flush all shadow tlb entries everywhere. This is slow, but
+	 * we are 100% sure that we catch the to be unmapped page
+	 */
+	kvm_flush_remote_tlbs(kvm);
+
+	return 0;
+}
+
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+	/* kvm_unmap_hva flushes everything anyways */
+	kvm_unmap_hva(kvm, start);
+
+	return 0;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+	/* XXX could be more clever ;) */
+	return 0;
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+	/* XXX could be more clever ;) */
+	return 0;
 }
 
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+	/* The page will get remapped properly on its next fault */
+	kvm_unmap_hva(kvm, hva);
+}
+
+/*****************************************/
+
 static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
 {
 	ulong smsr = vcpu->arch.shared->msr;
 
 	/* Guest MSR values */
-	smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE;
+	smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE;
 	/* Process MSR values */
 	smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
 	/* External providers the guest reserved */
@@ -379,10 +431,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 static inline int get_fpr_index(int i)
 {
-#ifdef CONFIG_VSX
-	i *= 2;
-#endif
-	return i;
+	return i * TS_FPRWIDTH;
 }
 
 /* Give up external provider (FPU, Altivec, VSX) */
@@ -396,41 +445,49 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
 	u64 *thread_fpr = (u64*)t->fpr;
 	int i;
 
-	if (!(vcpu->arch.guest_owned_ext & msr))
+	/*
+	 * VSX instructions can access FP and vector registers, so if
+	 * we are giving up VSX, make sure we give up FP and VMX as well.
+	 */
+	if (msr & MSR_VSX)
+		msr |= MSR_FP | MSR_VEC;
+
+	msr &= vcpu->arch.guest_owned_ext;
+	if (!msr)
 		return;
 
 #ifdef DEBUG_EXT
 	printk(KERN_INFO "Giving up ext 0x%lx\n", msr);
 #endif
 
-	switch (msr) {
-	case MSR_FP:
+	if (msr & MSR_FP) {
+		/*
+		 * Note that on CPUs with VSX, giveup_fpu stores
+		 * both the traditional FP registers and the added VSX
+		 * registers into thread.fpr[].
+		 */
 		giveup_fpu(current);
 		for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
 			vcpu_fpr[i] = thread_fpr[get_fpr_index(i)];
 
 		vcpu->arch.fpscr = t->fpscr.val;
-		break;
-	case MSR_VEC:
+
+#ifdef CONFIG_VSX
+		if (cpu_has_feature(CPU_FTR_VSX))
+			for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr) / 2; i++)
+				vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1];
+#endif
+	}
+
 #ifdef CONFIG_ALTIVEC
+	if (msr & MSR_VEC) {
 		giveup_altivec(current);
 		memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr));
 		vcpu->arch.vscr = t->vscr;
-#endif
-		break;
-	case MSR_VSX:
-#ifdef CONFIG_VSX
-		__giveup_vsx(current);
-		for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
-			vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1];
-#endif
-		break;
-	default:
-		BUG();
 	}
+#endif
 
-	vcpu->arch.guest_owned_ext &= ~msr;
-	current->thread.regs->msr &= ~msr;
+	vcpu->arch.guest_owned_ext &= ~(msr | MSR_VSX);
 	kvmppc_recalc_shadow_msr(vcpu);
 }
 
@@ -490,47 +547,56 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
 		return RESUME_GUEST;
 	}
 
-	/* We already own the ext */
-	if (vcpu->arch.guest_owned_ext & msr) {
-		return RESUME_GUEST;
+	if (msr == MSR_VSX) {
+		/* No VSX?  Give an illegal instruction interrupt */
+#ifdef CONFIG_VSX
+		if (!cpu_has_feature(CPU_FTR_VSX))
+#endif
+		{
+			kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+			return RESUME_GUEST;
+		}
+
+		/*
+		 * We have to load up all the FP and VMX registers before
+		 * we can let the guest use VSX instructions.
+		 */
+		msr = MSR_FP | MSR_VEC | MSR_VSX;
 	}
 
+	/* See if we already own all the ext(s) needed */
+	msr &= ~vcpu->arch.guest_owned_ext;
+	if (!msr)
+		return RESUME_GUEST;
+
 #ifdef DEBUG_EXT
 	printk(KERN_INFO "Loading up ext 0x%lx\n", msr);
 #endif
 
 	current->thread.regs->msr |= msr;
 
-	switch (msr) {
-	case MSR_FP:
+	if (msr & MSR_FP) {
 		for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
 			thread_fpr[get_fpr_index(i)] = vcpu_fpr[i];
-
+#ifdef CONFIG_VSX
+		for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr) / 2; i++)
+			thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i];
+#endif
 		t->fpscr.val = vcpu->arch.fpscr;
 		t->fpexc_mode = 0;
 		kvmppc_load_up_fpu();
-		break;
-	case MSR_VEC:
+	}
+
+	if (msr & MSR_VEC) {
 #ifdef CONFIG_ALTIVEC
 		memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr));
 		t->vscr = vcpu->arch.vscr;
 		t->vrsave = -1;
 		kvmppc_load_up_altivec();
 #endif
-		break;
-	case MSR_VSX:
-#ifdef CONFIG_VSX
-		for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
-			thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i];
-		kvmppc_load_up_vsx();
-#endif
-		break;
-	default:
-		BUG();
 	}
 
 	vcpu->arch.guest_owned_ext |= msr;
-
 	kvmppc_recalc_shadow_msr(vcpu);
 
 	return RESUME_GUEST;
@@ -540,18 +606,18 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                        unsigned int exit_nr)
 {
 	int r = RESUME_HOST;
+	int s;
 
 	vcpu->stat.sum_exits++;
 
 	run->exit_reason = KVM_EXIT_UNKNOWN;
 	run->ready_for_interrupt_injection = 1;
 
-	/* We get here with MSR.EE=0, so enable it to be a nice citizen */
-	__hard_irq_enable();
+	/* We get here with MSR.EE=1 */
+
+	trace_kvm_exit(exit_nr, vcpu);
+	kvm_guest_exit();
 
-	trace_kvm_book3s_exit(exit_nr, vcpu);
-	preempt_enable();
-	kvm_resched(vcpu);
 	switch (exit_nr) {
 	case BOOK3S_INTERRUPT_INST_STORAGE:
 	{
@@ -802,7 +868,6 @@ program_interrupt:
 	}
 	}
 
-	preempt_disable();
 	if (!(r & RESUME_HOST)) {
 		/* To avoid clobbering exit_reason, only check for signals if
 		 * we aren't already exiting to userspace for some other
@@ -814,20 +879,13 @@ program_interrupt:
 		 * and if we really did time things so badly, then we just exit
 		 * again due to a host external interrupt.
 		 */
-		__hard_irq_disable();
-		if (signal_pending(current)) {
-			__hard_irq_enable();
-#ifdef EXIT_DEBUG
-			printk(KERN_EMERG "KVM: Going back to host\n");
-#endif
-			vcpu->stat.signal_exits++;
-			run->exit_reason = KVM_EXIT_INTR;
-			r = -EINTR;
+		local_irq_disable();
+		s = kvmppc_prepare_to_enter(vcpu);
+		if (s <= 0) {
+			local_irq_enable();
+			r = s;
 		} else {
-			/* In case an interrupt came in that was triggered
-			 * from userspace (like DEC), we need to check what
-			 * to inject now! */
-			kvmppc_core_prepare_to_enter(vcpu);
+			kvmppc_lazy_ee_enable();
 		}
 	}
 
@@ -899,34 +957,59 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
 {
-	int r = -EINVAL;
+	int r = 0;
 
-	switch (reg->id) {
+	switch (id) {
 	case KVM_REG_PPC_HIOR:
-		r = copy_to_user((u64 __user *)(long)reg->addr,
-				&to_book3s(vcpu)->hior, sizeof(u64));
+		*val = get_reg_val(id, to_book3s(vcpu)->hior);
 		break;
+#ifdef CONFIG_VSX
+	case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: {
+		long int i = id - KVM_REG_PPC_VSR0;
+
+		if (!cpu_has_feature(CPU_FTR_VSX)) {
+			r = -ENXIO;
+			break;
+		}
+		val->vsxval[0] = vcpu->arch.fpr[i];
+		val->vsxval[1] = vcpu->arch.vsr[i];
+		break;
+	}
+#endif /* CONFIG_VSX */
 	default:
+		r = -EINVAL;
 		break;
 	}
 
 	return r;
 }
 
-int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
 {
-	int r = -EINVAL;
+	int r = 0;
 
-	switch (reg->id) {
+	switch (id) {
 	case KVM_REG_PPC_HIOR:
-		r = copy_from_user(&to_book3s(vcpu)->hior,
-				   (u64 __user *)(long)reg->addr, sizeof(u64));
-		if (!r)
-			to_book3s(vcpu)->hior_explicit = true;
+		to_book3s(vcpu)->hior = set_reg_val(id, *val);
+		to_book3s(vcpu)->hior_explicit = true;
+		break;
+#ifdef CONFIG_VSX
+	case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: {
+		long int i = id - KVM_REG_PPC_VSR0;
+
+		if (!cpu_has_feature(CPU_FTR_VSX)) {
+			r = -ENXIO;
+			break;
+		}
+		vcpu->arch.fpr[i] = val->vsxval[0];
+		vcpu->arch.vsr[i] = val->vsxval[1];
 		break;
+	}
+#endif /* CONFIG_VSX */
 	default:
+		r = -EINVAL;
 		break;
 	}
 
@@ -1020,8 +1103,6 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 #endif
 	ulong ext_msr;
 
-	preempt_disable();
-
 	/* Check if we can run the vcpu at all */
 	if (!vcpu->arch.sane) {
 		kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -1029,21 +1110,16 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		goto out;
 	}
 
-	kvmppc_core_prepare_to_enter(vcpu);
-
 	/*
 	 * Interrupts could be timers for the guest which we have to inject
 	 * again, so let's postpone them until we're in the guest and if we
 	 * really did time things so badly, then we just exit again due to
 	 * a host external interrupt.
 	 */
-	__hard_irq_disable();
-
-	/* No need to go into the guest when all we do is going out */
-	if (signal_pending(current)) {
-		__hard_irq_enable();
-		kvm_run->exit_reason = KVM_EXIT_INTR;
-		ret = -EINTR;
+	local_irq_disable();
+	ret = kvmppc_prepare_to_enter(vcpu);
+	if (ret <= 0) {
+		local_irq_enable();
 		goto out;
 	}
 
@@ -1070,7 +1146,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	/* Save VSX state in stack */
 	used_vsr = current->thread.used_vsr;
 	if (used_vsr && (current->thread.regs->msr & MSR_VSX))
-			__giveup_vsx(current);
+		__giveup_vsx(current);
 #endif
 
 	/* Remember the MSR with disabled extensions */
@@ -1080,20 +1156,19 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	if (vcpu->arch.shared->msr & MSR_FP)
 		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
 
-	kvm_guest_enter();
+	kvmppc_lazy_ee_enable();
 
 	ret = __kvmppc_vcpu_run(kvm_run, vcpu);
 
-	kvm_guest_exit();
-
-	current->thread.regs->msr = ext_msr;
+	/* No need for kvm_guest_exit. It's done in handle_exit.
+	   We also get here with interrupts enabled. */
 
 	/* Make sure we save the guest FPU/Altivec/VSX state */
-	kvmppc_giveup_ext(vcpu, MSR_FP);
-	kvmppc_giveup_ext(vcpu, MSR_VEC);
-	kvmppc_giveup_ext(vcpu, MSR_VSX);
+	kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX);
+
+	current->thread.regs->msr = ext_msr;
 
-	/* Restore FPU state from stack */
+	/* Restore FPU/VSX state from stack */
 	memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr));
 	current->thread.fpscr.val = fpscr;
 	current->thread.fpexc_mode = fpexc_mode;
@@ -1113,7 +1188,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 #endif
 
 out:
-	preempt_enable();
+	vcpu->mode = OUTSIDE_GUEST_MODE;
 	return ret;
 }
 
@@ -1181,14 +1256,31 @@ int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info)
 }
 #endif /* CONFIG_PPC64 */
 
+void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
+			      struct kvm_memory_slot *dont)
+{
+}
+
+int kvmppc_core_create_memslot(struct kvm_memory_slot *slot,
+			       unsigned long npages)
+{
+	return 0;
+}
+
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				      struct kvm_memory_slot *memslot,
 				      struct kvm_userspace_memory_region *mem)
 {
 	return 0;
 }
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem)
+				struct kvm_userspace_memory_region *mem,
+				struct kvm_memory_slot old)
+{
+}
+
+void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
 {
 }
 
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index 9ecf6e35cd8..8f7633e3afb 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -170,20 +170,21 @@ kvmppc_handler_skip_ins:
  * Call kvmppc_handler_trampoline_enter in real mode
  *
  * On entry, r4 contains the guest shadow MSR
+ * MSR.EE has to be 0 when calling this function
  */
 _GLOBAL(kvmppc_entry_trampoline)
 	mfmsr	r5
 	LOAD_REG_ADDR(r7, kvmppc_handler_trampoline_enter)
 	toreal(r7)
 
-	li	r9, MSR_RI
-	ori	r9, r9, MSR_EE
-	andc	r9, r5, r9	/* Clear EE and RI in MSR value */
 	li	r6, MSR_IR | MSR_DR
-	ori	r6, r6, MSR_EE
-	andc	r6, r5, r6	/* Clear EE, DR and IR in MSR value */
-	MTMSR_EERI(r9)		/* Clear EE and RI in MSR */
-	mtsrr0	r7		/* before we set srr0/1 */
+	andc	r6, r5, r6	/* Clear DR and IR in MSR value */
+	/*
+	 * Set EE in HOST_MSR so that it's enabled when we get into our
+	 * C exit handler function
+	 */
+	ori	r5, r5, MSR_EE
+	mtsrr0	r7
 	mtsrr1	r6
 	RFI
 
@@ -233,8 +234,5 @@ define_load_up(fpu)
 #ifdef CONFIG_ALTIVEC
 define_load_up(altivec)
 #endif
-#ifdef CONFIG_VSX
-define_load_up(vsx)
-#endif
 
 #include "book3s_segment.S"
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index d25a097c852..69f11401578 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -36,9 +36,11 @@
 #include <asm/dbell.h>
 #include <asm/hw_irq.h>
 #include <asm/irq.h>
+#include <asm/time.h>
 
 #include "timing.h"
 #include "booke.h"
+#include "trace.h"
 
 unsigned long kvmppc_booke_handlers;
 
@@ -62,6 +64,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "doorbell", VCPU_STAT(dbell_exits) },
 	{ "guest doorbell", VCPU_STAT(gdbell_exits) },
+	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 	{ NULL }
 };
 
@@ -120,6 +123,16 @@ static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu)
 }
 #endif
 
+static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu)
+{
+#if defined(CONFIG_PPC_FPU) && !defined(CONFIG_KVM_BOOKE_HV)
+	/* We always treat the FP bit as enabled from the host
+	   perspective, so only need to adjust the shadow MSR */
+	vcpu->arch.shadow_msr &= ~MSR_FP;
+	vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_FP;
+#endif
+}
+
 /*
  * Helper function for "full" MSR writes.  No need to call this if only
  * EE/CE/ME/DE/RI are changing.
@@ -136,11 +149,13 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
 
 	kvmppc_mmu_msr_notify(vcpu, old_msr);
 	kvmppc_vcpu_sync_spe(vcpu);
+	kvmppc_vcpu_sync_fpu(vcpu);
 }
 
 static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu,
                                        unsigned int priority)
 {
+	trace_kvm_booke_queue_irqprio(vcpu, priority);
 	set_bit(priority, &vcpu->arch.pending_exceptions);
 }
 
@@ -206,6 +221,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
 	clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
 }
 
+static void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu)
+{
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG);
+}
+
+static void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu)
+{
+	clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions);
+}
+
 static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
 {
 #ifdef CONFIG_KVM_BOOKE_HV
@@ -287,6 +312,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	bool crit;
 	bool keep_irq = false;
 	enum int_class int_class;
+	ulong new_msr = vcpu->arch.shared->msr;
 
 	/* Truncate crit indicators in 32 bit mode */
 	if (!(vcpu->arch.shared->msr & MSR_SF)) {
@@ -325,6 +351,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 		msr_mask = MSR_CE | MSR_ME | MSR_DE;
 		int_class = INT_CLASS_NONCRIT;
 		break;
+	case BOOKE_IRQPRIO_WATCHDOG:
 	case BOOKE_IRQPRIO_CRITICAL:
 	case BOOKE_IRQPRIO_DBELL_CRIT:
 		allowed = vcpu->arch.shared->msr & MSR_CE;
@@ -381,7 +408,13 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 			set_guest_esr(vcpu, vcpu->arch.queued_esr);
 		if (update_dear == true)
 			set_guest_dear(vcpu, vcpu->arch.queued_dear);
-		kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask);
+
+		new_msr &= msr_mask;
+#if defined(CONFIG_64BIT)
+		if (vcpu->arch.epcr & SPRN_EPCR_ICM)
+			new_msr |= MSR_CM;
+#endif
+		kvmppc_set_msr(vcpu, new_msr);
 
 		if (!keep_irq)
 			clear_bit(priority, &vcpu->arch.pending_exceptions);
@@ -404,12 +437,121 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	return allowed;
 }
 
+/*
+ * Return the number of jiffies until the next timeout.  If the timeout is
+ * longer than the NEXT_TIMER_MAX_DELTA, then return NEXT_TIMER_MAX_DELTA
+ * because the larger value can break the timer APIs.
+ */
+static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu)
+{
+	u64 tb, wdt_tb, wdt_ticks = 0;
+	u64 nr_jiffies = 0;
+	u32 period = TCR_GET_WP(vcpu->arch.tcr);
+
+	wdt_tb = 1ULL << (63 - period);
+	tb = get_tb();
+	/*
+	 * The watchdog timeout will hapeen when TB bit corresponding
+	 * to watchdog will toggle from 0 to 1.
+	 */
+	if (tb & wdt_tb)
+		wdt_ticks = wdt_tb;
+
+	wdt_ticks += wdt_tb - (tb & (wdt_tb - 1));
+
+	/* Convert timebase ticks to jiffies */
+	nr_jiffies = wdt_ticks;
+
+	if (do_div(nr_jiffies, tb_ticks_per_jiffy))
+		nr_jiffies++;
+
+	return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA);
+}
+
+static void arm_next_watchdog(struct kvm_vcpu *vcpu)
+{
+	unsigned long nr_jiffies;
+	unsigned long flags;
+
+	/*
+	 * If TSR_ENW and TSR_WIS are not set then no need to exit to
+	 * userspace, so clear the KVM_REQ_WATCHDOG request.
+	 */
+	if ((vcpu->arch.tsr & (TSR_ENW | TSR_WIS)) != (TSR_ENW | TSR_WIS))
+		clear_bit(KVM_REQ_WATCHDOG, &vcpu->requests);
+
+	spin_lock_irqsave(&vcpu->arch.wdt_lock, flags);
+	nr_jiffies = watchdog_next_timeout(vcpu);
+	/*
+	 * If the number of jiffies of watchdog timer >= NEXT_TIMER_MAX_DELTA
+	 * then do not run the watchdog timer as this can break timer APIs.
+	 */
+	if (nr_jiffies < NEXT_TIMER_MAX_DELTA)
+		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
+	else
+		del_timer(&vcpu->arch.wdt_timer);
+	spin_unlock_irqrestore(&vcpu->arch.wdt_lock, flags);
+}
+
+void kvmppc_watchdog_func(unsigned long data)
+{
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
+	u32 tsr, new_tsr;
+	int final;
+
+	do {
+		new_tsr = tsr = vcpu->arch.tsr;
+		final = 0;
+
+		/* Time out event */
+		if (tsr & TSR_ENW) {
+			if (tsr & TSR_WIS)
+				final = 1;
+			else
+				new_tsr = tsr | TSR_WIS;
+		} else {
+			new_tsr = tsr | TSR_ENW;
+		}
+	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
+
+	if (new_tsr & TSR_WIS) {
+		smp_wmb();
+		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+		kvm_vcpu_kick(vcpu);
+	}
+
+	/*
+	 * If this is final watchdog expiry and some action is required
+	 * then exit to userspace.
+	 */
+	if (final && (vcpu->arch.tcr & TCR_WRC_MASK) &&
+	    vcpu->arch.watchdog_enabled) {
+		smp_wmb();
+		kvm_make_request(KVM_REQ_WATCHDOG, vcpu);
+		kvm_vcpu_kick(vcpu);
+	}
+
+	/*
+	 * Stop running the watchdog timer after final expiration to
+	 * prevent the host from being flooded with timers if the
+	 * guest sets a short period.
+	 * Timers will resume when TSR/TCR is updated next time.
+	 */
+	if (!final)
+		arm_next_watchdog(vcpu);
+}
+
 static void update_timer_ints(struct kvm_vcpu *vcpu)
 {
 	if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS))
 		kvmppc_core_queue_dec(vcpu);
 	else
 		kvmppc_core_dequeue_dec(vcpu);
+
+	if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS))
+		kvmppc_core_queue_watchdog(vcpu);
+	else
+		kvmppc_core_dequeue_watchdog(vcpu);
 }
 
 static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
@@ -417,13 +559,6 @@ static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
 	unsigned long *pending = &vcpu->arch.pending_exceptions;
 	unsigned int priority;
 
-	if (vcpu->requests) {
-		if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) {
-			smp_mb();
-			update_timer_ints(vcpu);
-		}
-	}
-
 	priority = __ffs(*pending);
 	while (priority < BOOKE_IRQPRIO_MAX) {
 		if (kvmppc_booke_irqprio_deliver(vcpu, priority))
@@ -459,37 +594,20 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
 	return r;
 }
 
-/*
- * Common checks before entering the guest world.  Call with interrupts
- * disabled.
- *
- * returns !0 if a signal is pending and check_signal is true
- */
-static int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
+int kvmppc_core_check_requests(struct kvm_vcpu *vcpu)
 {
-	int r = 0;
+	int r = 1; /* Indicate we want to get back into the guest */
 
-	WARN_ON_ONCE(!irqs_disabled());
-	while (true) {
-		if (need_resched()) {
-			local_irq_enable();
-			cond_resched();
-			local_irq_disable();
-			continue;
-		}
-
-		if (signal_pending(current)) {
-			r = 1;
-			break;
-		}
-
-		if (kvmppc_core_prepare_to_enter(vcpu)) {
-			/* interrupts got enabled in between, so we
-			   are back at square 1 */
-			continue;
-		}
+	if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu))
+		update_timer_ints(vcpu);
+#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
+	if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
+		kvmppc_core_flush_tlb(vcpu);
+#endif
 
-		break;
+	if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu)) {
+		vcpu->run->exit_reason = KVM_EXIT_WATCHDOG;
+		r = 0;
 	}
 
 	return r;
@@ -497,7 +615,7 @@ static int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
 
 int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
-	int ret;
+	int ret, s;
 #ifdef CONFIG_PPC_FPU
 	unsigned int fpscr;
 	int fpexc_mode;
@@ -510,11 +628,13 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	}
 
 	local_irq_disable();
-	if (kvmppc_prepare_to_enter(vcpu)) {
-		kvm_run->exit_reason = KVM_EXIT_INTR;
-		ret = -EINTR;
+	s = kvmppc_prepare_to_enter(vcpu);
+	if (s <= 0) {
+		local_irq_enable();
+		ret = s;
 		goto out;
 	}
+	kvmppc_lazy_ee_enable();
 
 	kvm_guest_enter();
 
@@ -542,6 +662,9 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 	ret = __kvmppc_vcpu_run(kvm_run, vcpu);
 
+	/* No need for kvm_guest_exit. It's done in handle_exit.
+	   We also get here with interrupts enabled. */
+
 #ifdef CONFIG_PPC_FPU
 	kvmppc_save_guest_fp(vcpu);
 
@@ -557,10 +680,8 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	current->thread.fpexc_mode = fpexc_mode;
 #endif
 
-	kvm_guest_exit();
-
 out:
-	local_irq_enable();
+	vcpu->mode = OUTSIDE_GUEST_MODE;
 	return ret;
 }
 
@@ -668,6 +789,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                        unsigned int exit_nr)
 {
 	int r = RESUME_HOST;
+	int s;
 
 	/* update before a new last_exit_type is rewritten */
 	kvmppc_update_timing_stats(vcpu);
@@ -677,6 +799,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	local_irq_enable();
 
+	trace_kvm_exit(exit_nr, vcpu);
+	kvm_guest_exit();
+
 	run->exit_reason = KVM_EXIT_UNKNOWN;
 	run->ready_for_interrupt_injection = 1;
 
@@ -971,10 +1096,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	 */
 	if (!(r & RESUME_HOST)) {
 		local_irq_disable();
-		if (kvmppc_prepare_to_enter(vcpu)) {
-			run->exit_reason = KVM_EXIT_INTR;
-			r = (-EINTR << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
-			kvmppc_account_exit(vcpu, SIGNAL_EXITS);
+		s = kvmppc_prepare_to_enter(vcpu);
+		if (s <= 0) {
+			local_irq_enable();
+			r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
+		} else {
+			kvmppc_lazy_ee_enable();
 		}
 	}
 
@@ -1011,6 +1138,21 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	return r;
 }
 
+int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	/* setup watchdog timer once */
+	spin_lock_init(&vcpu->arch.wdt_lock);
+	setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func,
+		    (unsigned long)vcpu);
+
+	return 0;
+}
+
+void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+	del_timer_sync(&vcpu->arch.wdt_timer);
+}
+
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 	int i;
@@ -1106,7 +1248,13 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
 	}
 
 	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
+		u32 old_tsr = vcpu->arch.tsr;
+
 		vcpu->arch.tsr = sregs->u.e.tsr;
+
+		if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS))
+			arm_next_watchdog(vcpu);
+
 		update_timer_ints(vcpu);
 	}
 
@@ -1221,12 +1369,70 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 
 int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 {
-	return -EINVAL;
+	int r = -EINVAL;
+
+	switch (reg->id) {
+	case KVM_REG_PPC_IAC1:
+	case KVM_REG_PPC_IAC2:
+	case KVM_REG_PPC_IAC3:
+	case KVM_REG_PPC_IAC4: {
+		int iac = reg->id - KVM_REG_PPC_IAC1;
+		r = copy_to_user((u64 __user *)(long)reg->addr,
+				 &vcpu->arch.dbg_reg.iac[iac], sizeof(u64));
+		break;
+	}
+	case KVM_REG_PPC_DAC1:
+	case KVM_REG_PPC_DAC2: {
+		int dac = reg->id - KVM_REG_PPC_DAC1;
+		r = copy_to_user((u64 __user *)(long)reg->addr,
+				 &vcpu->arch.dbg_reg.dac[dac], sizeof(u64));
+		break;
+	}
+#if defined(CONFIG_64BIT)
+	case KVM_REG_PPC_EPCR:
+		r = put_user(vcpu->arch.epcr, (u32 __user *)(long)reg->addr);
+		break;
+#endif
+	default:
+		break;
+	}
+	return r;
 }
 
 int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 {
-	return -EINVAL;
+	int r = -EINVAL;
+
+	switch (reg->id) {
+	case KVM_REG_PPC_IAC1:
+	case KVM_REG_PPC_IAC2:
+	case KVM_REG_PPC_IAC3:
+	case KVM_REG_PPC_IAC4: {
+		int iac = reg->id - KVM_REG_PPC_IAC1;
+		r = copy_from_user(&vcpu->arch.dbg_reg.iac[iac],
+			     (u64 __user *)(long)reg->addr, sizeof(u64));
+		break;
+	}
+	case KVM_REG_PPC_DAC1:
+	case KVM_REG_PPC_DAC2: {
+		int dac = reg->id - KVM_REG_PPC_DAC1;
+		r = copy_from_user(&vcpu->arch.dbg_reg.dac[dac],
+			     (u64 __user *)(long)reg->addr, sizeof(u64));
+		break;
+	}
+#if defined(CONFIG_64BIT)
+	case KVM_REG_PPC_EPCR: {
+		u32 new_epcr;
+		r = get_user(new_epcr, (u32 __user *)(long)reg->addr);
+		if (r == 0)
+			kvmppc_set_epcr(vcpu, new_epcr);
+		break;
+	}
+#endif
+	default:
+		break;
+	}
+	return r;
 }
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
@@ -1253,20 +1459,50 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 	return -ENOTSUPP;
 }
 
+void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
+			      struct kvm_memory_slot *dont)
+{
+}
+
+int kvmppc_core_create_memslot(struct kvm_memory_slot *slot,
+			       unsigned long npages)
+{
+	return 0;
+}
+
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				      struct kvm_memory_slot *memslot,
 				      struct kvm_userspace_memory_region *mem)
 {
 	return 0;
 }
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem)
+				struct kvm_userspace_memory_region *mem,
+				struct kvm_memory_slot old)
+{
+}
+
+void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
+{
+}
+
+void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr)
 {
+#if defined(CONFIG_64BIT)
+	vcpu->arch.epcr = new_epcr;
+#ifdef CONFIG_KVM_BOOKE_HV
+	vcpu->arch.shadow_epcr &= ~SPRN_EPCR_GICM;
+	if (vcpu->arch.epcr  & SPRN_EPCR_ICM)
+		vcpu->arch.shadow_epcr |= SPRN_EPCR_GICM;
+#endif
+#endif
 }
 
 void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr)
 {
 	vcpu->arch.tcr = new_tcr;
+	arm_next_watchdog(vcpu);
 	update_timer_ints(vcpu);
 }
 
@@ -1281,6 +1517,14 @@ void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
 void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
 {
 	clear_bits(tsr_bits, &vcpu->arch.tsr);
+
+	/*
+	 * We may have stopped the watchdog due to
+	 * being stuck on final expiration.
+	 */
+	if (tsr_bits & (TSR_ENW | TSR_WIS))
+		arm_next_watchdog(vcpu);
+
 	update_timer_ints(vcpu);
 }
 
@@ -1298,12 +1542,14 @@ void kvmppc_decrementer_func(unsigned long data)
 
 void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
+	vcpu->cpu = smp_processor_id();
 	current->thread.kvm_vcpu = vcpu;
 }
 
 void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	current->thread.kvm_vcpu = NULL;
+	vcpu->cpu = -1;
 }
 
 int __init kvmppc_booke_init(void)
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index ba61974c1e2..e9b88e433f6 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -69,6 +69,7 @@ extern unsigned long kvmppc_booke_handlers;
 void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr);
 void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr);
 
+void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr);
 void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr);
 void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits);
 void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits);
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index 12834bb608a..4685b8cf224 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -133,10 +133,10 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 		vcpu->arch.csrr1 = spr_val;
 		break;
 	case SPRN_DBCR0:
-		vcpu->arch.dbcr0 = spr_val;
+		vcpu->arch.dbg_reg.dbcr0 = spr_val;
 		break;
 	case SPRN_DBCR1:
-		vcpu->arch.dbcr1 = spr_val;
+		vcpu->arch.dbg_reg.dbcr1 = spr_val;
 		break;
 	case SPRN_DBSR:
 		vcpu->arch.dbsr &= ~spr_val;
@@ -145,6 +145,14 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 		kvmppc_clr_tsr_bits(vcpu, spr_val);
 		break;
 	case SPRN_TCR:
+		/*
+		 * WRC is a 2-bit field that is supposed to preserve its
+		 * value once written to non-zero.
+		 */
+		if (vcpu->arch.tcr & TCR_WRC_MASK) {
+			spr_val &= ~TCR_WRC_MASK;
+			spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
+		}
 		kvmppc_set_tcr(vcpu, spr_val);
 		break;
 
@@ -229,7 +237,17 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 	case SPRN_IVOR15:
 		vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = spr_val;
 		break;
-
+	case SPRN_MCSR:
+		vcpu->arch.mcsr &= ~spr_val;
+		break;
+#if defined(CONFIG_64BIT)
+	case SPRN_EPCR:
+		kvmppc_set_epcr(vcpu, spr_val);
+#ifdef CONFIG_KVM_BOOKE_HV
+		mtspr(SPRN_EPCR, vcpu->arch.shadow_epcr);
+#endif
+		break;
+#endif
 	default:
 		emulated = EMULATE_FAIL;
 	}
@@ -258,10 +276,10 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
 		*spr_val = vcpu->arch.csrr1;
 		break;
 	case SPRN_DBCR0:
-		*spr_val = vcpu->arch.dbcr0;
+		*spr_val = vcpu->arch.dbg_reg.dbcr0;
 		break;
 	case SPRN_DBCR1:
-		*spr_val = vcpu->arch.dbcr1;
+		*spr_val = vcpu->arch.dbg_reg.dbcr1;
 		break;
 	case SPRN_DBSR:
 		*spr_val = vcpu->arch.dbsr;
@@ -321,6 +339,14 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
 	case SPRN_IVOR15:
 		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG];
 		break;
+	case SPRN_MCSR:
+		*spr_val = vcpu->arch.mcsr;
+		break;
+#if defined(CONFIG_64BIT)
+	case SPRN_EPCR:
+		*spr_val = vcpu->arch.epcr;
+		break;
+#endif
 
 	default:
 		emulated = EMULATE_FAIL;
diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
index 099fe8272b5..e8ed7d659c5 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -16,6 +16,7 @@
  *
  * Author: Varun Sethi <varun.sethi@freescale.com>
  * Author: Scott Wood <scotwood@freescale.com>
+ * Author: Mihai Caraman <mihai.caraman@freescale.com>
  *
  * This file is derived from arch/powerpc/kvm/booke_interrupts.S
  */
@@ -30,31 +31,33 @@
 #include <asm/bitsperlong.h>
 #include <asm/thread_info.h>
 
+#ifdef CONFIG_64BIT
+#include <asm/exception-64e.h>
+#else
 #include "../kernel/head_booke.h" /* for THREAD_NORMSAVE() */
-
-#define GET_VCPU(vcpu, thread)	\
-	PPC_LL	vcpu, THREAD_KVM_VCPU(thread)
+#endif
 
 #define LONGBYTES		(BITS_PER_LONG / 8)
 
 #define VCPU_GUEST_SPRG(n)	(VCPU_GUEST_SPRGS + (n * LONGBYTES))
 
 /* The host stack layout: */
-#define HOST_R1         (0 * LONGBYTES) /* Implied by stwu. */
-#define HOST_CALLEE_LR  (1 * LONGBYTES)
-#define HOST_RUN        (2 * LONGBYTES) /* struct kvm_run */
+#define HOST_R1         0 /* Implied by stwu. */
+#define HOST_CALLEE_LR  PPC_LR_STKOFF
+#define HOST_RUN        (HOST_CALLEE_LR + LONGBYTES)
 /*
  * r2 is special: it holds 'current', and it made nonvolatile in the
  * kernel with the -ffixed-r2 gcc option.
  */
-#define HOST_R2         (3 * LONGBYTES)
-#define HOST_CR         (4 * LONGBYTES)
-#define HOST_NV_GPRS    (5 * LONGBYTES)
+#define HOST_R2         (HOST_RUN + LONGBYTES)
+#define HOST_CR         (HOST_R2 + LONGBYTES)
+#define HOST_NV_GPRS    (HOST_CR + LONGBYTES)
 #define __HOST_NV_GPR(n)  (HOST_NV_GPRS + ((n - 14) * LONGBYTES))
 #define HOST_NV_GPR(n)  __HOST_NV_GPR(__REG_##n)
 #define HOST_MIN_STACK_SIZE (HOST_NV_GPR(R31) + LONGBYTES)
 #define HOST_STACK_SIZE ((HOST_MIN_STACK_SIZE + 15) & ~15) /* Align. */
-#define HOST_STACK_LR   (HOST_STACK_SIZE + LONGBYTES) /* In caller stack frame. */
+/* LR in caller stack frame. */
+#define HOST_STACK_LR	(HOST_STACK_SIZE + PPC_LR_STKOFF)
 
 #define NEED_EMU		0x00000001 /* emulation -- save nv regs */
 #define NEED_DEAR		0x00000002 /* save faulting DEAR */
@@ -201,12 +204,128 @@
 	b	kvmppc_resume_host
 .endm
 
+#ifdef CONFIG_64BIT
+/* Exception types */
+#define EX_GEN			1
+#define EX_GDBELL		2
+#define EX_DBG			3
+#define EX_MC			4
+#define EX_CRIT			5
+#define EX_TLB			6
+
+/*
+ * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h
+ */
+.macro kvm_handler intno type scratch, paca_ex, ex_r10, ex_r11, srr0, srr1, flags
+ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
+	mr	r11, r4
+	/*
+	 * Get vcpu from Paca: paca->__current.thread->kvm_vcpu
+	 */
+	PPC_LL	r4, PACACURRENT(r13)
+	PPC_LL	r4, (THREAD + THREAD_KVM_VCPU)(r4)
+	stw	r10, VCPU_CR(r4)
+	PPC_STL r11, VCPU_GPR(R4)(r4)
+	PPC_STL	r5, VCPU_GPR(R5)(r4)
+	.if \type == EX_CRIT
+	PPC_LL	r5, (\paca_ex + EX_R13)(r13)
+	.else
+	mfspr	r5, \scratch
+	.endif
+	PPC_STL	r6, VCPU_GPR(R6)(r4)
+	PPC_STL	r8, VCPU_GPR(R8)(r4)
+	PPC_STL	r9, VCPU_GPR(R9)(r4)
+	PPC_STL r5, VCPU_GPR(R13)(r4)
+	PPC_LL	r6, (\paca_ex + \ex_r10)(r13)
+	PPC_LL	r8, (\paca_ex + \ex_r11)(r13)
+	PPC_STL r3, VCPU_GPR(R3)(r4)
+	PPC_STL r7, VCPU_GPR(R7)(r4)
+	PPC_STL r12, VCPU_GPR(R12)(r4)
+	PPC_STL r6, VCPU_GPR(R10)(r4)
+	PPC_STL r8, VCPU_GPR(R11)(r4)
+	mfctr	r5
+	PPC_STL	r5, VCPU_CTR(r4)
+	mfspr	r5, \srr0
+	mfspr	r6, \srr1
+	kvm_handler_common \intno, \srr0, \flags
+.endm
+
+#define EX_PARAMS(type) \
+	EX_##type, \
+	SPRN_SPRG_##type##_SCRATCH, \
+	PACA_EX##type, \
+	EX_R10, \
+	EX_R11
+
+#define EX_PARAMS_TLB \
+	EX_TLB, \
+	SPRN_SPRG_GEN_SCRATCH, \
+	PACA_EXTLB, \
+	EX_TLB_R10, \
+	EX_TLB_R11
+
+kvm_handler BOOKE_INTERRUPT_CRITICAL, EX_PARAMS(CRIT), \
+	SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_handler BOOKE_INTERRUPT_MACHINE_CHECK, EX_PARAMS(MC), \
+	SPRN_MCSRR0, SPRN_MCSRR1, 0
+kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1,(NEED_EMU | NEED_DEAR | NEED_ESR)
+kvm_handler BOOKE_INTERRUPT_INST_STORAGE, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, NEED_ESR
+kvm_handler BOOKE_INTERRUPT_EXTERNAL, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_ALIGNMENT, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1,(NEED_DEAR | NEED_ESR)
+kvm_handler BOOKE_INTERRUPT_PROGRAM, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1,NEED_ESR
+kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_DECREMENTER, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_FIT, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_WATCHDOG, EX_PARAMS(CRIT),\
+	SPRN_CSRR0, SPRN_CSRR1, 0
+/*
+ * Only bolted TLB miss exception handlers are supported for now
+ */
+kvm_handler BOOKE_INTERRUPT_DTLB_MISS, EX_PARAMS_TLB, \
+	SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR)
+kvm_handler BOOKE_INTERRUPT_ITLB_MISS, EX_PARAMS_TLB, \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_SPE_UNAVAIL, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_SPE_FP_DATA, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_SPE_FP_ROUND, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_DOORBELL, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_DOORBELL_CRITICAL, EX_PARAMS(CRIT), \
+	SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_handler BOOKE_INTERRUPT_HV_PRIV, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, NEED_EMU
+kvm_handler BOOKE_INTERRUPT_HV_SYSCALL, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_GUEST_DBELL, EX_PARAMS(GDBELL), \
+	SPRN_GSRR0, SPRN_GSRR1, 0
+kvm_handler BOOKE_INTERRUPT_GUEST_DBELL_CRIT, EX_PARAMS(CRIT), \
+	SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(DBG), \
+	SPRN_DSRR0, SPRN_DSRR1, 0
+kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(CRIT), \
+	SPRN_CSRR0, SPRN_CSRR1, 0
+#else
 /*
  * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h
  */
 .macro kvm_handler intno srr0, srr1, flags
 _GLOBAL(kvmppc_handler_\intno\()_\srr1)
-	GET_VCPU(r11, r10)
+	PPC_LL	r11, THREAD_KVM_VCPU(r10)
 	PPC_STL r3, VCPU_GPR(R3)(r11)
 	mfspr	r3, SPRN_SPRG_RSCRATCH0
 	PPC_STL	r4, VCPU_GPR(R4)(r11)
@@ -233,7 +352,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
 .macro kvm_lvl_handler intno scratch srr0, srr1, flags
 _GLOBAL(kvmppc_handler_\intno\()_\srr1)
 	mfspr	r10, SPRN_SPRG_THREAD
-	GET_VCPU(r11, r10)
+	PPC_LL	r11, THREAD_KVM_VCPU(r10)
 	PPC_STL r3, VCPU_GPR(R3)(r11)
 	mfspr	r3, \scratch
 	PPC_STL	r4, VCPU_GPR(R4)(r11)
@@ -295,7 +414,7 @@ kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \
 	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
 kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \
 	SPRN_SPRG_RSCRATCH_DBG, SPRN_DSRR0, SPRN_DSRR1, 0
-
+#endif
 
 /* Registers:
  *  SPRG_SCRATCH0: guest r10
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index aa8b81428bf..c70d37ed770 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -27,8 +27,7 @@
 #define E500_TLB_NUM   2
 
 #define E500_TLB_VALID 1
-#define E500_TLB_DIRTY 2
-#define E500_TLB_BITMAP 4
+#define E500_TLB_BITMAP 2
 
 struct tlbe_ref {
 	pfn_t pfn;
@@ -130,9 +129,9 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500,
 				ulong value);
 int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu);
 int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu);
-int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb);
-int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int rt, int ra, int rb);
-int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb);
+int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea);
+int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea);
+int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea);
 int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500);
 void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500);
 
@@ -155,7 +154,7 @@ get_tlb_size(const struct kvm_book3e_206_tlb_entry *tlbe)
 
 static inline gva_t get_tlb_eaddr(const struct kvm_book3e_206_tlb_entry *tlbe)
 {
-	return tlbe->mas2 & 0xfffff000;
+	return tlbe->mas2 & MAS2_EPN;
 }
 
 static inline u64 get_tlb_bytes(const struct kvm_book3e_206_tlb_entry *tlbe)
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index e04b0ef55ce..e78f353a836 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -89,6 +89,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	int ra = get_ra(inst);
 	int rb = get_rb(inst);
 	int rt = get_rt(inst);
+	gva_t ea;
 
 	switch (get_op(inst)) {
 	case 31:
@@ -113,15 +114,20 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			break;
 
 		case XOP_TLBSX:
-			emulated = kvmppc_e500_emul_tlbsx(vcpu,rb);
+			ea = kvmppc_get_ea_indexed(vcpu, ra, rb);
+			emulated = kvmppc_e500_emul_tlbsx(vcpu, ea);
 			break;
 
-		case XOP_TLBILX:
-			emulated = kvmppc_e500_emul_tlbilx(vcpu, rt, ra, rb);
+		case XOP_TLBILX: {
+			int type = rt & 0x3;
+			ea = kvmppc_get_ea_indexed(vcpu, ra, rb);
+			emulated = kvmppc_e500_emul_tlbilx(vcpu, type, ea);
 			break;
+		}
 
 		case XOP_TLBIVAX:
-			emulated = kvmppc_e500_emul_tlbivax(vcpu, ra, rb);
+			ea = kvmppc_get_ea_indexed(vcpu, ra, rb);
+			emulated = kvmppc_e500_emul_tlbivax(vcpu, ea);
 			break;
 
 		default:
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index ff38b664195..cf3f1801237 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -304,17 +304,13 @@ static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
 	ref->flags = E500_TLB_VALID;
 
 	if (tlbe_is_writable(gtlbe))
-		ref->flags |= E500_TLB_DIRTY;
+		kvm_set_pfn_dirty(pfn);
 }
 
 static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref)
 {
 	if (ref->flags & E500_TLB_VALID) {
-		if (ref->flags & E500_TLB_DIRTY)
-			kvm_release_pfn_dirty(ref->pfn);
-		else
-			kvm_release_pfn_clean(ref->pfn);
-
+		trace_kvm_booke206_ref_release(ref->pfn, ref->flags);
 		ref->flags = 0;
 	}
 }
@@ -357,6 +353,13 @@ static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500)
 	clear_tlb_privs(vcpu_e500);
 }
 
+void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	clear_tlb_refs(vcpu_e500);
+	clear_tlb1_bitmap(vcpu_e500);
+}
+
 static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
 		unsigned int eaddr, int as)
 {
@@ -412,7 +415,8 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	struct tlbe_ref *ref)
 {
 	struct kvm_memory_slot *slot;
-	unsigned long pfn, hva;
+	unsigned long pfn = 0; /* silence GCC warning */
+	unsigned long hva;
 	int pfnmap = 0;
 	int tsize = BOOK3E_PAGESZ_4K;
 
@@ -521,7 +525,7 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	if (likely(!pfnmap)) {
 		unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
 		pfn = gfn_to_pfn_memslot(slot, gfn);
-		if (is_error_pfn(pfn)) {
+		if (is_error_noslot_pfn(pfn)) {
 			printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
 					(long)gfn);
 			return;
@@ -541,6 +545,9 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 
 	/* Clear i-cache for new pages */
 	kvmppc_mmu_flush_icache(pfn);
+
+	/* Drop refcount on page, so that mmu notifiers can clear it */
+	kvm_release_pfn_clean(pfn);
 }
 
 /* XXX only map the one-one case, for now use TLB0 */
@@ -682,14 +689,11 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value)
 	return EMULATE_DONE;
 }
 
-int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb)
+int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 	unsigned int ia;
 	int esel, tlbsel;
-	gva_t ea;
-
-	ea = ((ra) ? kvmppc_get_gpr(vcpu, ra) : 0) + kvmppc_get_gpr(vcpu, rb);
 
 	ia = (ea >> 2) & 0x1;
 
@@ -716,7 +720,7 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb)
 }
 
 static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
-		       int pid, int rt)
+		       int pid, int type)
 {
 	struct kvm_book3e_206_tlb_entry *tlbe;
 	int tid, esel;
@@ -725,7 +729,7 @@ static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
 	for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries; esel++) {
 		tlbe = get_entry(vcpu_e500, tlbsel, esel);
 		tid = get_tlb_tid(tlbe);
-		if (rt == 0 || tid == pid) {
+		if (type == 0 || tid == pid) {
 			inval_gtlbe_on_host(vcpu_e500, tlbsel, esel);
 			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
 		}
@@ -733,14 +737,9 @@ static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
 }
 
 static void tlbilx_one(struct kvmppc_vcpu_e500 *vcpu_e500, int pid,
-		       int ra, int rb)
+		       gva_t ea)
 {
 	int tlbsel, esel;
-	gva_t ea;
-
-	ea = kvmppc_get_gpr(&vcpu_e500->vcpu, rb);
-	if (ra)
-		ea += kvmppc_get_gpr(&vcpu_e500->vcpu, ra);
 
 	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
 		esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, -1);
@@ -752,16 +751,16 @@ static void tlbilx_one(struct kvmppc_vcpu_e500 *vcpu_e500, int pid,
 	}
 }
 
-int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int rt, int ra, int rb)
+int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 	int pid = get_cur_spid(vcpu);
 
-	if (rt == 0 || rt == 1) {
-		tlbilx_all(vcpu_e500, 0, pid, rt);
-		tlbilx_all(vcpu_e500, 1, pid, rt);
-	} else if (rt == 3) {
-		tlbilx_one(vcpu_e500, pid, ra, rb);
+	if (type == 0 || type == 1) {
+		tlbilx_all(vcpu_e500, 0, pid, type);
+		tlbilx_all(vcpu_e500, 1, pid, type);
+	} else if (type == 3) {
+		tlbilx_one(vcpu_e500, pid, ea);
 	}
 
 	return EMULATE_DONE;
@@ -786,16 +785,13 @@ int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu)
 	return EMULATE_DONE;
 }
 
-int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb)
+int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 	int as = !!get_cur_sas(vcpu);
 	unsigned int pid = get_cur_spid(vcpu);
 	int esel, tlbsel;
 	struct kvm_book3e_206_tlb_entry *gtlbe = NULL;
-	gva_t ea;
-
-	ea = kvmppc_get_gpr(vcpu, rb);
 
 	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
 		esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as);
@@ -875,6 +871,8 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 
 	gtlbe->mas1 = vcpu->arch.shared->mas1;
 	gtlbe->mas2 = vcpu->arch.shared->mas2;
+	if (!(vcpu->arch.shared->msr & MSR_CM))
+		gtlbe->mas2 &= 0xffffffffUL;
 	gtlbe->mas7_3 = vcpu->arch.shared->mas7_3;
 
 	trace_kvm_booke206_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1,
@@ -1039,8 +1037,12 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
 		sesel = 0; /* unused */
 		priv = &vcpu_e500->gtlb_priv[tlbsel][esel];
 
-		kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K,
-					&priv->ref, eaddr, &stlbe);
+		/* Only triggers after clear_tlb_refs */
+		if (unlikely(!(priv->ref.flags & E500_TLB_VALID)))
+			kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe);
+		else
+			kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K,
+						&priv->ref, eaddr, &stlbe);
 		break;
 
 	case 1: {
@@ -1060,6 +1062,49 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
 	write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel);
 }
 
+/************* MMU Notifiers *************/
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+	trace_kvm_unmap_hva(hva);
+
+	/*
+	 * Flush all shadow tlb entries everywhere. This is slow, but
+	 * we are 100% sure that we catch the to be unmapped page
+	 */
+	kvm_flush_remote_tlbs(kvm);
+
+	return 0;
+}
+
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+	/* kvm_unmap_hva flushes everything anyways */
+	kvm_unmap_hva(kvm, start);
+
+	return 0;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+	/* XXX could be more clever ;) */
+	return 0;
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+	/* XXX could be more clever ;) */
+	return 0;
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+	/* The page will get remapped properly on its next fault */
+	kvm_unmap_hva(kvm, hva);
+}
+
+/*****************************************/
+
 static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
 	int i;
@@ -1081,6 +1126,8 @@ static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500)
 		}
 
 		vcpu_e500->num_shared_tlb_pages = 0;
+
+		kfree(vcpu_e500->shared_tlb_pages);
 		vcpu_e500->shared_tlb_pages = NULL;
 	} else {
 		kfree(vcpu_e500->gtlb_arch);
@@ -1178,21 +1225,27 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 	}
 
 	virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL);
-	if (!virt)
+	if (!virt) {
+		ret = -ENOMEM;
 		goto err_put_page;
+	}
 
 	privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0],
 			   GFP_KERNEL);
 	privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1],
 			   GFP_KERNEL);
 
-	if (!privs[0] || !privs[1])
-		goto err_put_page;
+	if (!privs[0] || !privs[1]) {
+		ret = -ENOMEM;
+		goto err_privs;
+	}
 
 	g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1],
 	                     GFP_KERNEL);
-	if (!g2h_bitmap)
-		goto err_put_page;
+	if (!g2h_bitmap) {
+		ret = -ENOMEM;
+		goto err_privs;
+	}
 
 	free_gtlb(vcpu_e500);
 
@@ -1232,10 +1285,11 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 	kvmppc_recalc_tlb1map_range(vcpu_e500);
 	return 0;
 
-err_put_page:
+err_privs:
 	kfree(privs[0]);
 	kfree(privs[1]);
 
+err_put_page:
 	for (i = 0; i < num_pages; i++)
 		put_page(pages[i]);
 
@@ -1332,7 +1386,7 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 	if (!vcpu_e500->gtlb_priv[1])
 		goto err;
 
-	vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(unsigned int) *
+	vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(u64) *
 					  vcpu_e500->gtlb_params[1].entries,
 					  GFP_KERNEL);
 	if (!vcpu_e500->g2h_tlb1_map)
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index ee04abaefe2..b0855e5d890 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -131,6 +131,125 @@ u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb)
 	return vcpu->arch.dec - jd;
 }
 
+static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+{
+	enum emulation_result emulated = EMULATE_DONE;
+	ulong spr_val = kvmppc_get_gpr(vcpu, rs);
+
+	switch (sprn) {
+	case SPRN_SRR0:
+		vcpu->arch.shared->srr0 = spr_val;
+		break;
+	case SPRN_SRR1:
+		vcpu->arch.shared->srr1 = spr_val;
+		break;
+
+	/* XXX We need to context-switch the timebase for
+	 * watchdog and FIT. */
+	case SPRN_TBWL: break;
+	case SPRN_TBWU: break;
+
+	case SPRN_MSSSR0: break;
+
+	case SPRN_DEC:
+		vcpu->arch.dec = spr_val;
+		kvmppc_emulate_dec(vcpu);
+		break;
+
+	case SPRN_SPRG0:
+		vcpu->arch.shared->sprg0 = spr_val;
+		break;
+	case SPRN_SPRG1:
+		vcpu->arch.shared->sprg1 = spr_val;
+		break;
+	case SPRN_SPRG2:
+		vcpu->arch.shared->sprg2 = spr_val;
+		break;
+	case SPRN_SPRG3:
+		vcpu->arch.shared->sprg3 = spr_val;
+		break;
+
+	default:
+		emulated = kvmppc_core_emulate_mtspr(vcpu, sprn,
+						     spr_val);
+		if (emulated == EMULATE_FAIL)
+			printk(KERN_INFO "mtspr: unknown spr "
+				"0x%x\n", sprn);
+		break;
+	}
+
+	kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS);
+
+	return emulated;
+}
+
+static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+{
+	enum emulation_result emulated = EMULATE_DONE;
+	ulong spr_val = 0;
+
+	switch (sprn) {
+	case SPRN_SRR0:
+		spr_val = vcpu->arch.shared->srr0;
+		break;
+	case SPRN_SRR1:
+		spr_val = vcpu->arch.shared->srr1;
+		break;
+	case SPRN_PVR:
+		spr_val = vcpu->arch.pvr;
+		break;
+	case SPRN_PIR:
+		spr_val = vcpu->vcpu_id;
+		break;
+	case SPRN_MSSSR0:
+		spr_val = 0;
+		break;
+
+	/* Note: mftb and TBRL/TBWL are user-accessible, so
+	 * the guest can always access the real TB anyways.
+	 * In fact, we probably will never see these traps. */
+	case SPRN_TBWL:
+		spr_val = get_tb() >> 32;
+		break;
+	case SPRN_TBWU:
+		spr_val = get_tb();
+		break;
+
+	case SPRN_SPRG0:
+		spr_val = vcpu->arch.shared->sprg0;
+		break;
+	case SPRN_SPRG1:
+		spr_val = vcpu->arch.shared->sprg1;
+		break;
+	case SPRN_SPRG2:
+		spr_val = vcpu->arch.shared->sprg2;
+		break;
+	case SPRN_SPRG3:
+		spr_val = vcpu->arch.shared->sprg3;
+		break;
+	/* Note: SPRG4-7 are user-readable, so we don't get
+	 * a trap. */
+
+	case SPRN_DEC:
+		spr_val = kvmppc_get_dec(vcpu, get_tb());
+		break;
+	default:
+		emulated = kvmppc_core_emulate_mfspr(vcpu, sprn,
+						     &spr_val);
+		if (unlikely(emulated == EMULATE_FAIL)) {
+			printk(KERN_INFO "mfspr: unknown spr "
+				"0x%x\n", sprn);
+		}
+		break;
+	}
+
+	if (emulated == EMULATE_DONE)
+		kvmppc_set_gpr(vcpu, rt, spr_val);
+	kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS);
+
+	return emulated;
+}
+
 /* XXX to do:
  * lhax
  * lhaux
@@ -156,7 +275,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	int sprn = get_sprn(inst);
 	enum emulation_result emulated = EMULATE_DONE;
 	int advance = 1;
-	ulong spr_val = 0;
 
 	/* this default type might be overwritten by subcategories */
 	kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS);
@@ -236,62 +354,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			break;
 
 		case OP_31_XOP_MFSPR:
-			switch (sprn) {
-			case SPRN_SRR0:
-				spr_val = vcpu->arch.shared->srr0;
-				break;
-			case SPRN_SRR1:
-				spr_val = vcpu->arch.shared->srr1;
-				break;
-			case SPRN_PVR:
-				spr_val = vcpu->arch.pvr;
-				break;
-			case SPRN_PIR:
-				spr_val = vcpu->vcpu_id;
-				break;
-			case SPRN_MSSSR0:
-				spr_val = 0;
-				break;
-
-			/* Note: mftb and TBRL/TBWL are user-accessible, so
-			 * the guest can always access the real TB anyways.
-			 * In fact, we probably will never see these traps. */
-			case SPRN_TBWL:
-				spr_val = get_tb() >> 32;
-				break;
-			case SPRN_TBWU:
-				spr_val = get_tb();
-				break;
-
-			case SPRN_SPRG0:
-				spr_val = vcpu->arch.shared->sprg0;
-				break;
-			case SPRN_SPRG1:
-				spr_val = vcpu->arch.shared->sprg1;
-				break;
-			case SPRN_SPRG2:
-				spr_val = vcpu->arch.shared->sprg2;
-				break;
-			case SPRN_SPRG3:
-				spr_val = vcpu->arch.shared->sprg3;
-				break;
-			/* Note: SPRG4-7 are user-readable, so we don't get
-			 * a trap. */
-
-			case SPRN_DEC:
-				spr_val = kvmppc_get_dec(vcpu, get_tb());
-				break;
-			default:
-				emulated = kvmppc_core_emulate_mfspr(vcpu, sprn,
-								     &spr_val);
-				if (unlikely(emulated == EMULATE_FAIL)) {
-					printk(KERN_INFO "mfspr: unknown spr "
-						"0x%x\n", sprn);
-				}
-				break;
-			}
-			kvmppc_set_gpr(vcpu, rt, spr_val);
-			kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS);
+			emulated = kvmppc_emulate_mfspr(vcpu, sprn, rt);
 			break;
 
 		case OP_31_XOP_STHX:
@@ -308,49 +371,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			break;
 
 		case OP_31_XOP_MTSPR:
-			spr_val = kvmppc_get_gpr(vcpu, rs);
-			switch (sprn) {
-			case SPRN_SRR0:
-				vcpu->arch.shared->srr0 = spr_val;
-				break;
-			case SPRN_SRR1:
-				vcpu->arch.shared->srr1 = spr_val;
-				break;
-
-			/* XXX We need to context-switch the timebase for
-			 * watchdog and FIT. */
-			case SPRN_TBWL: break;
-			case SPRN_TBWU: break;
-
-			case SPRN_MSSSR0: break;
-
-			case SPRN_DEC:
-				vcpu->arch.dec = spr_val;
-				kvmppc_emulate_dec(vcpu);
-				break;
-
-			case SPRN_SPRG0:
-				vcpu->arch.shared->sprg0 = spr_val;
-				break;
-			case SPRN_SPRG1:
-				vcpu->arch.shared->sprg1 = spr_val;
-				break;
-			case SPRN_SPRG2:
-				vcpu->arch.shared->sprg2 = spr_val;
-				break;
-			case SPRN_SPRG3:
-				vcpu->arch.shared->sprg3 = spr_val;
-				break;
-
-			default:
-				emulated = kvmppc_core_emulate_mtspr(vcpu, sprn,
-								     spr_val);
-				if (emulated == EMULATE_FAIL)
-					printk(KERN_INFO "mtspr: unknown spr "
-						"0x%x\n", sprn);
-				break;
-			}
-			kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS);
+			emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs);
 			break;
 
 		case OP_31_XOP_DCBI:
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 4d213b8b0fb..70739a08956 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -30,6 +30,7 @@
 #include <asm/kvm_ppc.h>
 #include <asm/tlbflush.h>
 #include <asm/cputhreads.h>
+#include <asm/irqflags.h>
 #include "timing.h"
 #include "../mm/mmu_decl.h"
 
@@ -38,8 +39,7 @@
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
-	return !(v->arch.shared->msr & MSR_WE) ||
-	       !!(v->arch.pending_exceptions) ||
+	return !!(v->arch.pending_exceptions) ||
 	       v->requests;
 }
 
@@ -48,6 +48,85 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+#ifndef CONFIG_KVM_BOOK3S_64_HV
+/*
+ * Common checks before entering the guest world.  Call with interrupts
+ * disabled.
+ *
+ * returns:
+ *
+ * == 1 if we're ready to go into guest state
+ * <= 0 if we need to go back to the host with return value
+ */
+int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
+{
+	int r = 1;
+
+	WARN_ON_ONCE(!irqs_disabled());
+	while (true) {
+		if (need_resched()) {
+			local_irq_enable();
+			cond_resched();
+			local_irq_disable();
+			continue;
+		}
+
+		if (signal_pending(current)) {
+			kvmppc_account_exit(vcpu, SIGNAL_EXITS);
+			vcpu->run->exit_reason = KVM_EXIT_INTR;
+			r = -EINTR;
+			break;
+		}
+
+		vcpu->mode = IN_GUEST_MODE;
+
+		/*
+		 * Reading vcpu->requests must happen after setting vcpu->mode,
+		 * so we don't miss a request because the requester sees
+		 * OUTSIDE_GUEST_MODE and assumes we'll be checking requests
+		 * before next entering the guest (and thus doesn't IPI).
+		 */
+		smp_mb();
+
+		if (vcpu->requests) {
+			/* Make sure we process requests preemptable */
+			local_irq_enable();
+			trace_kvm_check_requests(vcpu);
+			r = kvmppc_core_check_requests(vcpu);
+			local_irq_disable();
+			if (r > 0)
+				continue;
+			break;
+		}
+
+		if (kvmppc_core_prepare_to_enter(vcpu)) {
+			/* interrupts got enabled in between, so we
+			   are back at square 1 */
+			continue;
+		}
+
+#ifdef CONFIG_PPC64
+		/* lazy EE magic */
+		hard_irq_disable();
+		if (lazy_irq_pending()) {
+			/* Got an interrupt in between, try again */
+			local_irq_enable();
+			local_irq_disable();
+			kvm_guest_exit();
+			continue;
+		}
+
+		trace_hardirqs_on();
+#endif
+
+		kvm_guest_enter();
+		break;
+	}
+
+	return r;
+}
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
+
 int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 {
 	int nr = kvmppc_get_gpr(vcpu, 11);
@@ -67,18 +146,18 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 	}
 
 	switch (nr) {
-	case HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE:
+	case KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE):
 	{
 		vcpu->arch.magic_page_pa = param1;
 		vcpu->arch.magic_page_ea = param2;
 
 		r2 = KVM_MAGIC_FEAT_SR | KVM_MAGIC_FEAT_MAS0_TO_SPRG7;
 
-		r = HC_EV_SUCCESS;
+		r = EV_SUCCESS;
 		break;
 	}
-	case HC_VENDOR_KVM | KVM_HC_FEATURES:
-		r = HC_EV_SUCCESS;
+	case KVM_HCALL_TOKEN(KVM_HC_FEATURES):
+		r = EV_SUCCESS;
 #if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500V2)
 		/* XXX Missing magic page on 44x */
 		r2 |= (1 << KVM_FEATURE_MAGIC_PAGE);
@@ -86,8 +165,13 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 
 		/* Second return value is in r4 */
 		break;
+	case EV_HCALL_TOKEN(EV_IDLE):
+		r = EV_SUCCESS;
+		kvm_vcpu_block(vcpu);
+		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+		break;
 	default:
-		r = HC_EV_UNIMPLEMENTED;
+		r = EV_UNIMPLEMENTED;
 		break;
 	}
 
@@ -220,6 +304,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	switch (ext) {
 #ifdef CONFIG_BOOKE
 	case KVM_CAP_PPC_BOOKE_SREGS:
+	case KVM_CAP_PPC_BOOKE_WATCHDOG:
 #else
 	case KVM_CAP_PPC_SEGSTATE:
 	case KVM_CAP_PPC_HIOR:
@@ -229,6 +314,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PPC_IRQ_LEVEL:
 	case KVM_CAP_ENABLE_CAP:
 	case KVM_CAP_ONE_REG:
+	case KVM_CAP_IOEVENTFD:
 		r = 1;
 		break;
 #ifndef CONFIG_KVM_BOOK3S_64_HV
@@ -260,10 +346,22 @@ int kvm_dev_ioctl_check_extension(long ext)
 		if (cpu_has_feature(CPU_FTR_ARCH_201))
 			r = 2;
 		break;
+#endif
 	case KVM_CAP_SYNC_MMU:
+#ifdef CONFIG_KVM_BOOK3S_64_HV
 		r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0;
+#elif defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+		r = 1;
+#else
+		r = 0;
+		break;
+#endif
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	case KVM_CAP_PPC_HTAB_FD:
+		r = 1;
 		break;
 #endif
+		break;
 	case KVM_CAP_NR_VCPUS:
 		/*
 		 * Recommending a number of CPUs is somewhat arbitrary; we
@@ -302,19 +400,12 @@ long kvm_arch_dev_ioctl(struct file *filp,
 void kvm_arch_free_memslot(struct kvm_memory_slot *free,
 			   struct kvm_memory_slot *dont)
 {
-	if (!dont || free->arch.rmap != dont->arch.rmap) {
-		vfree(free->arch.rmap);
-		free->arch.rmap = NULL;
-	}
+	kvmppc_core_free_memslot(free, dont);
 }
 
 int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 {
-	slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
-	if (!slot->arch.rmap)
-		return -ENOMEM;
-
-	return 0;
+	return kvmppc_core_create_memslot(slot, npages);
 }
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -323,7 +414,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                    struct kvm_userspace_memory_region *mem,
                                    int user_alloc)
 {
-	return kvmppc_core_prepare_memory_region(kvm, mem);
+	return kvmppc_core_prepare_memory_region(kvm, memslot, mem);
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
@@ -331,7 +422,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
                struct kvm_memory_slot old,
                int user_alloc)
 {
-	kvmppc_core_commit_memory_region(kvm, mem);
+	kvmppc_core_commit_memory_region(kvm, mem, old);
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -341,6 +432,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 				   struct kvm_memory_slot *slot)
 {
+	kvmppc_core_flush_memslot(kvm, slot);
 }
 
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
@@ -354,6 +446,11 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 	return vcpu;
 }
 
+int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 {
 	/* Make sure we're not using the vcpu anymore */
@@ -390,6 +487,8 @@ enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer)
 
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
+	int ret;
+
 	hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
 	tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu);
 	vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup;
@@ -398,13 +497,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 #ifdef CONFIG_KVM_EXIT_TIMING
 	mutex_init(&vcpu->arch.exit_timing_lock);
 #endif
-
-	return 0;
+	ret = kvmppc_subarch_vcpu_init(vcpu);
+	return ret;
 }
 
 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
 	kvmppc_mmu_destroy(vcpu);
+	kvmppc_subarch_vcpu_uninit(vcpu);
 }
 
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -420,7 +520,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
 #endif
 	kvmppc_core_vcpu_load(vcpu, cpu);
-	vcpu->cpu = smp_processor_id();
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -429,7 +528,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 #ifdef CONFIG_BOOKE
 	vcpu->arch.vrsave = mfspr(SPRN_VRSAVE);
 #endif
-	vcpu->cpu = -1;
 }
 
 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
@@ -527,6 +625,13 @@ int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	vcpu->mmio_is_write = 0;
 	vcpu->arch.mmio_sign_extend = 0;
 
+	if (!kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr,
+			     bytes, &run->mmio.data)) {
+		kvmppc_complete_mmio_load(vcpu, run);
+		vcpu->mmio_needed = 0;
+		return EMULATE_DONE;
+	}
+
 	return EMULATE_DO_MMIO;
 }
 
@@ -536,8 +641,8 @@ int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu,
 {
 	int r;
 
-	r = kvmppc_handle_load(run, vcpu, rt, bytes, is_bigendian);
 	vcpu->arch.mmio_sign_extend = 1;
+	r = kvmppc_handle_load(run, vcpu, rt, bytes, is_bigendian);
 
 	return r;
 }
@@ -575,6 +680,13 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		}
 	}
 
+	if (!kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr,
+			      bytes, &run->mmio.data)) {
+		kvmppc_complete_mmio_load(vcpu, run);
+		vcpu->mmio_needed = 0;
+		return EMULATE_DONE;
+	}
+
 	return EMULATE_DO_MMIO;
 }
 
@@ -649,6 +761,12 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 		r = 0;
 		vcpu->arch.papr_enabled = true;
 		break;
+#ifdef CONFIG_BOOKE
+	case KVM_CAP_PPC_BOOKE_WATCHDOG:
+		r = 0;
+		vcpu->arch.watchdog_enabled = true;
+		break;
+#endif
 #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
 	case KVM_CAP_SW_TLB: {
 		struct kvm_config_tlb cfg;
@@ -751,9 +869,16 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
 
 static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
 {
+	u32 inst_nop = 0x60000000;
+#ifdef CONFIG_KVM_BOOKE_HV
+	u32 inst_sc1 = 0x44000022;
+	pvinfo->hcall[0] = inst_sc1;
+	pvinfo->hcall[1] = inst_nop;
+	pvinfo->hcall[2] = inst_nop;
+	pvinfo->hcall[3] = inst_nop;
+#else
 	u32 inst_lis = 0x3c000000;
 	u32 inst_ori = 0x60000000;
-	u32 inst_nop = 0x60000000;
 	u32 inst_sc = 0x44000002;
 	u32 inst_imm_mask = 0xffff;
 
@@ -770,6 +895,9 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
 	pvinfo->hcall[1] = inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask);
 	pvinfo->hcall[2] = inst_sc;
 	pvinfo->hcall[3] = inst_nop;
+#endif
+
+	pvinfo->flags = KVM_PPC_PVINFO_FLAGS_EV_IDLE;
 
 	return 0;
 }
@@ -832,6 +960,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+
+	case KVM_PPC_GET_HTAB_FD: {
+		struct kvm *kvm = filp->private_data;
+		struct kvm_get_htab_fd ghf;
+
+		r = -EFAULT;
+		if (copy_from_user(&ghf, argp, sizeof(ghf)))
+			break;
+		r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf);
+		break;
+	}
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 
 #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
index ddb6a2149d4..e326489a542 100644
--- a/arch/powerpc/kvm/trace.h
+++ b/arch/powerpc/kvm/trace.h
@@ -31,6 +31,126 @@ TRACE_EVENT(kvm_ppc_instr,
 		  __entry->inst, __entry->pc, __entry->emulate)
 );
 
+#ifdef CONFIG_PPC_BOOK3S
+#define kvm_trace_symbol_exit \
+	{0x100, "SYSTEM_RESET"}, \
+	{0x200, "MACHINE_CHECK"}, \
+	{0x300, "DATA_STORAGE"}, \
+	{0x380, "DATA_SEGMENT"}, \
+	{0x400, "INST_STORAGE"}, \
+	{0x480, "INST_SEGMENT"}, \
+	{0x500, "EXTERNAL"}, \
+	{0x501, "EXTERNAL_LEVEL"}, \
+	{0x502, "EXTERNAL_HV"}, \
+	{0x600, "ALIGNMENT"}, \
+	{0x700, "PROGRAM"}, \
+	{0x800, "FP_UNAVAIL"}, \
+	{0x900, "DECREMENTER"}, \
+	{0x980, "HV_DECREMENTER"}, \
+	{0xc00, "SYSCALL"}, \
+	{0xd00, "TRACE"}, \
+	{0xe00, "H_DATA_STORAGE"}, \
+	{0xe20, "H_INST_STORAGE"}, \
+	{0xe40, "H_EMUL_ASSIST"}, \
+	{0xf00, "PERFMON"}, \
+	{0xf20, "ALTIVEC"}, \
+	{0xf40, "VSX"}
+#else
+#define kvm_trace_symbol_exit \
+	{0, "CRITICAL"}, \
+	{1, "MACHINE_CHECK"}, \
+	{2, "DATA_STORAGE"}, \
+	{3, "INST_STORAGE"}, \
+	{4, "EXTERNAL"}, \
+	{5, "ALIGNMENT"}, \
+	{6, "PROGRAM"}, \
+	{7, "FP_UNAVAIL"}, \
+	{8, "SYSCALL"}, \
+	{9, "AP_UNAVAIL"}, \
+	{10, "DECREMENTER"}, \
+	{11, "FIT"}, \
+	{12, "WATCHDOG"}, \
+	{13, "DTLB_MISS"}, \
+	{14, "ITLB_MISS"}, \
+	{15, "DEBUG"}, \
+	{32, "SPE_UNAVAIL"}, \
+	{33, "SPE_FP_DATA"}, \
+	{34, "SPE_FP_ROUND"}, \
+	{35, "PERFORMANCE_MONITOR"}, \
+	{36, "DOORBELL"}, \
+	{37, "DOORBELL_CRITICAL"}, \
+	{38, "GUEST_DBELL"}, \
+	{39, "GUEST_DBELL_CRIT"}, \
+	{40, "HV_SYSCALL"}, \
+	{41, "HV_PRIV"}
+#endif
+
+TRACE_EVENT(kvm_exit,
+	TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
+	TP_ARGS(exit_nr, vcpu),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	exit_nr		)
+		__field(	unsigned long,	pc		)
+		__field(	unsigned long,	msr		)
+		__field(	unsigned long,	dar		)
+#ifdef CONFIG_KVM_BOOK3S_PR
+		__field(	unsigned long,	srr1		)
+#endif
+		__field(	unsigned long,	last_inst	)
+	),
+
+	TP_fast_assign(
+#ifdef CONFIG_KVM_BOOK3S_PR
+		struct kvmppc_book3s_shadow_vcpu *svcpu;
+#endif
+		__entry->exit_nr	= exit_nr;
+		__entry->pc		= kvmppc_get_pc(vcpu);
+		__entry->dar		= kvmppc_get_fault_dar(vcpu);
+		__entry->msr		= vcpu->arch.shared->msr;
+#ifdef CONFIG_KVM_BOOK3S_PR
+		svcpu = svcpu_get(vcpu);
+		__entry->srr1		= svcpu->shadow_srr1;
+		svcpu_put(svcpu);
+#endif
+		__entry->last_inst	= vcpu->arch.last_inst;
+	),
+
+	TP_printk("exit=%s"
+		" | pc=0x%lx"
+		" | msr=0x%lx"
+		" | dar=0x%lx"
+#ifdef CONFIG_KVM_BOOK3S_PR
+		" | srr1=0x%lx"
+#endif
+		" | last_inst=0x%lx"
+		,
+		__print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit),
+		__entry->pc,
+		__entry->msr,
+		__entry->dar,
+#ifdef CONFIG_KVM_BOOK3S_PR
+		__entry->srr1,
+#endif
+		__entry->last_inst
+		)
+);
+
+TRACE_EVENT(kvm_unmap_hva,
+	TP_PROTO(unsigned long hva),
+	TP_ARGS(hva),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	hva		)
+	),
+
+	TP_fast_assign(
+		__entry->hva		= hva;
+	),
+
+	TP_printk("unmap hva 0x%lx\n", __entry->hva)
+);
+
 TRACE_EVENT(kvm_stlb_inval,
 	TP_PROTO(unsigned int stlb_index),
 	TP_ARGS(stlb_index),
@@ -98,41 +218,31 @@ TRACE_EVENT(kvm_gtlb_write,
 		__entry->word1, __entry->word2)
 );
 
-
-/*************************************************************************
- *                         Book3S trace points                           *
- *************************************************************************/
-
-#ifdef CONFIG_KVM_BOOK3S_PR
-
-TRACE_EVENT(kvm_book3s_exit,
-	TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
-	TP_ARGS(exit_nr, vcpu),
+TRACE_EVENT(kvm_check_requests,
+	TP_PROTO(struct kvm_vcpu *vcpu),
+	TP_ARGS(vcpu),
 
 	TP_STRUCT__entry(
-		__field(	unsigned int,	exit_nr		)
-		__field(	unsigned long,	pc		)
-		__field(	unsigned long,	msr		)
-		__field(	unsigned long,	dar		)
-		__field(	unsigned long,	srr1		)
+		__field(	__u32,	cpu_nr		)
+		__field(	__u32,	requests	)
 	),
 
 	TP_fast_assign(
-		struct kvmppc_book3s_shadow_vcpu *svcpu;
-		__entry->exit_nr	= exit_nr;
-		__entry->pc		= kvmppc_get_pc(vcpu);
-		__entry->dar		= kvmppc_get_fault_dar(vcpu);
-		__entry->msr		= vcpu->arch.shared->msr;
-		svcpu = svcpu_get(vcpu);
-		__entry->srr1		= svcpu->shadow_srr1;
-		svcpu_put(svcpu);
+		__entry->cpu_nr		= vcpu->vcpu_id;
+		__entry->requests	= vcpu->requests;
 	),
 
-	TP_printk("exit=0x%x | pc=0x%lx | msr=0x%lx | dar=0x%lx | srr1=0x%lx",
-		  __entry->exit_nr, __entry->pc, __entry->msr, __entry->dar,
-		  __entry->srr1)
+	TP_printk("vcpu=%x requests=%x",
+		__entry->cpu_nr, __entry->requests)
 );
 
+
+/*************************************************************************
+ *                         Book3S trace points                           *
+ *************************************************************************/
+
+#ifdef CONFIG_KVM_BOOK3S_PR
+
 TRACE_EVENT(kvm_book3s_reenter,
 	TP_PROTO(int r, struct kvm_vcpu *vcpu),
 	TP_ARGS(r, vcpu),
@@ -395,6 +505,44 @@ TRACE_EVENT(kvm_booke206_gtlb_write,
 		__entry->mas2, __entry->mas7_3)
 );
 
+TRACE_EVENT(kvm_booke206_ref_release,
+	TP_PROTO(__u64 pfn, __u32 flags),
+	TP_ARGS(pfn, flags),
+
+	TP_STRUCT__entry(
+		__field(	__u64,	pfn		)
+		__field(	__u32,	flags		)
+	),
+
+	TP_fast_assign(
+		__entry->pfn		= pfn;
+		__entry->flags		= flags;
+	),
+
+	TP_printk("pfn=%llx flags=%x",
+		__entry->pfn, __entry->flags)
+);
+
+TRACE_EVENT(kvm_booke_queue_irqprio,
+	TP_PROTO(struct kvm_vcpu *vcpu, unsigned int priority),
+	TP_ARGS(vcpu, priority),
+
+	TP_STRUCT__entry(
+		__field(	__u32,	cpu_nr		)
+		__field(	__u32,	priority		)
+		__field(	unsigned long,	pending		)
+	),
+
+	TP_fast_assign(
+		__entry->cpu_nr		= vcpu->vcpu_id;
+		__entry->priority	= priority;
+		__entry->pending	= vcpu->arch.pending_exceptions;
+	),
+
+	TP_printk("vcpu=%x prio=%x pending=%lx",
+		__entry->cpu_nr, __entry->priority, __entry->pending)
+);
+
 #endif
 
 #endif /* _TRACE_KVM_H */
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index e7a896acd98..48a920d5148 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -90,6 +90,7 @@ config MPIC
 config PPC_EPAPR_HV_PIC
 	bool
 	default n
+	select EPAPR_PARAVIRT
 
 config MPIC_WEIRD
 	bool
diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c
index 51ffafae561..63c5f04ea58 100644
--- a/arch/powerpc/sysdev/fsl_msi.c
+++ b/arch/powerpc/sysdev/fsl_msi.c
@@ -236,7 +236,6 @@ static void fsl_msi_cascade(unsigned int irq, struct irq_desc *desc)
 	u32 intr_index;
 	u32 have_shift = 0;
 	struct fsl_msi_cascade_data *cascade_data;
-	unsigned int ret;
 
 	cascade_data = irq_get_handler_data(irq);
 	msi_data = cascade_data->msi_data;
@@ -268,7 +267,9 @@ static void fsl_msi_cascade(unsigned int irq, struct irq_desc *desc)
 	case FSL_PIC_IP_IPIC:
 		msir_value = fsl_msi_read(msi_data->msi_regs, msir_index * 0x4);
 		break;
-	case FSL_PIC_IP_VMPIC:
+#ifdef CONFIG_EPAPR_PARAVIRT
+	case FSL_PIC_IP_VMPIC: {
+		unsigned int ret;
 		ret = fh_vmpic_get_msir(virq_to_hw(irq), &msir_value);
 		if (ret) {
 			pr_err("fsl-msi: fh_vmpic_get_msir() failed for "
@@ -277,6 +278,8 @@ static void fsl_msi_cascade(unsigned int irq, struct irq_desc *desc)
 		}
 		break;
 	}
+#endif
+	}
 
 	while (msir_value) {
 		intr_index = ffs(msir_value) - 1;
@@ -508,10 +511,12 @@ static const struct of_device_id fsl_of_msi_ids[] = {
 		.compatible = "fsl,ipic-msi",
 		.data = &ipic_msi_feature,
 	},
+#ifdef CONFIG_EPAPR_PARAVIRT
 	{
 		.compatible = "fsl,vmpic-msi",
 		.data = &vmpic_msi_feature,
 	},
+#endif
 	{}
 };
 
diff --git a/arch/powerpc/sysdev/fsl_soc.c b/arch/powerpc/sysdev/fsl_soc.c
index c449dbd1c93..97118dc3d28 100644
--- a/arch/powerpc/sysdev/fsl_soc.c
+++ b/arch/powerpc/sysdev/fsl_soc.c
@@ -253,6 +253,7 @@ struct platform_diu_data_ops diu_ops;
 EXPORT_SYMBOL(diu_ops);
 #endif
 
+#ifdef CONFIG_EPAPR_PARAVIRT
 /*
  * Restart the current partition
  *
@@ -278,3 +279,4 @@ void fsl_hv_halt(void)
 	pr_info("hv exit\n");
 	fh_partition_stop(-1);
 }
+#endif
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index ff1e2f8ef94..c30615e605a 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -629,10 +629,27 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
 		break;
 	case KVM_S390_SIGP_STOP:
 	case KVM_S390_RESTART:
+		VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type);
+		inti->type = s390int->type;
+		break;
 	case KVM_S390_INT_EXTERNAL_CALL:
+		if (s390int->parm & 0xffff0000) {
+			kfree(inti);
+			return -EINVAL;
+		}
+		VCPU_EVENT(vcpu, 3, "inject: external call source-cpu:%u",
+			   s390int->parm);
+		inti->type = s390int->type;
+		inti->extcall.code = s390int->parm;
+		break;
 	case KVM_S390_INT_EMERGENCY:
-		VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type);
+		if (s390int->parm & 0xffff0000) {
+			kfree(inti);
+			return -EINVAL;
+		}
+		VCPU_EVENT(vcpu, 3, "inject: emergency %u\n", s390int->parm);
 		inti->type = s390int->type;
+		inti->emerg.code = s390int->parm;
 		break;
 	case KVM_S390_INT_VIRTIO:
 	case KVM_S390_INT_SERVICE:
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d91a9556800..c9011bfaabb 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -355,6 +355,11 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
 	atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
 }
 
+int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
 	atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH |
@@ -993,7 +998,7 @@ static int __init kvm_s390_init(void)
 	}
 	memcpy(facilities, S390_lowcore.stfle_fac_list, 16);
 	facilities[0] &= 0xff00fff3f47c0000ULL;
-	facilities[1] &= 0x201c000000000000ULL;
+	facilities[1] &= 0x001c000000000000ULL;
 	return 0;
 }
 
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
index 0bdbbb3b9ce..16a57f4ed64 100644
--- a/arch/x86/include/asm/clocksource.h
+++ b/arch/x86/include/asm/clocksource.h
@@ -8,6 +8,7 @@
 #define VCLOCK_NONE 0  /* No vDSO clock available.	*/
 #define VCLOCK_TSC  1  /* vDSO should use vread_tsc.	*/
 #define VCLOCK_HPET 2  /* vDSO should use vread_hpet.	*/
+#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */
 
 struct arch_clocksource_data {
 	int vclock_mode;
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index da40b1e2228..2d9075e863a 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -202,6 +202,7 @@
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
 #define X86_FEATURE_FSGSBASE	(9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
+#define X86_FEATURE_TSC_ADJUST	(9*32+ 1) /* TSC adjustment MSR 0x3b */
 #define X86_FEATURE_BMI1	(9*32+ 3) /* 1st group bit manipulation extensions */
 #define X86_FEATURE_HLE		(9*32+ 4) /* Hardware Lock Elision */
 #define X86_FEATURE_AVX2	(9*32+ 5) /* AVX2 instructions */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 4da3c0c4c97..a09c2857106 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -19,6 +19,7 @@
 #include <asm/acpi.h>
 #include <asm/apicdef.h>
 #include <asm/page.h>
+#include <asm/pvclock.h>
 #ifdef CONFIG_X86_32
 #include <linux/threads.h>
 #include <asm/kmap_types.h>
@@ -81,6 +82,10 @@ enum fixed_addresses {
 	VVAR_PAGE,
 	VSYSCALL_HPET,
 #endif
+#ifdef CONFIG_PARAVIRT_CLOCK
+	PVCLOCK_FIXMAP_BEGIN,
+	PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1,
+#endif
 	FIX_DBGP_BASE,
 	FIX_EARLYCON_MEM_BASE,
 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 317ff1703d0..6080d2694ba 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -163,6 +163,9 @@ struct kimage_arch {
 };
 #endif
 
+typedef void crash_vmclear_fn(void);
+extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_X86_KEXEC_H */
diff --git a/arch/x86/include/asm/kvm_guest.h b/arch/x86/include/asm/kvm_guest.h
new file mode 100644
index 00000000000..a92b1763c41
--- /dev/null
+++ b/arch/x86/include/asm/kvm_guest.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_X86_KVM_GUEST_H
+#define _ASM_X86_KVM_GUEST_H
+
+int kvm_setup_vsyscall_timeinfo(void);
+
+#endif /* _ASM_X86_KVM_GUEST_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b2e11f45243..dc87b65e9c3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -22,6 +22,8 @@
 #include <linux/kvm_para.h>
 #include <linux/kvm_types.h>
 #include <linux/perf_event.h>
+#include <linux/pvclock_gtod.h>
+#include <linux/clocksource.h>
 
 #include <asm/pvclock-abi.h>
 #include <asm/desc.h>
@@ -442,6 +444,7 @@ struct kvm_vcpu_arch {
 	s8 virtual_tsc_shift;
 	u32 virtual_tsc_mult;
 	u32 virtual_tsc_khz;
+	s64 ia32_tsc_adjust_msr;
 
 	atomic_t nmi_queued;  /* unprocessed asynchronous NMIs */
 	unsigned nmi_pending; /* NMI queued after currently running handler */
@@ -559,6 +562,12 @@ struct kvm_arch {
 	u64 cur_tsc_write;
 	u64 cur_tsc_offset;
 	u8  cur_tsc_generation;
+	int nr_vcpus_matched_tsc;
+
+	spinlock_t pvclock_gtod_sync_lock;
+	bool use_master_clock;
+	u64 master_kernel_ns;
+	cycle_t master_cycle_now;
 
 	struct kvm_xen_hvm_config xen_hvm_config;
 
@@ -612,6 +621,12 @@ struct kvm_vcpu_stat {
 
 struct x86_instruction_info;
 
+struct msr_data {
+	bool host_initiated;
+	u32 index;
+	u64 data;
+};
+
 struct kvm_x86_ops {
 	int (*cpu_has_kvm_support)(void);          /* __init */
 	int (*disabled_by_bios)(void);             /* __init */
@@ -634,7 +649,7 @@ struct kvm_x86_ops {
 
 	void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu);
 	int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
-	int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+	int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
 	u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
 	void (*get_segment)(struct kvm_vcpu *vcpu,
 			    struct kvm_segment *var, int seg);
@@ -697,10 +712,11 @@ struct kvm_x86_ops {
 	bool (*has_wbinvd_exit)(void);
 
 	void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale);
+	u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu);
 	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
 	u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
-	u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu);
+	u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc);
 
 	void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
 
@@ -785,7 +801,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu,
 
 void kvm_enable_efer_bits(u64);
 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
 
 struct x86_emulate_ctxt;
 
@@ -812,7 +828,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
 
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
 
 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index e400cdb2dd6..6e930b21872 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -236,6 +236,7 @@
 #define MSR_IA32_EBL_CR_POWERON		0x0000002a
 #define MSR_EBC_FREQUENCY_ID		0x0000002c
 #define MSR_IA32_FEATURE_CONTROL        0x0000003a
+#define MSR_IA32_TSC_ADJUST             0x0000003b
 
 #define FEATURE_CONTROL_LOCKED				(1<<0)
 #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX	(1<<1)
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index c59cc97fe6c..109a9dd5d45 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -6,6 +6,7 @@
 
 /* some helper functions for xen and kvm pv clock sources */
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
 void pvclock_set_flags(u8 flags);
 unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
 void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
@@ -56,4 +57,50 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
 	return product;
 }
 
+static __always_inline
+u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src)
+{
+	u64 delta = __native_read_tsc() - src->tsc_timestamp;
+	return pvclock_scale_delta(delta, src->tsc_to_system_mul,
+				   src->tsc_shift);
+}
+
+static __always_inline
+unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
+			       cycle_t *cycles, u8 *flags)
+{
+	unsigned version;
+	cycle_t ret, offset;
+	u8 ret_flags;
+
+	version = src->version;
+	/* Note: emulated platforms which do not advertise SSE2 support
+	 * result in kvmclock not using the necessary RDTSC barriers.
+	 * Without barriers, it is possible that RDTSC instruction reads from
+	 * the time stamp counter outside rdtsc_barrier protected section
+	 * below, resulting in violation of monotonicity.
+	 */
+	rdtsc_barrier();
+	offset = pvclock_get_nsec_offset(src);
+	ret = src->system_time + offset;
+	ret_flags = src->flags;
+	rdtsc_barrier();
+
+	*cycles = ret;
+	*flags = ret_flags;
+	return version;
+}
+
+struct pvclock_vsyscall_time_info {
+	struct pvclock_vcpu_time_info pvti;
+	u32 migrate_count;
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
+#define PVCLOCK_VSYSCALL_NR_PAGES (((NR_CPUS-1)/(PAGE_SIZE/PVTI_SIZE))+1)
+
+int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
+				 int size);
+struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu);
+
 #endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 36ec21c36d6..c2d56b34830 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -445,8 +445,7 @@ enum vmcs_field {
 #define VMX_EPTP_WB_BIT				(1ull << 14)
 #define VMX_EPT_2MB_PAGE_BIT			(1ull << 16)
 #define VMX_EPT_1GB_PAGE_BIT			(1ull << 17)
-#define VMX_EPT_AD_BIT					(1ull << 21)
-#define VMX_EPT_EXTENT_INDIVIDUAL_BIT		(1ull << 24)
+#define VMX_EPT_AD_BIT				    (1ull << 21)
 #define VMX_EPT_EXTENT_CONTEXT_BIT		(1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
 
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index eaea1d31f75..80f80955cfd 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -33,6 +33,26 @@ extern void map_vsyscall(void);
  */
 extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address);
 
+#ifdef CONFIG_X86_64
+
+#define VGETCPU_CPU_MASK 0xfff
+
+static inline unsigned int __getcpu(void)
+{
+	unsigned int p;
+
+	if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
+		/* Load per CPU data from RDTSCP */
+		native_read_tscp(&p);
+	} else {
+		/* Load per CPU data from GDT */
+		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+	}
+
+	return p;
+}
+#endif /* CONFIG_X86_64 */
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 13ad89971d4..74467feb4dc 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -16,6 +16,7 @@
 #include <linux/delay.h>
 #include <linux/elf.h>
 #include <linux/elfcore.h>
+#include <linux/module.h>
 
 #include <asm/processor.h>
 #include <asm/hardirq.h>
@@ -30,6 +31,27 @@
 
 int in_crash_kexec;
 
+/*
+ * This is used to VMCLEAR all VMCSs loaded on the
+ * processor. And when loading kvm_intel module, the
+ * callback function pointer will be assigned.
+ *
+ * protected by rcu.
+ */
+crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
+EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
+
+static inline void cpu_crash_vmclear_loaded_vmcss(void)
+{
+	crash_vmclear_fn *do_vmclear_operation = NULL;
+
+	rcu_read_lock();
+	do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss);
+	if (do_vmclear_operation)
+		do_vmclear_operation();
+	rcu_read_unlock();
+}
+
 #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
 
 static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
@@ -46,6 +68,11 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
 #endif
 	crash_save_cpu(regs, cpu);
 
+	/*
+	 * VMCLEAR VMCSs loaded on all cpus if needed.
+	 */
+	cpu_crash_vmclear_loaded_vmcss();
+
 	/* Disable VMX or SVM if needed.
 	 *
 	 * We need to disable virtualization on all CPUs.
@@ -88,6 +115,11 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
 
 	kdump_nmi_shootdown_cpus();
 
+	/*
+	 * VMCLEAR VMCSs loaded on this cpu if needed.
+	 */
+	cpu_crash_vmclear_loaded_vmcss();
+
 	/* Booting kdump kernel with VMX or SVM enabled won't work,
 	 * because (among other limitations) we can't disable paging
 	 * with the virt flags.
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 4180a874c76..08b973f6403 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -42,6 +42,7 @@
 #include <asm/apic.h>
 #include <asm/apicdef.h>
 #include <asm/hypervisor.h>
+#include <asm/kvm_guest.h>
 
 static int kvmapf = 1;
 
@@ -62,6 +63,15 @@ static int parse_no_stealacc(char *arg)
 
 early_param("no-steal-acc", parse_no_stealacc);
 
+static int kvmclock_vsyscall = 1;
+static int parse_no_kvmclock_vsyscall(char *arg)
+{
+        kvmclock_vsyscall = 0;
+        return 0;
+}
+
+early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
+
 static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
 static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
 static int has_steal_clock = 0;
@@ -110,11 +120,6 @@ void kvm_async_pf_task_wait(u32 token)
 	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
 	struct kvm_task_sleep_node n, *e;
 	DEFINE_WAIT(wait);
-	int cpu, idle;
-
-	cpu = get_cpu();
-	idle = idle_cpu(cpu);
-	put_cpu();
 
 	spin_lock(&b->lock);
 	e = _find_apf_task(b, token);
@@ -128,7 +133,7 @@ void kvm_async_pf_task_wait(u32 token)
 
 	n.token = token;
 	n.cpu = smp_processor_id();
-	n.halted = idle || preempt_count() > 1;
+	n.halted = is_idle_task(current) || preempt_count() > 1;
 	init_waitqueue_head(&n.wq);
 	hlist_add_head(&n.link, &b->list);
 	spin_unlock(&b->lock);
@@ -471,6 +476,9 @@ void __init kvm_guest_init(void)
 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
 		apic_set_eoi_write(kvm_guest_apic_eoi_write);
 
+	if (kvmclock_vsyscall)
+		kvm_setup_vsyscall_timeinfo();
+
 #ifdef CONFIG_SMP
 	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
 	register_cpu_notifier(&kvm_cpu_notifier);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f1b42b3a186..220a360010f 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -23,6 +23,7 @@
 #include <asm/apic.h>
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
+#include <linux/memblock.h>
 
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
@@ -39,7 +40,7 @@ static int parse_no_kvmclock(char *arg)
 early_param("no-kvmclock", parse_no_kvmclock);
 
 /* The hypervisor will put information about time periodically here */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
+static struct pvclock_vsyscall_time_info *hv_clock;
 static struct pvclock_wall_clock wall_clock;
 
 /*
@@ -52,15 +53,20 @@ static unsigned long kvm_get_wallclock(void)
 	struct pvclock_vcpu_time_info *vcpu_time;
 	struct timespec ts;
 	int low, high;
+	int cpu;
 
 	low = (int)__pa_symbol(&wall_clock);
 	high = ((u64)__pa_symbol(&wall_clock) >> 32);
 
 	native_write_msr(msr_kvm_wall_clock, low, high);
 
-	vcpu_time = &get_cpu_var(hv_clock);
+	preempt_disable();
+	cpu = smp_processor_id();
+
+	vcpu_time = &hv_clock[cpu].pvti;
 	pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
-	put_cpu_var(hv_clock);
+
+	preempt_enable();
 
 	return ts.tv_sec;
 }
@@ -74,9 +80,11 @@ static cycle_t kvm_clock_read(void)
 {
 	struct pvclock_vcpu_time_info *src;
 	cycle_t ret;
+	int cpu;
 
 	preempt_disable_notrace();
-	src = &__get_cpu_var(hv_clock);
+	cpu = smp_processor_id();
+	src = &hv_clock[cpu].pvti;
 	ret = pvclock_clocksource_read(src);
 	preempt_enable_notrace();
 	return ret;
@@ -99,8 +107,15 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
 static unsigned long kvm_get_tsc_khz(void)
 {
 	struct pvclock_vcpu_time_info *src;
-	src = &per_cpu(hv_clock, 0);
-	return pvclock_tsc_khz(src);
+	int cpu;
+	unsigned long tsc_khz;
+
+	preempt_disable();
+	cpu = smp_processor_id();
+	src = &hv_clock[cpu].pvti;
+	tsc_khz = pvclock_tsc_khz(src);
+	preempt_enable();
+	return tsc_khz;
 }
 
 static void kvm_get_preset_lpj(void)
@@ -119,10 +134,14 @@ bool kvm_check_and_clear_guest_paused(void)
 {
 	bool ret = false;
 	struct pvclock_vcpu_time_info *src;
+	int cpu = smp_processor_id();
 
-	src = &__get_cpu_var(hv_clock);
+	if (!hv_clock)
+		return ret;
+
+	src = &hv_clock[cpu].pvti;
 	if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {
-		__this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED);
+		src->flags &= ~PVCLOCK_GUEST_STOPPED;
 		ret = true;
 	}
 
@@ -141,9 +160,10 @@ int kvm_register_clock(char *txt)
 {
 	int cpu = smp_processor_id();
 	int low, high, ret;
+	struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti;
 
-	low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
-	high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
+	low = (int)__pa(src) | 1;
+	high = ((u64)__pa(src) >> 32);
 	ret = native_write_msr_safe(msr_kvm_system_time, low, high);
 	printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
 	       cpu, high, low, txt);
@@ -197,6 +217,8 @@ static void kvm_shutdown(void)
 
 void __init kvmclock_init(void)
 {
+	unsigned long mem;
+
 	if (!kvm_para_available())
 		return;
 
@@ -209,8 +231,18 @@ void __init kvmclock_init(void)
 	printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
 		msr_kvm_system_time, msr_kvm_wall_clock);
 
-	if (kvm_register_clock("boot clock"))
+	mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS,
+			     PAGE_SIZE);
+	if (!mem)
+		return;
+	hv_clock = __va(mem);
+
+	if (kvm_register_clock("boot clock")) {
+		hv_clock = NULL;
+		memblock_free(mem,
+			sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
 		return;
+	}
 	pv_time_ops.sched_clock = kvm_clock_read;
 	x86_platform.calibrate_tsc = kvm_get_tsc_khz;
 	x86_platform.get_wallclock = kvm_get_wallclock;
@@ -233,3 +265,37 @@ void __init kvmclock_init(void)
 	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
 		pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
 }
+
+int __init kvm_setup_vsyscall_timeinfo(void)
+{
+#ifdef CONFIG_X86_64
+	int cpu;
+	int ret;
+	u8 flags;
+	struct pvclock_vcpu_time_info *vcpu_time;
+	unsigned int size;
+
+	size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS;
+
+	preempt_disable();
+	cpu = smp_processor_id();
+
+	vcpu_time = &hv_clock[cpu].pvti;
+	flags = pvclock_read_flags(vcpu_time);
+
+	if (!(flags & PVCLOCK_TSC_STABLE_BIT)) {
+		preempt_enable();
+		return 1;
+	}
+
+	if ((ret = pvclock_init_vsyscall(hv_clock, size))) {
+		preempt_enable();
+		return ret;
+	}
+
+	preempt_enable();
+
+	kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
+#endif
+	return 0;
+}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 42eb3300dfc..85c39590c1a 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -17,23 +17,13 @@
 
 #include <linux/kernel.h>
 #include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/bootmem.h>
+#include <asm/fixmap.h>
 #include <asm/pvclock.h>
 
-/*
- * These are perodically updated
- *    xen: magic shared_info page
- *    kvm: gpa registered via msr
- * and then copied here.
- */
-struct pvclock_shadow_time {
-	u64 tsc_timestamp;     /* TSC at last update of time vals.  */
-	u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
-	u32 tsc_to_nsec_mul;
-	int tsc_shift;
-	u32 version;
-	u8  flags;
-};
-
 static u8 valid_flags __read_mostly = 0;
 
 void pvclock_set_flags(u8 flags)
@@ -41,34 +31,6 @@ void pvclock_set_flags(u8 flags)
 	valid_flags = flags;
 }
 
-static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
-{
-	u64 delta = native_read_tsc() - shadow->tsc_timestamp;
-	return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul,
-				   shadow->tsc_shift);
-}
-
-/*
- * Reads a consistent set of time-base values from hypervisor,
- * into a shadow data area.
- */
-static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
-					struct pvclock_vcpu_time_info *src)
-{
-	do {
-		dst->version = src->version;
-		rmb();		/* fetch version before data */
-		dst->tsc_timestamp     = src->tsc_timestamp;
-		dst->system_timestamp  = src->system_time;
-		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
-		dst->tsc_shift         = src->tsc_shift;
-		dst->flags             = src->flags;
-		rmb();		/* test version after fetching data */
-	} while ((src->version & 1) || (dst->version != src->version));
-
-	return dst->version;
-}
-
 unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
 {
 	u64 pv_tsc_khz = 1000000ULL << 32;
@@ -88,23 +50,32 @@ void pvclock_resume(void)
 	atomic64_set(&last_value, 0);
 }
 
+u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
+{
+	unsigned version;
+	cycle_t ret;
+	u8 flags;
+
+	do {
+		version = __pvclock_read_cycles(src, &ret, &flags);
+	} while ((src->version & 1) || version != src->version);
+
+	return flags & valid_flags;
+}
+
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 {
-	struct pvclock_shadow_time shadow;
 	unsigned version;
-	cycle_t ret, offset;
+	cycle_t ret;
 	u64 last;
+	u8 flags;
 
 	do {
-		version = pvclock_get_time_values(&shadow, src);
-		barrier();
-		offset = pvclock_get_nsec_offset(&shadow);
-		ret = shadow.system_timestamp + offset;
-		barrier();
-	} while (version != src->version);
+		version = __pvclock_read_cycles(src, &ret, &flags);
+	} while ((src->version & 1) || version != src->version);
 
 	if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
-		(shadow.flags & PVCLOCK_TSC_STABLE_BIT))
+		(flags & PVCLOCK_TSC_STABLE_BIT))
 		return ret;
 
 	/*
@@ -156,3 +127,71 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
 
 	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
 }
+
+static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
+
+static struct pvclock_vsyscall_time_info *
+pvclock_get_vsyscall_user_time_info(int cpu)
+{
+	if (!pvclock_vdso_info) {
+		BUG();
+		return NULL;
+	}
+
+	return &pvclock_vdso_info[cpu];
+}
+
+struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
+{
+	return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
+}
+
+#ifdef CONFIG_X86_64
+static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
+			        void *v)
+{
+	struct task_migration_notifier *mn = v;
+	struct pvclock_vsyscall_time_info *pvti;
+
+	pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
+
+	/* this is NULL when pvclock vsyscall is not initialized */
+	if (unlikely(pvti == NULL))
+		return NOTIFY_DONE;
+
+	pvti->migrate_count++;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block pvclock_migrate = {
+	.notifier_call = pvclock_task_migrate,
+};
+
+/*
+ * Initialize the generic pvclock vsyscall state.  This will allocate
+ * a/some page(s) for the per-vcpu pvclock information, set up a
+ * fixmap mapping for the page(s)
+ */
+
+int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
+				 int size)
+{
+	int idx;
+
+	WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
+
+	pvclock_vdso_info = i;
+
+	for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
+		__set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
+			     __pa_symbol(i) + (idx*PAGE_SIZE),
+			     PAGE_KERNEL_VVAR);
+	}
+
+
+	register_task_migration_notifier(&pvclock_migrate);
+
+	return 0;
+}
+#endif
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index ec79e773342..a20ecb5b6cb 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -320,6 +320,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		if (index == 0) {
 			entry->ebx &= kvm_supported_word9_x86_features;
 			cpuid_mask(&entry->ebx, 9);
+			// TSC_ADJUST is emulated
+			entry->ebx |= F(TSC_ADJUST);
 		} else
 			entry->ebx = 0;
 		entry->eax = 0;
@@ -659,6 +661,7 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
 	} else
 		*eax = *ebx = *ecx = *edx = 0;
 }
+EXPORT_SYMBOL_GPL(kvm_cpuid);
 
 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 58fc5148882..b7fd0798488 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -31,6 +31,14 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
 	return best && (best->ecx & bit(X86_FEATURE_XSAVE));
 }
 
+static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 7, 0);
+	return best && (best->ebx & bit(X86_FEATURE_TSC_ADJUST));
+}
+
 static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index bba39bfa1c4..a27e7637110 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -676,8 +676,9 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
 						addr.seg);
 		if (!usable)
 			goto bad;
-		/* code segment or read-only data segment */
-		if (((desc.type & 8) || !(desc.type & 2)) && write)
+		/* code segment in protected mode or read-only data segment */
+		if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8))
+					|| !(desc.type & 2)) && write)
 			goto bad;
 		/* unreadable code segment */
 		if (!fetch && (desc.type & 8) && !(desc.type & 2))
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 43e9fadca5d..9392f527f10 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1011,7 +1011,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
 		local_irq_save(flags);
 
 		now = apic->lapic_timer.timer.base->get_time();
-		guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
+		guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
 		if (likely(tscdeadline > guest_tsc)) {
 			ns = (tscdeadline - guest_tsc) * 1000000ULL;
 			do_div(ns, this_tsc_khz);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6f85fe0bf95..01d7c2ad05f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2382,12 +2382,20 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	    || (!vcpu->arch.mmu.direct_map && write_fault
 		&& !is_write_protection(vcpu) && !user_fault)) {
 
+		/*
+		 * There are two cases:
+		 * - the one is other vcpu creates new sp in the window
+		 *   between mapping_level() and acquiring mmu-lock.
+		 * - the another case is the new sp is created by itself
+		 *   (page-fault path) when guest uses the target gfn as
+		 *   its page table.
+		 * Both of these cases can be fixed by allowing guest to
+		 * retry the access, it will refault, then we can establish
+		 * the mapping by using small page.
+		 */
 		if (level > PT_PAGE_TABLE_LEVEL &&
-		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
-			ret = 1;
-			drop_spte(vcpu->kvm, sptep);
+		    has_wrprotected_page(vcpu->kvm, gfn, level))
 			goto done;
-		}
 
 		spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
 
@@ -2505,6 +2513,14 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 	mmu_free_roots(vcpu);
 }
 
+static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+	int bit7;
+
+	bit7 = (gpte >> 7) & 1;
+	return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
+}
+
 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 				     bool no_dirty_log)
 {
@@ -2517,6 +2533,26 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 	return gfn_to_pfn_memslot_atomic(slot, gfn);
 }
 
+static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu_page *sp, u64 *spte,
+				  u64 gpte)
+{
+	if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+		goto no_present;
+
+	if (!is_present_gpte(gpte))
+		goto no_present;
+
+	if (!(gpte & PT_ACCESSED_MASK))
+		goto no_present;
+
+	return false;
+
+no_present:
+	drop_spte(vcpu->kvm, spte);
+	return true;
+}
+
 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
 				    struct kvm_mmu_page *sp,
 				    u64 *start, u64 *end)
@@ -2671,7 +2707,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
 	 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
 	 * here.
 	 */
-	if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
+	if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
 	    level == PT_PAGE_TABLE_LEVEL &&
 	    PageTransCompound(pfn_to_page(pfn)) &&
 	    !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
@@ -2699,18 +2735,13 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
 	}
 }
 
-static bool mmu_invalid_pfn(pfn_t pfn)
-{
-	return unlikely(is_invalid_pfn(pfn));
-}
-
 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
 				pfn_t pfn, unsigned access, int *ret_val)
 {
 	bool ret = true;
 
 	/* The pfn is invalid, report the error! */
-	if (unlikely(is_invalid_pfn(pfn))) {
+	if (unlikely(is_error_pfn(pfn))) {
 		*ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
 		goto exit;
 	}
@@ -2862,7 +2893,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
 		return r;
 
 	spin_lock(&vcpu->kvm->mmu_lock);
-	if (mmu_notifier_retry(vcpu, mmu_seq))
+	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
 		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
 	if (likely(!force_pt_level))
@@ -3331,7 +3362,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 		return r;
 
 	spin_lock(&vcpu->kvm->mmu_lock);
-	if (mmu_notifier_retry(vcpu, mmu_seq))
+	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
 		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
 	if (likely(!force_pt_level))
@@ -3399,14 +3430,6 @@ static void paging_free(struct kvm_vcpu *vcpu)
 	nonpaging_free(vcpu);
 }
 
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
-	int bit7;
-
-	bit7 = (gpte >> 7) & 1;
-	return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
-}
-
 static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
 {
 	unsigned mask;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 714e2c01a6f..891eb6d93b8 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -305,51 +305,43 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
 					addr, access);
 }
 
-static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
-				    struct kvm_mmu_page *sp, u64 *spte,
-				    pt_element_t gpte)
+static bool
+FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+		     u64 *spte, pt_element_t gpte, bool no_dirty_log)
 {
-	if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
-		goto no_present;
-
-	if (!is_present_gpte(gpte))
-		goto no_present;
-
-	if (!(gpte & PT_ACCESSED_MASK))
-		goto no_present;
-
-	return false;
-
-no_present:
-	drop_spte(vcpu->kvm, spte);
-	return true;
-}
-
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-			      u64 *spte, const void *pte)
-{
-	pt_element_t gpte;
 	unsigned pte_access;
+	gfn_t gfn;
 	pfn_t pfn;
 
-	gpte = *(const pt_element_t *)pte;
-	if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
-		return;
+	if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
+		return false;
 
 	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
+
+	gfn = gpte_to_gfn(gpte);
 	pte_access = sp->role.access & gpte_access(vcpu, gpte);
 	protect_clean_gpte(&pte_access, gpte);
-	pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
-	if (mmu_invalid_pfn(pfn))
-		return;
+	pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
+			no_dirty_log && (pte_access & ACC_WRITE_MASK));
+	if (is_error_pfn(pfn))
+		return false;
 
 	/*
-	 * we call mmu_set_spte() with host_writable = true because that
-	 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
+	 * we call mmu_set_spte() with host_writable = true because
+	 * pte_prefetch_gfn_to_pfn always gets a writable pfn.
 	 */
 	mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
-		     NULL, PT_PAGE_TABLE_LEVEL,
-		     gpte_to_gfn(gpte), pfn, true, true);
+		     NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true);
+
+	return true;
+}
+
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			      u64 *spte, const void *pte)
+{
+	pt_element_t gpte = *(const pt_element_t *)pte;
+
+	FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false);
 }
 
 static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
@@ -395,53 +387,34 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 	spte = sp->spt + i;
 
 	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
-		pt_element_t gpte;
-		unsigned pte_access;
-		gfn_t gfn;
-		pfn_t pfn;
-
 		if (spte == sptep)
 			continue;
 
 		if (is_shadow_present_pte(*spte))
 			continue;
 
-		gpte = gptep[i];
-
-		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
-			continue;
-
-		pte_access = sp->role.access & gpte_access(vcpu, gpte);
-		protect_clean_gpte(&pte_access, gpte);
-		gfn = gpte_to_gfn(gpte);
-		pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
-				      pte_access & ACC_WRITE_MASK);
-		if (mmu_invalid_pfn(pfn))
+		if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true))
 			break;
-
-		mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
-			     NULL, PT_PAGE_TABLE_LEVEL, gfn,
-			     pfn, true, true);
 	}
 }
 
 /*
  * Fetch a shadow pte for a specific level in the paging hierarchy.
+ * If the guest tries to write a write-protected page, we need to
+ * emulate this operation, return 1 to indicate this case.
  */
-static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			 struct guest_walker *gw,
 			 int user_fault, int write_fault, int hlevel,
-			 int *emulate, pfn_t pfn, bool map_writable,
-			 bool prefault)
+			 pfn_t pfn, bool map_writable, bool prefault)
 {
-	unsigned access = gw->pt_access;
 	struct kvm_mmu_page *sp = NULL;
-	int top_level;
-	unsigned direct_access;
 	struct kvm_shadow_walk_iterator it;
+	unsigned direct_access, access = gw->pt_access;
+	int top_level, emulate = 0;
 
 	if (!is_present_gpte(gw->ptes[gw->level - 1]))
-		return NULL;
+		return 0;
 
 	direct_access = gw->pte_access;
 
@@ -505,17 +478,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 
 	clear_sp_write_flooding_count(it.sptep);
 	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
-		     user_fault, write_fault, emulate, it.level,
+		     user_fault, write_fault, &emulate, it.level,
 		     gw->gfn, pfn, prefault, map_writable);
 	FNAME(pte_prefetch)(vcpu, gw, it.sptep);
 
-	return it.sptep;
+	return emulate;
 
 out_gpte_changed:
 	if (sp)
 		kvm_mmu_put_page(sp, it.sptep);
 	kvm_release_pfn_clean(pfn);
-	return NULL;
+	return 0;
 }
 
 /*
@@ -538,8 +511,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	int write_fault = error_code & PFERR_WRITE_MASK;
 	int user_fault = error_code & PFERR_USER_MASK;
 	struct guest_walker walker;
-	u64 *sptep;
-	int emulate = 0;
 	int r;
 	pfn_t pfn;
 	int level = PT_PAGE_TABLE_LEVEL;
@@ -594,24 +565,20 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 		return r;
 
 	spin_lock(&vcpu->kvm->mmu_lock);
-	if (mmu_notifier_retry(vcpu, mmu_seq))
+	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
 		goto out_unlock;
 
 	kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
 	kvm_mmu_free_some_pages(vcpu);
 	if (!force_pt_level)
 		transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
-	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
-			     level, &emulate, pfn, map_writable, prefault);
-	(void)sptep;
-	pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
-		 sptep, *sptep, emulate);
-
+	r = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
+			 level, pfn, map_writable, prefault);
 	++vcpu->stat.pf_fixed;
 	kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
-	return emulate;
+	return r;
 
 out_unlock:
 	spin_unlock(&vcpu->kvm->mmu_lock);
@@ -757,7 +724,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 					  sizeof(pt_element_t)))
 			return -EINVAL;
 
-		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
+		if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) {
 			vcpu->kvm->tlbs_dirty++;
 			continue;
 		}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d017df3899e..d29d3cd1c15 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -20,6 +20,7 @@
 #include "mmu.h"
 #include "kvm_cache_regs.h"
 #include "x86.h"
+#include "cpuid.h"
 
 #include <linux/module.h>
 #include <linux/mod_devicetable.h>
@@ -630,15 +631,12 @@ static int svm_hardware_enable(void *garbage)
 		return -EBUSY;
 
 	if (!has_svm()) {
-		printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n",
-		       me);
+		pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
 		return -EINVAL;
 	}
 	sd = per_cpu(svm_data, me);
-
 	if (!sd) {
-		printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
-		       me);
+		pr_err("%s: svm_data is NULL on %d\n", __func__, me);
 		return -EINVAL;
 	}
 
@@ -1012,6 +1010,13 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
 	svm->tsc_ratio             = ratio;
 }
 
+static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	return svm->vmcb->control.tsc_offset;
+}
+
 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -1189,6 +1194,8 @@ static void init_vmcb(struct vcpu_svm *svm)
 static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
+	u32 dummy;
+	u32 eax = 1;
 
 	init_vmcb(svm);
 
@@ -1197,8 +1204,9 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
 		svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
 		svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
 	}
-	vcpu->arch.regs_avail = ~0;
-	vcpu->arch.regs_dirty = ~0;
+
+	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
 
 	return 0;
 }
@@ -1254,11 +1262,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
 	svm->asid_generation = 0;
 	init_vmcb(svm);
-	kvm_write_tsc(&svm->vcpu, 0);
-
-	err = fx_init(&svm->vcpu);
-	if (err)
-		goto free_page4;
 
 	svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
 	if (kvm_vcpu_is_bsp(&svm->vcpu))
@@ -1268,8 +1271,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 
 	return &svm->vcpu;
 
-free_page4:
-	__free_page(hsave_page);
 free_page3:
 	__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
 free_page2:
@@ -3008,11 +3009,11 @@ static int cr8_write_interception(struct vcpu_svm *svm)
 	return 0;
 }
 
-u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu)
+u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
 {
 	struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
 	return vmcb->control.tsc_offset +
-		svm_scale_tsc(vcpu, native_read_tsc());
+		svm_scale_tsc(vcpu, host_tsc);
 }
 
 static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
@@ -3131,13 +3132,15 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
 	return 0;
 }
 
-static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
+static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	u32 ecx = msr->index;
+	u64 data = msr->data;
 	switch (ecx) {
 	case MSR_IA32_TSC:
-		kvm_write_tsc(vcpu, data);
+		kvm_write_tsc(vcpu, msr);
 		break;
 	case MSR_STAR:
 		svm->vmcb->save.star = data;
@@ -3192,20 +3195,24 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 		vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
 		break;
 	default:
-		return kvm_set_msr_common(vcpu, ecx, data);
+		return kvm_set_msr_common(vcpu, msr);
 	}
 	return 0;
 }
 
 static int wrmsr_interception(struct vcpu_svm *svm)
 {
+	struct msr_data msr;
 	u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
 	u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
 		| ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
 
+	msr.data = data;
+	msr.index = ecx;
+	msr.host_initiated = false;
 
 	svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
-	if (svm_set_msr(&svm->vcpu, ecx, data)) {
+	if (svm_set_msr(&svm->vcpu, &msr)) {
 		trace_kvm_msr_write_ex(ecx, data);
 		kvm_inject_gp(&svm->vcpu, 0);
 	} else {
@@ -4302,6 +4309,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.has_wbinvd_exit = svm_has_wbinvd_exit,
 
 	.set_tsc_khz = svm_set_tsc_khz,
+	.read_tsc_offset = svm_read_tsc_offset,
 	.write_tsc_offset = svm_write_tsc_offset,
 	.adjust_tsc_offset = svm_adjust_tsc_offset,
 	.compute_tsc_offset = svm_compute_tsc_offset,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index bca63f04dcc..fe5e00ed703 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -4,6 +4,7 @@
 #include <linux/tracepoint.h>
 #include <asm/vmx.h>
 #include <asm/svm.h>
+#include <asm/clocksource.h>
 
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvm
@@ -754,6 +755,68 @@ TRACE_EVENT(
 		  __entry->write ? "Write" : "Read",
 		  __entry->gpa_match ? "GPA" : "GVA")
 );
+
+#ifdef CONFIG_X86_64
+
+#define host_clocks					\
+	{VCLOCK_NONE, "none"},				\
+	{VCLOCK_TSC,  "tsc"},				\
+	{VCLOCK_HPET, "hpet"}				\
+
+TRACE_EVENT(kvm_update_master_clock,
+	TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched),
+	TP_ARGS(use_master_clock, host_clock, offset_matched),
+
+	TP_STRUCT__entry(
+		__field(		bool,	use_master_clock	)
+		__field(	unsigned int,	host_clock		)
+		__field(		bool,	offset_matched		)
+	),
+
+	TP_fast_assign(
+		__entry->use_master_clock	= use_master_clock;
+		__entry->host_clock		= host_clock;
+		__entry->offset_matched		= offset_matched;
+	),
+
+	TP_printk("masterclock %d hostclock %s offsetmatched %u",
+		  __entry->use_master_clock,
+		  __print_symbolic(__entry->host_clock, host_clocks),
+		  __entry->offset_matched)
+);
+
+TRACE_EVENT(kvm_track_tsc,
+	TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched,
+		 unsigned int online_vcpus, bool use_master_clock,
+		 unsigned int host_clock),
+	TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock,
+		host_clock),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	vcpu_id			)
+		__field(	unsigned int,	nr_vcpus_matched_tsc	)
+		__field(	unsigned int,	online_vcpus		)
+		__field(	bool,		use_master_clock	)
+		__field(	unsigned int,	host_clock		)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id		= vcpu_id;
+		__entry->nr_vcpus_matched_tsc	= nr_matched;
+		__entry->online_vcpus		= online_vcpus;
+		__entry->use_master_clock	= use_master_clock;
+		__entry->host_clock		= host_clock;
+	),
+
+	TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u"
+		  " hostclock %s",
+		  __entry->vcpu_id, __entry->use_master_clock,
+		  __entry->nr_vcpus_matched_tsc, __entry->online_vcpus,
+		  __print_symbolic(__entry->host_clock, host_clocks))
+);
+
+#endif /* CONFIG_X86_64 */
+
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f85815945fc..9120ae1901e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -42,6 +42,7 @@
 #include <asm/i387.h>
 #include <asm/xcr.h>
 #include <asm/perf_event.h>
+#include <asm/kexec.h>
 
 #include "trace.h"
 
@@ -802,11 +803,6 @@ static inline bool cpu_has_vmx_ept_ad_bits(void)
 	return vmx_capability.ept & VMX_EPT_AD_BIT;
 }
 
-static inline bool cpu_has_vmx_invept_individual_addr(void)
-{
-	return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
-}
-
 static inline bool cpu_has_vmx_invept_context(void)
 {
 	return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
@@ -992,6 +988,46 @@ static void vmcs_load(struct vmcs *vmcs)
 		       vmcs, phys_addr);
 }
 
+#ifdef CONFIG_KEXEC
+/*
+ * This bitmap is used to indicate whether the vmclear
+ * operation is enabled on all cpus. All disabled by
+ * default.
+ */
+static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
+
+static inline void crash_enable_local_vmclear(int cpu)
+{
+	cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
+}
+
+static inline void crash_disable_local_vmclear(int cpu)
+{
+	cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
+}
+
+static inline int crash_local_vmclear_enabled(int cpu)
+{
+	return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
+}
+
+static void crash_vmclear_local_loaded_vmcss(void)
+{
+	int cpu = raw_smp_processor_id();
+	struct loaded_vmcs *v;
+
+	if (!crash_local_vmclear_enabled(cpu))
+		return;
+
+	list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
+			    loaded_vmcss_on_cpu_link)
+		vmcs_clear(v->vmcs);
+}
+#else
+static inline void crash_enable_local_vmclear(int cpu) { }
+static inline void crash_disable_local_vmclear(int cpu) { }
+#endif /* CONFIG_KEXEC */
+
 static void __loaded_vmcs_clear(void *arg)
 {
 	struct loaded_vmcs *loaded_vmcs = arg;
@@ -1001,15 +1037,28 @@ static void __loaded_vmcs_clear(void *arg)
 		return; /* vcpu migration can race with cpu offline */
 	if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
 		per_cpu(current_vmcs, cpu) = NULL;
+	crash_disable_local_vmclear(cpu);
 	list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
+
+	/*
+	 * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
+	 * is before setting loaded_vmcs->vcpu to -1 which is done in
+	 * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
+	 * then adds the vmcs into percpu list before it is deleted.
+	 */
+	smp_wmb();
+
 	loaded_vmcs_init(loaded_vmcs);
+	crash_enable_local_vmclear(cpu);
 }
 
 static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
 {
-	if (loaded_vmcs->cpu != -1)
-		smp_call_function_single(
-			loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1);
+	int cpu = loaded_vmcs->cpu;
+
+	if (cpu != -1)
+		smp_call_function_single(cpu,
+			 __loaded_vmcs_clear, loaded_vmcs, 1);
 }
 
 static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
@@ -1051,17 +1100,6 @@ static inline void ept_sync_context(u64 eptp)
 	}
 }
 
-static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
-{
-	if (enable_ept) {
-		if (cpu_has_vmx_invept_individual_addr())
-			__invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
-					eptp, gpa);
-		else
-			ept_sync_context(eptp);
-	}
-}
-
 static __always_inline unsigned long vmcs_readl(unsigned long field)
 {
 	unsigned long value;
@@ -1535,8 +1573,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 		local_irq_disable();
+		crash_disable_local_vmclear(cpu);
+
+		/*
+		 * Read loaded_vmcs->cpu should be before fetching
+		 * loaded_vmcs->loaded_vmcss_on_cpu_link.
+		 * See the comments in __loaded_vmcs_clear().
+		 */
+		smp_rmb();
+
 		list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
 			 &per_cpu(loaded_vmcss_on_cpu, cpu));
+		crash_enable_local_vmclear(cpu);
 		local_irq_enable();
 
 		/*
@@ -1839,11 +1887,10 @@ static u64 guest_read_tsc(void)
  * Like guest_read_tsc, but always returns L1's notion of the timestamp
  * counter, even if a nested guest (L2) is currently running.
  */
-u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu)
+u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
 {
-	u64 host_tsc, tsc_offset;
+	u64 tsc_offset;
 
-	rdtscll(host_tsc);
 	tsc_offset = is_guest_mode(vcpu) ?
 		to_vmx(vcpu)->nested.vmcs01_tsc_offset :
 		vmcs_read64(TSC_OFFSET);
@@ -1866,6 +1913,11 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
 		WARN(1, "user requested TSC rate below hardware speed\n");
 }
 
+static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
+{
+	return vmcs_read64(TSC_OFFSET);
+}
+
 /*
  * writes 'offset' into guest's timestamp counter offset register
  */
@@ -2202,15 +2254,17 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
  * Returns 0 on success, non-0 otherwise.
  * Assumes vcpu_load() was already called.
  */
-static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct shared_msr_entry *msr;
 	int ret = 0;
+	u32 msr_index = msr_info->index;
+	u64 data = msr_info->data;
 
 	switch (msr_index) {
 	case MSR_EFER:
-		ret = kvm_set_msr_common(vcpu, msr_index, data);
+		ret = kvm_set_msr_common(vcpu, msr_info);
 		break;
 #ifdef CONFIG_X86_64
 	case MSR_FS_BASE:
@@ -2236,7 +2290,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 		vmcs_writel(GUEST_SYSENTER_ESP, data);
 		break;
 	case MSR_IA32_TSC:
-		kvm_write_tsc(vcpu, data);
+		kvm_write_tsc(vcpu, msr_info);
 		break;
 	case MSR_IA32_CR_PAT:
 		if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
@@ -2244,7 +2298,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 			vcpu->arch.pat = data;
 			break;
 		}
-		ret = kvm_set_msr_common(vcpu, msr_index, data);
+		ret = kvm_set_msr_common(vcpu, msr_info);
+		break;
+	case MSR_IA32_TSC_ADJUST:
+		ret = kvm_set_msr_common(vcpu, msr_info);
 		break;
 	case MSR_TSC_AUX:
 		if (!vmx->rdtscp_enabled)
@@ -2267,7 +2324,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 			}
 			break;
 		}
-		ret = kvm_set_msr_common(vcpu, msr_index, data);
+		ret = kvm_set_msr_common(vcpu, msr_info);
 	}
 
 	return ret;
@@ -2341,6 +2398,18 @@ static int hardware_enable(void *garbage)
 		return -EBUSY;
 
 	INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+
+	/*
+	 * Now we can enable the vmclear operation in kdump
+	 * since the loaded_vmcss_on_cpu list on this cpu
+	 * has been initialized.
+	 *
+	 * Though the cpu is not in VMX operation now, there
+	 * is no problem to enable the vmclear operation
+	 * for the loaded_vmcss_on_cpu list is empty!
+	 */
+	crash_enable_local_vmclear(cpu);
+
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
 
 	test_bits = FEATURE_CONTROL_LOCKED;
@@ -2697,6 +2766,7 @@ static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment
 	if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) {
 		tmp.base = vmcs_readl(sf->base);
 		tmp.selector = vmcs_read16(sf->selector);
+		tmp.dpl = tmp.selector & SELECTOR_RPL_MASK;
 		tmp.s = 1;
 	}
 	vmx_set_segment(vcpu, &tmp, seg);
@@ -3246,7 +3316,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
 	 * unrestricted guest like Westmere to older host that don't have
 	 * unrestricted guest like Nehelem.
 	 */
-	if (!enable_unrestricted_guest && vmx->rmode.vm86_active) {
+	if (vmx->rmode.vm86_active) {
 		switch (seg) {
 		case VCPU_SREG_CS:
 			vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
@@ -3897,8 +3967,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
 	set_cr4_guest_host_mask(vmx);
 
-	kvm_write_tsc(&vmx->vcpu, 0);
-
 	return 0;
 }
 
@@ -3908,8 +3976,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	u64 msr;
 	int ret;
 
-	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
-
 	vmx->rmode.vm86_active = 0;
 
 	vmx->soft_vnmi_blocked = 0;
@@ -3921,10 +3987,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 		msr |= MSR_IA32_APICBASE_BSP;
 	kvm_set_apic_base(&vmx->vcpu, msr);
 
-	ret = fx_init(&vmx->vcpu);
-	if (ret != 0)
-		goto out;
-
 	vmx_segment_cache_clear(vmx);
 
 	seg_setup(VCPU_SREG_CS);
@@ -3965,7 +4027,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 		kvm_rip_write(vcpu, 0xfff0);
 	else
 		kvm_rip_write(vcpu, 0);
-	kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
 
 	vmcs_writel(GUEST_GDTR_BASE, 0);
 	vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
@@ -4015,7 +4076,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	/* HACK: Don't enable emulation on guest boot/reset */
 	vmx->emulation_required = 0;
 
-out:
 	return ret;
 }
 
@@ -4287,16 +4347,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 	if (is_machine_check(intr_info))
 		return handle_machine_check(vcpu);
 
-	if ((vect_info & VECTORING_INFO_VALID_MASK) &&
-	    !is_page_fault(intr_info)) {
-		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
-		vcpu->run->internal.ndata = 2;
-		vcpu->run->internal.data[0] = vect_info;
-		vcpu->run->internal.data[1] = intr_info;
-		return 0;
-	}
-
 	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
 		return 1;  /* already handled by vmx_vcpu_run() */
 
@@ -4315,6 +4365,22 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 	error_code = 0;
 	if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
 		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+
+	/*
+	 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
+	 * MMIO, it is better to report an internal error.
+	 * See the comments in vmx_handle_exit.
+	 */
+	if ((vect_info & VECTORING_INFO_VALID_MASK) &&
+	    !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
+		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
+		vcpu->run->internal.ndata = 2;
+		vcpu->run->internal.data[0] = vect_info;
+		vcpu->run->internal.data[1] = intr_info;
+		return 0;
+	}
+
 	if (is_page_fault(intr_info)) {
 		/* EPT won't cause page fault directly */
 		BUG_ON(enable_ept);
@@ -4626,11 +4692,15 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu)
 
 static int handle_wrmsr(struct kvm_vcpu *vcpu)
 {
+	struct msr_data msr;
 	u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
 	u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
 		| ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
 
-	if (vmx_set_msr(vcpu, ecx, data) != 0) {
+	msr.data = data;
+	msr.index = ecx;
+	msr.host_initiated = false;
+	if (vmx_set_msr(vcpu, &msr) != 0) {
 		trace_kvm_msr_write_ex(ecx, data);
 		kvm_inject_gp(vcpu, 0);
 		return 1;
@@ -4827,11 +4897,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
-	if (exit_qualification & (1 << 6)) {
-		printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
-		return -EINVAL;
-	}
-
 	gla_validity = (exit_qualification >> 7) & 0x3;
 	if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
 		printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
@@ -5979,13 +6044,24 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 
+	/*
+	 * Note:
+	 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
+	 * delivery event since it indicates guest is accessing MMIO.
+	 * The vm-exit can be triggered again after return to guest that
+	 * will cause infinite loop.
+	 */
 	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
 			(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
 			exit_reason != EXIT_REASON_EPT_VIOLATION &&
-			exit_reason != EXIT_REASON_TASK_SWITCH))
-		printk(KERN_WARNING "%s: unexpected, valid vectoring info "
-		       "(0x%x) and exit reason is 0x%x\n",
-		       __func__, vectoring_info, exit_reason);
+			exit_reason != EXIT_REASON_TASK_SWITCH)) {
+		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
+		vcpu->run->internal.ndata = 2;
+		vcpu->run->internal.data[0] = vectoring_info;
+		vcpu->run->internal.data[1] = exit_reason;
+		return 0;
+	}
 
 	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
 	    !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
@@ -7309,6 +7385,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 
 	.set_tsc_khz = vmx_set_tsc_khz,
+	.read_tsc_offset = vmx_read_tsc_offset,
 	.write_tsc_offset = vmx_write_tsc_offset,
 	.adjust_tsc_offset = vmx_adjust_tsc_offset,
 	.compute_tsc_offset = vmx_compute_tsc_offset,
@@ -7367,6 +7444,11 @@ static int __init vmx_init(void)
 	if (r)
 		goto out3;
 
+#ifdef CONFIG_KEXEC
+	rcu_assign_pointer(crash_vmclear_loaded_vmcss,
+			   crash_vmclear_local_loaded_vmcss);
+#endif
+
 	vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
 	vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
 	vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
@@ -7404,6 +7486,11 @@ static void __exit vmx_exit(void)
 	free_page((unsigned long)vmx_io_bitmap_b);
 	free_page((unsigned long)vmx_io_bitmap_a);
 
+#ifdef CONFIG_KEXEC
+	rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL);
+	synchronize_rcu();
+#endif
+
 	kvm_exit();
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4f7641756be..76f54461f7c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -46,6 +46,8 @@
 #include <linux/uaccess.h>
 #include <linux/hash.h>
 #include <linux/pci.h>
+#include <linux/timekeeper_internal.h>
+#include <linux/pvclock_gtod.h>
 #include <trace/events/kvm.h>
 
 #define CREATE_TRACE_POINTS
@@ -158,7 +160,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 
 u64 __read_mostly host_xcr0;
 
-int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
+static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
+
+static int kvm_vcpu_reset(struct kvm_vcpu *vcpu);
 
 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
 {
@@ -633,7 +637,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 	}
 
 	if (is_long_mode(vcpu)) {
-		if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) {
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {
 			if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
 				return 1;
 		} else
@@ -827,6 +831,7 @@ static u32 msrs_to_save[] = {
 static unsigned num_msrs_to_save;
 
 static const u32 emulated_msrs[] = {
+	MSR_IA32_TSC_ADJUST,
 	MSR_IA32_TSCDEADLINE,
 	MSR_IA32_MISC_ENABLE,
 	MSR_IA32_MCG_STATUS,
@@ -886,9 +891,9 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
  * Returns 0 on success, non-0 otherwise.
  * Assumes vcpu_load() was already called.
  */
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 {
-	return kvm_x86_ops->set_msr(vcpu, msr_index, data);
+	return kvm_x86_ops->set_msr(vcpu, msr);
 }
 
 /*
@@ -896,9 +901,63 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
  */
 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 {
-	return kvm_set_msr(vcpu, index, *data);
+	struct msr_data msr;
+
+	msr.data = *data;
+	msr.index = index;
+	msr.host_initiated = true;
+	return kvm_set_msr(vcpu, &msr);
 }
 
+#ifdef CONFIG_X86_64
+struct pvclock_gtod_data {
+	seqcount_t	seq;
+
+	struct { /* extract of a clocksource struct */
+		int vclock_mode;
+		cycle_t	cycle_last;
+		cycle_t	mask;
+		u32	mult;
+		u32	shift;
+	} clock;
+
+	/* open coded 'struct timespec' */
+	u64		monotonic_time_snsec;
+	time_t		monotonic_time_sec;
+};
+
+static struct pvclock_gtod_data pvclock_gtod_data;
+
+static void update_pvclock_gtod(struct timekeeper *tk)
+{
+	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
+
+	write_seqcount_begin(&vdata->seq);
+
+	/* copy pvclock gtod data */
+	vdata->clock.vclock_mode	= tk->clock->archdata.vclock_mode;
+	vdata->clock.cycle_last		= tk->clock->cycle_last;
+	vdata->clock.mask		= tk->clock->mask;
+	vdata->clock.mult		= tk->mult;
+	vdata->clock.shift		= tk->shift;
+
+	vdata->monotonic_time_sec	= tk->xtime_sec
+					+ tk->wall_to_monotonic.tv_sec;
+	vdata->monotonic_time_snsec	= tk->xtime_nsec
+					+ (tk->wall_to_monotonic.tv_nsec
+						<< tk->shift);
+	while (vdata->monotonic_time_snsec >=
+					(((u64)NSEC_PER_SEC) << tk->shift)) {
+		vdata->monotonic_time_snsec -=
+					((u64)NSEC_PER_SEC) << tk->shift;
+		vdata->monotonic_time_sec++;
+	}
+
+	write_seqcount_end(&vdata->seq);
+}
+#endif
+
+
 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 {
 	int version;
@@ -995,6 +1054,10 @@ static inline u64 get_kernel_ns(void)
 	return timespec_to_ns(&ts);
 }
 
+#ifdef CONFIG_X86_64
+static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
+#endif
+
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 unsigned long max_tsc_khz;
 
@@ -1046,12 +1109,47 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
 	return tsc;
 }
 
-void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
+void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+	bool vcpus_matched;
+	bool do_request = false;
+	struct kvm_arch *ka = &vcpu->kvm->arch;
+	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+
+	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
+			 atomic_read(&vcpu->kvm->online_vcpus));
+
+	if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC)
+		if (!ka->use_master_clock)
+			do_request = 1;
+
+	if (!vcpus_matched && ka->use_master_clock)
+			do_request = 1;
+
+	if (do_request)
+		kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+
+	trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
+			    atomic_read(&vcpu->kvm->online_vcpus),
+		            ka->use_master_clock, gtod->clock.vclock_mode);
+#endif
+}
+
+static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
+{
+	u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
+	vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
+}
+
+void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 {
 	struct kvm *kvm = vcpu->kvm;
 	u64 offset, ns, elapsed;
 	unsigned long flags;
 	s64 usdiff;
+	bool matched;
+	u64 data = msr->data;
 
 	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
 	offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
@@ -1094,6 +1192,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 			offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
 			pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
 		}
+		matched = true;
 	} else {
 		/*
 		 * We split periods of matched TSC writes into generations.
@@ -1108,6 +1207,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 		kvm->arch.cur_tsc_nsec = ns;
 		kvm->arch.cur_tsc_write = data;
 		kvm->arch.cur_tsc_offset = offset;
+		matched = false;
 		pr_debug("kvm: new tsc generation %u, clock %llu\n",
 			 kvm->arch.cur_tsc_generation, data);
 	}
@@ -1129,26 +1229,195 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 	vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
 	vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
 
+	if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
+		update_ia32_tsc_adjust_msr(vcpu, offset);
 	kvm_x86_ops->write_tsc_offset(vcpu, offset);
 	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+
+	spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
+	if (matched)
+		kvm->arch.nr_vcpus_matched_tsc++;
+	else
+		kvm->arch.nr_vcpus_matched_tsc = 0;
+
+	kvm_track_tsc_matching(vcpu);
+	spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
 }
 
 EXPORT_SYMBOL_GPL(kvm_write_tsc);
 
+#ifdef CONFIG_X86_64
+
+static cycle_t read_tsc(void)
+{
+	cycle_t ret;
+	u64 last;
+
+	/*
+	 * Empirically, a fence (of type that depends on the CPU)
+	 * before rdtsc is enough to ensure that rdtsc is ordered
+	 * with respect to loads.  The various CPU manuals are unclear
+	 * as to whether rdtsc can be reordered with later loads,
+	 * but no one has ever seen it happen.
+	 */
+	rdtsc_barrier();
+	ret = (cycle_t)vget_cycles();
+
+	last = pvclock_gtod_data.clock.cycle_last;
+
+	if (likely(ret >= last))
+		return ret;
+
+	/*
+	 * GCC likes to generate cmov here, but this branch is extremely
+	 * predictable (it's just a funciton of time and the likely is
+	 * very likely) and there's a data dependence, so force GCC
+	 * to generate a branch instead.  I don't barrier() because
+	 * we don't actually need a barrier, and if this function
+	 * ever gets inlined it will generate worse code.
+	 */
+	asm volatile ("");
+	return last;
+}
+
+static inline u64 vgettsc(cycle_t *cycle_now)
+{
+	long v;
+	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+
+	*cycle_now = read_tsc();
+
+	v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
+	return v * gtod->clock.mult;
+}
+
+static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
+{
+	unsigned long seq;
+	u64 ns;
+	int mode;
+	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+
+	ts->tv_nsec = 0;
+	do {
+		seq = read_seqcount_begin(&gtod->seq);
+		mode = gtod->clock.vclock_mode;
+		ts->tv_sec = gtod->monotonic_time_sec;
+		ns = gtod->monotonic_time_snsec;
+		ns += vgettsc(cycle_now);
+		ns >>= gtod->clock.shift;
+	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+	timespec_add_ns(ts, ns);
+
+	return mode;
+}
+
+/* returns true if host is using tsc clocksource */
+static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
+{
+	struct timespec ts;
+
+	/* checked again under seqlock below */
+	if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
+		return false;
+
+	if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)
+		return false;
+
+	monotonic_to_bootbased(&ts);
+	*kernel_ns = timespec_to_ns(&ts);
+
+	return true;
+}
+#endif
+
+/*
+ *
+ * Assuming a stable TSC across physical CPUS, and a stable TSC
+ * across virtual CPUs, the following condition is possible.
+ * Each numbered line represents an event visible to both
+ * CPUs at the next numbered event.
+ *
+ * "timespecX" represents host monotonic time. "tscX" represents
+ * RDTSC value.
+ *
+ * 		VCPU0 on CPU0		|	VCPU1 on CPU1
+ *
+ * 1.  read timespec0,tsc0
+ * 2.					| timespec1 = timespec0 + N
+ * 					| tsc1 = tsc0 + M
+ * 3. transition to guest		| transition to guest
+ * 4. ret0 = timespec0 + (rdtsc - tsc0) |
+ * 5.				        | ret1 = timespec1 + (rdtsc - tsc1)
+ * 				        | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
+ *
+ * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
+ *
+ * 	- ret0 < ret1
+ *	- timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
+ *		...
+ *	- 0 < N - M => M < N
+ *
+ * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
+ * always the case (the difference between two distinct xtime instances
+ * might be smaller then the difference between corresponding TSC reads,
+ * when updating guest vcpus pvclock areas).
+ *
+ * To avoid that problem, do not allow visibility of distinct
+ * system_timestamp/tsc_timestamp values simultaneously: use a master
+ * copy of host monotonic time values. Update that master copy
+ * in lockstep.
+ *
+ * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
+ *
+ */
+
+static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+	struct kvm_arch *ka = &kvm->arch;
+	int vclock_mode;
+	bool host_tsc_clocksource, vcpus_matched;
+
+	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
+			atomic_read(&kvm->online_vcpus));
+
+	/*
+	 * If the host uses TSC clock, then passthrough TSC as stable
+	 * to the guest.
+	 */
+	host_tsc_clocksource = kvm_get_time_and_clockread(
+					&ka->master_kernel_ns,
+					&ka->master_cycle_now);
+
+	ka->use_master_clock = host_tsc_clocksource & vcpus_matched;
+
+	if (ka->use_master_clock)
+		atomic_set(&kvm_guest_has_master_clock, 1);
+
+	vclock_mode = pvclock_gtod_data.clock.vclock_mode;
+	trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
+					vcpus_matched);
+#endif
+}
+
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
-	unsigned long flags;
+	unsigned long flags, this_tsc_khz;
 	struct kvm_vcpu_arch *vcpu = &v->arch;
+	struct kvm_arch *ka = &v->kvm->arch;
 	void *shared_kaddr;
-	unsigned long this_tsc_khz;
 	s64 kernel_ns, max_kernel_ns;
-	u64 tsc_timestamp;
+	u64 tsc_timestamp, host_tsc;
+	struct pvclock_vcpu_time_info *guest_hv_clock;
 	u8 pvclock_flags;
+	bool use_master_clock;
+
+	kernel_ns = 0;
+	host_tsc = 0;
 
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
-	tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
-	kernel_ns = get_kernel_ns();
 	this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
 	if (unlikely(this_tsc_khz == 0)) {
 		local_irq_restore(flags);
@@ -1157,6 +1426,24 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	}
 
 	/*
+	 * If the host uses TSC clock, then passthrough TSC as stable
+	 * to the guest.
+	 */
+	spin_lock(&ka->pvclock_gtod_sync_lock);
+	use_master_clock = ka->use_master_clock;
+	if (use_master_clock) {
+		host_tsc = ka->master_cycle_now;
+		kernel_ns = ka->master_kernel_ns;
+	}
+	spin_unlock(&ka->pvclock_gtod_sync_lock);
+	if (!use_master_clock) {
+		host_tsc = native_read_tsc();
+		kernel_ns = get_kernel_ns();
+	}
+
+	tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
+
+	/*
 	 * We may have to catch up the TSC to match elapsed wall clock
 	 * time for two reasons, even if kvmclock is used.
 	 *   1) CPU could have been running below the maximum TSC rate
@@ -1217,23 +1504,20 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 		vcpu->hw_tsc_khz = this_tsc_khz;
 	}
 
-	if (max_kernel_ns > kernel_ns)
-		kernel_ns = max_kernel_ns;
-
+	/* with a master <monotonic time, tsc value> tuple,
+	 * pvclock clock reads always increase at the (scaled) rate
+	 * of guest TSC - no need to deal with sampling errors.
+	 */
+	if (!use_master_clock) {
+		if (max_kernel_ns > kernel_ns)
+			kernel_ns = max_kernel_ns;
+	}
 	/* With all the info we got, fill in the values */
 	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
 	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
 	vcpu->last_kernel_ns = kernel_ns;
 	vcpu->last_guest_tsc = tsc_timestamp;
 
-	pvclock_flags = 0;
-	if (vcpu->pvclock_set_guest_stopped_request) {
-		pvclock_flags |= PVCLOCK_GUEST_STOPPED;
-		vcpu->pvclock_set_guest_stopped_request = false;
-	}
-
-	vcpu->hv_clock.flags = pvclock_flags;
-
 	/*
 	 * The interface expects us to write an even number signaling that the
 	 * update is finished. Since the guest won't see the intermediate
@@ -1243,6 +1527,22 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
 	shared_kaddr = kmap_atomic(vcpu->time_page);
 
+	guest_hv_clock = shared_kaddr + vcpu->time_offset;
+
+	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
+	pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
+
+	if (vcpu->pvclock_set_guest_stopped_request) {
+		pvclock_flags |= PVCLOCK_GUEST_STOPPED;
+		vcpu->pvclock_set_guest_stopped_request = false;
+	}
+
+	/* If the host uses TSC clocksource, then it is stable */
+	if (use_master_clock)
+		pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
+
+	vcpu->hv_clock.flags = pvclock_flags;
+
 	memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
 	       sizeof(vcpu->hv_clock));
 
@@ -1572,9 +1872,11 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 }
 
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
 	bool pr = false;
+	u32 msr = msr_info->index;
+	u64 data = msr_info->data;
 
 	switch (msr) {
 	case MSR_EFER:
@@ -1625,6 +1927,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	case MSR_IA32_TSCDEADLINE:
 		kvm_set_lapic_tscdeadline_msr(vcpu, data);
 		break;
+	case MSR_IA32_TSC_ADJUST:
+		if (guest_cpuid_has_tsc_adjust(vcpu)) {
+			if (!msr_info->host_initiated) {
+				u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
+				kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true);
+			}
+			vcpu->arch.ia32_tsc_adjust_msr = data;
+		}
+		break;
 	case MSR_IA32_MISC_ENABLE:
 		vcpu->arch.ia32_misc_enable_msr = data;
 		break;
@@ -1984,6 +2295,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_IA32_TSCDEADLINE:
 		data = kvm_get_lapic_tscdeadline_msr(vcpu);
 		break;
+	case MSR_IA32_TSC_ADJUST:
+		data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
+		break;
 	case MSR_IA32_MISC_ENABLE:
 		data = vcpu->arch.ia32_misc_enable_msr;
 		break;
@@ -2342,7 +2656,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 			kvm_x86_ops->write_tsc_offset(vcpu, offset);
 			vcpu->arch.tsc_catchup = 1;
 		}
-		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+		/*
+		 * On a host with synchronized TSC, there is no need to update
+		 * kvmclock on vcpu->cpu migration
+		 */
+		if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
+			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 		if (vcpu->cpu != cpu)
 			kvm_migrate_timers(vcpu);
 		vcpu->cpu = cpu;
@@ -2691,15 +3010,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		if (!vcpu->arch.apic)
 			goto out;
 		u.lapic = memdup_user(argp, sizeof(*u.lapic));
-		if (IS_ERR(u.lapic)) {
-			r = PTR_ERR(u.lapic);
-			goto out;
-		}
+		if (IS_ERR(u.lapic))
+			return PTR_ERR(u.lapic);
 
 		r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
-		if (r)
-			goto out;
-		r = 0;
 		break;
 	}
 	case KVM_INTERRUPT: {
@@ -2709,16 +3023,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		if (copy_from_user(&irq, argp, sizeof irq))
 			goto out;
 		r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
-		if (r)
-			goto out;
-		r = 0;
 		break;
 	}
 	case KVM_NMI: {
 		r = kvm_vcpu_ioctl_nmi(vcpu);
-		if (r)
-			goto out;
-		r = 0;
 		break;
 	}
 	case KVM_SET_CPUID: {
@@ -2729,8 +3037,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
 			goto out;
 		r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
-		if (r)
-			goto out;
 		break;
 	}
 	case KVM_SET_CPUID2: {
@@ -2742,8 +3048,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 			goto out;
 		r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
 					      cpuid_arg->entries);
-		if (r)
-			goto out;
 		break;
 	}
 	case KVM_GET_CPUID2: {
@@ -2875,10 +3179,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	}
 	case KVM_SET_XSAVE: {
 		u.xsave = memdup_user(argp, sizeof(*u.xsave));
-		if (IS_ERR(u.xsave)) {
-			r = PTR_ERR(u.xsave);
-			goto out;
-		}
+		if (IS_ERR(u.xsave))
+			return PTR_ERR(u.xsave);
 
 		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
 		break;
@@ -2900,10 +3202,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	}
 	case KVM_SET_XCRS: {
 		u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
-		if (IS_ERR(u.xcrs)) {
-			r = PTR_ERR(u.xcrs);
-			goto out;
-		}
+		if (IS_ERR(u.xcrs))
+			return PTR_ERR(u.xcrs);
 
 		r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
 		break;
@@ -2951,7 +3251,7 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
 	int ret;
 
 	if (addr > (unsigned int)(-3 * PAGE_SIZE))
-		return -1;
+		return -EINVAL;
 	ret = kvm_x86_ops->set_tss_addr(kvm, addr);
 	return ret;
 }
@@ -3212,8 +3512,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	switch (ioctl) {
 	case KVM_SET_TSS_ADDR:
 		r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
-		if (r < 0)
-			goto out;
 		break;
 	case KVM_SET_IDENTITY_MAP_ADDR: {
 		u64 ident_addr;
@@ -3222,14 +3520,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
 			goto out;
 		r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
-		if (r < 0)
-			goto out;
 		break;
 	}
 	case KVM_SET_NR_MMU_PAGES:
 		r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
-		if (r)
-			goto out;
 		break;
 	case KVM_GET_NR_MMU_PAGES:
 		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
@@ -3320,8 +3614,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = 0;
 	get_irqchip_out:
 		kfree(chip);
-		if (r)
-			goto out;
 		break;
 	}
 	case KVM_SET_IRQCHIP: {
@@ -3343,8 +3635,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = 0;
 	set_irqchip_out:
 		kfree(chip);
-		if (r)
-			goto out;
 		break;
 	}
 	case KVM_GET_PIT: {
@@ -3371,9 +3661,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		if (!kvm->arch.vpit)
 			goto out;
 		r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
-		if (r)
-			goto out;
-		r = 0;
 		break;
 	}
 	case KVM_GET_PIT2: {
@@ -3397,9 +3684,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		if (!kvm->arch.vpit)
 			goto out;
 		r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
-		if (r)
-			goto out;
-		r = 0;
 		break;
 	}
 	case KVM_REINJECT_CONTROL: {
@@ -3408,9 +3692,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		if (copy_from_user(&control, argp, sizeof(control)))
 			goto out;
 		r = kvm_vm_ioctl_reinject(kvm, &control);
-		if (r)
-			goto out;
-		r = 0;
 		break;
 	}
 	case KVM_XEN_HVM_CONFIG: {
@@ -4273,7 +4554,12 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
 static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
 			    u32 msr_index, u64 data)
 {
-	return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
+	struct msr_data msr;
+
+	msr.data = data;
+	msr.index = msr_index;
+	msr.host_initiated = false;
+	return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
 }
 
 static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
@@ -4495,7 +4781,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
 	 * instruction -> ...
 	 */
 	pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
-	if (!is_error_pfn(pfn)) {
+	if (!is_error_noslot_pfn(pfn)) {
 		kvm_release_pfn_clean(pfn);
 		return true;
 	}
@@ -4881,6 +5167,50 @@ static void kvm_set_mmio_spte_mask(void)
 	kvm_mmu_set_mmio_spte_mask(mask);
 }
 
+#ifdef CONFIG_X86_64
+static void pvclock_gtod_update_fn(struct work_struct *work)
+{
+	struct kvm *kvm;
+
+	struct kvm_vcpu *vcpu;
+	int i;
+
+	raw_spin_lock(&kvm_lock);
+	list_for_each_entry(kvm, &vm_list, vm_list)
+		kvm_for_each_vcpu(i, vcpu, kvm)
+			set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
+	atomic_set(&kvm_guest_has_master_clock, 0);
+	raw_spin_unlock(&kvm_lock);
+}
+
+static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
+
+/*
+ * Notification about pvclock gtod data update.
+ */
+static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
+			       void *priv)
+{
+	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+	struct timekeeper *tk = priv;
+
+	update_pvclock_gtod(tk);
+
+	/* disable master clock if host does not trust, or does not
+	 * use, TSC clocksource
+	 */
+	if (gtod->clock.vclock_mode != VCLOCK_TSC &&
+	    atomic_read(&kvm_guest_has_master_clock) != 0)
+		queue_work(system_long_wq, &pvclock_gtod_work);
+
+	return 0;
+}
+
+static struct notifier_block pvclock_gtod_notifier = {
+	.notifier_call = pvclock_gtod_notify,
+};
+#endif
+
 int kvm_arch_init(void *opaque)
 {
 	int r;
@@ -4922,6 +5252,10 @@ int kvm_arch_init(void *opaque)
 		host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
 
 	kvm_lapic_init();
+#ifdef CONFIG_X86_64
+	pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
+#endif
+
 	return 0;
 
 out:
@@ -4936,6 +5270,9 @@ void kvm_arch_exit(void)
 		cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
 					    CPUFREQ_TRANSITION_NOTIFIER);
 	unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
+#ifdef CONFIG_X86_64
+	pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
+#endif
 	kvm_x86_ops = NULL;
 	kvm_mmu_module_exit();
 }
@@ -5059,7 +5396,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
 
-int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
+static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 	char instruction[3];
@@ -5235,6 +5572,29 @@ static void process_nmi(struct kvm_vcpu *vcpu)
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 
+static void kvm_gen_update_masterclock(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+	int i;
+	struct kvm_vcpu *vcpu;
+	struct kvm_arch *ka = &kvm->arch;
+
+	spin_lock(&ka->pvclock_gtod_sync_lock);
+	kvm_make_mclock_inprogress_request(kvm);
+	/* no guest entries from this point */
+	pvclock_update_vm_gtod_copy(kvm);
+
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+
+	/* guest entries allowed */
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
+
+	spin_unlock(&ka->pvclock_gtod_sync_lock);
+#endif
+}
+
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
 	int r;
@@ -5247,6 +5607,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_mmu_unload(vcpu);
 		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
 			__kvm_migrate_timers(vcpu);
+		if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
+			kvm_gen_update_masterclock(vcpu->kvm);
 		if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
 			r = kvm_guest_time_update(vcpu);
 			if (unlikely(r))
@@ -5362,7 +5724,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (hw_breakpoint_active())
 		hw_breakpoint_restore();
 
-	vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
+	vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
+							   native_read_tsc());
 
 	vcpu->mode = OUTSIDE_GUEST_MODE;
 	smp_wmb();
@@ -5419,7 +5782,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 		pr_debug("vcpu %d received sipi with vector # %x\n",
 			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
 		kvm_lapic_reset(vcpu);
-		r = kvm_arch_vcpu_reset(vcpu);
+		r = kvm_vcpu_reset(vcpu);
 		if (r)
 			return r;
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -6047,7 +6410,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	r = vcpu_load(vcpu);
 	if (r)
 		return r;
-	r = kvm_arch_vcpu_reset(vcpu);
+	r = kvm_vcpu_reset(vcpu);
 	if (r == 0)
 		r = kvm_mmu_setup(vcpu);
 	vcpu_put(vcpu);
@@ -6055,6 +6418,23 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	return r;
 }
 
+int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+	int r;
+	struct msr_data msr;
+
+	r = vcpu_load(vcpu);
+	if (r)
+		return r;
+	msr.data = 0x0;
+	msr.index = MSR_IA32_TSC;
+	msr.host_initiated = true;
+	kvm_write_tsc(vcpu, &msr);
+	vcpu_put(vcpu);
+
+	return r;
+}
+
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
 	int r;
@@ -6069,7 +6449,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->vcpu_free(vcpu);
 }
 
-int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
+static int kvm_vcpu_reset(struct kvm_vcpu *vcpu)
 {
 	atomic_set(&vcpu->arch.nmi_queued, 0);
 	vcpu->arch.nmi_pending = 0;
@@ -6092,6 +6472,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	kvm_pmu_reset(vcpu);
 
+	memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
+	vcpu->arch.regs_avail = ~0;
+	vcpu->arch.regs_dirty = ~0;
+
 	return kvm_x86_ops->vcpu_reset(vcpu);
 }
 
@@ -6168,6 +6552,8 @@ int kvm_arch_hardware_enable(void *garbage)
 			kvm_for_each_vcpu(i, vcpu, kvm) {
 				vcpu->arch.tsc_offset_adjustment += delta_cyc;
 				vcpu->arch.last_host_tsc = local_tsc;
+				set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
+					&vcpu->requests);
 			}
 
 			/*
@@ -6258,10 +6644,17 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
 		goto fail_free_mce_banks;
 
+	r = fx_init(vcpu);
+	if (r)
+		goto fail_free_wbinvd_dirty_mask;
+
+	vcpu->arch.ia32_tsc_adjust_msr = 0x0;
 	kvm_async_pf_hash_reset(vcpu);
 	kvm_pmu_init(vcpu);
 
 	return 0;
+fail_free_wbinvd_dirty_mask:
+	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
 fail_free_mce_banks:
 	kfree(vcpu->arch.mce_banks);
 fail_free_lapic:
@@ -6305,6 +6698,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
 	raw_spin_lock_init(&kvm->arch.tsc_write_lock);
 	mutex_init(&kvm->arch.apic_map_lock);
+	spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+
+	pvclock_update_vm_gtod_copy(kvm);
 
 	return 0;
 }
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2b5219c12ac..e224f7a671b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -112,7 +112,7 @@ void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
 
-void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
+void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
 
 int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
 	gva_t addr, void *val, unsigned int bytes,
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 4df6c373421..205ad328aa5 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -22,6 +22,7 @@
 #include <asm/hpet.h>
 #include <asm/unistd.h>
 #include <asm/io.h>
+#include <asm/pvclock.h>
 
 #define gtod (&VVAR(vsyscall_gtod_data))
 
@@ -62,6 +63,76 @@ static notrace cycle_t vread_hpet(void)
 	return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
 }
 
+#ifdef CONFIG_PARAVIRT_CLOCK
+
+static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
+{
+	const struct pvclock_vsyscall_time_info *pvti_base;
+	int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
+	int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
+
+	BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
+
+	pvti_base = (struct pvclock_vsyscall_time_info *)
+		    __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
+
+	return &pvti_base[offset];
+}
+
+static notrace cycle_t vread_pvclock(int *mode)
+{
+	const struct pvclock_vsyscall_time_info *pvti;
+	cycle_t ret;
+	u64 last;
+	u32 version;
+	u32 migrate_count;
+	u8 flags;
+	unsigned cpu, cpu1;
+
+
+	/*
+	 * When looping to get a consistent (time-info, tsc) pair, we
+	 * also need to deal with the possibility we can switch vcpus,
+	 * so make sure we always re-fetch time-info for the current vcpu.
+	 */
+	do {
+		cpu = __getcpu() & VGETCPU_CPU_MASK;
+		/* TODO: We can put vcpu id into higher bits of pvti.version.
+		 * This will save a couple of cycles by getting rid of
+		 * __getcpu() calls (Gleb).
+		 */
+
+		pvti = get_pvti(cpu);
+
+		migrate_count = pvti->migrate_count;
+
+		version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
+
+		/*
+		 * Test we're still on the cpu as well as the version.
+		 * We could have been migrated just after the first
+		 * vgetcpu but before fetching the version, so we
+		 * wouldn't notice a version change.
+		 */
+		cpu1 = __getcpu() & VGETCPU_CPU_MASK;
+	} while (unlikely(cpu != cpu1 ||
+			  (pvti->pvti.version & 1) ||
+			  pvti->pvti.version != version ||
+			  pvti->migrate_count != migrate_count));
+
+	if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
+		*mode = VCLOCK_NONE;
+
+	/* refer to tsc.c read_tsc() comment for rationale */
+	last = VVAR(vsyscall_gtod_data).clock.cycle_last;
+
+	if (likely(ret >= last))
+		return ret;
+
+	return last;
+}
+#endif
+
 notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
 {
 	long ret;
@@ -80,7 +151,7 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
 }
 
 
-notrace static inline u64 vgetsns(void)
+notrace static inline u64 vgetsns(int *mode)
 {
 	long v;
 	cycles_t cycles;
@@ -88,6 +159,10 @@ notrace static inline u64 vgetsns(void)
 		cycles = vread_tsc();
 	else if (gtod->clock.vclock_mode == VCLOCK_HPET)
 		cycles = vread_hpet();
+#ifdef CONFIG_PARAVIRT_CLOCK
+	else if (gtod->clock.vclock_mode == VCLOCK_PVCLOCK)
+		cycles = vread_pvclock(mode);
+#endif
 	else
 		return 0;
 	v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask;
@@ -107,7 +182,7 @@ notrace static int __always_inline do_realtime(struct timespec *ts)
 		mode = gtod->clock.vclock_mode;
 		ts->tv_sec = gtod->wall_time_sec;
 		ns = gtod->wall_time_snsec;
-		ns += vgetsns();
+		ns += vgetsns(&mode);
 		ns >>= gtod->clock.shift;
 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
 
@@ -127,7 +202,7 @@ notrace static int do_monotonic(struct timespec *ts)
 		mode = gtod->clock.vclock_mode;
 		ts->tv_sec = gtod->monotonic_time_sec;
 		ns = gtod->monotonic_time_snsec;
-		ns += vgetsns();
+		ns += vgetsns(&mode);
 		ns >>= gtod->clock.shift;
 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
 	timespec_add_ns(ts, ns);
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
index 5463ad55857..2f94b039e55 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -17,15 +17,10 @@ __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
 {
 	unsigned int p;
 
-	if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
-		/* Load per CPU data from RDTSCP */
-		native_read_tscp(&p);
-	} else {
-		/* Load per CPU data from GDT */
-		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
-	}
+	p = __getcpu();
+
 	if (cpu)
-		*cpu = p & 0xfff;
+		*cpu = p & VGETCPU_CPU_MASK;
 	if (node)
 		*node = p >> 12;
 	return 0;
diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index d8e05eeab23..0ecf22b6a38 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -357,6 +357,7 @@ config TRACE_SINK
 config PPC_EPAPR_HV_BYTECHAN
 	tristate "ePAPR hypervisor byte channel driver"
 	depends on PPC
+	select EPAPR_PARAVIRT
 	help
 	  This driver creates /dev entries for each ePAPR hypervisor byte
 	  channel, thereby allowing applications to communicate with byte
diff --git a/drivers/virt/Kconfig b/drivers/virt/Kconfig
index 2dcdbc9364d..99ebdde590f 100644
--- a/drivers/virt/Kconfig
+++ b/drivers/virt/Kconfig
@@ -15,6 +15,7 @@ if VIRT_DRIVERS
 config FSL_HV_MANAGER
 	tristate "Freescale hypervisor management driver"
 	depends on FSL_SOC
+	select EPAPR_PARAVIRT
 	help
           The Freescale hypervisor management driver provides several services
 	  to drivers and applications related to the Freescale hypervisor:
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d5cddd8dcc5..2c497ab0d03 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -47,28 +47,40 @@
 
 /*
  * For the normal pfn, the highest 12 bits should be zero,
- * so we can mask these bits to indicate the error.
+ * so we can mask bit 62 ~ bit 52  to indicate the error pfn,
+ * mask bit 63 to indicate the noslot pfn.
  */
-#define KVM_PFN_ERR_MASK	(0xfffULL << 52)
+#define KVM_PFN_ERR_MASK	(0x7ffULL << 52)
+#define KVM_PFN_ERR_NOSLOT_MASK	(0xfffULL << 52)
+#define KVM_PFN_NOSLOT		(0x1ULL << 63)
 
 #define KVM_PFN_ERR_FAULT	(KVM_PFN_ERR_MASK)
 #define KVM_PFN_ERR_HWPOISON	(KVM_PFN_ERR_MASK + 1)
-#define KVM_PFN_ERR_BAD		(KVM_PFN_ERR_MASK + 2)
-#define KVM_PFN_ERR_RO_FAULT	(KVM_PFN_ERR_MASK + 3)
+#define KVM_PFN_ERR_RO_FAULT	(KVM_PFN_ERR_MASK + 2)
 
+/*
+ * error pfns indicate that the gfn is in slot but faild to
+ * translate it to pfn on host.
+ */
 static inline bool is_error_pfn(pfn_t pfn)
 {
 	return !!(pfn & KVM_PFN_ERR_MASK);
 }
 
-static inline bool is_noslot_pfn(pfn_t pfn)
+/*
+ * error_noslot pfns indicate that the gfn can not be
+ * translated to pfn - it is not in slot or failed to
+ * translate it to pfn.
+ */
+static inline bool is_error_noslot_pfn(pfn_t pfn)
 {
-	return pfn == KVM_PFN_ERR_BAD;
+	return !!(pfn & KVM_PFN_ERR_NOSLOT_MASK);
 }
 
-static inline bool is_invalid_pfn(pfn_t pfn)
+/* noslot pfn indicates that the gfn is not in slot. */
+static inline bool is_noslot_pfn(pfn_t pfn)
 {
-	return !is_noslot_pfn(pfn) && is_error_pfn(pfn);
+	return pfn == KVM_PFN_NOSLOT;
 }
 
 #define KVM_HVA_ERR_BAD		(PAGE_OFFSET)
@@ -107,6 +119,9 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_IMMEDIATE_EXIT    15
 #define KVM_REQ_PMU               16
 #define KVM_REQ_PMI               17
+#define KVM_REQ_WATCHDOG          18
+#define KVM_REQ_MASTERCLOCK_UPDATE 19
+#define KVM_REQ_MCLOCK_INPROGRESS 20
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
@@ -516,6 +531,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
 
 void kvm_flush_remote_tlbs(struct kvm *kvm);
 void kvm_reload_remote_mmus(struct kvm *kvm);
+void kvm_make_mclock_inprogress_request(struct kvm *kvm);
 
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg);
@@ -569,9 +585,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
+int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
 
-int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
 int kvm_arch_hardware_enable(void *garbage);
 void kvm_arch_hardware_disable(void *garbage);
 int kvm_arch_hardware_setup(void);
@@ -666,6 +682,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 				   unsigned long *deliver_bitmask);
 #endif
 int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
+int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level);
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
 		int irq_source_id, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
@@ -838,9 +855,9 @@ extern struct kvm_stats_debugfs_item debugfs_entries[];
 extern struct dentry *kvm_debugfs_dir;
 
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
-static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq)
+static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
 {
-	if (unlikely(vcpu->kvm->mmu_notifier_count))
+	if (unlikely(kvm->mmu_notifier_count))
 		return 1;
 	/*
 	 * Ensure the read of mmu_notifier_count happens before the read
@@ -853,7 +870,7 @@ static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_se
 	 * can't rely on kvm->mmu_lock to keep things ordered.
 	 */
 	smp_rmb();
-	if (vcpu->kvm->mmu_notifier_seq != mmu_seq)
+	if (kvm->mmu_notifier_seq != mmu_seq)
 		return 1;
 	return 0;
 }
@@ -881,10 +898,20 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {}
 #ifdef CONFIG_HAVE_KVM_EVENTFD
 
 void kvm_eventfd_init(struct kvm *kvm);
+int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
+
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
 int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args);
 void kvm_irqfd_release(struct kvm *kvm);
 void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *);
-int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
+#else
+static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
+{
+	return -EINVAL;
+}
+
+static inline void kvm_irqfd_release(struct kvm *kvm) {}
+#endif
 
 #else
 
diff --git a/include/linux/pvclock_gtod.h b/include/linux/pvclock_gtod.h
new file mode 100644
index 00000000000..0ca75825b60
--- /dev/null
+++ b/include/linux/pvclock_gtod.h
@@ -0,0 +1,9 @@
+#ifndef _PVCLOCK_GTOD_H
+#define _PVCLOCK_GTOD_H
+
+#include <linux/notifier.h>
+
+extern int pvclock_gtod_register_notifier(struct notifier_block *nb);
+extern int pvclock_gtod_unregister_notifier(struct notifier_block *nb);
+
+#endif /* _PVCLOCK_GTOD_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 651b51a3671..2c2f3072bee 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -107,6 +107,14 @@ extern unsigned long this_cpu_load(void);
 extern void calc_global_load(unsigned long ticks);
 extern void update_cpu_load_nohz(void);
 
+/* Notifier for when a task gets migrated to a new CPU */
+struct task_migration_notifier {
+	struct task_struct *task;
+	int from_cpu;
+	int to_cpu;
+};
+extern void register_task_migration_notifier(struct notifier_block *n);
+
 extern unsigned long get_parent_ip(unsigned long addr);
 
 extern void dump_cpu_task(int cpu);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 0a6d6ba44c8..e6e5d4b1370 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -167,10 +167,15 @@ struct kvm_pit_config {
 #define KVM_EXIT_OSI              18
 #define KVM_EXIT_PAPR_HCALL	  19
 #define KVM_EXIT_S390_UCONTROL	  20
+#define KVM_EXIT_WATCHDOG         21
 
 /* For KVM_EXIT_INTERNAL_ERROR */
-#define KVM_INTERNAL_ERROR_EMULATION 1
-#define KVM_INTERNAL_ERROR_SIMUL_EX 2
+/* Emulate instruction failed. */
+#define KVM_INTERNAL_ERROR_EMULATION	1
+/* Encounter unexpected simultaneous exceptions. */
+#define KVM_INTERNAL_ERROR_SIMUL_EX	2
+/* Encounter unexpected vm-exit due to delivery event. */
+#define KVM_INTERNAL_ERROR_DELIVERY_EV	3
 
 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
 struct kvm_run {
@@ -477,6 +482,8 @@ struct kvm_ppc_smmu_info {
 	struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ];
 };
 
+#define KVM_PPC_PVINFO_FLAGS_EV_IDLE   (1<<0)
+
 #define KVMIO 0xAE
 
 /* machine type bits, to be used as argument to KVM_CREATE_VM */
@@ -626,6 +633,8 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_READONLY_MEM 81
 #endif
 #define KVM_CAP_IRQFD_RESAMPLE 82
+#define KVM_CAP_PPC_BOOKE_WATCHDOG 83
+#define KVM_CAP_PPC_HTAB_FD 84
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -848,6 +857,11 @@ struct kvm_s390_ucas_mapping {
 #define KVM_PPC_GET_SMMU_INFO	  _IOR(KVMIO,  0xa6, struct kvm_ppc_smmu_info)
 /* Available with KVM_CAP_PPC_ALLOC_HTAB */
 #define KVM_PPC_ALLOCATE_HTAB	  _IOWR(KVMIO, 0xa7, __u32)
+#define KVM_CREATE_SPAPR_TCE	  _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
+/* Available with KVM_CAP_RMA */
+#define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
+/* Available with KVM_CAP_PPC_HTAB_FD */
+#define KVM_PPC_GET_HTAB_FD	  _IOW(KVMIO,  0xaa, struct kvm_get_htab_fd)
 
 /*
  * ioctls for vcpu fds
@@ -911,9 +925,6 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_XCRS */
 #define KVM_GET_XCRS		  _IOR(KVMIO,  0xa6, struct kvm_xcrs)
 #define KVM_SET_XCRS		  _IOW(KVMIO,  0xa7, struct kvm_xcrs)
-#define KVM_CREATE_SPAPR_TCE	  _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
-/* Available with KVM_CAP_RMA */
-#define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
 /* Available with KVM_CAP_SW_TLB */
 #define KVM_DIRTY_TLB		  _IOW(KVMIO,  0xaa, struct kvm_dirty_tlb)
 /* Available with KVM_CAP_ONE_REG */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6271b89f87a..0533496b622 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -923,6 +923,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 		rq->skip_clock_update = 1;
 }
 
+static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
+
+void register_task_migration_notifier(struct notifier_block *n)
+{
+	atomic_notifier_chain_register(&task_migration_notifier, n);
+}
+
 #ifdef CONFIG_SMP
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
@@ -953,10 +960,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	trace_sched_migrate_task(p, new_cpu);
 
 	if (task_cpu(p) != new_cpu) {
+		struct task_migration_notifier tmn;
+
 		if (p->sched_class->migrate_task_rq)
 			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
 		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
+
+		tmn.task = p;
+		tmn.from_cpu = task_cpu(p);
+		tmn.to_cpu = new_cpu;
+
+		atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
 	}
 
 	__set_task_cpu(p, new_cpu);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4c7de02eacd..cbc6acb0db3 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -21,6 +21,7 @@
 #include <linux/time.h>
 #include <linux/tick.h>
 #include <linux/stop_machine.h>
+#include <linux/pvclock_gtod.h>
 
 
 static struct timekeeper timekeeper;
@@ -174,6 +175,54 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
 	return nsec + arch_gettimeoffset();
 }
 
+static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
+
+static void update_pvclock_gtod(struct timekeeper *tk)
+{
+	raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk);
+}
+
+/**
+ * pvclock_gtod_register_notifier - register a pvclock timedata update listener
+ *
+ * Must hold write on timekeeper.lock
+ */
+int pvclock_gtod_register_notifier(struct notifier_block *nb)
+{
+	struct timekeeper *tk = &timekeeper;
+	unsigned long flags;
+	int ret;
+
+	write_seqlock_irqsave(&tk->lock, flags);
+	ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
+	/* update timekeeping data */
+	update_pvclock_gtod(tk);
+	write_sequnlock_irqrestore(&tk->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
+
+/**
+ * pvclock_gtod_unregister_notifier - unregister a pvclock
+ * timedata update listener
+ *
+ * Must hold write on timekeeper.lock
+ */
+int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
+{
+	struct timekeeper *tk = &timekeeper;
+	unsigned long flags;
+	int ret;
+
+	write_seqlock_irqsave(&tk->lock, flags);
+	ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
+	write_sequnlock_irqrestore(&tk->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
+
 /* must hold write on timekeeper.lock */
 static void timekeeping_update(struct timekeeper *tk, bool clearntp)
 {
@@ -182,6 +231,7 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp)
 		ntp_clear();
 	}
 	update_vsyscall(tk);
+	update_pvclock_gtod(tk);
 }
 
 /**
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 23a41a9f8db..3642239252b 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -105,6 +105,15 @@ static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
 }
 
 #ifdef __KVM_HAVE_MSI
+static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
+{
+	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+	int ret = kvm_set_irq_inatomic(assigned_dev->kvm,
+				       assigned_dev->irq_source_id,
+				       assigned_dev->guest_irq, 1);
+	return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
+}
+
 static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
 {
 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@@ -117,6 +126,23 @@ static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
 #endif
 
 #ifdef __KVM_HAVE_MSIX
+static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
+{
+	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+	int index = find_index_from_host_irq(assigned_dev, irq);
+	u32 vector;
+	int ret = 0;
+
+	if (index >= 0) {
+		vector = assigned_dev->guest_msix_entries[index].vector;
+		ret = kvm_set_irq_inatomic(assigned_dev->kvm,
+					   assigned_dev->irq_source_id,
+					   vector, 1);
+	}
+
+	return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
+}
+
 static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
 {
 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@@ -334,11 +360,6 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
 }
 
 #ifdef __KVM_HAVE_MSI
-static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
-{
-	return IRQ_WAKE_THREAD;
-}
-
 static int assigned_device_enable_host_msi(struct kvm *kvm,
 					   struct kvm_assigned_dev_kernel *dev)
 {
@@ -363,11 +384,6 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
 #endif
 
 #ifdef __KVM_HAVE_MSIX
-static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
-{
-	return IRQ_WAKE_THREAD;
-}
-
 static int assigned_device_enable_host_msix(struct kvm *kvm,
 					    struct kvm_assigned_dev_kernel *dev)
 {
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 9718e98d6d2..b6eea5cc7b3 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -35,6 +35,7 @@
 
 #include "iodev.h"
 
+#ifdef __KVM_HAVE_IOAPIC
 /*
  * --------------------------------------------------------------------
  * irqfd: Allows an fd to be used to inject an interrupt to the guest
@@ -332,7 +333,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 		mutex_lock(&kvm->irqfds.resampler_lock);
 
 		list_for_each_entry(resampler,
-				    &kvm->irqfds.resampler_list, list) {
+				    &kvm->irqfds.resampler_list, link) {
 			if (resampler->notifier.gsi == irqfd->gsi) {
 				irqfd->resampler = resampler;
 				break;
@@ -425,17 +426,21 @@ fail:
 	kfree(irqfd);
 	return ret;
 }
+#endif
 
 void
 kvm_eventfd_init(struct kvm *kvm)
 {
+#ifdef __KVM_HAVE_IOAPIC
 	spin_lock_init(&kvm->irqfds.lock);
 	INIT_LIST_HEAD(&kvm->irqfds.items);
 	INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
 	mutex_init(&kvm->irqfds.resampler_lock);
+#endif
 	INIT_LIST_HEAD(&kvm->ioeventfds);
 }
 
+#ifdef __KVM_HAVE_IOAPIC
 /*
  * shutdown any irqfd's that match fd+gsi
  */
@@ -555,6 +560,7 @@ static void __exit irqfd_module_exit(void)
 
 module_init(irqfd_module_init);
 module_exit(irqfd_module_exit);
+#endif
 
 /*
  * --------------------------------------------------------------------
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 037cb6730e6..4a340cb2301 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -52,7 +52,7 @@ static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
 	end_gfn = gfn + (size >> PAGE_SHIFT);
 	gfn    += 1;
 
-	if (is_error_pfn(pfn))
+	if (is_error_noslot_pfn(pfn))
 		return pfn;
 
 	while (gfn < end_gfn)
@@ -106,7 +106,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
 		 * important because we unmap and unpin in 4kb steps later.
 		 */
 		pfn = kvm_pin_pages(slot, gfn, page_size);
-		if (is_error_pfn(pfn)) {
+		if (is_error_noslot_pfn(pfn)) {
 			gfn += 1;
 			continue;
 		}
@@ -168,11 +168,7 @@ int kvm_assign_device(struct kvm *kvm,
 
 	r = iommu_attach_device(domain, &pdev->dev);
 	if (r) {
-		printk(KERN_ERR "assign device %x:%x:%x.%x failed",
-			pci_domain_nr(pdev->bus),
-			pdev->bus->number,
-			PCI_SLOT(pdev->devfn),
-			PCI_FUNC(pdev->devfn));
+		dev_err(&pdev->dev, "kvm assign device failed ret %d", r);
 		return r;
 	}
 
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 2eb58af7ee9..656fa455e15 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -102,6 +102,23 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 	return r;
 }
 
+static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+				   struct kvm_lapic_irq *irq)
+{
+	trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
+
+	irq->dest_id = (e->msi.address_lo &
+			MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
+	irq->vector = (e->msi.data &
+			MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
+	irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
+	irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
+	irq->delivery_mode = e->msi.data & 0x700;
+	irq->level = 1;
+	irq->shorthand = 0;
+	/* TODO Deal with RH bit of MSI message address */
+}
+
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 		struct kvm *kvm, int irq_source_id, int level)
 {
@@ -110,22 +127,26 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 	if (!level)
 		return -1;
 
-	trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
+	kvm_set_msi_irq(e, &irq);
 
-	irq.dest_id = (e->msi.address_lo &
-			MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
-	irq.vector = (e->msi.data &
-			MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
-	irq.dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
-	irq.trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
-	irq.delivery_mode = e->msi.data & 0x700;
-	irq.level = 1;
-	irq.shorthand = 0;
-
-	/* TODO Deal with RH bit of MSI message address */
 	return kvm_irq_delivery_to_apic(kvm, NULL, &irq);
 }
 
+
+static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
+			 struct kvm *kvm)
+{
+	struct kvm_lapic_irq irq;
+	int r;
+
+	kvm_set_msi_irq(e, &irq);
+
+	if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r))
+		return r;
+	else
+		return -EWOULDBLOCK;
+}
+
 int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
 {
 	struct kvm_kernel_irq_routing_entry route;
@@ -178,6 +199,44 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
 	return ret;
 }
 
+/*
+ * Deliver an IRQ in an atomic context if we can, or return a failure,
+ * user can retry in a process context.
+ * Return value:
+ *  -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
+ *  Other values - No need to retry.
+ */
+int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
+{
+	struct kvm_kernel_irq_routing_entry *e;
+	int ret = -EINVAL;
+	struct kvm_irq_routing_table *irq_rt;
+	struct hlist_node *n;
+
+	trace_kvm_set_irq(irq, level, irq_source_id);
+
+	/*
+	 * Injection into either PIC or IOAPIC might need to scan all CPUs,
+	 * which would need to be retried from thread context;  when same GSI
+	 * is connected to both PIC and IOAPIC, we'd have to report a
+	 * partial failure here.
+	 * Since there's no easy way to do this, we only support injecting MSI
+	 * which is limited to 1:1 GSI mapping.
+	 */
+	rcu_read_lock();
+	irq_rt = rcu_dereference(kvm->irq_routing);
+	if (irq < irq_rt->nr_rt_entries)
+		hlist_for_each_entry(e, n, &irq_rt->map[irq], link) {
+			if (likely(e->type == KVM_IRQ_ROUTING_MSI))
+				ret = kvm_set_msi_inatomic(e, kvm);
+			else
+				ret = -EWOULDBLOCK;
+			break;
+		}
+	rcu_read_unlock();
+	return ret;
+}
+
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
 {
 	struct kvm_irq_ack_notifier *kian;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index be70035fd42..1cd693a76a5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -212,6 +212,11 @@ void kvm_reload_remote_mmus(struct kvm *kvm)
 	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
 }
 
+void kvm_make_mclock_inprogress_request(struct kvm *kvm)
+{
+	make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
+}
+
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 {
 	struct page *page;
@@ -709,8 +714,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	int r;
 	gfn_t base_gfn;
 	unsigned long npages;
-	unsigned long i;
-	struct kvm_memory_slot *memslot;
+	struct kvm_memory_slot *memslot, *slot;
 	struct kvm_memory_slot old, new;
 	struct kvm_memslots *slots, *old_memslots;
 
@@ -761,13 +765,11 @@ int __kvm_set_memory_region(struct kvm *kvm,
 
 	/* Check for overlaps */
 	r = -EEXIST;
-	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
-		struct kvm_memory_slot *s = &kvm->memslots->memslots[i];
-
-		if (s == memslot || !s->npages)
+	kvm_for_each_memslot(slot, kvm->memslots) {
+		if (slot->id >= KVM_MEMORY_SLOTS || slot == memslot)
 			continue;
-		if (!((base_gfn + npages <= s->base_gfn) ||
-		      (base_gfn >= s->base_gfn + s->npages)))
+		if (!((base_gfn + npages <= slot->base_gfn) ||
+		      (base_gfn >= slot->base_gfn + slot->npages)))
 			goto out_free;
 	}
 
@@ -1208,7 +1210,7 @@ __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
 		return KVM_PFN_ERR_RO_FAULT;
 
 	if (kvm_is_error_hva(addr))
-		return KVM_PFN_ERR_BAD;
+		return KVM_PFN_NOSLOT;
 
 	/* Do not map writable pfn in the readonly memslot. */
 	if (writable && memslot_is_readonly(slot)) {
@@ -1290,7 +1292,7 @@ EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
 
 static struct page *kvm_pfn_to_page(pfn_t pfn)
 {
-	if (is_error_pfn(pfn))
+	if (is_error_noslot_pfn(pfn))
 		return KVM_ERR_PTR_BAD_PAGE;
 
 	if (kvm_is_mmio_pfn(pfn)) {
@@ -1322,7 +1324,7 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean);
 
 void kvm_release_pfn_clean(pfn_t pfn)
 {
-	if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn))
+	if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn))
 		put_page(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
@@ -1848,6 +1850,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 	atomic_inc(&kvm->online_vcpus);
 
 	mutex_unlock(&kvm->lock);
+	kvm_arch_vcpu_postcreate(vcpu);
 	return r;
 
 unlock_vcpu_destroy:
@@ -1929,10 +1932,6 @@ out_free1:
 			goto out;
 		}
 		r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
-		if (r)
-			goto out_free2;
-		r = 0;
-out_free2:
 		kfree(kvm_regs);
 		break;
 	}
@@ -1954,12 +1953,10 @@ out_free2:
 		kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
 		if (IS_ERR(kvm_sregs)) {
 			r = PTR_ERR(kvm_sregs);
+			kvm_sregs = NULL;
 			goto out;
 		}
 		r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
-		if (r)
-			goto out;
-		r = 0;
 		break;
 	}
 	case KVM_GET_MP_STATE: {
@@ -1981,9 +1978,6 @@ out_free2:
 		if (copy_from_user(&mp_state, argp, sizeof mp_state))
 			goto out;
 		r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
-		if (r)
-			goto out;
-		r = 0;
 		break;
 	}
 	case KVM_TRANSLATE: {
@@ -2008,9 +2002,6 @@ out_free2:
 		if (copy_from_user(&dbg, argp, sizeof dbg))
 			goto out;
 		r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
-		if (r)
-			goto out;
-		r = 0;
 		break;
 	}
 	case KVM_SET_SIGNAL_MASK: {
@@ -2054,12 +2045,10 @@ out_free2:
 		fpu = memdup_user(argp, sizeof(*fpu));
 		if (IS_ERR(fpu)) {
 			r = PTR_ERR(fpu);
+			fpu = NULL;
 			goto out;
 		}
 		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
-		if (r)
-			goto out;
-		r = 0;
 		break;
 	}
 	default:
@@ -2129,8 +2118,6 @@ static long kvm_vm_ioctl(struct file *filp,
 	switch (ioctl) {
 	case KVM_CREATE_VCPU:
 		r = kvm_vm_ioctl_create_vcpu(kvm, arg);
-		if (r < 0)
-			goto out;
 		break;
 	case KVM_SET_USER_MEMORY_REGION: {
 		struct kvm_userspace_memory_region kvm_userspace_mem;
@@ -2141,8 +2128,6 @@ static long kvm_vm_ioctl(struct file *filp,
 			goto out;
 
 		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
-		if (r)
-			goto out;
 		break;
 	}
 	case KVM_GET_DIRTY_LOG: {
@@ -2152,8 +2137,6 @@ static long kvm_vm_ioctl(struct file *filp,
 		if (copy_from_user(&log, argp, sizeof log))
 			goto out;
 		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
-		if (r)
-			goto out;
 		break;
 	}
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
@@ -2163,9 +2146,6 @@ static long kvm_vm_ioctl(struct file *filp,
 		if (copy_from_user(&zone, argp, sizeof zone))
 			goto out;
 		r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
-		if (r)
-			goto out;
-		r = 0;
 		break;
 	}
 	case KVM_UNREGISTER_COALESCED_MMIO: {
@@ -2174,9 +2154,6 @@ static long kvm_vm_ioctl(struct file *filp,
 		if (copy_from_user(&zone, argp, sizeof zone))
 			goto out;
 		r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
-		if (r)
-			goto out;
-		r = 0;
 		break;
 	}
 #endif
@@ -2285,8 +2262,6 @@ static long kvm_vm_compat_ioctl(struct file *filp,
 		log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
 
 		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
-		if (r)
-			goto out;
 		break;
 	}
 	default: