43 files changed, 2734 insertions, 1814 deletions
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index cffd754f303..4d4f96a2763 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -14,29 +14,33 @@ $(obj)/inat.o: $(obj)/inat-tables.c
 
 clean-files := inat-tables.c
 
-obj-$(CONFIG_SMP) += msr-smp.o
+obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
 
-lib-y := delay.o
+lib-y := delay.o misc.o cmdline.o
 lib-y += thunk_$(BITS).o
-lib-y += usercopy_$(BITS).o getuser.o putuser.o
+lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
 lib-y += memcpy_$(BITS).o
-lib-$(CONFIG_KPROBES) += insn.o inat.o
+lib-$(CONFIG_SMP) += rwlock.o
+lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
+lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
 
-obj-y += msr.o msr-reg.o msr-reg-export.o
+obj-y += msr.o msr-reg.o msr-reg-export.o hash.o
 
 ifeq ($(CONFIG_X86_32),y)
         obj-y += atomic64_32.o
+        lib-y += atomic64_cx8_32.o
         lib-y += checksum_32.o
         lib-y += strstr_32.o
-        lib-y += semaphore_32.o string_32.o
+        lib-y += string_32.o
 ifneq ($(CONFIG_X86_CMPXCHG64),y)
-        lib-y += cmpxchg8b_emu.o
+        lib-y += cmpxchg8b_emu.o atomic64_386_32.o
 endif
         lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
 else
-        obj-y += io_64.o iomap_copy_64.o
+        obj-y += iomap_copy_64.o
         lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
         lib-y += thunk_64.o clear_page_64.o copy_page_64.o
         lib-y += memmove_64.o memset_64.o
-        lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
+        lib-y += copy_user_64.o copy_user_nocache_64.o
+	lib-y += cmpxchg16b_emu.o
 endif
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
index 824fa0be55a..a0b4a350daa 100644
--- a/arch/x86/lib/atomic64_32.c
+++ b/arch/x86/lib/atomic64_32.c
@@ -1,230 +1,4 @@
-#include <linux/compiler.h>
-#include <linux/module.h>
-#include <linux/types.h>
+#define ATOMIC64_EXPORT EXPORT_SYMBOL
 
-#include <asm/processor.h>
-#include <asm/cmpxchg.h>
-#include <asm/atomic.h>
-
-static noinline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new)
-{
-	u32 low = new;
-	u32 high = new >> 32;
-
-	asm volatile(
-		LOCK_PREFIX "cmpxchg8b %1\n"
-		     : "+A" (old), "+m" (*ptr)
-		     :  "b" (low),  "c" (high)
-		     );
-	return old;
-}
-
-u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val)
-{
-	return cmpxchg8b(&ptr->counter, old_val, new_val);
-}
-EXPORT_SYMBOL(atomic64_cmpxchg);
-
-/**
- * atomic64_xchg - xchg atomic64 variable
- * @ptr:      pointer to type atomic64_t
- * @new_val:  value to assign
- *
- * Atomically xchgs the value of @ptr to @new_val and returns
- * the old value.
- */
-u64 atomic64_xchg(atomic64_t *ptr, u64 new_val)
-{
-	/*
-	 * Try first with a (possibly incorrect) assumption about
-	 * what we have there. We'll do two loops most likely,
-	 * but we'll get an ownership MESI transaction straight away
-	 * instead of a read transaction followed by a
-	 * flush-for-ownership transaction:
-	 */
-	u64 old_val, real_val = 0;
-
-	do {
-		old_val = real_val;
-
-		real_val = atomic64_cmpxchg(ptr, old_val, new_val);
-
-	} while (real_val != old_val);
-
-	return old_val;
-}
-EXPORT_SYMBOL(atomic64_xchg);
-
-/**
- * atomic64_set - set atomic64 variable
- * @ptr:      pointer to type atomic64_t
- * @new_val:  value to assign
- *
- * Atomically sets the value of @ptr to @new_val.
- */
-void atomic64_set(atomic64_t *ptr, u64 new_val)
-{
-	atomic64_xchg(ptr, new_val);
-}
-EXPORT_SYMBOL(atomic64_set);
-
-/**
-EXPORT_SYMBOL(atomic64_read);
- * atomic64_add_return - add and return
- * @delta: integer value to add
- * @ptr:   pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr and returns @delta + *@ptr
- */
-noinline u64 atomic64_add_return(u64 delta, atomic64_t *ptr)
-{
-	/*
-	 * Try first with a (possibly incorrect) assumption about
-	 * what we have there. We'll do two loops most likely,
-	 * but we'll get an ownership MESI transaction straight away
-	 * instead of a read transaction followed by a
-	 * flush-for-ownership transaction:
-	 */
-	u64 old_val, new_val, real_val = 0;
-
-	do {
-		old_val = real_val;
-		new_val = old_val + delta;
-
-		real_val = atomic64_cmpxchg(ptr, old_val, new_val);
-
-	} while (real_val != old_val);
-
-	return new_val;
-}
-EXPORT_SYMBOL(atomic64_add_return);
-
-u64 atomic64_sub_return(u64 delta, atomic64_t *ptr)
-{
-	return atomic64_add_return(-delta, ptr);
-}
-EXPORT_SYMBOL(atomic64_sub_return);
-
-u64 atomic64_inc_return(atomic64_t *ptr)
-{
-	return atomic64_add_return(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_inc_return);
-
-u64 atomic64_dec_return(atomic64_t *ptr)
-{
-	return atomic64_sub_return(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_dec_return);
-
-/**
- * atomic64_add - add integer to atomic64 variable
- * @delta: integer value to add
- * @ptr:   pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr.
- */
-void atomic64_add(u64 delta, atomic64_t *ptr)
-{
-	atomic64_add_return(delta, ptr);
-}
-EXPORT_SYMBOL(atomic64_add);
-
-/**
- * atomic64_sub - subtract the atomic64 variable
- * @delta: integer value to subtract
- * @ptr:   pointer to type atomic64_t
- *
- * Atomically subtracts @delta from @ptr.
- */
-void atomic64_sub(u64 delta, atomic64_t *ptr)
-{
-	atomic64_add(-delta, ptr);
-}
-EXPORT_SYMBOL(atomic64_sub);
-
-/**
- * atomic64_sub_and_test - subtract value from variable and test result
- * @delta: integer value to subtract
- * @ptr:   pointer to type atomic64_t
- *
- * Atomically subtracts @delta from @ptr and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-int atomic64_sub_and_test(u64 delta, atomic64_t *ptr)
-{
-	u64 new_val = atomic64_sub_return(delta, ptr);
-
-	return new_val == 0;
-}
-EXPORT_SYMBOL(atomic64_sub_and_test);
-
-/**
- * atomic64_inc - increment atomic64 variable
- * @ptr: pointer to type atomic64_t
- *
- * Atomically increments @ptr by 1.
- */
-void atomic64_inc(atomic64_t *ptr)
-{
-	atomic64_add(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_inc);
-
-/**
- * atomic64_dec - decrement atomic64 variable
- * @ptr: pointer to type atomic64_t
- *
- * Atomically decrements @ptr by 1.
- */
-void atomic64_dec(atomic64_t *ptr)
-{
-	atomic64_sub(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_dec);
-
-/**
- * atomic64_dec_and_test - decrement and test
- * @ptr: pointer to type atomic64_t
- *
- * Atomically decrements @ptr by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-int atomic64_dec_and_test(atomic64_t *ptr)
-{
-	return atomic64_sub_and_test(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_dec_and_test);
-
-/**
- * atomic64_inc_and_test - increment and test
- * @ptr: pointer to type atomic64_t
- *
- * Atomically increments @ptr by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-int atomic64_inc_and_test(atomic64_t *ptr)
-{
-	return atomic64_sub_and_test(-1, ptr);
-}
-EXPORT_SYMBOL(atomic64_inc_and_test);
-
-/**
- * atomic64_add_negative - add and test if negative
- * @delta: integer value to add
- * @ptr:   pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-int atomic64_add_negative(u64 delta, atomic64_t *ptr)
-{
-	s64 new_val = atomic64_add_return(delta, ptr);
-
-	return new_val < 0;
-}
-EXPORT_SYMBOL(atomic64_add_negative);
+#include <linux/export.h>
+#include <linux/atomic.h>
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
new file mode 100644
index 00000000000..00933d5e992
--- /dev/null
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -0,0 +1,194 @@
+/*
+ * atomic64_t for 386/486
+ *
+ * Copyright © 2010  Luca Barbieri
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/dwarf2.h>
+
+/* if you want SMP support, implement these with real spinlocks */
+.macro LOCK reg
+	pushfl_cfi
+	cli
+.endm
+
+.macro UNLOCK reg
+	popfl_cfi
+.endm
+
+#define BEGIN(op) \
+.macro endp; \
+	CFI_ENDPROC; \
+ENDPROC(atomic64_##op##_386); \
+.purgem endp; \
+.endm; \
+ENTRY(atomic64_##op##_386); \
+	CFI_STARTPROC; \
+	LOCK v;
+
+#define ENDP endp
+
+#define RET \
+	UNLOCK v; \
+	ret
+
+#define RET_ENDP \
+	RET; \
+	ENDP
+
+#define v %ecx
+BEGIN(read)
+	movl  (v), %eax
+	movl 4(v), %edx
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(set)
+	movl %ebx,  (v)
+	movl %ecx, 4(v)
+RET_ENDP
+#undef v
+
+#define v  %esi
+BEGIN(xchg)
+	movl  (v), %eax
+	movl 4(v), %edx
+	movl %ebx,  (v)
+	movl %ecx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %ecx
+BEGIN(add)
+	addl %eax,  (v)
+	adcl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %ecx
+BEGIN(add_return)
+	addl  (v), %eax
+	adcl 4(v), %edx
+	movl %eax,  (v)
+	movl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %ecx
+BEGIN(sub)
+	subl %eax,  (v)
+	sbbl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %ecx
+BEGIN(sub_return)
+	negl %edx
+	negl %eax
+	sbbl $0, %edx
+	addl  (v), %eax
+	adcl 4(v), %edx
+	movl %eax,  (v)
+	movl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(inc)
+	addl $1,  (v)
+	adcl $0, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(inc_return)
+	movl  (v), %eax
+	movl 4(v), %edx
+	addl $1, %eax
+	adcl $0, %edx
+	movl %eax,  (v)
+	movl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(dec)
+	subl $1,  (v)
+	sbbl $0, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(dec_return)
+	movl  (v), %eax
+	movl 4(v), %edx
+	subl $1, %eax
+	sbbl $0, %edx
+	movl %eax,  (v)
+	movl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(add_unless)
+	addl %eax, %ecx
+	adcl %edx, %edi
+	addl  (v), %eax
+	adcl 4(v), %edx
+	cmpl %eax, %ecx
+	je 3f
+1:
+	movl %eax,  (v)
+	movl %edx, 4(v)
+	movl $1, %eax
+2:
+	RET
+3:
+	cmpl %edx, %edi
+	jne 1b
+	xorl %eax, %eax
+	jmp 2b
+ENDP
+#undef v
+
+#define v %esi
+BEGIN(inc_not_zero)
+	movl  (v), %eax
+	movl 4(v), %edx
+	testl %eax, %eax
+	je 3f
+1:
+	addl $1, %eax
+	adcl $0, %edx
+	movl %eax,  (v)
+	movl %edx, 4(v)
+	movl $1, %eax
+2:
+	RET
+3:
+	testl %edx, %edx
+	jne 1b
+	jmp 2b
+ENDP
+#undef v
+
+#define v %esi
+BEGIN(dec_if_positive)
+	movl  (v), %eax
+	movl 4(v), %edx
+	subl $1, %eax
+	sbbl $0, %edx
+	js 1f
+	movl %eax,  (v)
+	movl %edx, 4(v)
+1:
+RET_ENDP
+#undef v
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
new file mode 100644
index 00000000000..f5cc9eb1d51
--- /dev/null
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -0,0 +1,215 @@
+/*
+ * atomic64_t for 586+
+ *
+ * Copyright © 2010  Luca Barbieri
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/dwarf2.h>
+
+.macro SAVE reg
+	pushl_cfi %\reg
+	CFI_REL_OFFSET \reg, 0
+.endm
+
+.macro RESTORE reg
+	popl_cfi %\reg
+	CFI_RESTORE \reg
+.endm
+
+.macro read64 reg
+	movl %ebx, %eax
+	movl %ecx, %edx
+/* we need LOCK_PREFIX since otherwise cmpxchg8b always does the write */
+	LOCK_PREFIX
+	cmpxchg8b (\reg)
+.endm
+
+ENTRY(atomic64_read_cx8)
+	CFI_STARTPROC
+
+	read64 %ecx
+	ret
+	CFI_ENDPROC
+ENDPROC(atomic64_read_cx8)
+
+ENTRY(atomic64_set_cx8)
+	CFI_STARTPROC
+
+1:
+/* we don't need LOCK_PREFIX since aligned 64-bit writes
+ * are atomic on 586 and newer */
+	cmpxchg8b (%esi)
+	jne 1b
+
+	ret
+	CFI_ENDPROC
+ENDPROC(atomic64_set_cx8)
+
+ENTRY(atomic64_xchg_cx8)
+	CFI_STARTPROC
+
+1:
+	LOCK_PREFIX
+	cmpxchg8b (%esi)
+	jne 1b
+
+	ret
+	CFI_ENDPROC
+ENDPROC(atomic64_xchg_cx8)
+
+.macro addsub_return func ins insc
+ENTRY(atomic64_\func\()_return_cx8)
+	CFI_STARTPROC
+	SAVE ebp
+	SAVE ebx
+	SAVE esi
+	SAVE edi
+
+	movl %eax, %esi
+	movl %edx, %edi
+	movl %ecx, %ebp
+
+	read64 %ecx
+1:
+	movl %eax, %ebx
+	movl %edx, %ecx
+	\ins\()l %esi, %ebx
+	\insc\()l %edi, %ecx
+	LOCK_PREFIX
+	cmpxchg8b (%ebp)
+	jne 1b
+
+10:
+	movl %ebx, %eax
+	movl %ecx, %edx
+	RESTORE edi
+	RESTORE esi
+	RESTORE ebx
+	RESTORE ebp
+	ret
+	CFI_ENDPROC
+ENDPROC(atomic64_\func\()_return_cx8)
+.endm
+
+addsub_return add add adc
+addsub_return sub sub sbb
+
+.macro incdec_return func ins insc
+ENTRY(atomic64_\func\()_return_cx8)
+	CFI_STARTPROC
+	SAVE ebx
+
+	read64 %esi
+1:
+	movl %eax, %ebx
+	movl %edx, %ecx
+	\ins\()l $1, %ebx
+	\insc\()l $0, %ecx
+	LOCK_PREFIX
+	cmpxchg8b (%esi)
+	jne 1b
+
+10:
+	movl %ebx, %eax
+	movl %ecx, %edx
+	RESTORE ebx
+	ret
+	CFI_ENDPROC
+ENDPROC(atomic64_\func\()_return_cx8)
+.endm
+
+incdec_return inc add adc
+incdec_return dec sub sbb
+
+ENTRY(atomic64_dec_if_positive_cx8)
+	CFI_STARTPROC
+	SAVE ebx
+
+	read64 %esi
+1:
+	movl %eax, %ebx
+	movl %edx, %ecx
+	subl $1, %ebx
+	sbb $0, %ecx
+	js 2f
+	LOCK_PREFIX
+	cmpxchg8b (%esi)
+	jne 1b
+
+2:
+	movl %ebx, %eax
+	movl %ecx, %edx
+	RESTORE ebx
+	ret
+	CFI_ENDPROC
+ENDPROC(atomic64_dec_if_positive_cx8)
+
+ENTRY(atomic64_add_unless_cx8)
+	CFI_STARTPROC
+	SAVE ebp
+	SAVE ebx
+/* these just push these two parameters on the stack */
+	SAVE edi
+	SAVE ecx
+
+	movl %eax, %ebp
+	movl %edx, %edi
+
+	read64 %esi
+1:
+	cmpl %eax, 0(%esp)
+	je 4f
+2:
+	movl %eax, %ebx
+	movl %edx, %ecx
+	addl %ebp, %ebx
+	adcl %edi, %ecx
+	LOCK_PREFIX
+	cmpxchg8b (%esi)
+	jne 1b
+
+	movl $1, %eax
+3:
+	addl $8, %esp
+	CFI_ADJUST_CFA_OFFSET -8
+	RESTORE ebx
+	RESTORE ebp
+	ret
+4:
+	cmpl %edx, 4(%esp)
+	jne 2b
+	xorl %eax, %eax
+	jmp 3b
+	CFI_ENDPROC
+ENDPROC(atomic64_add_unless_cx8)
+
+ENTRY(atomic64_inc_not_zero_cx8)
+	CFI_STARTPROC
+	SAVE ebx
+
+	read64 %esi
+1:
+	movl %eax, %ecx
+	orl %edx, %ecx
+	jz 3f
+	movl %eax, %ebx
+	xorl %ecx, %ecx
+	addl $1, %ebx
+	adcl %edx, %ecx
+	LOCK_PREFIX
+	cmpxchg8b (%esi)
+	jne 1b
+
+	movl $1, %eax
+3:
+	RESTORE ebx
+	ret
+	CFI_ENDPROC
+ENDPROC(atomic64_inc_not_zero_cx8)
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c
new file mode 100644
index 00000000000..a3c66887503
--- /dev/null
+++ b/arch/x86/lib/cache-smp.c
@@ -0,0 +1,19 @@
+#include <linux/smp.h>
+#include <linux/module.h>
+
+static void __wbinvd(void *dummy)
+{
+	wbinvd();
+}
+
+void wbinvd_on_cpu(int cpu)
+{
+	smp_call_function_single(cpu, __wbinvd, NULL, 1);
+}
+EXPORT_SYMBOL(wbinvd_on_cpu);
+
+int wbinvd_on_all_cpus(void)
+{
+	return on_each_cpu(__wbinvd, NULL, 1);
+}
+EXPORT_SYMBOL(wbinvd_on_all_cpus);
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index adbccd0bbb7..e78b8eee661 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -28,6 +28,7 @@
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
 #include <asm/errno.h>
+#include <asm/asm.h>
 				
 /*
  * computes a partial checksum, e.g. for TCP/UDP fragments
@@ -50,11 +51,9 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
 	   */		
 ENTRY(csum_partial)
 	CFI_STARTPROC
-	pushl %esi
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %esi
 	CFI_REL_OFFSET esi, 0
-	pushl %ebx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ebx
 	CFI_REL_OFFSET ebx, 0
 	movl 20(%esp),%eax	# Function arg: unsigned int sum
 	movl 16(%esp),%ecx	# Function arg: int len
@@ -62,7 +61,7 @@ ENTRY(csum_partial)
 	testl $3, %esi		# Check alignment.
 	jz 2f			# Jump if alignment is ok.
 	testl $1, %esi		# Check alignment.
-	jz 10f			# Jump if alignment is boundary of 2bytes.
+	jz 10f			# Jump if alignment is boundary of 2 bytes.
 
 	# buf is odd
 	dec %ecx
@@ -132,11 +131,9 @@ ENTRY(csum_partial)
 	jz 8f
 	roll $8, %eax
 8:
-	popl %ebx
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %ebx
 	CFI_RESTORE ebx
-	popl %esi
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %esi
 	CFI_RESTORE esi
 	ret
 	CFI_ENDPROC
@@ -148,11 +145,9 @@ ENDPROC(csum_partial)
 
 ENTRY(csum_partial)
 	CFI_STARTPROC
-	pushl %esi
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %esi
 	CFI_REL_OFFSET esi, 0
-	pushl %ebx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ebx
 	CFI_REL_OFFSET ebx, 0
 	movl 20(%esp),%eax	# Function arg: unsigned int sum
 	movl 16(%esp),%ecx	# Function arg: int len
@@ -260,11 +255,9 @@ ENTRY(csum_partial)
 	jz 90f
 	roll $8, %eax
 90: 
-	popl %ebx
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %ebx
 	CFI_RESTORE ebx
-	popl %esi
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %esi
 	CFI_RESTORE esi
 	ret
 	CFI_ENDPROC
@@ -290,15 +283,11 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst,
 
 #define SRC(y...)			\
 	9999: y;			\
-	.section __ex_table, "a";	\
-	.long 9999b, 6001f	;	\
-	.previous
+	_ASM_EXTABLE(9999b, 6001f)
 
 #define DST(y...)			\
 	9999: y;			\
-	.section __ex_table, "a";	\
-	.long 9999b, 6002f	;	\
-	.previous
+	_ASM_EXTABLE(9999b, 6002f)
 
 #ifndef CONFIG_X86_USE_PPRO_CHECKSUM
 
@@ -309,14 +298,11 @@ ENTRY(csum_partial_copy_generic)
 	CFI_STARTPROC
 	subl  $4,%esp	
 	CFI_ADJUST_CFA_OFFSET 4
-	pushl %edi
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %edi
 	CFI_REL_OFFSET edi, 0
-	pushl %esi
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %esi
 	CFI_REL_OFFSET esi, 0
-	pushl %ebx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ebx
 	CFI_REL_OFFSET ebx, 0
 	movl ARGBASE+16(%esp),%eax	# sum
 	movl ARGBASE+12(%esp),%ecx	# len
@@ -426,17 +412,13 @@ DST(	movb %cl, (%edi)	)
 
 .previous
 
-	popl %ebx
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %ebx
 	CFI_RESTORE ebx
-	popl %esi
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %esi
 	CFI_RESTORE esi
-	popl %edi
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %edi
 	CFI_RESTORE edi
-	popl %ecx			# equivalent to addl $4,%esp
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %ecx			# equivalent to addl $4,%esp
 	ret	
 	CFI_ENDPROC
 ENDPROC(csum_partial_copy_generic)
@@ -459,14 +441,11 @@ ENDPROC(csum_partial_copy_generic)
 		
 ENTRY(csum_partial_copy_generic)
 	CFI_STARTPROC
-	pushl %ebx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ebx
 	CFI_REL_OFFSET ebx, 0
-	pushl %edi
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %edi
 	CFI_REL_OFFSET edi, 0
-	pushl %esi
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %esi
 	CFI_REL_OFFSET esi, 0
 	movl ARGBASE+4(%esp),%esi	#src
 	movl ARGBASE+8(%esp),%edi	#dst	
@@ -527,14 +506,11 @@ DST(	movb %dl, (%edi)         )
 	jmp  7b			
 .previous				
 
-	popl %esi
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %esi
 	CFI_RESTORE esi
-	popl %edi
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %edi
 	CFI_RESTORE edi
-	popl %ebx
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %ebx
 	CFI_RESTORE ebx
 	ret
 	CFI_ENDPROC
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index ebeafcce04a..f2145cfa12a 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,5 +1,6 @@
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
 
 /*
  * Zero a page. 	
@@ -14,6 +15,15 @@ ENTRY(clear_page_c)
 	CFI_ENDPROC
 ENDPROC(clear_page_c)
 
+ENTRY(clear_page_c_e)
+	CFI_STARTPROC
+	movl $4096,%ecx
+	xorl %eax,%eax
+	rep stosb
+	ret
+	CFI_ENDPROC
+ENDPROC(clear_page_c_e)
+
 ENTRY(clear_page)
 	CFI_STARTPROC
 	xorl   %eax,%eax
@@ -38,21 +48,26 @@ ENTRY(clear_page)
 .Lclear_page_end:
 ENDPROC(clear_page)
 
-	/* Some CPUs run faster using the string instructions.
-	   It is also a lot simpler. Use this when possible */
+	/*
+	 * Some CPUs support enhanced REP MOVSB/STOSB instructions.
+	 * It is recommended to use this when possible.
+	 * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
+	 * Otherwise, use original function.
+	 *
+	 */
 
 #include <asm/cpufeature.h>
 
 	.section .altinstr_replacement,"ax"
 1:	.byte 0xeb					/* jmp <disp8> */
 	.byte (clear_page_c - clear_page) - (2f - 1b)	/* offset */
-2:
+2:	.byte 0xeb					/* jmp <disp8> */
+	.byte (clear_page_c_e - clear_page) - (3f - 2b)	/* offset */
+3:
 	.previous
 	.section .altinstructions,"a"
-	.align 8
-	.quad clear_page
-	.quad 1b
-	.byte X86_FEATURE_REP_GOOD
-	.byte .Lclear_page_end - clear_page
-	.byte 2b - 1b
+	altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
+			     .Lclear_page_end-clear_page, 2b-1b
+	altinstruction_entry clear_page,2b,X86_FEATURE_ERMS,   \
+			     .Lclear_page_end-clear_page,3b-2b
 	.previous
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
new file mode 100644
index 00000000000..422db000d72
--- /dev/null
+++ b/arch/x86/lib/cmdline.c
@@ -0,0 +1,84 @@
+/*
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * Misc librarized functions for cmdline poking.
+ */
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <asm/setup.h>
+
+static inline int myisspace(u8 c)
+{
+	return c <= ' ';	/* Close enough approximation */
+}
+
+/**
+ * Find a boolean option (like quiet,noapic,nosmp....)
+ *
+ * @cmdline: the cmdline string
+ * @option: option string to look for
+ *
+ * Returns the position of that @option (starts counting with 1)
+ * or 0 on not found.
+ */
+int cmdline_find_option_bool(const char *cmdline, const char *option)
+{
+	char c;
+	int len, pos = 0, wstart = 0;
+	const char *opptr = NULL;
+	enum {
+		st_wordstart = 0,	/* Start of word/after whitespace */
+		st_wordcmp,	/* Comparing this word */
+		st_wordskip,	/* Miscompare, skip */
+	} state = st_wordstart;
+
+	if (!cmdline)
+		return -1;      /* No command line */
+
+	len = min_t(int, strlen(cmdline), COMMAND_LINE_SIZE);
+	if (!len)
+		return 0;
+
+	while (len--) {
+		c = *(char *)cmdline++;
+		pos++;
+
+		switch (state) {
+		case st_wordstart:
+			if (!c)
+				return 0;
+			else if (myisspace(c))
+				break;
+
+			state = st_wordcmp;
+			opptr = option;
+			wstart = pos;
+			/* fall through */
+
+		case st_wordcmp:
+			if (!*opptr)
+				if (!c || myisspace(c))
+					return wstart;
+				else
+					state = st_wordskip;
+			else if (!c)
+				return 0;
+			else if (c != *opptr++)
+				state = st_wordskip;
+			else if (!len)		/* last word and is matching */
+				return wstart;
+			break;
+
+		case st_wordskip:
+			if (!c)
+				return 0;
+			else if (myisspace(c))
+				state = st_wordstart;
+			break;
+		}
+	}
+
+	return 0;	/* Buffer overrun */
+}
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
new file mode 100644
index 00000000000..1e572c507d0
--- /dev/null
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -0,0 +1,65 @@
+/*
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; version 2
+ *	of the License.
+ *
+ */
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/frame.h>
+#include <asm/dwarf2.h>
+
+#ifdef CONFIG_SMP
+#define SEG_PREFIX %gs:
+#else
+#define SEG_PREFIX
+#endif
+
+.text
+
+/*
+ * Inputs:
+ * %rsi : memory location to compare
+ * %rax : low 64 bits of old value
+ * %rdx : high 64 bits of old value
+ * %rbx : low 64 bits of new value
+ * %rcx : high 64 bits of new value
+ * %al  : Operation successful
+ */
+ENTRY(this_cpu_cmpxchg16b_emu)
+CFI_STARTPROC
+
+#
+# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not
+# via the ZF.  Caller will access %al to get result.
+#
+# Note that this is only useful for a cpuops operation.  Meaning that we
+# do *not* have a fully atomic operation but just an operation that is
+# *atomic* on a single cpu (as provided by the this_cpu_xx class of
+# macros).
+#
+this_cpu_cmpxchg16b_emu:
+	pushf
+	cli
+
+	cmpq SEG_PREFIX(%rsi), %rax
+	jne not_same
+	cmpq SEG_PREFIX 8(%rsi), %rdx
+	jne not_same
+
+	movq %rbx, SEG_PREFIX(%rsi)
+	movq %rcx, SEG_PREFIX 8(%rsi)
+
+	popf
+	mov $1, %al
+	ret
+
+ not_same:
+	popf
+	xor %al,%al
+	ret
+
+CFI_ENDPROC
+
+ENDPROC(this_cpu_cmpxchg16b_emu)
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 727a5d46d2f..176cca67212 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -2,98 +2,93 @@
 
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
 
 	ALIGN
-copy_page_c:
+copy_page_rep:
 	CFI_STARTPROC
-	movl $4096/8,%ecx
-	rep movsq
+	movl	$4096/8, %ecx
+	rep	movsq
 	ret
 	CFI_ENDPROC
-ENDPROC(copy_page_c)
+ENDPROC(copy_page_rep)
 
-/* Don't use streaming store because it's better when the target
-   ends up in cache. */
-	    
-/* Could vary the prefetch distance based on SMP/UP */
+/*
+ *  Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD.
+ *  Could vary the prefetch distance based on SMP/UP.
+*/
 
 ENTRY(copy_page)
 	CFI_STARTPROC
-	subq	$3*8,%rsp
-	CFI_ADJUST_CFA_OFFSET 3*8
-	movq	%rbx,(%rsp)
+	subq	$2*8,	%rsp
+	CFI_ADJUST_CFA_OFFSET 2*8
+	movq	%rbx,	(%rsp)
 	CFI_REL_OFFSET rbx, 0
-	movq	%r12,1*8(%rsp)
+	movq	%r12,	1*8(%rsp)
 	CFI_REL_OFFSET r12, 1*8
-	movq	%r13,2*8(%rsp)
-	CFI_REL_OFFSET r13, 2*8
 
-	movl	$(4096/64)-5,%ecx
+	movl	$(4096/64)-5,	%ecx
 	.p2align 4
 .Loop64:
-  	dec     %rcx
-
-	movq        (%rsi), %rax
-	movq      8 (%rsi), %rbx
-	movq     16 (%rsi), %rdx
-	movq     24 (%rsi), %r8
-	movq     32 (%rsi), %r9
-	movq     40 (%rsi), %r10
-	movq     48 (%rsi), %r11
-	movq     56 (%rsi), %r12
+	dec	%rcx
+	movq	0x8*0(%rsi), %rax
+	movq	0x8*1(%rsi), %rbx
+	movq	0x8*2(%rsi), %rdx
+	movq	0x8*3(%rsi), %r8
+	movq	0x8*4(%rsi), %r9
+	movq	0x8*5(%rsi), %r10
+	movq	0x8*6(%rsi), %r11
+	movq	0x8*7(%rsi), %r12
 
 	prefetcht0 5*64(%rsi)
 
-	movq     %rax,    (%rdi)
-	movq     %rbx,  8 (%rdi)
-	movq     %rdx, 16 (%rdi)
-	movq     %r8,  24 (%rdi)
-	movq     %r9,  32 (%rdi)
-	movq     %r10, 40 (%rdi)
-	movq     %r11, 48 (%rdi)
-	movq     %r12, 56 (%rdi)
+	movq	%rax, 0x8*0(%rdi)
+	movq	%rbx, 0x8*1(%rdi)
+	movq	%rdx, 0x8*2(%rdi)
+	movq	%r8,  0x8*3(%rdi)
+	movq	%r9,  0x8*4(%rdi)
+	movq	%r10, 0x8*5(%rdi)
+	movq	%r11, 0x8*6(%rdi)
+	movq	%r12, 0x8*7(%rdi)
 
-	leaq    64 (%rsi), %rsi
-	leaq    64 (%rdi), %rdi
+	leaq	64 (%rsi), %rsi
+	leaq	64 (%rdi), %rdi
 
-	jnz     .Loop64
+	jnz	.Loop64
 
-	movl	$5,%ecx
+	movl	$5, %ecx
 	.p2align 4
 .Loop2:
-	decl   %ecx
-
-	movq        (%rsi), %rax
-	movq      8 (%rsi), %rbx
-	movq     16 (%rsi), %rdx
-	movq     24 (%rsi), %r8
-	movq     32 (%rsi), %r9
-	movq     40 (%rsi), %r10
-	movq     48 (%rsi), %r11
-	movq     56 (%rsi), %r12
-
-	movq     %rax,    (%rdi)
-	movq     %rbx,  8 (%rdi)
-	movq     %rdx, 16 (%rdi)
-	movq     %r8,  24 (%rdi)
-	movq     %r9,  32 (%rdi)
-	movq     %r10, 40 (%rdi)
-	movq     %r11, 48 (%rdi)
-	movq     %r12, 56 (%rdi)
-
-	leaq	64(%rdi),%rdi
-	leaq	64(%rsi),%rsi
-
+	decl	%ecx
+
+	movq	0x8*0(%rsi), %rax
+	movq	0x8*1(%rsi), %rbx
+	movq	0x8*2(%rsi), %rdx
+	movq	0x8*3(%rsi), %r8
+	movq	0x8*4(%rsi), %r9
+	movq	0x8*5(%rsi), %r10
+	movq	0x8*6(%rsi), %r11
+	movq	0x8*7(%rsi), %r12
+
+	movq	%rax, 0x8*0(%rdi)
+	movq	%rbx, 0x8*1(%rdi)
+	movq	%rdx, 0x8*2(%rdi)
+	movq	%r8,  0x8*3(%rdi)
+	movq	%r9,  0x8*4(%rdi)
+	movq	%r10, 0x8*5(%rdi)
+	movq	%r11, 0x8*6(%rdi)
+	movq	%r12, 0x8*7(%rdi)
+
+	leaq	64(%rdi), %rdi
+	leaq	64(%rsi), %rsi
 	jnz	.Loop2
 
-	movq	(%rsp),%rbx
+	movq	(%rsp), %rbx
 	CFI_RESTORE rbx
-	movq	1*8(%rsp),%r12
+	movq	1*8(%rsp), %r12
 	CFI_RESTORE r12
-	movq	2*8(%rsp),%r13
-	CFI_RESTORE r13
-	addq	$3*8,%rsp
-	CFI_ADJUST_CFA_OFFSET -3*8
+	addq	$2*8, %rsp
+	CFI_ADJUST_CFA_OFFSET -2*8
 	ret
 .Lcopy_page_end:
 	CFI_ENDPROC
@@ -106,14 +101,10 @@ ENDPROC(copy_page)
 
 	.section .altinstr_replacement,"ax"
 1:	.byte 0xeb					/* jmp <disp8> */
-	.byte (copy_page_c - copy_page) - (2f - 1b)	/* offset */
+	.byte (copy_page_rep - copy_page) - (2f - 1b)	/* offset */
 2:
 	.previous
 	.section .altinstructions,"a"
-	.align 8
-	.quad copy_page
-	.quad 1b
-	.byte X86_FEATURE_REP_GOOD
-	.byte .Lcopy_page_end - copy_page
-	.byte 2b - 1b
+	altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD,	\
+		.Lcopy_page_end-copy_page, 2b-1b
 	.previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index cf889d4e076..dee945d5559 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -15,23 +15,32 @@
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
 #include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
 
-	.macro ALTERNATIVE_JUMP feature,orig,alt
+/*
+ * By placing feature2 after feature1 in altinstructions section, we logically
+ * implement:
+ * If CPU has feature2, jmp to alt2 is used
+ * else if CPU has feature1, jmp to alt1 is used
+ * else jmp to orig is used.
+ */
+	.macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
 0:
 	.byte 0xe9	/* 32bit jump */
 	.long \orig-1f	/* by default jump to orig */
 1:
 	.section .altinstr_replacement,"ax"
 2:	.byte 0xe9			/* near jump with 32bit immediate */
-	.long \alt-1b /* offset */   /* or alternatively to alt */
+	.long \alt1-1b /* offset */   /* or alternatively to alt1 */
+3:	.byte 0xe9			/* near jump with 32bit immediate */
+	.long \alt2-1b /* offset */   /* or alternatively to alt2 */
 	.previous
+
 	.section .altinstructions,"a"
-	.align 8
-	.quad  0b
-	.quad  2b
-	.byte  \feature			/* when feature is set */
-	.byte  5
-	.byte  5
+	altinstruction_entry 0b,2b,\feature1,5,5
+	altinstruction_entry 0b,3b,\feature2,5,5
 	.previous
 	.endm
 
@@ -56,11 +65,8 @@
 	jmp copy_user_handle_tail
 	.previous
 
-	.section __ex_table,"a"
-	.align 8
-	.quad 100b,103b
-	.quad 101b,103b
-	.previous
+	_ASM_EXTABLE(100b,103b)
+	_ASM_EXTABLE(101b,103b)
 #endif
 	.endm
 
@@ -72,8 +78,10 @@ ENTRY(_copy_to_user)
 	addq %rdx,%rcx
 	jc bad_to_user
 	cmpq TI_addr_limit(%rax),%rcx
-	jae bad_to_user
-	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+	ja bad_to_user
+	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS,	\
+		copy_user_generic_unrolled,copy_user_generic_string,	\
+		copy_user_enhanced_fast_string
 	CFI_ENDPROC
 ENDPROC(_copy_to_user)
 
@@ -85,17 +93,13 @@ ENTRY(_copy_from_user)
 	addq %rdx,%rcx
 	jc bad_from_user
 	cmpq TI_addr_limit(%rax),%rcx
-	jae bad_from_user
-	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+	ja bad_from_user
+	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS,	\
+		copy_user_generic_unrolled,copy_user_generic_string,	\
+		copy_user_enhanced_fast_string
 	CFI_ENDPROC
 ENDPROC(_copy_from_user)
 
-ENTRY(copy_user_generic)
-	CFI_STARTPROC
-	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
-	CFI_ENDPROC
-ENDPROC(copy_user_generic)
-
 	.section .fixup,"ax"
 	/* must zero dest */
 ENTRY(bad_from_user)
@@ -123,10 +127,11 @@ ENDPROC(bad_from_user)
  * rdx count
  *
  * Output:
- * eax uncopied bytes or 0 if successfull.
+ * eax uncopied bytes or 0 if successful.
  */
 ENTRY(copy_user_generic_unrolled)
 	CFI_STARTPROC
+	ASM_STAC
 	cmpl $8,%edx
 	jb 20f		/* less then 8 bytes, go to byte copy loop */
 	ALIGN_DESTINATION
@@ -174,41 +179,39 @@ ENTRY(copy_user_generic_unrolled)
 	decl %ecx
 	jnz 21b
 23:	xor %eax,%eax
+	ASM_CLAC
 	ret
 
 	.section .fixup,"ax"
 30:	shll $6,%ecx
 	addl %ecx,%edx
 	jmp 60f
-40:	lea (%rdx,%rcx,8),%rdx
+40:	leal (%rdx,%rcx,8),%edx
 	jmp 60f
 50:	movl %ecx,%edx
 60:	jmp copy_user_handle_tail /* ecx is zerorest also */
 	.previous
 
-	.section __ex_table,"a"
-	.align 8
-	.quad 1b,30b
-	.quad 2b,30b
-	.quad 3b,30b
-	.quad 4b,30b
-	.quad 5b,30b
-	.quad 6b,30b
-	.quad 7b,30b
-	.quad 8b,30b
-	.quad 9b,30b
-	.quad 10b,30b
-	.quad 11b,30b
-	.quad 12b,30b
-	.quad 13b,30b
-	.quad 14b,30b
-	.quad 15b,30b
-	.quad 16b,30b
-	.quad 18b,40b
-	.quad 19b,40b
-	.quad 21b,50b
-	.quad 22b,50b
-	.previous
+	_ASM_EXTABLE(1b,30b)
+	_ASM_EXTABLE(2b,30b)
+	_ASM_EXTABLE(3b,30b)
+	_ASM_EXTABLE(4b,30b)
+	_ASM_EXTABLE(5b,30b)
+	_ASM_EXTABLE(6b,30b)
+	_ASM_EXTABLE(7b,30b)
+	_ASM_EXTABLE(8b,30b)
+	_ASM_EXTABLE(9b,30b)
+	_ASM_EXTABLE(10b,30b)
+	_ASM_EXTABLE(11b,30b)
+	_ASM_EXTABLE(12b,30b)
+	_ASM_EXTABLE(13b,30b)
+	_ASM_EXTABLE(14b,30b)
+	_ASM_EXTABLE(15b,30b)
+	_ASM_EXTABLE(16b,30b)
+	_ASM_EXTABLE(18b,40b)
+	_ASM_EXTABLE(19b,40b)
+	_ASM_EXTABLE(21b,50b)
+	_ASM_EXTABLE(22b,50b)
 	CFI_ENDPROC
 ENDPROC(copy_user_generic_unrolled)
 
@@ -232,8 +235,7 @@ ENDPROC(copy_user_generic_unrolled)
  */
 ENTRY(copy_user_generic_string)
 	CFI_STARTPROC
-	andl %edx,%edx
-	jz 4f
+	ASM_STAC
 	cmpl $8,%edx
 	jb 2f		/* less than 8 bytes, go to byte copy loop */
 	ALIGN_DESTINATION
@@ -245,19 +247,48 @@ ENTRY(copy_user_generic_string)
 2:	movl %edx,%ecx
 3:	rep
 	movsb
-4:	xorl %eax,%eax
+	xorl %eax,%eax
+	ASM_CLAC
 	ret
 
 	.section .fixup,"ax"
-11:	lea (%rdx,%rcx,8),%rcx
+11:	leal (%rdx,%rcx,8),%ecx
 12:	movl %ecx,%edx		/* ecx is zerorest also */
 	jmp copy_user_handle_tail
 	.previous
 
-	.section __ex_table,"a"
-	.align 8
-	.quad 1b,11b
-	.quad 3b,12b
-	.previous
+	_ASM_EXTABLE(1b,11b)
+	_ASM_EXTABLE(3b,12b)
 	CFI_ENDPROC
 ENDPROC(copy_user_generic_string)
+
+/*
+ * Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
+ * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(copy_user_enhanced_fast_string)
+	CFI_STARTPROC
+	ASM_STAC
+	movl %edx,%ecx
+1:	rep
+	movsb
+	xorl %eax,%eax
+	ASM_CLAC
+	ret
+
+	.section .fixup,"ax"
+12:	movl %ecx,%edx		/* ecx is zerorest also */
+	jmp copy_user_handle_tail
+	.previous
+
+	_ASM_EXTABLE(1b,12b)
+	CFI_ENDPROC
+ENDPROC(copy_user_enhanced_fast_string)
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
index cb0c112386f..6a4f43c2d9e 100644
--- a/arch/x86/lib/copy_user_nocache_64.S
+++ b/arch/x86/lib/copy_user_nocache_64.S
@@ -14,6 +14,8 @@
 #include <asm/current.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
 
 	.macro ALIGN_DESTINATION
 #ifdef FIX_ALIGNMENT
@@ -36,11 +38,8 @@
 	jmp copy_user_handle_tail
 	.previous
 
-	.section __ex_table,"a"
-	.align 8
-	.quad 100b,103b
-	.quad 101b,103b
-	.previous
+	_ASM_EXTABLE(100b,103b)
+	_ASM_EXTABLE(101b,103b)
 #endif
 	.endm
 
@@ -50,6 +49,7 @@
  */
 ENTRY(__copy_user_nocache)
 	CFI_STARTPROC
+	ASM_STAC
 	cmpl $8,%edx
 	jb 20f		/* less then 8 bytes, go to byte copy loop */
 	ALIGN_DESTINATION
@@ -97,6 +97,7 @@ ENTRY(__copy_user_nocache)
 	decl %ecx
 	jnz 21b
 23:	xorl %eax,%eax
+	ASM_CLAC
 	sfence
 	ret
 
@@ -111,27 +112,25 @@ ENTRY(__copy_user_nocache)
 	jmp copy_user_handle_tail
 	.previous
 
-	.section __ex_table,"a"
-	.quad 1b,30b
-	.quad 2b,30b
-	.quad 3b,30b
-	.quad 4b,30b
-	.quad 5b,30b
-	.quad 6b,30b
-	.quad 7b,30b
-	.quad 8b,30b
-	.quad 9b,30b
-	.quad 10b,30b
-	.quad 11b,30b
-	.quad 12b,30b
-	.quad 13b,30b
-	.quad 14b,30b
-	.quad 15b,30b
-	.quad 16b,30b
-	.quad 18b,40b
-	.quad 19b,40b
-	.quad 21b,50b
-	.quad 22b,50b
-	.previous
+	_ASM_EXTABLE(1b,30b)
+	_ASM_EXTABLE(2b,30b)
+	_ASM_EXTABLE(3b,30b)
+	_ASM_EXTABLE(4b,30b)
+	_ASM_EXTABLE(5b,30b)
+	_ASM_EXTABLE(6b,30b)
+	_ASM_EXTABLE(7b,30b)
+	_ASM_EXTABLE(8b,30b)
+	_ASM_EXTABLE(9b,30b)
+	_ASM_EXTABLE(10b,30b)
+	_ASM_EXTABLE(11b,30b)
+	_ASM_EXTABLE(12b,30b)
+	_ASM_EXTABLE(13b,30b)
+	_ASM_EXTABLE(14b,30b)
+	_ASM_EXTABLE(15b,30b)
+	_ASM_EXTABLE(16b,30b)
+	_ASM_EXTABLE(18b,40b)
+	_ASM_EXTABLE(19b,40b)
+	_ASM_EXTABLE(21b,50b)
+	_ASM_EXTABLE(22b,50b)
 	CFI_ENDPROC
 ENDPROC(__copy_user_nocache)
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index f0dba36578e..2419d5fefae 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -1,6 +1,6 @@
 /*
- * Copyright 2002,2003 Andi Kleen, SuSE Labs.
- *	
+ * Copyright 2002, 2003 Andi Kleen, SuSE Labs.
+ *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file COPYING in the main directory of this archive
  * for more details. No warranty for anything given at all.
@@ -8,85 +8,77 @@
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
 #include <asm/errno.h>
+#include <asm/asm.h>
 
 /*
  * Checksum copy with exception handling.
- * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the 
+ * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
  * destination is zeroed.
- * 
+ *
  * Input
  * rdi  source
  * rsi  destination
  * edx  len (32bit)
- * ecx  sum (32bit) 
+ * ecx  sum (32bit)
  * r8   src_err_ptr (int)
  * r9   dst_err_ptr (int)
  *
  * Output
  * eax  64bit sum. undefined in case of exception.
- * 
- * Wrappers need to take care of valid exception sum and zeroing.		 
+ *
+ * Wrappers need to take care of valid exception sum and zeroing.
  * They also should align source or destination to 8 bytes.
  */
 
 	.macro source
 10:
-	.section __ex_table,"a"
-	.align 8
-	.quad 10b,.Lbad_source
-	.previous
+	_ASM_EXTABLE(10b, .Lbad_source)
 	.endm
-		
+
 	.macro dest
 20:
-	.section __ex_table,"a"
-	.align 8
-	.quad 20b,.Lbad_dest
-	.previous
+	_ASM_EXTABLE(20b, .Lbad_dest)
 	.endm
-			
+
 	.macro ignore L=.Lignore
 30:
-	.section __ex_table,"a"
-	.align 8
-	.quad 30b,\L
-	.previous
+	_ASM_EXTABLE(30b, \L)
 	.endm
-	
-				
+
+
 ENTRY(csum_partial_copy_generic)
 	CFI_STARTPROC
-	cmpl	 $3*64,%edx
-	jle	 .Lignore
+	cmpl	$3*64, %edx
+	jle	.Lignore
 
-.Lignore:		
-	subq  $7*8,%rsp
+.Lignore:
+	subq  $7*8, %rsp
 	CFI_ADJUST_CFA_OFFSET 7*8
-	movq  %rbx,2*8(%rsp)
+	movq  %rbx, 2*8(%rsp)
 	CFI_REL_OFFSET rbx, 2*8
-	movq  %r12,3*8(%rsp)
+	movq  %r12, 3*8(%rsp)
 	CFI_REL_OFFSET r12, 3*8
-	movq  %r14,4*8(%rsp)
+	movq  %r14, 4*8(%rsp)
 	CFI_REL_OFFSET r14, 4*8
-	movq  %r13,5*8(%rsp)
+	movq  %r13, 5*8(%rsp)
 	CFI_REL_OFFSET r13, 5*8
-	movq  %rbp,6*8(%rsp)
+	movq  %rbp, 6*8(%rsp)
 	CFI_REL_OFFSET rbp, 6*8
 
-	movq  %r8,(%rsp)
-	movq  %r9,1*8(%rsp)
-	
-	movl  %ecx,%eax
-	movl  %edx,%ecx
+	movq  %r8, (%rsp)
+	movq  %r9, 1*8(%rsp)
 
-	xorl  %r9d,%r9d
-	movq  %rcx,%r12
+	movl  %ecx, %eax
+	movl  %edx, %ecx
 
-	shrq  $6,%r12
-	jz    .Lhandle_tail       /* < 64 */
+	xorl  %r9d, %r9d
+	movq  %rcx, %r12
+
+	shrq  $6, %r12
+	jz	.Lhandle_tail       /* < 64 */
 
 	clc
-	
+
 	/* main loop. clear in 64 byte blocks */
 	/* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
 	/* r11:	temp3, rdx: temp4, r12 loopcnt */
@@ -94,156 +86,156 @@ ENTRY(csum_partial_copy_generic)
 	.p2align 4
 .Lloop:
 	source
-	movq  (%rdi),%rbx
+	movq  (%rdi), %rbx
 	source
-	movq  8(%rdi),%r8
+	movq  8(%rdi), %r8
 	source
-	movq  16(%rdi),%r11
+	movq  16(%rdi), %r11
 	source
-	movq  24(%rdi),%rdx
+	movq  24(%rdi), %rdx
 
 	source
-	movq  32(%rdi),%r10
+	movq  32(%rdi), %r10
 	source
-	movq  40(%rdi),%rbp
+	movq  40(%rdi), %rbp
 	source
-	movq  48(%rdi),%r14
+	movq  48(%rdi), %r14
 	source
-	movq  56(%rdi),%r13
-		
+	movq  56(%rdi), %r13
+
 	ignore 2f
 	prefetcht0 5*64(%rdi)
-2:							
-	adcq  %rbx,%rax
-	adcq  %r8,%rax
-	adcq  %r11,%rax
-	adcq  %rdx,%rax
-	adcq  %r10,%rax
-	adcq  %rbp,%rax
-	adcq  %r14,%rax
-	adcq  %r13,%rax
+2:
+	adcq  %rbx, %rax
+	adcq  %r8, %rax
+	adcq  %r11, %rax
+	adcq  %rdx, %rax
+	adcq  %r10, %rax
+	adcq  %rbp, %rax
+	adcq  %r14, %rax
+	adcq  %r13, %rax
 
 	decl %r12d
-	
+
 	dest
-	movq %rbx,(%rsi)
+	movq %rbx, (%rsi)
 	dest
-	movq %r8,8(%rsi)
+	movq %r8, 8(%rsi)
 	dest
-	movq %r11,16(%rsi)
+	movq %r11, 16(%rsi)
 	dest
-	movq %rdx,24(%rsi)
+	movq %rdx, 24(%rsi)
 
 	dest
-	movq %r10,32(%rsi)
+	movq %r10, 32(%rsi)
 	dest
-	movq %rbp,40(%rsi)
+	movq %rbp, 40(%rsi)
 	dest
-	movq %r14,48(%rsi)
+	movq %r14, 48(%rsi)
 	dest
-	movq %r13,56(%rsi)
-	
+	movq %r13, 56(%rsi)
+
 3:
-	
-	leaq 64(%rdi),%rdi
-	leaq 64(%rsi),%rsi
 
-	jnz   .Lloop
+	leaq 64(%rdi), %rdi
+	leaq 64(%rsi), %rsi
 
-	adcq  %r9,%rax
+	jnz	.Lloop
 
-	/* do last upto 56 bytes */
+	adcq  %r9, %rax
+
+	/* do last up to 56 bytes */
 .Lhandle_tail:
 	/* ecx:	count */
-	movl %ecx,%r10d
-	andl $63,%ecx
-	shrl $3,%ecx
-	jz 	 .Lfold
+	movl %ecx, %r10d
+	andl $63, %ecx
+	shrl $3, %ecx
+	jz	.Lfold
 	clc
 	.p2align 4
-.Lloop_8:	
+.Lloop_8:
 	source
-	movq (%rdi),%rbx
-	adcq %rbx,%rax
+	movq (%rdi), %rbx
+	adcq %rbx, %rax
 	decl %ecx
 	dest
-	movq %rbx,(%rsi)
-	leaq 8(%rsi),%rsi /* preserve carry */
-	leaq 8(%rdi),%rdi
+	movq %rbx, (%rsi)
+	leaq 8(%rsi), %rsi /* preserve carry */
+	leaq 8(%rdi), %rdi
 	jnz	.Lloop_8
-	adcq %r9,%rax	/* add in carry */
+	adcq %r9, %rax	/* add in carry */
 
 .Lfold:
 	/* reduce checksum to 32bits */
-	movl %eax,%ebx
-	shrq $32,%rax
-	addl %ebx,%eax
-	adcl %r9d,%eax
+	movl %eax, %ebx
+	shrq $32, %rax
+	addl %ebx, %eax
+	adcl %r9d, %eax
 
-	/* do last upto 6 bytes */	
+	/* do last up to 6 bytes */
 .Lhandle_7:
-	movl %r10d,%ecx
-	andl $7,%ecx
-	shrl $1,%ecx
+	movl %r10d, %ecx
+	andl $7, %ecx
+	shrl $1, %ecx
 	jz   .Lhandle_1
-	movl $2,%edx
-	xorl %ebx,%ebx
-	clc  
+	movl $2, %edx
+	xorl %ebx, %ebx
+	clc
 	.p2align 4
-.Lloop_1:	
+.Lloop_1:
 	source
-	movw (%rdi),%bx
-	adcl %ebx,%eax
+	movw (%rdi), %bx
+	adcl %ebx, %eax
 	decl %ecx
 	dest
-	movw %bx,(%rsi)
-	leaq 2(%rdi),%rdi
-	leaq 2(%rsi),%rsi
+	movw %bx, (%rsi)
+	leaq 2(%rdi), %rdi
+	leaq 2(%rsi), %rsi
 	jnz .Lloop_1
-	adcl %r9d,%eax	/* add in carry */
-	
+	adcl %r9d, %eax	/* add in carry */
+
 	/* handle last odd byte */
 .Lhandle_1:
-	testl $1,%r10d
+	testl $1, %r10d
 	jz    .Lende
-	xorl  %ebx,%ebx
+	xorl  %ebx, %ebx
 	source
-	movb (%rdi),%bl
+	movb (%rdi), %bl
 	dest
-	movb %bl,(%rsi)
-	addl %ebx,%eax
-	adcl %r9d,%eax		/* carry */
-			
+	movb %bl, (%rsi)
+	addl %ebx, %eax
+	adcl %r9d, %eax		/* carry */
+
 	CFI_REMEMBER_STATE
 .Lende:
-	movq 2*8(%rsp),%rbx
+	movq 2*8(%rsp), %rbx
 	CFI_RESTORE rbx
-	movq 3*8(%rsp),%r12
+	movq 3*8(%rsp), %r12
 	CFI_RESTORE r12
-	movq 4*8(%rsp),%r14
+	movq 4*8(%rsp), %r14
 	CFI_RESTORE r14
-	movq 5*8(%rsp),%r13
+	movq 5*8(%rsp), %r13
 	CFI_RESTORE r13
-	movq 6*8(%rsp),%rbp
+	movq 6*8(%rsp), %rbp
 	CFI_RESTORE rbp
-	addq $7*8,%rsp
+	addq $7*8, %rsp
 	CFI_ADJUST_CFA_OFFSET -7*8
 	ret
 	CFI_RESTORE_STATE
 
 	/* Exception handlers. Very simple, zeroing is done in the wrappers */
 .Lbad_source:
-	movq (%rsp),%rax
-	testq %rax,%rax
+	movq (%rsp), %rax
+	testq %rax, %rax
 	jz   .Lende
-	movl $-EFAULT,(%rax)
+	movl $-EFAULT, (%rax)
 	jmp  .Lende
-	
+
 .Lbad_dest:
-	movq 8(%rsp),%rax
-	testq %rax,%rax
-	jz   .Lende	
-	movl $-EFAULT,(%rax)
+	movq 8(%rsp), %rax
+	testq %rax, %rax
+	jz   .Lende
+	movl $-EFAULT, (%rax)
 	jmp .Lende
 	CFI_ENDPROC
 ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index bf51144d97e..9845371c5c3 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -84,7 +84,7 @@ static unsigned do_csum(const unsigned char *buff, unsigned len)
 				count64--;
 			}
 
-			/* last upto 7 8byte blocks */
+			/* last up to 7 8byte blocks */
 			count %= 8; 
 			while (count) { 
 				asm("addq %1,%0\n\t"
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index 459b58a8a15..7609e0e421e 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -6,6 +6,7 @@
  */
 #include <asm/checksum.h>
 #include <linux/module.h>
+#include <asm/smap.h>
 
 /**
  * csum_partial_copy_from_user - Copy and checksum from user space.
@@ -52,8 +53,10 @@ csum_partial_copy_from_user(const void __user *src, void *dst,
 			len -= 2;
 		}
 	}
+	stac();
 	isum = csum_partial_copy_generic((__force const void *)src,
 				dst, len, isum, errp, NULL);
+	clac();
 	if (unlikely(*errp))
 		goto out_err;
 
@@ -82,6 +85,8 @@ __wsum
 csum_partial_copy_to_user(const void *src, void __user *dst,
 			  int len, __wsum isum, int *errp)
 {
+	__wsum ret;
+
 	might_sleep();
 
 	if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) {
@@ -105,8 +110,11 @@ csum_partial_copy_to_user(const void *src, void __user *dst,
 	}
 
 	*errp = 0;
-	return csum_partial_copy_generic(src, (void __force *)dst,
-					 len, isum, NULL, errp);
+	stac();
+	ret = csum_partial_copy_generic(src, (void __force *)dst,
+					len, isum, NULL, errp);
+	clac();
+	return ret;
 }
 EXPORT_SYMBOL(csum_partial_copy_to_user);
 
@@ -115,7 +123,7 @@ EXPORT_SYMBOL(csum_partial_copy_to_user);
  * @src: source address
  * @dst: destination address
  * @len: number of bytes to be copied.
- * @isum: initial sum that is added into the result (32bit unfolded)
+ * @sum: initial sum that is added into the result (32bit unfolded)
  *
  * Returns an 32bit unfolded checksum of the buffer.
  */
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index ff485d36118..39d6a3db0b9 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -16,7 +16,6 @@
 #include <linux/timex.h>
 #include <linux/preempt.h>
 #include <linux/delay.h>
-#include <linux/init.h>
 
 #include <asm/processor.h>
 #include <asm/delay.h>
@@ -48,9 +47,9 @@ static void delay_loop(unsigned long loops)
 }
 
 /* TSC based delay: */
-static void delay_tsc(unsigned long loops)
+static void delay_tsc(unsigned long __loops)
 {
-	unsigned long bclock, now;
+	u32 bclock, now, loops = __loops;
 	int cpu;
 
 	preempt_disable();
@@ -98,7 +97,7 @@ void use_tsc_delay(void)
 	delay_fn = delay_tsc;
 }
 
-int __devinit read_current_timer(unsigned long *timer_val)
+int read_current_timer(unsigned long *timer_val)
 {
 	if (delay_fn == delay_tsc) {
 		rdtscll(*timer_val);
@@ -121,7 +120,7 @@ inline void __const_udelay(unsigned long xloops)
 	asm("mull %%edx"
 		:"=d" (xloops), "=&a" (d0)
 		:"1" (xloops), "0"
-		(cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4)));
+		(this_cpu_read(cpu_info.loops_per_jiffy) * (HZ/4)));
 
 	__delay(++xloops);
 }
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index 51f1504cddd..a4512359656 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -15,11 +15,10 @@
  * __get_user_X
  *
  * Inputs:	%[r|e]ax contains the address.
- *		The register is modified, but all changes are undone
- *		before returning because the C code doesn't know about it.
  *
  * Outputs:	%[r|e]ax is error code (0 or -EFAULT)
  *		%[r|e]dx contains zero-extended value
+ *		%ecx contains the high half for 32-bit __get_user_8
  *
  *
  * These functions should not modify any other registers,
@@ -33,6 +32,7 @@
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
 #include <asm/asm.h>
+#include <asm/smap.h>
 
 	.text
 ENTRY(__get_user_1)
@@ -40,8 +40,10 @@ ENTRY(__get_user_1)
 	GET_THREAD_INFO(%_ASM_DX)
 	cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
 	jae bad_get_user
-1:	movzb (%_ASM_AX),%edx
+	ASM_STAC
+1:	movzbl (%_ASM_AX),%edx
 	xor %eax,%eax
+	ASM_CLAC
 	ret
 	CFI_ENDPROC
 ENDPROC(__get_user_1)
@@ -53,8 +55,10 @@ ENTRY(__get_user_2)
 	GET_THREAD_INFO(%_ASM_DX)
 	cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
 	jae bad_get_user
+	ASM_STAC
 2:	movzwl -1(%_ASM_AX),%edx
 	xor %eax,%eax
+	ASM_CLAC
 	ret
 	CFI_ENDPROC
 ENDPROC(__get_user_2)
@@ -66,39 +70,71 @@ ENTRY(__get_user_4)
 	GET_THREAD_INFO(%_ASM_DX)
 	cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
 	jae bad_get_user
-3:	mov -3(%_ASM_AX),%edx
+	ASM_STAC
+3:	movl -3(%_ASM_AX),%edx
 	xor %eax,%eax
+	ASM_CLAC
 	ret
 	CFI_ENDPROC
 ENDPROC(__get_user_4)
 
-#ifdef CONFIG_X86_64
 ENTRY(__get_user_8)
 	CFI_STARTPROC
+#ifdef CONFIG_X86_64
 	add $7,%_ASM_AX
 	jc bad_get_user
 	GET_THREAD_INFO(%_ASM_DX)
 	cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
-	jae	bad_get_user
-4:	movq -7(%_ASM_AX),%_ASM_DX
+	jae bad_get_user
+	ASM_STAC
+4:	movq -7(%_ASM_AX),%rdx
 	xor %eax,%eax
+	ASM_CLAC
 	ret
+#else
+	add $7,%_ASM_AX
+	jc bad_get_user_8
+	GET_THREAD_INFO(%_ASM_DX)
+	cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
+	jae bad_get_user_8
+	ASM_STAC
+4:	movl -7(%_ASM_AX),%edx
+5:	movl -3(%_ASM_AX),%ecx
+	xor %eax,%eax
+	ASM_CLAC
+	ret
+#endif
 	CFI_ENDPROC
 ENDPROC(__get_user_8)
-#endif
+
 
 bad_get_user:
 	CFI_STARTPROC
 	xor %edx,%edx
 	mov $(-EFAULT),%_ASM_AX
+	ASM_CLAC
 	ret
 	CFI_ENDPROC
 END(bad_get_user)
 
-.section __ex_table,"a"
-	_ASM_PTR 1b,bad_get_user
-	_ASM_PTR 2b,bad_get_user
-	_ASM_PTR 3b,bad_get_user
+#ifdef CONFIG_X86_32
+bad_get_user_8:
+	CFI_STARTPROC
+	xor %edx,%edx
+	xor %ecx,%ecx
+	mov $(-EFAULT),%_ASM_AX
+	ASM_CLAC
+	ret
+	CFI_ENDPROC
+END(bad_get_user_8)
+#endif
+
+	_ASM_EXTABLE(1b,bad_get_user)
+	_ASM_EXTABLE(2b,bad_get_user)
+	_ASM_EXTABLE(3b,bad_get_user)
 #ifdef CONFIG_X86_64
-	_ASM_PTR 4b,bad_get_user
+	_ASM_EXTABLE(4b,bad_get_user)
+#else
+	_ASM_EXTABLE(4b,bad_get_user_8)
+	_ASM_EXTABLE(5b,bad_get_user_8)
 #endif
diff --git a/arch/x86/lib/hash.c b/arch/x86/lib/hash.c
new file mode 100644
index 00000000000..ff4fa51a5b1
--- /dev/null
+++ b/arch/x86/lib/hash.c
@@ -0,0 +1,92 @@
+/*
+ * Some portions derived from code covered by the following notice:
+ *
+ * Copyright (c) 2010-2013 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/hash.h>
+#include <linux/init.h>
+
+#include <asm/processor.h>
+#include <asm/cpufeature.h>
+#include <asm/hash.h>
+
+static inline u32 crc32_u32(u32 crc, u32 val)
+{
+#ifdef CONFIG_AS_CRC32
+	asm ("crc32l %1,%0\n" : "+r" (crc) : "rm" (val));
+#else
+	asm (".byte 0xf2, 0x0f, 0x38, 0xf1, 0xc1" : "+a" (crc) : "c" (val));
+#endif
+	return crc;
+}
+
+static u32 intel_crc4_2_hash(const void *data, u32 len, u32 seed)
+{
+	const u32 *p32 = (const u32 *) data;
+	u32 i, tmp = 0;
+
+	for (i = 0; i < len / 4; i++)
+		seed = crc32_u32(seed, *p32++);
+
+	switch (len & 3) {
+	case 3:
+		tmp |= *((const u8 *) p32 + 2) << 16;
+		/* fallthrough */
+	case 2:
+		tmp |= *((const u8 *) p32 + 1) << 8;
+		/* fallthrough */
+	case 1:
+		tmp |= *((const u8 *) p32);
+		seed = crc32_u32(seed, tmp);
+		break;
+	}
+
+	return seed;
+}
+
+static u32 intel_crc4_2_hash2(const u32 *data, u32 len, u32 seed)
+{
+	const u32 *p32 = (const u32 *) data;
+	u32 i;
+
+	for (i = 0; i < len; i++)
+		seed = crc32_u32(seed, *p32++);
+
+	return seed;
+}
+
+void __init setup_arch_fast_hash(struct fast_hash_ops *ops)
+{
+	if (cpu_has_xmm4_2) {
+		ops->hash  = intel_crc4_2_hash;
+		ops->hash2 = intel_crc4_2_hash2;
+	}
+}
diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c
index 46fc4ee09fc..c1f01a8e9f6 100644
--- a/arch/x86/lib/inat.c
+++ b/arch/x86/lib/inat.c
@@ -29,46 +29,46 @@ insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode)
 	return inat_primary_table[opcode];
 }
 
-insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, insn_byte_t last_pfx,
+int inat_get_last_prefix_id(insn_byte_t last_pfx)
+{
+	insn_attr_t lpfx_attr;
+
+	lpfx_attr = inat_get_opcode_attribute(last_pfx);
+	return inat_last_prefix_id(lpfx_attr);
+}
+
+insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, int lpfx_id,
 				      insn_attr_t esc_attr)
 {
 	const insn_attr_t *table;
-	insn_attr_t lpfx_attr;
-	int n, m = 0;
+	int n;
 
 	n = inat_escape_id(esc_attr);
-	if (last_pfx) {
-		lpfx_attr = inat_get_opcode_attribute(last_pfx);
-		m = inat_last_prefix_id(lpfx_attr);
-	}
+
 	table = inat_escape_tables[n][0];
 	if (!table)
 		return 0;
-	if (inat_has_variant(table[opcode]) && m) {
-		table = inat_escape_tables[n][m];
+	if (inat_has_variant(table[opcode]) && lpfx_id) {
+		table = inat_escape_tables[n][lpfx_id];
 		if (!table)
 			return 0;
 	}
 	return table[opcode];
 }
 
-insn_attr_t inat_get_group_attribute(insn_byte_t modrm, insn_byte_t last_pfx,
+insn_attr_t inat_get_group_attribute(insn_byte_t modrm, int lpfx_id,
 				     insn_attr_t grp_attr)
 {
 	const insn_attr_t *table;
-	insn_attr_t lpfx_attr;
-	int n, m = 0;
+	int n;
 
 	n = inat_group_id(grp_attr);
-	if (last_pfx) {
-		lpfx_attr = inat_get_opcode_attribute(last_pfx);
-		m = inat_last_prefix_id(lpfx_attr);
-	}
+
 	table = inat_group_tables[n][0];
 	if (!table)
 		return inat_group_common_attribute(grp_attr);
-	if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && m) {
-		table = inat_group_tables[n][m];
+	if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && lpfx_id) {
+		table = inat_group_tables[n][lpfx_id];
 		if (!table)
 			return inat_group_common_attribute(grp_attr);
 	}
@@ -82,9 +82,16 @@ insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m,
 	const insn_attr_t *table;
 	if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX)
 		return 0;
-	table = inat_avx_tables[vex_m][vex_p];
+	/* At first, this checks the master table */
+	table = inat_avx_tables[vex_m][0];
 	if (!table)
 		return 0;
+	if (!inat_is_group(table[opcode]) && vex_p) {
+		/* If this is not a group, get attribute directly */
+		table = inat_avx_tables[vex_m][vex_p];
+		if (!table)
+			return 0;
+	}
 	return table[opcode];
 }
 
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 9f33b984d0e..54fcffed28e 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -18,18 +18,31 @@
  * Copyright (C) IBM Corporation, 2002, 2004, 2009
  */
 
+#ifdef __KERNEL__
 #include <linux/string.h>
+#else
+#include <string.h>
+#endif
 #include <asm/inat.h>
 #include <asm/insn.h>
 
-#define get_next(t, insn)	\
-	({t r; r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; })
+/* Verify next sizeof(t) bytes can be on the same instruction */
+#define validate_next(t, insn, n)	\
+	((insn)->next_byte + sizeof(t) + n - (insn)->kaddr <= MAX_INSN_SIZE)
+
+#define __get_next(t, insn)	\
+	({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; })
 
-#define peek_next(t, insn)	\
-	({t r; r = *(t*)insn->next_byte; r; })
+#define __peek_nbyte_next(t, insn, n)	\
+	({ t r = *(t*)((insn)->next_byte + n); r; })
+
+#define get_next(t, insn)	\
+	({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); })
 
 #define peek_nbyte_next(t, insn, n)	\
-	({t r; r = *(t*)((insn)->next_byte + n); r; })
+	({ if (unlikely(!validate_next(t, insn, n))) goto err_out; __peek_nbyte_next(t, insn, n); })
+
+#define peek_next(t, insn)	peek_nbyte_next(t, insn, 0)
 
 /**
  * insn_init() - initialize struct insn
@@ -158,6 +171,8 @@ vex_end:
 	insn->vex_prefix.got = 1;
 
 	prefixes->got = 1;
+
+err_out:
 	return;
 }
 
@@ -174,7 +189,8 @@ vex_end:
 void insn_get_opcode(struct insn *insn)
 {
 	struct insn_field *opcode = &insn->opcode;
-	insn_byte_t op, pfx;
+	insn_byte_t op;
+	int pfx_id;
 	if (opcode->got)
 		return;
 	if (!insn->prefixes.got)
@@ -191,7 +207,7 @@ void insn_get_opcode(struct insn *insn)
 		m = insn_vex_m_bits(insn);
 		p = insn_vex_p_bits(insn);
 		insn->attr = inat_get_avx_attribute(op, m, p);
-		if (!inat_accept_vex(insn->attr))
+		if (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr))
 			insn->attr = 0;	/* This instruction is bad */
 		goto end;	/* VEX has only 1 byte for opcode */
 	}
@@ -201,13 +217,16 @@ void insn_get_opcode(struct insn *insn)
 		/* Get escaped opcode */
 		op = get_next(insn_byte_t, insn);
 		opcode->bytes[opcode->nbytes++] = op;
-		pfx = insn_last_prefix(insn);
-		insn->attr = inat_get_escape_attribute(op, pfx, insn->attr);
+		pfx_id = insn_last_prefix_id(insn);
+		insn->attr = inat_get_escape_attribute(op, pfx_id, insn->attr);
 	}
 	if (inat_must_vex(insn->attr))
 		insn->attr = 0;	/* This instruction is bad */
 end:
 	opcode->got = 1;
+
+err_out:
+	return;
 }
 
 /**
@@ -221,7 +240,7 @@ end:
 void insn_get_modrm(struct insn *insn)
 {
 	struct insn_field *modrm = &insn->modrm;
-	insn_byte_t pfx, mod;
+	insn_byte_t pfx_id, mod;
 	if (modrm->got)
 		return;
 	if (!insn->opcode.got)
@@ -232,15 +251,20 @@ void insn_get_modrm(struct insn *insn)
 		modrm->value = mod;
 		modrm->nbytes = 1;
 		if (inat_is_group(insn->attr)) {
-			pfx = insn_last_prefix(insn);
-			insn->attr = inat_get_group_attribute(mod, pfx,
+			pfx_id = insn_last_prefix_id(insn);
+			insn->attr = inat_get_group_attribute(mod, pfx_id,
 							      insn->attr);
+			if (insn_is_avx(insn) && !inat_accept_vex(insn->attr))
+				insn->attr = 0;	/* This is bad */
 		}
 	}
 
 	if (insn->x86_64 && inat_is_force64(insn->attr))
 		insn->opnd_bytes = 8;
 	modrm->got = 1;
+
+err_out:
+	return;
 }
 
 
@@ -290,6 +314,9 @@ void insn_get_sib(struct insn *insn)
 		}
 	}
 	insn->sib.got = 1;
+
+err_out:
+	return;
 }
 
 
@@ -351,10 +378,13 @@ void insn_get_displacement(struct insn *insn)
 	}
 out:
 	insn->displacement.got = 1;
+
+err_out:
+	return;
 }
 
-/* Decode moffset16/32/64 */
-static void __get_moffset(struct insn *insn)
+/* Decode moffset16/32/64. Return 0 if failed */
+static int __get_moffset(struct insn *insn)
 {
 	switch (insn->addr_bytes) {
 	case 2:
@@ -371,12 +401,19 @@ static void __get_moffset(struct insn *insn)
 		insn->moffset2.value = get_next(int, insn);
 		insn->moffset2.nbytes = 4;
 		break;
+	default:	/* opnd_bytes must be modified manually */
+		goto err_out;
 	}
 	insn->moffset1.got = insn->moffset2.got = 1;
+
+	return 1;
+
+err_out:
+	return 0;
 }
 
-/* Decode imm v32(Iz) */
-static void __get_immv32(struct insn *insn)
+/* Decode imm v32(Iz). Return 0 if failed */
+static int __get_immv32(struct insn *insn)
 {
 	switch (insn->opnd_bytes) {
 	case 2:
@@ -388,11 +425,18 @@ static void __get_immv32(struct insn *insn)
 		insn->immediate.value = get_next(int, insn);
 		insn->immediate.nbytes = 4;
 		break;
+	default:	/* opnd_bytes must be modified manually */
+		goto err_out;
 	}
+
+	return 1;
+
+err_out:
+	return 0;
 }
 
-/* Decode imm v64(Iv/Ov) */
-static void __get_immv(struct insn *insn)
+/* Decode imm v64(Iv/Ov), Return 0 if failed */
+static int __get_immv(struct insn *insn)
 {
 	switch (insn->opnd_bytes) {
 	case 2:
@@ -409,12 +453,18 @@ static void __get_immv(struct insn *insn)
 		insn->immediate2.value = get_next(int, insn);
 		insn->immediate2.nbytes = 4;
 		break;
+	default:	/* opnd_bytes must be modified manually */
+		goto err_out;
 	}
 	insn->immediate1.got = insn->immediate2.got = 1;
+
+	return 1;
+err_out:
+	return 0;
 }
 
 /* Decode ptr16:16/32(Ap) */
-static void __get_immptr(struct insn *insn)
+static int __get_immptr(struct insn *insn)
 {
 	switch (insn->opnd_bytes) {
 	case 2:
@@ -427,11 +477,17 @@ static void __get_immptr(struct insn *insn)
 		break;
 	case 8:
 		/* ptr16:64 is not exist (no segment) */
-		return;
+		return 0;
+	default:	/* opnd_bytes must be modified manually */
+		goto err_out;
 	}
 	insn->immediate2.value = get_next(unsigned short, insn);
 	insn->immediate2.nbytes = 2;
 	insn->immediate1.got = insn->immediate2.got = 1;
+
+	return 1;
+err_out:
+	return 0;
 }
 
 /**
@@ -451,7 +507,8 @@ void insn_get_immediate(struct insn *insn)
 		insn_get_displacement(insn);
 
 	if (inat_has_moffset(insn->attr)) {
-		__get_moffset(insn);
+		if (!__get_moffset(insn))
+			goto err_out;
 		goto done;
 	}
 
@@ -479,16 +536,20 @@ void insn_get_immediate(struct insn *insn)
 		insn->immediate2.nbytes = 4;
 		break;
 	case INAT_IMM_PTR:
-		__get_immptr(insn);
+		if (!__get_immptr(insn))
+			goto err_out;
 		break;
 	case INAT_IMM_VWORD32:
-		__get_immv32(insn);
+		if (!__get_immv32(insn))
+			goto err_out;
 		break;
 	case INAT_IMM_VWORD:
-		__get_immv(insn);
+		if (!__get_immv(insn))
+			goto err_out;
 		break;
 	default:
-		break;
+		/* Here, insn must have an immediate, but failed */
+		goto err_out;
 	}
 	if (inat_has_second_immediate(insn->attr)) {
 		insn->immediate2.value = get_next(char, insn);
@@ -496,6 +557,9 @@ void insn_get_immediate(struct insn *insn)
 	}
 done:
 	insn->immediate.got = 1;
+
+err_out:
+	return;
 }
 
 /**
diff --git a/arch/x86/lib/io_64.c b/arch/x86/lib/io_64.c
deleted file mode 100644
index 3f1eb59b5f0..00000000000
--- a/arch/x86/lib/io_64.c
+++ /dev/null
@@ -1,25 +0,0 @@
-#include <linux/string.h>
-#include <linux/module.h>
-#include <asm/io.h>
-
-void __memcpy_toio(unsigned long dst, const void *src, unsigned len)
-{
-	__inline_memcpy((void *)dst, src, len);
-}
-EXPORT_SYMBOL(__memcpy_toio);
-
-void __memcpy_fromio(void *dst, unsigned long src, unsigned len)
-{
-	__inline_memcpy(dst, (const void *)src, len);
-}
-EXPORT_SYMBOL(__memcpy_fromio);
-
-void memset_io(volatile void __iomem *a, int b, size_t c)
-{
-	/*
-	 * TODO: memset can mangle the IO patterns quite a bit.
-	 * perhaps it would be better to use a dumb one:
-	 */
-	memset((void *)a, b, c);
-}
-EXPORT_SYMBOL(memset_io);
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index 5415a9d06f5..a404b4b7553 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -4,7 +4,7 @@
 #undef memcpy
 #undef memset
 
-void *memcpy(void *to, const void *from, size_t n)
+__visible void *memcpy(void *to, const void *from, size_t n)
 {
 #ifdef CONFIG_X86_USE_3DNOW
 	return __memcpy3d(to, from, n);
@@ -14,30 +14,195 @@ void *memcpy(void *to, const void *from, size_t n)
 }
 EXPORT_SYMBOL(memcpy);
 
-void *memset(void *s, int c, size_t count)
+__visible void *memset(void *s, int c, size_t count)
 {
 	return __memset(s, c, count);
 }
 EXPORT_SYMBOL(memset);
 
-void *memmove(void *dest, const void *src, size_t n)
+__visible void *memmove(void *dest, const void *src, size_t n)
 {
-	int d0, d1, d2;
-
-	if (dest < src) {
-		memcpy(dest, src, n);
-	} else {
-		__asm__ __volatile__(
-			"std\n\t"
-			"rep\n\t"
-			"movsb\n\t"
-			"cld"
-			: "=&c" (d0), "=&S" (d1), "=&D" (d2)
-			:"0" (n),
-			 "1" (n-1+src),
-			 "2" (n-1+dest)
-			:"memory");
-	}
-	return dest;
+	int d0,d1,d2,d3,d4,d5;
+	char *ret = dest;
+
+	__asm__ __volatile__(
+		/* Handle more 16 bytes in loop */
+		"cmp $0x10, %0\n\t"
+		"jb	1f\n\t"
+
+		/* Decide forward/backward copy mode */
+		"cmp %2, %1\n\t"
+		"jb	2f\n\t"
+
+		/*
+		 * movs instruction have many startup latency
+		 * so we handle small size by general register.
+		 */
+		"cmp  $680, %0\n\t"
+		"jb 3f\n\t"
+		/*
+		 * movs instruction is only good for aligned case.
+		 */
+		"mov %1, %3\n\t"
+		"xor %2, %3\n\t"
+		"and $0xff, %3\n\t"
+		"jz 4f\n\t"
+		"3:\n\t"
+		"sub $0x10, %0\n\t"
+
+		/*
+		 * We gobble 16 bytes forward in each loop.
+		 */
+		"3:\n\t"
+		"sub $0x10, %0\n\t"
+		"mov 0*4(%1), %3\n\t"
+		"mov 1*4(%1), %4\n\t"
+		"mov  %3, 0*4(%2)\n\t"
+		"mov  %4, 1*4(%2)\n\t"
+		"mov 2*4(%1), %3\n\t"
+		"mov 3*4(%1), %4\n\t"
+		"mov  %3, 2*4(%2)\n\t"
+		"mov  %4, 3*4(%2)\n\t"
+		"lea  0x10(%1), %1\n\t"
+		"lea  0x10(%2), %2\n\t"
+		"jae 3b\n\t"
+		"add $0x10, %0\n\t"
+		"jmp 1f\n\t"
+
+		/*
+		 * Handle data forward by movs.
+		 */
+		".p2align 4\n\t"
+		"4:\n\t"
+		"mov -4(%1, %0), %3\n\t"
+		"lea -4(%2, %0), %4\n\t"
+		"shr $2, %0\n\t"
+		"rep movsl\n\t"
+		"mov %3, (%4)\n\t"
+		"jmp 11f\n\t"
+		/*
+		 * Handle data backward by movs.
+		 */
+		".p2align 4\n\t"
+		"6:\n\t"
+		"mov (%1), %3\n\t"
+		"mov %2, %4\n\t"
+		"lea -4(%1, %0), %1\n\t"
+		"lea -4(%2, %0), %2\n\t"
+		"shr $2, %0\n\t"
+		"std\n\t"
+		"rep movsl\n\t"
+		"mov %3,(%4)\n\t"
+		"cld\n\t"
+		"jmp 11f\n\t"
+
+		/*
+		 * Start to prepare for backward copy.
+		 */
+		".p2align 4\n\t"
+		"2:\n\t"
+		"cmp  $680, %0\n\t"
+		"jb 5f\n\t"
+		"mov %1, %3\n\t"
+		"xor %2, %3\n\t"
+		"and $0xff, %3\n\t"
+		"jz 6b\n\t"
+
+		/*
+		 * Calculate copy position to tail.
+		 */
+		"5:\n\t"
+		"add %0, %1\n\t"
+		"add %0, %2\n\t"
+		"sub $0x10, %0\n\t"
+
+		/*
+		 * We gobble 16 bytes backward in each loop.
+		 */
+		"7:\n\t"
+		"sub $0x10, %0\n\t"
+
+		"mov -1*4(%1), %3\n\t"
+		"mov -2*4(%1), %4\n\t"
+		"mov  %3, -1*4(%2)\n\t"
+		"mov  %4, -2*4(%2)\n\t"
+		"mov -3*4(%1), %3\n\t"
+		"mov -4*4(%1), %4\n\t"
+		"mov  %3, -3*4(%2)\n\t"
+		"mov  %4, -4*4(%2)\n\t"
+		"lea  -0x10(%1), %1\n\t"
+		"lea  -0x10(%2), %2\n\t"
+		"jae 7b\n\t"
+		/*
+		 * Calculate copy position to head.
+		 */
+		"add $0x10, %0\n\t"
+		"sub %0, %1\n\t"
+		"sub %0, %2\n\t"
+
+		/*
+		 * Move data from 8 bytes to 15 bytes.
+		 */
+		".p2align 4\n\t"
+		"1:\n\t"
+		"cmp $8, %0\n\t"
+		"jb 8f\n\t"
+		"mov 0*4(%1), %3\n\t"
+		"mov 1*4(%1), %4\n\t"
+		"mov -2*4(%1, %0), %5\n\t"
+		"mov -1*4(%1, %0), %1\n\t"
+
+		"mov  %3, 0*4(%2)\n\t"
+		"mov  %4, 1*4(%2)\n\t"
+		"mov  %5, -2*4(%2, %0)\n\t"
+		"mov  %1, -1*4(%2, %0)\n\t"
+		"jmp 11f\n\t"
+
+		/*
+		 * Move data from 4 bytes to 7 bytes.
+		 */
+		".p2align 4\n\t"
+		"8:\n\t"
+		"cmp $4, %0\n\t"
+		"jb 9f\n\t"
+		"mov 0*4(%1), %3\n\t"
+		"mov -1*4(%1, %0), %4\n\t"
+		"mov  %3, 0*4(%2)\n\t"
+		"mov  %4, -1*4(%2, %0)\n\t"
+		"jmp 11f\n\t"
+
+		/*
+		 * Move data from 2 bytes to 3 bytes.
+		 */
+		".p2align 4\n\t"
+		"9:\n\t"
+		"cmp $2, %0\n\t"
+		"jb 10f\n\t"
+		"movw 0*2(%1), %%dx\n\t"
+		"movw -1*2(%1, %0), %%bx\n\t"
+		"movw %%dx, 0*2(%2)\n\t"
+		"movw %%bx, -1*2(%2, %0)\n\t"
+		"jmp 11f\n\t"
+
+		/*
+		 * Move data for 1 byte.
+		 */
+		".p2align 4\n\t"
+		"10:\n\t"
+		"cmp $1, %0\n\t"
+		"jb 11f\n\t"
+		"movb (%1), %%cl\n\t"
+		"movb %%cl, (%2)\n\t"
+		".p2align 4\n\t"
+		"11:"
+		: "=&c" (d0), "=&S" (d1), "=&D" (d2),
+		  "=r" (d3),"=r" (d4), "=r"(d5)
+		:"0" (n),
+		 "1" (src),
+		 "2" (dest)
+		:"memory");
+
+	return ret;
+
 }
 EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index ad5441ed1b5..56313a32618 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -4,6 +4,7 @@
 
 #include <asm/cpufeature.h>
 #include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
 
 /*
  * memcpy - Copy a memory block.
@@ -20,131 +21,186 @@
 /*
  * memcpy_c() - fast string ops (REP MOVSQ) based variant.
  *
- * Calls to this get patched into the kernel image via the
+ * This gets patched over the unrolled variant (below) via the
  * alternative instructions framework:
  */
-	ALIGN
-memcpy_c:
-	CFI_STARTPROC
+	.section .altinstr_replacement, "ax", @progbits
+.Lmemcpy_c:
 	movq %rdi, %rax
-
-	movl %edx, %ecx
-	shrl $3, %ecx
+	movq %rdx, %rcx
+	shrq $3, %rcx
 	andl $7, %edx
 	rep movsq
 	movl %edx, %ecx
 	rep movsb
 	ret
-	CFI_ENDPROC
-ENDPROC(memcpy_c)
+.Lmemcpy_e:
+	.previous
+
+/*
+ * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
+ * memcpy_c. Use memcpy_c_e when possible.
+ *
+ * This gets patched over the unrolled variant (below) via the
+ * alternative instructions framework:
+ */
+	.section .altinstr_replacement, "ax", @progbits
+.Lmemcpy_c_e:
+	movq %rdi, %rax
+	movq %rdx, %rcx
+	rep movsb
+	ret
+.Lmemcpy_e_e:
+	.previous
 
 ENTRY(__memcpy)
 ENTRY(memcpy)
 	CFI_STARTPROC
+	movq %rdi, %rax
+
+	cmpq $0x20, %rdx
+	jb .Lhandle_tail
 
 	/*
-	 * Put the number of full 64-byte blocks into %ecx.
-	 * Tail portion is handled at the end:
+	 * We check whether memory false dependence could occur,
+	 * then jump to corresponding copy mode.
 	 */
-	movq %rdi, %rax
-	movl %edx, %ecx
-	shrl   $6, %ecx
-	jz .Lhandle_tail
+	cmp  %dil, %sil
+	jl .Lcopy_backward
+	subq $0x20, %rdx
+.Lcopy_forward_loop:
+	subq $0x20,	%rdx
 
-	.p2align 4
-.Lloop_64:
 	/*
-	 * We decrement the loop index here - and the zero-flag is
-	 * checked at the end of the loop (instructions inbetween do
-	 * not change the zero flag):
+	 * Move in blocks of 4x8 bytes:
 	 */
-	decl %ecx
-
+	movq 0*8(%rsi),	%r8
+	movq 1*8(%rsi),	%r9
+	movq 2*8(%rsi),	%r10
+	movq 3*8(%rsi),	%r11
+	leaq 4*8(%rsi),	%rsi
+
+	movq %r8,	0*8(%rdi)
+	movq %r9,	1*8(%rdi)
+	movq %r10,	2*8(%rdi)
+	movq %r11,	3*8(%rdi)
+	leaq 4*8(%rdi),	%rdi
+	jae  .Lcopy_forward_loop
+	addl $0x20,	%edx
+	jmp  .Lhandle_tail
+
+.Lcopy_backward:
 	/*
-	 * Move in blocks of 4x16 bytes:
+	 * Calculate copy position to tail.
 	 */
-	movq 0*8(%rsi),		%r11
-	movq 1*8(%rsi),		%r8
-	movq %r11,		0*8(%rdi)
-	movq %r8,		1*8(%rdi)
-
-	movq 2*8(%rsi),		%r9
-	movq 3*8(%rsi),		%r10
-	movq %r9,		2*8(%rdi)
-	movq %r10,		3*8(%rdi)
-
-	movq 4*8(%rsi),		%r11
-	movq 5*8(%rsi),		%r8
-	movq %r11,		4*8(%rdi)
-	movq %r8,		5*8(%rdi)
-
-	movq 6*8(%rsi),		%r9
-	movq 7*8(%rsi),		%r10
-	movq %r9,		6*8(%rdi)
-	movq %r10,		7*8(%rdi)
-
-	leaq 64(%rsi), %rsi
-	leaq 64(%rdi), %rdi
-
-	jnz  .Lloop_64
+	addq %rdx,	%rsi
+	addq %rdx,	%rdi
+	subq $0x20,	%rdx
+	/*
+	 * At most 3 ALU operations in one cycle,
+	 * so append NOPS in the same 16 bytes trunk.
+	 */
+	.p2align 4
+.Lcopy_backward_loop:
+	subq $0x20,	%rdx
+	movq -1*8(%rsi),	%r8
+	movq -2*8(%rsi),	%r9
+	movq -3*8(%rsi),	%r10
+	movq -4*8(%rsi),	%r11
+	leaq -4*8(%rsi),	%rsi
+	movq %r8,		-1*8(%rdi)
+	movq %r9,		-2*8(%rdi)
+	movq %r10,		-3*8(%rdi)
+	movq %r11,		-4*8(%rdi)
+	leaq -4*8(%rdi),	%rdi
+	jae  .Lcopy_backward_loop
 
+	/*
+	 * Calculate copy position to head.
+	 */
+	addl $0x20,	%edx
+	subq %rdx,	%rsi
+	subq %rdx,	%rdi
 .Lhandle_tail:
-	movl %edx, %ecx
-	andl  $63, %ecx
-	shrl   $3, %ecx
-	jz   .Lhandle_7
+	cmpl $16,	%edx
+	jb   .Lless_16bytes
 
+	/*
+	 * Move data from 16 bytes to 31 bytes.
+	 */
+	movq 0*8(%rsi), %r8
+	movq 1*8(%rsi),	%r9
+	movq -2*8(%rsi, %rdx),	%r10
+	movq -1*8(%rsi, %rdx),	%r11
+	movq %r8,	0*8(%rdi)
+	movq %r9,	1*8(%rdi)
+	movq %r10,	-2*8(%rdi, %rdx)
+	movq %r11,	-1*8(%rdi, %rdx)
+	retq
 	.p2align 4
-.Lloop_8:
-	decl %ecx
-	movq (%rsi),		%r8
-	movq %r8,		(%rdi)
-	leaq 8(%rdi),		%rdi
-	leaq 8(%rsi),		%rsi
-	jnz  .Lloop_8
-
-.Lhandle_7:
-	movl %edx, %ecx
-	andl $7, %ecx
-	jz .Lend
+.Lless_16bytes:
+	cmpl $8,	%edx
+	jb   .Lless_8bytes
+	/*
+	 * Move data from 8 bytes to 15 bytes.
+	 */
+	movq 0*8(%rsi),	%r8
+	movq -1*8(%rsi, %rdx),	%r9
+	movq %r8,	0*8(%rdi)
+	movq %r9,	-1*8(%rdi, %rdx)
+	retq
+	.p2align 4
+.Lless_8bytes:
+	cmpl $4,	%edx
+	jb   .Lless_3bytes
 
+	/*
+	 * Move data from 4 bytes to 7 bytes.
+	 */
+	movl (%rsi), %ecx
+	movl -4(%rsi, %rdx), %r8d
+	movl %ecx, (%rdi)
+	movl %r8d, -4(%rdi, %rdx)
+	retq
 	.p2align 4
-.Lloop_1:
-	movb (%rsi), %r8b
-	movb %r8b, (%rdi)
-	incq %rdi
-	incq %rsi
-	decl %ecx
-	jnz .Lloop_1
+.Lless_3bytes:
+	subl $1, %edx
+	jb .Lend
+	/*
+	 * Move data from 1 bytes to 3 bytes.
+	 */
+	movzbl (%rsi), %ecx
+	jz .Lstore_1byte
+	movzbq 1(%rsi), %r8
+	movzbq (%rsi, %rdx), %r9
+	movb %r8b, 1(%rdi)
+	movb %r9b, (%rdi, %rdx)
+.Lstore_1byte:
+	movb %cl, (%rdi)
 
 .Lend:
-	ret
+	retq
 	CFI_ENDPROC
 ENDPROC(memcpy)
 ENDPROC(__memcpy)
 
 	/*
-	 * Some CPUs run faster using the string copy instructions.
-	 * It is also a lot simpler. Use this when possible:
-	 */
-
-	.section .altinstr_replacement, "ax"
-1:	.byte 0xeb				/* jmp <disp8> */
-	.byte (memcpy_c - memcpy) - (2f - 1b)	/* offset */
-2:
-	.previous
-
-	.section .altinstructions, "a"
-	.align 8
-	.quad memcpy
-	.quad 1b
-	.byte X86_FEATURE_REP_GOOD
-
-	/*
+	 * Some CPUs are adding enhanced REP MOVSB/STOSB feature
+	 * If the feature is supported, memcpy_c_e() is the first choice.
+	 * If enhanced rep movsb copy is not available, use fast string copy
+	 * memcpy_c() when possible. This is faster and code is simpler than
+	 * original memcpy().
+	 * Otherwise, original memcpy() is used.
+	 * In .altinstructions section, ERMS feature is placed after REG_GOOD
+         * feature to implement the right patch order.
+	 *
 	 * Replace only beginning, memcpy is used to apply alternatives,
 	 * so it is silly to overwrite itself with nops - reboot is the
 	 * only outcome...
 	 */
-	.byte 2b - 1b
-	.byte 2b - 1b
+	.section .altinstructions, "a"
+	altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
+			     .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
+	altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
+			     .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
 	.previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
new file mode 100644
index 00000000000..65268a6104f
--- /dev/null
+++ b/arch/x86/lib/memmove_64.S
@@ -0,0 +1,223 @@
+/*
+ * Normally compiler builtins are used, but sometimes the compiler calls out
+ * of line code. Based on asm-i386/string.h.
+ *
+ * This assembly file is re-written from memmove_64.c file.
+ *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
+ */
+#define _STRING_C
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
+
+#undef memmove
+
+/*
+ * Implement memmove(). This can handle overlap between src and dst.
+ *
+ * Input:
+ * rdi: dest
+ * rsi: src
+ * rdx: count
+ *
+ * Output:
+ * rax: dest
+ */
+ENTRY(memmove)
+	CFI_STARTPROC
+
+	/* Handle more 32 bytes in loop */
+	mov %rdi, %rax
+	cmp $0x20, %rdx
+	jb	1f
+
+	/* Decide forward/backward copy mode */
+	cmp %rdi, %rsi
+	jge .Lmemmove_begin_forward
+	mov %rsi, %r8
+	add %rdx, %r8
+	cmp %rdi, %r8
+	jg 2f
+
+.Lmemmove_begin_forward:
+	/*
+	 * movsq instruction have many startup latency
+	 * so we handle small size by general register.
+	 */
+	cmp  $680, %rdx
+	jb	3f
+	/*
+	 * movsq instruction is only good for aligned case.
+	 */
+
+	cmpb %dil, %sil
+	je 4f
+3:
+	sub $0x20, %rdx
+	/*
+	 * We gobble 32 bytes forward in each loop.
+	 */
+5:
+	sub $0x20, %rdx
+	movq 0*8(%rsi), %r11
+	movq 1*8(%rsi), %r10
+	movq 2*8(%rsi), %r9
+	movq 3*8(%rsi), %r8
+	leaq 4*8(%rsi), %rsi
+
+	movq %r11, 0*8(%rdi)
+	movq %r10, 1*8(%rdi)
+	movq %r9, 2*8(%rdi)
+	movq %r8, 3*8(%rdi)
+	leaq 4*8(%rdi), %rdi
+	jae 5b
+	addq $0x20, %rdx
+	jmp 1f
+	/*
+	 * Handle data forward by movsq.
+	 */
+	.p2align 4
+4:
+	movq %rdx, %rcx
+	movq -8(%rsi, %rdx), %r11
+	lea -8(%rdi, %rdx), %r10
+	shrq $3, %rcx
+	rep movsq
+	movq %r11, (%r10)
+	jmp 13f
+.Lmemmove_end_forward:
+
+	/*
+	 * Handle data backward by movsq.
+	 */
+	.p2align 4
+7:
+	movq %rdx, %rcx
+	movq (%rsi), %r11
+	movq %rdi, %r10
+	leaq -8(%rsi, %rdx), %rsi
+	leaq -8(%rdi, %rdx), %rdi
+	shrq $3, %rcx
+	std
+	rep movsq
+	cld
+	movq %r11, (%r10)
+	jmp 13f
+
+	/*
+	 * Start to prepare for backward copy.
+	 */
+	.p2align 4
+2:
+	cmp $680, %rdx
+	jb 6f
+	cmp %dil, %sil
+	je 7b
+6:
+	/*
+	 * Calculate copy position to tail.
+	 */
+	addq %rdx, %rsi
+	addq %rdx, %rdi
+	subq $0x20, %rdx
+	/*
+	 * We gobble 32 bytes backward in each loop.
+	 */
+8:
+	subq $0x20, %rdx
+	movq -1*8(%rsi), %r11
+	movq -2*8(%rsi), %r10
+	movq -3*8(%rsi), %r9
+	movq -4*8(%rsi), %r8
+	leaq -4*8(%rsi), %rsi
+
+	movq %r11, -1*8(%rdi)
+	movq %r10, -2*8(%rdi)
+	movq %r9, -3*8(%rdi)
+	movq %r8, -4*8(%rdi)
+	leaq -4*8(%rdi), %rdi
+	jae 8b
+	/*
+	 * Calculate copy position to head.
+	 */
+	addq $0x20, %rdx
+	subq %rdx, %rsi
+	subq %rdx, %rdi
+1:
+	cmpq $16, %rdx
+	jb 9f
+	/*
+	 * Move data from 16 bytes to 31 bytes.
+	 */
+	movq 0*8(%rsi), %r11
+	movq 1*8(%rsi), %r10
+	movq -2*8(%rsi, %rdx), %r9
+	movq -1*8(%rsi, %rdx), %r8
+	movq %r11, 0*8(%rdi)
+	movq %r10, 1*8(%rdi)
+	movq %r9, -2*8(%rdi, %rdx)
+	movq %r8, -1*8(%rdi, %rdx)
+	jmp 13f
+	.p2align 4
+9:
+	cmpq $8, %rdx
+	jb 10f
+	/*
+	 * Move data from 8 bytes to 15 bytes.
+	 */
+	movq 0*8(%rsi), %r11
+	movq -1*8(%rsi, %rdx), %r10
+	movq %r11, 0*8(%rdi)
+	movq %r10, -1*8(%rdi, %rdx)
+	jmp 13f
+10:
+	cmpq $4, %rdx
+	jb 11f
+	/*
+	 * Move data from 4 bytes to 7 bytes.
+	 */
+	movl (%rsi), %r11d
+	movl -4(%rsi, %rdx), %r10d
+	movl %r11d, (%rdi)
+	movl %r10d, -4(%rdi, %rdx)
+	jmp 13f
+11:
+	cmp $2, %rdx
+	jb 12f
+	/*
+	 * Move data from 2 bytes to 3 bytes.
+	 */
+	movw (%rsi), %r11w
+	movw -2(%rsi, %rdx), %r10w
+	movw %r11w, (%rdi)
+	movw %r10w, -2(%rdi, %rdx)
+	jmp 13f
+12:
+	cmp $1, %rdx
+	jb 13f
+	/*
+	 * Move data for 1 byte.
+	 */
+	movb (%rsi), %r11b
+	movb %r11b, (%rdi)
+13:
+	retq
+	CFI_ENDPROC
+
+	.section .altinstr_replacement,"ax"
+.Lmemmove_begin_forward_efs:
+	/* Forward moving data. */
+	movq %rdx, %rcx
+	rep movsb
+	retq
+.Lmemmove_end_forward_efs:
+	.previous
+
+	.section .altinstructions,"a"
+	altinstruction_entry .Lmemmove_begin_forward,		\
+		.Lmemmove_begin_forward_efs,X86_FEATURE_ERMS,	\
+		.Lmemmove_end_forward-.Lmemmove_begin_forward,	\
+		.Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
+	.previous
+ENDPROC(memmove)
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
deleted file mode 100644
index 0a33909bf12..00000000000
--- a/arch/x86/lib/memmove_64.c
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Normally compiler builtins are used, but sometimes the compiler calls out
-   of line code. Based on asm-i386/string.h.
- */
-#define _STRING_C
-#include <linux/string.h>
-#include <linux/module.h>
-
-#undef memmove
-void *memmove(void *dest, const void *src, size_t count)
-{
-	if (dest < src) {
-		return memcpy(dest, src, count);
-	} else {
-		char *p = dest + count;
-		const char *s = src + count;
-		while (count--)
-			*--p = *--s;
-	}
-	return dest;
-}
-EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 2c5948116bd..2dcb3808cbd 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -2,9 +2,13 @@
 
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
 
 /*
- * ISO C memset - set a memory block to a byte value.
+ * ISO C memset - set a memory block to a byte value. This function uses fast
+ * string to get better performance than the original function. The code is
+ * simpler and shorter than the orignal function as well.
  *	
  * rdi   destination
  * rsi   value (char) 
@@ -12,36 +16,55 @@
  * 
  * rax   original destination
  */	
-	ALIGN
-memset_c:
-	CFI_STARTPROC
+	.section .altinstr_replacement, "ax", @progbits
+.Lmemset_c:
 	movq %rdi,%r9
-	movl %edx,%r8d
-	andl $7,%r8d
-	movl %edx,%ecx
-	shrl $3,%ecx
+	movq %rdx,%rcx
+	andl $7,%edx
+	shrq $3,%rcx
 	/* expand byte value  */
 	movzbl %sil,%esi
 	movabs $0x0101010101010101,%rax
-	mulq %rsi		/* with rax, clobbers rdx */
+	imulq %rsi,%rax
 	rep stosq
-	movl %r8d,%ecx
+	movl %edx,%ecx
 	rep stosb
 	movq %r9,%rax
 	ret
-	CFI_ENDPROC
-ENDPROC(memset_c)
+.Lmemset_e:
+	.previous
+
+/*
+ * ISO C memset - set a memory block to a byte value. This function uses
+ * enhanced rep stosb to override the fast string function.
+ * The code is simpler and shorter than the fast string function as well.
+ *
+ * rdi   destination
+ * rsi   value (char)
+ * rdx   count (bytes)
+ *
+ * rax   original destination
+ */
+	.section .altinstr_replacement, "ax", @progbits
+.Lmemset_c_e:
+	movq %rdi,%r9
+	movb %sil,%al
+	movq %rdx,%rcx
+	rep stosb
+	movq %r9,%rax
+	ret
+.Lmemset_e_e:
+	.previous
 
 ENTRY(memset)
 ENTRY(__memset)
 	CFI_STARTPROC
 	movq %rdi,%r10
-	movq %rdx,%r11
 
 	/* expand byte value  */
 	movzbl %sil,%ecx
 	movabs $0x0101010101010101,%rax
-	mul    %rcx		/* with rax, clobbers rdx */
+	imulq  %rcx,%rax
 
 	/* align dst */
 	movl  %edi,%r9d
@@ -50,13 +73,13 @@ ENTRY(__memset)
 	CFI_REMEMBER_STATE
 .Lafter_bad_alignment:
 
-	movl %r11d,%ecx
-	shrl $6,%ecx
+	movq  %rdx,%rcx
+	shrq  $6,%rcx
 	jz	 .Lhandle_tail
 
 	.p2align 4
 .Lloop_64:
-	decl   %ecx
+	decq  %rcx
 	movq  %rax,(%rdi)
 	movq  %rax,8(%rdi)
 	movq  %rax,16(%rdi)
@@ -72,7 +95,7 @@ ENTRY(__memset)
 	   to predict jump tables. */
 	.p2align 4
 .Lhandle_tail:
-	movl	%r11d,%ecx
+	movl	%edx,%ecx
 	andl    $63&(~7),%ecx
 	jz 		.Lhandle_7
 	shrl	$3,%ecx
@@ -84,12 +107,11 @@ ENTRY(__memset)
 	jnz    .Lloop_8
 
 .Lhandle_7:
-	movl	%r11d,%ecx
-	andl	$7,%ecx
+	andl	$7,%edx
 	jz      .Lende
 	.p2align 4
 .Lloop_1:
-	decl    %ecx
+	decl    %edx
 	movb 	%al,(%rdi)
 	leaq	1(%rdi),%rdi
 	jnz     .Lloop_1
@@ -100,34 +122,33 @@ ENTRY(__memset)
 
 	CFI_RESTORE_STATE
 .Lbad_alignment:
-	cmpq $7,%r11
+	cmpq $7,%rdx
 	jbe	.Lhandle_7
 	movq %rax,(%rdi)	/* unaligned store */
 	movq $8,%r8
 	subq %r9,%r8
 	addq %r8,%rdi
-	subq %r8,%r11
+	subq %r8,%rdx
 	jmp .Lafter_bad_alignment
 .Lfinal:
 	CFI_ENDPROC
 ENDPROC(memset)
 ENDPROC(__memset)
 
-	/* Some CPUs run faster using the string instructions.
-	   It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
-	.section .altinstr_replacement,"ax"
-1:	.byte 0xeb				/* jmp <disp8> */
-	.byte (memset_c - memset) - (2f - 1b)	/* offset */
-2:
-	.previous
+	/* Some CPUs support enhanced REP MOVSB/STOSB feature.
+	 * It is recommended to use this when possible.
+	 *
+	 * If enhanced REP MOVSB/STOSB feature is not available, use fast string
+	 * instructions.
+	 *
+	 * Otherwise, use original memset function.
+	 *
+	 * In .altinstructions section, ERMS feature is placed after REG_GOOD
+         * feature to implement the right patch order.
+	 */
 	.section .altinstructions,"a"
-	.align 8
-	.quad memset
-	.quad 1b
-	.byte X86_FEATURE_REP_GOOD
-	.byte .Lfinal - memset
-	.byte 2b - 1b
+	altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
+			     .Lfinal-memset,.Lmemset_e-.Lmemset_c
+	altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
+			     .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
 	.previous
diff --git a/arch/x86/lib/misc.c b/arch/x86/lib/misc.c
new file mode 100644
index 00000000000..76b373af03f
--- /dev/null
+++ b/arch/x86/lib/misc.c
@@ -0,0 +1,21 @@
+/*
+ * Count the digits of @val including a possible sign.
+ *
+ * (Typed on and submitted from hpa's mobile phone.)
+ */
+int num_digits(int val)
+{
+	int m = 10;
+	int d = 1;
+
+	if (val < 0) {
+		d++;
+		val = -val;
+	}
+
+	while (val >= m) {
+		m *= 10;
+		d++;
+	}
+	return d;
+}
diff --git a/arch/x86/lib/msr-reg-export.c b/arch/x86/lib/msr-reg-export.c
index a311cc59b65..8d6ef78b5d0 100644
--- a/arch/x86/lib/msr-reg-export.c
+++ b/arch/x86/lib/msr-reg-export.c
@@ -1,5 +1,5 @@
 #include <linux/module.h>
 #include <asm/msr.h>
 
-EXPORT_SYMBOL(native_rdmsr_safe_regs);
-EXPORT_SYMBOL(native_wrmsr_safe_regs);
+EXPORT_SYMBOL(rdmsr_safe_regs);
+EXPORT_SYMBOL(wrmsr_safe_regs);
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index 69fa10623f2..f6d13eefad1 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -6,13 +6,13 @@
 
 #ifdef CONFIG_X86_64
 /*
- * int native_{rdmsr,wrmsr}_safe_regs(u32 gprs[8]);
+ * int {rdmsr,wrmsr}_safe_regs(u32 gprs[8]);
  *
  * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi]
  *
  */
 .macro op_safe_regs op
-ENTRY(native_\op\()_safe_regs)
+ENTRY(\op\()_safe_regs)
 	CFI_STARTPROC
 	pushq_cfi %rbx
 	pushq_cfi %rbp
@@ -45,13 +45,13 @@ ENTRY(native_\op\()_safe_regs)
 
 	_ASM_EXTABLE(1b, 3b)
 	CFI_ENDPROC
-ENDPROC(native_\op\()_safe_regs)
+ENDPROC(\op\()_safe_regs)
 .endm
 
 #else /* X86_32 */
 
 .macro op_safe_regs op
-ENTRY(native_\op\()_safe_regs)
+ENTRY(\op\()_safe_regs)
 	CFI_STARTPROC
 	pushl_cfi %ebx
 	pushl_cfi %ebp
@@ -92,7 +92,7 @@ ENTRY(native_\op\()_safe_regs)
 
 	_ASM_EXTABLE(1b, 3b)
 	CFI_ENDPROC
-ENDPROC(native_\op\()_safe_regs)
+ENDPROC(\op\()_safe_regs)
 .endm
 
 #endif
diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c
index a6b1b86d225..518532e6a3f 100644
--- a/arch/x86/lib/msr-smp.c
+++ b/arch/x86/lib/msr-smp.c
@@ -47,6 +47,21 @@ int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
 }
 EXPORT_SYMBOL(rdmsr_on_cpu);
 
+int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
+{
+	int err;
+	struct msr_info rv;
+
+	memset(&rv, 0, sizeof(rv));
+
+	rv.msr_no = msr_no;
+	err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
+	*q = rv.reg.q;
+
+	return err;
+}
+EXPORT_SYMBOL(rdmsrl_on_cpu);
+
 int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
 {
 	int err;
@@ -63,6 +78,22 @@ int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
 }
 EXPORT_SYMBOL(wrmsr_on_cpu);
 
+int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
+{
+	int err;
+	struct msr_info rv;
+
+	memset(&rv, 0, sizeof(rv));
+
+	rv.msr_no = msr_no;
+	rv.reg.q = q;
+
+	err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
+
+	return err;
+}
+EXPORT_SYMBOL(wrmsrl_on_cpu);
+
 static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no,
 			    struct msr *msrs,
 			    void (*msr_func) (void *info))
@@ -159,6 +190,37 @@ int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
 }
 EXPORT_SYMBOL(wrmsr_safe_on_cpu);
 
+int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
+{
+	int err;
+	struct msr_info rv;
+
+	memset(&rv, 0, sizeof(rv));
+
+	rv.msr_no = msr_no;
+	rv.reg.q = q;
+
+	err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
+
+	return err ? err : rv.err;
+}
+EXPORT_SYMBOL(wrmsrl_safe_on_cpu);
+
+int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
+{
+	int err;
+	struct msr_info rv;
+
+	memset(&rv, 0, sizeof(rv));
+
+	rv.msr_no = msr_no;
+	err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
+	*q = rv.reg.q;
+
+	return err ? err : rv.err;
+}
+EXPORT_SYMBOL(rdmsrl_safe_on_cpu);
+
 /*
  * These variants are significantly slower, but allows control over
  * the entire 32-bit GPR set.
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 8f8eebdca7d..43623739c7c 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -8,7 +8,7 @@ struct msr *msrs_alloc(void)
 
 	msrs = alloc_percpu(struct msr);
 	if (!msrs) {
-		pr_warning("%s: error allocating msrs\n", __func__);
+		pr_warn("%s: error allocating msrs\n", __func__);
 		return NULL;
 	}
 
@@ -21,3 +21,90 @@ void msrs_free(struct msr *msrs)
 	free_percpu(msrs);
 }
 EXPORT_SYMBOL(msrs_free);
+
+/**
+ * Read an MSR with error handling
+ *
+ * @msr: MSR to read
+ * @m: value to read into
+ *
+ * It returns read data only on success, otherwise it doesn't change the output
+ * argument @m.
+ *
+ */
+int msr_read(u32 msr, struct msr *m)
+{
+	int err;
+	u64 val;
+
+	err = rdmsrl_safe(msr, &val);
+	if (!err)
+		m->q = val;
+
+	return err;
+}
+
+/**
+ * Write an MSR with error handling
+ *
+ * @msr: MSR to write
+ * @m: value to write
+ */
+int msr_write(u32 msr, struct msr *m)
+{
+	return wrmsrl_safe(msr, m->q);
+}
+
+static inline int __flip_bit(u32 msr, u8 bit, bool set)
+{
+	struct msr m, m1;
+	int err = -EINVAL;
+
+	if (bit > 63)
+		return err;
+
+	err = msr_read(msr, &m);
+	if (err)
+		return err;
+
+	m1 = m;
+	if (set)
+		m1.q |=  BIT_64(bit);
+	else
+		m1.q &= ~BIT_64(bit);
+
+	if (m1.q == m.q)
+		return 0;
+
+	err = msr_write(msr, &m1);
+	if (err)
+		return err;
+
+	return 1;
+}
+
+/**
+ * Set @bit in a MSR @msr.
+ *
+ * Retval:
+ * < 0: An error was encountered.
+ * = 0: Bit was already set.
+ * > 0: Hardware accepted the MSR write.
+ */
+int msr_set_bit(u32 msr, u8 bit)
+{
+	return __flip_bit(msr, bit, true);
+}
+
+/**
+ * Clear @bit in a MSR @msr.
+ *
+ * Retval:
+ * < 0: An error was encountered.
+ * = 0: Bit was already cleared.
+ * > 0: Hardware accepted the MSR write.
+ */
+int msr_clear_bit(u32 msr, u8 bit)
+{
+	return __flip_bit(msr, bit, false);
+}
diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
index 36b0d15ae6e..fc6ba17a7ee 100644
--- a/arch/x86/lib/putuser.S
+++ b/arch/x86/lib/putuser.S
@@ -15,6 +15,7 @@
 #include <asm/thread_info.h>
 #include <asm/errno.h>
 #include <asm/asm.h>
+#include <asm/smap.h>
 
 
 /*
@@ -31,7 +32,8 @@
 
 #define ENTER	CFI_STARTPROC ; \
 		GET_THREAD_INFO(%_ASM_BX)
-#define EXIT	ret ; \
+#define EXIT	ASM_CLAC ;	\
+		ret ;		\
 		CFI_ENDPROC
 
 .text
@@ -39,6 +41,7 @@ ENTRY(__put_user_1)
 	ENTER
 	cmp TI_addr_limit(%_ASM_BX),%_ASM_CX
 	jae bad_put_user
+	ASM_STAC
 1:	movb %al,(%_ASM_CX)
 	xor %eax,%eax
 	EXIT
@@ -50,6 +53,7 @@ ENTRY(__put_user_2)
 	sub $1,%_ASM_BX
 	cmp %_ASM_BX,%_ASM_CX
 	jae bad_put_user
+	ASM_STAC
 2:	movw %ax,(%_ASM_CX)
 	xor %eax,%eax
 	EXIT
@@ -61,6 +65,7 @@ ENTRY(__put_user_4)
 	sub $3,%_ASM_BX
 	cmp %_ASM_BX,%_ASM_CX
 	jae bad_put_user
+	ASM_STAC
 3:	movl %eax,(%_ASM_CX)
 	xor %eax,%eax
 	EXIT
@@ -72,6 +77,7 @@ ENTRY(__put_user_8)
 	sub $7,%_ASM_BX
 	cmp %_ASM_BX,%_ASM_CX
 	jae bad_put_user
+	ASM_STAC
 4:	mov %_ASM_AX,(%_ASM_CX)
 #ifdef CONFIG_X86_32
 5:	movl %edx,4(%_ASM_CX)
@@ -86,12 +92,10 @@ bad_put_user:
 	EXIT
 END(bad_put_user)
 
-.section __ex_table,"a"
-	_ASM_PTR 1b,bad_put_user
-	_ASM_PTR 2b,bad_put_user
-	_ASM_PTR 3b,bad_put_user
-	_ASM_PTR 4b,bad_put_user
+	_ASM_EXTABLE(1b,bad_put_user)
+	_ASM_EXTABLE(2b,bad_put_user)
+	_ASM_EXTABLE(3b,bad_put_user)
+	_ASM_EXTABLE(4b,bad_put_user)
 #ifdef CONFIG_X86_32
-	_ASM_PTR 5b,bad_put_user
+	_ASM_EXTABLE(5b,bad_put_user)
 #endif
-.previous
diff --git a/arch/x86/lib/rwlock.S b/arch/x86/lib/rwlock.S
new file mode 100644
index 00000000000..1cad22139c8
--- /dev/null
+++ b/arch/x86/lib/rwlock.S
@@ -0,0 +1,44 @@
+/* Slow paths of read/write spinlocks. */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/frame.h>
+#include <asm/rwlock.h>
+
+#ifdef CONFIG_X86_32
+# define __lock_ptr eax
+#else
+# define __lock_ptr rdi
+#endif
+
+ENTRY(__write_lock_failed)
+	CFI_STARTPROC
+	FRAME
+0:	LOCK_PREFIX
+	WRITE_LOCK_ADD($RW_LOCK_BIAS) (%__lock_ptr)
+1:	rep; nop
+	cmpl	$WRITE_LOCK_CMP, (%__lock_ptr)
+	jne	1b
+	LOCK_PREFIX
+	WRITE_LOCK_SUB($RW_LOCK_BIAS) (%__lock_ptr)
+	jnz	0b
+	ENDFRAME
+	ret
+	CFI_ENDPROC
+END(__write_lock_failed)
+
+ENTRY(__read_lock_failed)
+	CFI_STARTPROC
+	FRAME
+0:	LOCK_PREFIX
+	READ_LOCK_SIZE(inc) (%__lock_ptr)
+1:	rep; nop
+	READ_LOCK_SIZE(cmp) $1, (%__lock_ptr)
+	js	1b
+	LOCK_PREFIX
+	READ_LOCK_SIZE(dec) (%__lock_ptr)
+	js	0b
+	ENDFRAME
+	ret
+	CFI_ENDPROC
+END(__read_lock_failed)
diff --git a/arch/x86/lib/rwlock_64.S b/arch/x86/lib/rwlock_64.S
deleted file mode 100644
index 05ea55f7140..00000000000
--- a/arch/x86/lib/rwlock_64.S
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Slow paths of read/write spinlocks. */
-
-#include <linux/linkage.h>
-#include <asm/rwlock.h>
-#include <asm/alternative-asm.h>
-#include <asm/dwarf2.h>
-
-/* rdi:	pointer to rwlock_t */
-ENTRY(__write_lock_failed)
-	CFI_STARTPROC
-	LOCK_PREFIX
-	addl $RW_LOCK_BIAS,(%rdi)
-1:	rep
-	nop
-	cmpl $RW_LOCK_BIAS,(%rdi)
-	jne 1b
-	LOCK_PREFIX
-	subl $RW_LOCK_BIAS,(%rdi)
-	jnz  __write_lock_failed
-	ret
-	CFI_ENDPROC
-END(__write_lock_failed)
-
-/* rdi:	pointer to rwlock_t */
-ENTRY(__read_lock_failed)
-	CFI_STARTPROC
-	LOCK_PREFIX
-	incl (%rdi)
-1:	rep
-	nop
-	cmpl $1,(%rdi)
-	js 1b
-	LOCK_PREFIX
-	decl (%rdi)
-	js __read_lock_failed
-	ret
-	CFI_ENDPROC
-END(__read_lock_failed)
diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S
new file mode 100644
index 00000000000..5dff5f04246
--- /dev/null
+++ b/arch/x86/lib/rwsem.S
@@ -0,0 +1,136 @@
+/*
+ * x86 semaphore implementation.
+ *
+ * (C) Copyright 1999 Linus Torvalds
+ *
+ * Portions Copyright 1999 Red Hat, Inc.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/dwarf2.h>
+
+#define __ASM_HALF_REG(reg)	__ASM_SEL(reg, e##reg)
+#define __ASM_HALF_SIZE(inst)	__ASM_SEL(inst##w, inst##l)
+
+#ifdef CONFIG_X86_32
+
+/*
+ * The semaphore operations have a special calling sequence that
+ * allow us to do a simpler in-line version of them. These routines
+ * need to convert that sequence back into the C sequence when
+ * there is contention on the semaphore.
+ *
+ * %eax contains the semaphore pointer on entry. Save the C-clobbered
+ * registers (%eax, %edx and %ecx) except %eax whish is either a return
+ * value or just clobbered..
+ */
+
+#define save_common_regs \
+	pushl_cfi %ecx; CFI_REL_OFFSET ecx, 0
+
+#define restore_common_regs \
+	popl_cfi %ecx; CFI_RESTORE ecx
+
+	/* Avoid uglifying the argument copying x86-64 needs to do. */
+	.macro movq src, dst
+	.endm
+
+#else
+
+/*
+ * x86-64 rwsem wrappers
+ *
+ * This interfaces the inline asm code to the slow-path
+ * C routines. We need to save the call-clobbered regs
+ * that the asm does not mark as clobbered, and move the
+ * argument from %rax to %rdi.
+ *
+ * NOTE! We don't need to save %rax, because the functions
+ * will always return the semaphore pointer in %rax (which
+ * is also the input argument to these helpers)
+ *
+ * The following can clobber %rdx because the asm clobbers it:
+ *   call_rwsem_down_write_failed
+ *   call_rwsem_wake
+ * but %rdi, %rsi, %rcx, %r8-r11 always need saving.
+ */
+
+#define save_common_regs \
+	pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \
+	pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \
+	pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \
+	pushq_cfi %r8;  CFI_REL_OFFSET r8,  0; \
+	pushq_cfi %r9;  CFI_REL_OFFSET r9,  0; \
+	pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \
+	pushq_cfi %r11; CFI_REL_OFFSET r11, 0
+
+#define restore_common_regs \
+	popq_cfi %r11; CFI_RESTORE r11; \
+	popq_cfi %r10; CFI_RESTORE r10; \
+	popq_cfi %r9;  CFI_RESTORE r9; \
+	popq_cfi %r8;  CFI_RESTORE r8; \
+	popq_cfi %rcx; CFI_RESTORE rcx; \
+	popq_cfi %rsi; CFI_RESTORE rsi; \
+	popq_cfi %rdi; CFI_RESTORE rdi
+
+#endif
+
+/* Fix up special calling conventions */
+ENTRY(call_rwsem_down_read_failed)
+	CFI_STARTPROC
+	save_common_regs
+	__ASM_SIZE(push,_cfi) %__ASM_REG(dx)
+	CFI_REL_OFFSET __ASM_REG(dx), 0
+	movq %rax,%rdi
+	call rwsem_down_read_failed
+	__ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
+	CFI_RESTORE __ASM_REG(dx)
+	restore_common_regs
+	ret
+	CFI_ENDPROC
+ENDPROC(call_rwsem_down_read_failed)
+
+ENTRY(call_rwsem_down_write_failed)
+	CFI_STARTPROC
+	save_common_regs
+	movq %rax,%rdi
+	call rwsem_down_write_failed
+	restore_common_regs
+	ret
+	CFI_ENDPROC
+ENDPROC(call_rwsem_down_write_failed)
+
+ENTRY(call_rwsem_wake)
+	CFI_STARTPROC
+	/* do nothing if still outstanding active readers */
+	__ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx)
+	jnz 1f
+	save_common_regs
+	movq %rax,%rdi
+	call rwsem_wake
+	restore_common_regs
+1:	ret
+	CFI_ENDPROC
+ENDPROC(call_rwsem_wake)
+
+ENTRY(call_rwsem_downgrade_wake)
+	CFI_STARTPROC
+	save_common_regs
+	__ASM_SIZE(push,_cfi) %__ASM_REG(dx)
+	CFI_REL_OFFSET __ASM_REG(dx), 0
+	movq %rax,%rdi
+	call rwsem_downgrade_wake
+	__ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
+	CFI_RESTORE __ASM_REG(dx)
+	restore_common_regs
+	ret
+	CFI_ENDPROC
+ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
deleted file mode 100644
index 648fe474178..00000000000
--- a/arch/x86/lib/semaphore_32.S
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * i386 semaphore implementation.
- *
- * (C) Copyright 1999 Linus Torvalds
- *
- * Portions Copyright 1999 Red Hat, Inc.
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/rwlock.h>
-#include <asm/alternative-asm.h>
-#include <asm/frame.h>
-#include <asm/dwarf2.h>
-
-/*
- * The semaphore operations have a special calling sequence that
- * allow us to do a simpler in-line version of them. These routines
- * need to convert that sequence back into the C sequence when
- * there is contention on the semaphore.
- *
- * %eax contains the semaphore pointer on entry. Save the C-clobbered
- * registers (%eax, %edx and %ecx) except %eax whish is either a return
- * value or just clobbered..
- */
-	.section .sched.text, "ax"
-
-/*
- * rw spinlock fallbacks
- */
-#ifdef CONFIG_SMP
-ENTRY(__write_lock_failed)
-	CFI_STARTPROC simple
-	FRAME
-2: 	LOCK_PREFIX
-	addl	$ RW_LOCK_BIAS,(%eax)
-1:	rep; nop
-	cmpl	$ RW_LOCK_BIAS,(%eax)
-	jne	1b
-	LOCK_PREFIX
-	subl	$ RW_LOCK_BIAS,(%eax)
-	jnz	2b
-	ENDFRAME
-	ret
-	CFI_ENDPROC
-	ENDPROC(__write_lock_failed)
-
-ENTRY(__read_lock_failed)
-	CFI_STARTPROC
-	FRAME
-2: 	LOCK_PREFIX
-	incl	(%eax)
-1:	rep; nop
-	cmpl	$1,(%eax)
-	js	1b
-	LOCK_PREFIX
-	decl	(%eax)
-	js	2b
-	ENDFRAME
-	ret
-	CFI_ENDPROC
-	ENDPROC(__read_lock_failed)
-
-#endif
-
-#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
-
-/* Fix up special calling conventions */
-ENTRY(call_rwsem_down_read_failed)
-	CFI_STARTPROC
-	push %ecx
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET ecx,0
-	push %edx
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET edx,0
-	call rwsem_down_read_failed
-	pop %edx
-	CFI_ADJUST_CFA_OFFSET -4
-	pop %ecx
-	CFI_ADJUST_CFA_OFFSET -4
-	ret
-	CFI_ENDPROC
-	ENDPROC(call_rwsem_down_read_failed)
-
-ENTRY(call_rwsem_down_write_failed)
-	CFI_STARTPROC
-	push %ecx
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET ecx,0
-	calll rwsem_down_write_failed
-	pop %ecx
-	CFI_ADJUST_CFA_OFFSET -4
-	ret
-	CFI_ENDPROC
-	ENDPROC(call_rwsem_down_write_failed)
-
-ENTRY(call_rwsem_wake)
-	CFI_STARTPROC
-	decw %dx    /* do nothing if still outstanding active readers */
-	jnz 1f
-	push %ecx
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET ecx,0
-	call rwsem_wake
-	pop %ecx
-	CFI_ADJUST_CFA_OFFSET -4
-1:	ret
-	CFI_ENDPROC
-	ENDPROC(call_rwsem_wake)
-
-/* Fix up special calling conventions */
-ENTRY(call_rwsem_downgrade_wake)
-	CFI_STARTPROC
-	push %ecx
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET ecx,0
-	push %edx
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET edx,0
-	call rwsem_downgrade_wake
-	pop %edx
-	CFI_ADJUST_CFA_OFFSET -4
-	pop %ecx
-	CFI_ADJUST_CFA_OFFSET -4
-	ret
-	CFI_ENDPROC
-	ENDPROC(call_rwsem_downgrade_wake)
-
-#endif
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
index 82004d2bf05..bd59090825d 100644
--- a/arch/x86/lib/string_32.c
+++ b/arch/x86/lib/string_32.c
@@ -164,15 +164,13 @@ EXPORT_SYMBOL(strchr);
 size_t strlen(const char *s)
 {
 	int d0;
-	int res;
+	size_t res;
 	asm volatile("repne\n\t"
-		"scasb\n\t"
-		"notl %0\n\t"
-		"decl %0"
+		"scasb"
 		: "=c" (res), "=&D" (d0)
 		: "1" (s), "a" (0), "0" (0xffffffffu)
 		: "memory");
-	return res;
+	return ~res - 1;
 }
 EXPORT_SYMBOL(strlen);
 #endif
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
index 650b11e00ec..28f85c91671 100644
--- a/arch/x86/lib/thunk_32.S
+++ b/arch/x86/lib/thunk_32.S
@@ -4,26 +4,8 @@
  *  (inspired by Andi Kleen's thunk_64.S)
  * Subject to the GNU public license, v.2. No warranty of any kind.
  */
-
 	#include <linux/linkage.h>
-
-#define ARCH_TRACE_IRQS_ON			\
-	pushl %eax;				\
-	pushl %ecx;				\
-	pushl %edx;				\
-	call trace_hardirqs_on;			\
-	popl %edx;				\
-	popl %ecx;				\
-	popl %eax;
-
-#define ARCH_TRACE_IRQS_OFF			\
-	pushl %eax;				\
-	pushl %ecx;				\
-	pushl %edx;				\
-	call trace_hardirqs_off;		\
-	popl %edx;				\
-	popl %ecx;				\
-	popl %eax;
+	#include <asm/asm.h>
 
 #ifdef CONFIG_TRACE_IRQFLAGS
 	/* put return address in eax (arg1) */
@@ -40,6 +22,7 @@
 	popl %ecx
 	popl %eax
 	ret
+	_ASM_NOKPROBE(\name)
 	.endm
 
 	thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index bf9a7d5a542..92d9feaff42 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -5,77 +5,44 @@
  * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc.
  * Subject to the GNU public license, v.2. No warranty of any kind.
  */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
+#include <asm/asm.h>
 
-	#include <linux/linkage.h>
-	#include <asm/dwarf2.h>
-	#include <asm/calling.h>			
-	#include <asm/rwlock.h>
-		
-	/* rdi:	arg1 ... normal C conventions. rax is saved/restored. */ 	
-	.macro thunk name,func
+	/* rdi:	arg1 ... normal C conventions. rax is saved/restored. */
+	.macro THUNK name, func, put_ret_addr_in_rdi=0
 	.globl \name
-\name:	
+\name:
 	CFI_STARTPROC
-	SAVE_ARGS
-	call \func
-	jmp  restore
-	CFI_ENDPROC
-	.endm
 
-	/* rdi:	arg1 ... normal C conventions. rax is passed from C. */ 	
-	.macro thunk_retrax name,func
-	.globl \name
-\name:	
-	CFI_STARTPROC
+	/* this one pushes 9 elems, the next one would be %rIP */
 	SAVE_ARGS
-	call \func
-	jmp  restore_norax
-	CFI_ENDPROC
-	.endm
-	
 
-	.section .sched.text, "ax"
-#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
-	thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
-	thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
-	thunk rwsem_wake_thunk,rwsem_wake
-	thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
-#endif	
-	
-#ifdef CONFIG_TRACE_IRQFLAGS
-	/* put return address in rdi (arg1) */
-	.macro thunk_ra name,func
-	.globl \name
-\name:
-	CFI_STARTPROC
-	SAVE_ARGS
-	/* SAVE_ARGS pushs 9 elements */
-	/* the next element would be the rip */
-	movq 9*8(%rsp), %rdi
+	.if \put_ret_addr_in_rdi
+	movq_cfi_restore 9*8, rdi
+	.endif
+
 	call \func
 	jmp  restore
 	CFI_ENDPROC
+	_ASM_NOKPROBE(\name)
 	.endm
 
-	thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
-	thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
+#ifdef CONFIG_TRACE_IRQFLAGS
+	THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1
+	THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1
 #endif
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-	thunk lockdep_sys_exit_thunk,lockdep_sys_exit
+	THUNK lockdep_sys_exit_thunk,lockdep_sys_exit
 #endif
-	
+
 	/* SAVE_ARGS below is used only for the .cfi directives it contains. */
 	CFI_STARTPROC
 	SAVE_ARGS
 restore:
 	RESTORE_ARGS
-	ret	
-	CFI_ENDPROC
-	
-	CFI_STARTPROC
-	SAVE_ARGS
-restore_norax:	
-	RESTORE_ARGS 1
 	ret
 	CFI_ENDPROC
+	_ASM_NOKPROBE(restore)
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
new file mode 100644
index 00000000000..ddf9ecb53cc
--- /dev/null
+++ b/arch/x86/lib/usercopy.c
@@ -0,0 +1,36 @@
+/*
+ * User address space access functions.
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/highmem.h>
+#include <linux/module.h>
+
+#include <asm/word-at-a-time.h>
+#include <linux/sched.h>
+
+/*
+ * We rely on the nested NMI work to allow atomic faults from the NMI path; the
+ * nested NMI paths are careful to preserve CR2.
+ */
+unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
+{
+	unsigned long ret;
+
+	if (__range_not_ok(from, n, TASK_SIZE))
+		return 0;
+
+	/*
+	 * Even though this function is typically called from NMI/IRQ context
+	 * disable pagefaults so that its behaviour is consistent even when
+	 * called form other contexts.
+	 */
+	pagefault_disable();
+	ret = __copy_from_user_inatomic(to, from, n);
+	pagefault_enable();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(copy_from_user_nmi);
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index e218d5df85f..e2f5e21c03b 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -13,6 +13,7 @@
 #include <linux/interrupt.h>
 #include <asm/uaccess.h>
 #include <asm/mmx.h>
+#include <asm/asm.h>
 
 #ifdef CONFIG_X86_INTEL_USERCOPY
 /*
@@ -33,93 +34,6 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon
 	__movsl_is_ok((unsigned long)(a1), (unsigned long)(a2), (n))
 
 /*
- * Copy a null terminated string from userspace.
- */
-
-#define __do_strncpy_from_user(dst, src, count, res)			   \
-do {									   \
-	int __d0, __d1, __d2;						   \
-	might_fault();							   \
-	__asm__ __volatile__(						   \
-		"	testl %1,%1\n"					   \
-		"	jz 2f\n"					   \
-		"0:	lodsb\n"					   \
-		"	stosb\n"					   \
-		"	testb %%al,%%al\n"				   \
-		"	jz 1f\n"					   \
-		"	decl %1\n"					   \
-		"	jnz 0b\n"					   \
-		"1:	subl %1,%0\n"					   \
-		"2:\n"							   \
-		".section .fixup,\"ax\"\n"				   \
-		"3:	movl %5,%0\n"					   \
-		"	jmp 2b\n"					   \
-		".previous\n"						   \
-		_ASM_EXTABLE(0b,3b)					   \
-		: "=&d"(res), "=&c"(count), "=&a" (__d0), "=&S" (__d1),	   \
-		  "=&D" (__d2)						   \
-		: "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
-		: "memory");						   \
-} while (0)
-
-/**
- * __strncpy_from_user: - Copy a NUL terminated string from userspace, with less checking.
- * @dst:   Destination address, in kernel space.  This buffer must be at
- *         least @count bytes long.
- * @src:   Source address, in user space.
- * @count: Maximum number of bytes to copy, including the trailing NUL.
- *
- * Copies a NUL-terminated string from userspace to kernel space.
- * Caller must check the specified block with access_ok() before calling
- * this function.
- *
- * On success, returns the length of the string (not including the trailing
- * NUL).
- *
- * If access to userspace fails, returns -EFAULT (some data may have been
- * copied).
- *
- * If @count is smaller than the length of the string, copies @count bytes
- * and returns @count.
- */
-long
-__strncpy_from_user(char *dst, const char __user *src, long count)
-{
-	long res;
-	__do_strncpy_from_user(dst, src, count, res);
-	return res;
-}
-EXPORT_SYMBOL(__strncpy_from_user);
-
-/**
- * strncpy_from_user: - Copy a NUL terminated string from userspace.
- * @dst:   Destination address, in kernel space.  This buffer must be at
- *         least @count bytes long.
- * @src:   Source address, in user space.
- * @count: Maximum number of bytes to copy, including the trailing NUL.
- *
- * Copies a NUL-terminated string from userspace to kernel space.
- *
- * On success, returns the length of the string (not including the trailing
- * NUL).
- *
- * If access to userspace fails, returns -EFAULT (some data may have been
- * copied).
- *
- * If @count is smaller than the length of the string, copies @count bytes
- * and returns @count.
- */
-long
-strncpy_from_user(char *dst, const char __user *src, long count)
-{
-	long res = -EFAULT;
-	if (access_ok(VERIFY_READ, src, 1))
-		__do_strncpy_from_user(dst, src, count, res);
-	return res;
-}
-EXPORT_SYMBOL(strncpy_from_user);
-
-/*
  * Zero Userspace
  */
 
@@ -128,10 +42,11 @@ do {									\
 	int __d0;							\
 	might_fault();							\
 	__asm__ __volatile__(						\
+		ASM_STAC "\n"						\
 		"0:	rep; stosl\n"					\
 		"	movl %2,%0\n"					\
 		"1:	rep; stosb\n"					\
-		"2:\n"							\
+		"2: " ASM_CLAC "\n"					\
 		".section .fixup,\"ax\"\n"				\
 		"3:	lea 0(%2,%0,4),%0\n"				\
 		"	jmp 2b\n"					\
@@ -181,50 +96,6 @@ __clear_user(void __user *to, unsigned long n)
 }
 EXPORT_SYMBOL(__clear_user);
 
-/**
- * strnlen_user: - Get the size of a string in user space.
- * @s: The string to measure.
- * @n: The maximum valid length
- *
- * Get the size of a NUL-terminated string in user space.
- *
- * Returns the size of the string INCLUDING the terminating NUL.
- * On exception, returns 0.
- * If the string is too long, returns a value greater than @n.
- */
-long strnlen_user(const char __user *s, long n)
-{
-	unsigned long mask = -__addr_ok(s);
-	unsigned long res, tmp;
-
-	might_fault();
-
-	__asm__ __volatile__(
-		"	testl %0, %0\n"
-		"	jz 3f\n"
-		"	andl %0,%%ecx\n"
-		"0:	repne; scasb\n"
-		"	setne %%al\n"
-		"	subl %%ecx,%0\n"
-		"	addl %0,%%eax\n"
-		"1:\n"
-		".section .fixup,\"ax\"\n"
-		"2:	xorl %%eax,%%eax\n"
-		"	jmp 1b\n"
-		"3:	movb $1,%%al\n"
-		"	jmp 1b\n"
-		".previous\n"
-		".section __ex_table,\"a\"\n"
-		"	.align 4\n"
-		"	.long 0b,2b\n"
-		".previous"
-		:"=&r" (n), "=&D" (s), "=&a" (res), "=&c" (tmp)
-		:"0" (n), "1" (s), "2" (0), "3" (mask)
-		:"cc");
-	return res & mask;
-}
-EXPORT_SYMBOL(strnlen_user);
-
 #ifdef CONFIG_X86_INTEL_USERCOPY
 static unsigned long
 __copy_user_intel(void __user *to, const void *from, unsigned long size)
@@ -286,47 +157,44 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size)
 		       "101:   lea 0(%%eax,%0,4),%0\n"
 		       "       jmp 100b\n"
 		       ".previous\n"
-		       ".section __ex_table,\"a\"\n"
-		       "       .align 4\n"
-		       "       .long 1b,100b\n"
-		       "       .long 2b,100b\n"
-		       "       .long 3b,100b\n"
-		       "       .long 4b,100b\n"
-		       "       .long 5b,100b\n"
-		       "       .long 6b,100b\n"
-		       "       .long 7b,100b\n"
-		       "       .long 8b,100b\n"
-		       "       .long 9b,100b\n"
-		       "       .long 10b,100b\n"
-		       "       .long 11b,100b\n"
-		       "       .long 12b,100b\n"
-		       "       .long 13b,100b\n"
-		       "       .long 14b,100b\n"
-		       "       .long 15b,100b\n"
-		       "       .long 16b,100b\n"
-		       "       .long 17b,100b\n"
-		       "       .long 18b,100b\n"
-		       "       .long 19b,100b\n"
-		       "       .long 20b,100b\n"
-		       "       .long 21b,100b\n"
-		       "       .long 22b,100b\n"
-		       "       .long 23b,100b\n"
-		       "       .long 24b,100b\n"
-		       "       .long 25b,100b\n"
-		       "       .long 26b,100b\n"
-		       "       .long 27b,100b\n"
-		       "       .long 28b,100b\n"
-		       "       .long 29b,100b\n"
-		       "       .long 30b,100b\n"
-		       "       .long 31b,100b\n"
-		       "       .long 32b,100b\n"
-		       "       .long 33b,100b\n"
-		       "       .long 34b,100b\n"
-		       "       .long 35b,100b\n"
-		       "       .long 36b,100b\n"
-		       "       .long 37b,100b\n"
-		       "       .long 99b,101b\n"
-		       ".previous"
+		       _ASM_EXTABLE(1b,100b)
+		       _ASM_EXTABLE(2b,100b)
+		       _ASM_EXTABLE(3b,100b)
+		       _ASM_EXTABLE(4b,100b)
+		       _ASM_EXTABLE(5b,100b)
+		       _ASM_EXTABLE(6b,100b)
+		       _ASM_EXTABLE(7b,100b)
+		       _ASM_EXTABLE(8b,100b)
+		       _ASM_EXTABLE(9b,100b)
+		       _ASM_EXTABLE(10b,100b)
+		       _ASM_EXTABLE(11b,100b)
+		       _ASM_EXTABLE(12b,100b)
+		       _ASM_EXTABLE(13b,100b)
+		       _ASM_EXTABLE(14b,100b)
+		       _ASM_EXTABLE(15b,100b)
+		       _ASM_EXTABLE(16b,100b)
+		       _ASM_EXTABLE(17b,100b)
+		       _ASM_EXTABLE(18b,100b)
+		       _ASM_EXTABLE(19b,100b)
+		       _ASM_EXTABLE(20b,100b)
+		       _ASM_EXTABLE(21b,100b)
+		       _ASM_EXTABLE(22b,100b)
+		       _ASM_EXTABLE(23b,100b)
+		       _ASM_EXTABLE(24b,100b)
+		       _ASM_EXTABLE(25b,100b)
+		       _ASM_EXTABLE(26b,100b)
+		       _ASM_EXTABLE(27b,100b)
+		       _ASM_EXTABLE(28b,100b)
+		       _ASM_EXTABLE(29b,100b)
+		       _ASM_EXTABLE(30b,100b)
+		       _ASM_EXTABLE(31b,100b)
+		       _ASM_EXTABLE(32b,100b)
+		       _ASM_EXTABLE(33b,100b)
+		       _ASM_EXTABLE(34b,100b)
+		       _ASM_EXTABLE(35b,100b)
+		       _ASM_EXTABLE(36b,100b)
+		       _ASM_EXTABLE(37b,100b)
+		       _ASM_EXTABLE(99b,101b)
 		       : "=&c"(size), "=&D" (d0), "=&S" (d1)
 		       :  "1"(to), "2"(from), "0"(size)
 		       : "eax", "edx", "memory");
@@ -399,29 +267,26 @@ __copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size)
 		       "        popl %0\n"
 		       "        jmp 8b\n"
 		       ".previous\n"
-		       ".section __ex_table,\"a\"\n"
-		       "	.align 4\n"
-		       "	.long 0b,16b\n"
-		       "	.long 1b,16b\n"
-		       "	.long 2b,16b\n"
-		       "	.long 21b,16b\n"
-		       "	.long 3b,16b\n"
-		       "	.long 31b,16b\n"
-		       "	.long 4b,16b\n"
-		       "	.long 41b,16b\n"
-		       "	.long 10b,16b\n"
-		       "	.long 51b,16b\n"
-		       "	.long 11b,16b\n"
-		       "	.long 61b,16b\n"
-		       "	.long 12b,16b\n"
-		       "	.long 71b,16b\n"
-		       "	.long 13b,16b\n"
-		       "	.long 81b,16b\n"
-		       "	.long 14b,16b\n"
-		       "	.long 91b,16b\n"
-		       "	.long 6b,9b\n"
-		       "        .long 7b,16b\n"
-		       ".previous"
+		       _ASM_EXTABLE(0b,16b)
+		       _ASM_EXTABLE(1b,16b)
+		       _ASM_EXTABLE(2b,16b)
+		       _ASM_EXTABLE(21b,16b)
+		       _ASM_EXTABLE(3b,16b)
+		       _ASM_EXTABLE(31b,16b)
+		       _ASM_EXTABLE(4b,16b)
+		       _ASM_EXTABLE(41b,16b)
+		       _ASM_EXTABLE(10b,16b)
+		       _ASM_EXTABLE(51b,16b)
+		       _ASM_EXTABLE(11b,16b)
+		       _ASM_EXTABLE(61b,16b)
+		       _ASM_EXTABLE(12b,16b)
+		       _ASM_EXTABLE(71b,16b)
+		       _ASM_EXTABLE(13b,16b)
+		       _ASM_EXTABLE(81b,16b)
+		       _ASM_EXTABLE(14b,16b)
+		       _ASM_EXTABLE(91b,16b)
+		       _ASM_EXTABLE(6b,9b)
+		       _ASM_EXTABLE(7b,16b)
 		       : "=&c"(size), "=&D" (d0), "=&S" (d1)
 		       :  "1"(to), "2"(from), "0"(size)
 		       : "eax", "edx", "memory");
@@ -501,29 +366,26 @@ static unsigned long __copy_user_zeroing_intel_nocache(void *to,
 	       "        popl %0\n"
 	       "        jmp 8b\n"
 	       ".previous\n"
-	       ".section __ex_table,\"a\"\n"
-	       "	.align 4\n"
-	       "	.long 0b,16b\n"
-	       "	.long 1b,16b\n"
-	       "	.long 2b,16b\n"
-	       "	.long 21b,16b\n"
-	       "	.long 3b,16b\n"
-	       "	.long 31b,16b\n"
-	       "	.long 4b,16b\n"
-	       "	.long 41b,16b\n"
-	       "	.long 10b,16b\n"
-	       "	.long 51b,16b\n"
-	       "	.long 11b,16b\n"
-	       "	.long 61b,16b\n"
-	       "	.long 12b,16b\n"
-	       "	.long 71b,16b\n"
-	       "	.long 13b,16b\n"
-	       "	.long 81b,16b\n"
-	       "	.long 14b,16b\n"
-	       "	.long 91b,16b\n"
-	       "	.long 6b,9b\n"
-	       "        .long 7b,16b\n"
-	       ".previous"
+	       _ASM_EXTABLE(0b,16b)
+	       _ASM_EXTABLE(1b,16b)
+	       _ASM_EXTABLE(2b,16b)
+	       _ASM_EXTABLE(21b,16b)
+	       _ASM_EXTABLE(3b,16b)
+	       _ASM_EXTABLE(31b,16b)
+	       _ASM_EXTABLE(4b,16b)
+	       _ASM_EXTABLE(41b,16b)
+	       _ASM_EXTABLE(10b,16b)
+	       _ASM_EXTABLE(51b,16b)
+	       _ASM_EXTABLE(11b,16b)
+	       _ASM_EXTABLE(61b,16b)
+	       _ASM_EXTABLE(12b,16b)
+	       _ASM_EXTABLE(71b,16b)
+	       _ASM_EXTABLE(13b,16b)
+	       _ASM_EXTABLE(81b,16b)
+	       _ASM_EXTABLE(14b,16b)
+	       _ASM_EXTABLE(91b,16b)
+	       _ASM_EXTABLE(6b,9b)
+	       _ASM_EXTABLE(7b,16b)
 	       : "=&c"(size), "=&D" (d0), "=&S" (d1)
 	       :  "1"(to), "2"(from), "0"(size)
 	       : "eax", "edx", "memory");
@@ -592,29 +454,26 @@ static unsigned long __copy_user_intel_nocache(void *to,
 	       "9:      lea 0(%%eax,%0,4),%0\n"
 	       "16:     jmp 8b\n"
 	       ".previous\n"
-	       ".section __ex_table,\"a\"\n"
-	       "	.align 4\n"
-	       "	.long 0b,16b\n"
-	       "	.long 1b,16b\n"
-	       "	.long 2b,16b\n"
-	       "	.long 21b,16b\n"
-	       "	.long 3b,16b\n"
-	       "	.long 31b,16b\n"
-	       "	.long 4b,16b\n"
-	       "	.long 41b,16b\n"
-	       "	.long 10b,16b\n"
-	       "	.long 51b,16b\n"
-	       "	.long 11b,16b\n"
-	       "	.long 61b,16b\n"
-	       "	.long 12b,16b\n"
-	       "	.long 71b,16b\n"
-	       "	.long 13b,16b\n"
-	       "	.long 81b,16b\n"
-	       "	.long 14b,16b\n"
-	       "	.long 91b,16b\n"
-	       "	.long 6b,9b\n"
-	       "        .long 7b,16b\n"
-	       ".previous"
+	       _ASM_EXTABLE(0b,16b)
+	       _ASM_EXTABLE(1b,16b)
+	       _ASM_EXTABLE(2b,16b)
+	       _ASM_EXTABLE(21b,16b)
+	       _ASM_EXTABLE(3b,16b)
+	       _ASM_EXTABLE(31b,16b)
+	       _ASM_EXTABLE(4b,16b)
+	       _ASM_EXTABLE(41b,16b)
+	       _ASM_EXTABLE(10b,16b)
+	       _ASM_EXTABLE(51b,16b)
+	       _ASM_EXTABLE(11b,16b)
+	       _ASM_EXTABLE(61b,16b)
+	       _ASM_EXTABLE(12b,16b)
+	       _ASM_EXTABLE(71b,16b)
+	       _ASM_EXTABLE(13b,16b)
+	       _ASM_EXTABLE(81b,16b)
+	       _ASM_EXTABLE(14b,16b)
+	       _ASM_EXTABLE(91b,16b)
+	       _ASM_EXTABLE(6b,9b)
+	       _ASM_EXTABLE(7b,16b)
 	       : "=&c"(size), "=&D" (d0), "=&S" (d1)
 	       :  "1"(to), "2"(from), "0"(size)
 	       : "eax", "edx", "memory");
@@ -661,12 +520,9 @@ do {									\
 		"3:	lea 0(%3,%0,4),%0\n"				\
 		"	jmp 2b\n"					\
 		".previous\n"						\
-		".section __ex_table,\"a\"\n"				\
-		"	.align 4\n"					\
-		"	.long 4b,5b\n"					\
-		"	.long 0b,3b\n"					\
-		"	.long 1b,2b\n"					\
-		".previous"						\
+		_ASM_EXTABLE(4b,5b)					\
+		_ASM_EXTABLE(0b,3b)					\
+		_ASM_EXTABLE(1b,2b)					\
 		: "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2)	\
 		: "3"(size), "0"(size), "1"(to), "2"(from)		\
 		: "memory");						\
@@ -703,12 +559,9 @@ do {									\
 		"	popl %0\n"					\
 		"	jmp 2b\n"					\
 		".previous\n"						\
-		".section __ex_table,\"a\"\n"				\
-		"	.align 4\n"					\
-		"	.long 4b,5b\n"					\
-		"	.long 0b,3b\n"					\
-		"	.long 1b,6b\n"					\
-		".previous"						\
+		_ASM_EXTABLE(4b,5b)					\
+		_ASM_EXTABLE(0b,3b)					\
+		_ASM_EXTABLE(1b,6b)					\
 		: "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2)	\
 		: "3"(size), "0"(size), "1"(to), "2"(from)		\
 		: "memory");						\
@@ -717,67 +570,12 @@ do {									\
 unsigned long __copy_to_user_ll(void __user *to, const void *from,
 				unsigned long n)
 {
-#ifndef CONFIG_X86_WP_WORKS_OK
-	if (unlikely(boot_cpu_data.wp_works_ok == 0) &&
-			((unsigned long)to) < TASK_SIZE) {
-		/*
-		 * When we are in an atomic section (see
-		 * mm/filemap.c:file_read_actor), return the full
-		 * length to take the slow path.
-		 */
-		if (in_atomic())
-			return n;
-
-		/*
-		 * CPU does not honor the WP bit when writing
-		 * from supervisory mode, and due to preemption or SMP,
-		 * the page tables can change at any time.
-		 * Do it manually.	Manfred <manfred@colorfullife.com>
-		 */
-		while (n) {
-			unsigned long offset = ((unsigned long)to)%PAGE_SIZE;
-			unsigned long len = PAGE_SIZE - offset;
-			int retval;
-			struct page *pg;
-			void *maddr;
-
-			if (len > n)
-				len = n;
-
-survive:
-			down_read(&current->mm->mmap_sem);
-			retval = get_user_pages(current, current->mm,
-					(unsigned long)to, 1, 1, 0, &pg, NULL);
-
-			if (retval == -ENOMEM && is_global_init(current)) {
-				up_read(&current->mm->mmap_sem);
-				congestion_wait(BLK_RW_ASYNC, HZ/50);
-				goto survive;
-			}
-
-			if (retval != 1) {
-				up_read(&current->mm->mmap_sem);
-				break;
-			}
-
-			maddr = kmap_atomic(pg, KM_USER0);
-			memcpy(maddr + offset, from, len);
-			kunmap_atomic(maddr, KM_USER0);
-			set_page_dirty_lock(pg);
-			put_page(pg);
-			up_read(&current->mm->mmap_sem);
-
-			from += len;
-			to += len;
-			n -= len;
-		}
-		return n;
-	}
-#endif
+	stac();
 	if (movsl_is_ok(to, from, n))
 		__copy_user(to, from, n);
 	else
 		n = __copy_user_intel(to, from, n);
+	clac();
 	return n;
 }
 EXPORT_SYMBOL(__copy_to_user_ll);
@@ -785,10 +583,12 @@ EXPORT_SYMBOL(__copy_to_user_ll);
 unsigned long __copy_from_user_ll(void *to, const void __user *from,
 					unsigned long n)
 {
+	stac();
 	if (movsl_is_ok(to, from, n))
 		__copy_user_zeroing(to, from, n);
 	else
 		n = __copy_user_zeroing_intel(to, from, n);
+	clac();
 	return n;
 }
 EXPORT_SYMBOL(__copy_from_user_ll);
@@ -796,11 +596,13 @@ EXPORT_SYMBOL(__copy_from_user_ll);
 unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from,
 					 unsigned long n)
 {
+	stac();
 	if (movsl_is_ok(to, from, n))
 		__copy_user(to, from, n);
 	else
 		n = __copy_user_intel((void __user *)to,
 				      (const void *)from, n);
+	clac();
 	return n;
 }
 EXPORT_SYMBOL(__copy_from_user_ll_nozero);
@@ -808,6 +610,7 @@ EXPORT_SYMBOL(__copy_from_user_ll_nozero);
 unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
 					unsigned long n)
 {
+	stac();
 #ifdef CONFIG_X86_INTEL_USERCOPY
 	if (n > 64 && cpu_has_xmm2)
 		n = __copy_user_zeroing_intel_nocache(to, from, n);
@@ -816,6 +619,7 @@ unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
 #else
 	__copy_user_zeroing(to, from, n);
 #endif
+	clac();
 	return n;
 }
 EXPORT_SYMBOL(__copy_from_user_ll_nocache);
@@ -823,6 +627,7 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocache);
 unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
 					unsigned long n)
 {
+	stac();
 #ifdef CONFIG_X86_INTEL_USERCOPY
 	if (n > 64 && cpu_has_xmm2)
 		n = __copy_user_intel_nocache(to, from, n);
@@ -831,6 +636,7 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr
 #else
 	__copy_user(to, from, n);
 #endif
+	clac();
 	return n;
 }
 EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
@@ -848,14 +654,13 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
  * Returns number of bytes that could not be copied.
  * On success, this will be zero.
  */
-unsigned long
-copy_to_user(void __user *to, const void *from, unsigned long n)
+unsigned long _copy_to_user(void __user *to, const void *from, unsigned n)
 {
 	if (access_ok(VERIFY_WRITE, to, n))
 		n = __copy_to_user(to, from, n);
 	return n;
 }
-EXPORT_SYMBOL(copy_to_user);
+EXPORT_SYMBOL(_copy_to_user);
 
 /**
  * copy_from_user: - Copy a block of data from user space.
@@ -873,8 +678,7 @@ EXPORT_SYMBOL(copy_to_user);
  * If some data could not be copied, this function will pad the copied
  * data to the requested size using zero bytes.
  */
-unsigned long
-_copy_from_user(void *to, const void __user *from, unsigned long n)
+unsigned long _copy_from_user(void *to, const void __user *from, unsigned n)
 {
 	if (access_ok(VERIFY_READ, from, n))
 		n = __copy_from_user(to, from, n);
@@ -883,9 +687,3 @@ _copy_from_user(void *to, const void __user *from, unsigned long n)
 	return n;
 }
 EXPORT_SYMBOL(_copy_from_user);
-
-void copy_from_user_overflow(void)
-{
-	WARN(1, "Buffer overflow detected!\n");
-}
-EXPORT_SYMBOL(copy_from_user_overflow);
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index b7c2849ffb6..c905e89e19f 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -9,55 +9,6 @@
 #include <asm/uaccess.h>
 
 /*
- * Copy a null terminated string from userspace.
- */
-
-#define __do_strncpy_from_user(dst,src,count,res)			   \
-do {									   \
-	long __d0, __d1, __d2;						   \
-	might_fault();							   \
-	__asm__ __volatile__(						   \
-		"	testq %1,%1\n"					   \
-		"	jz 2f\n"					   \
-		"0:	lodsb\n"					   \
-		"	stosb\n"					   \
-		"	testb %%al,%%al\n"				   \
-		"	jz 1f\n"					   \
-		"	decq %1\n"					   \
-		"	jnz 0b\n"					   \
-		"1:	subq %1,%0\n"					   \
-		"2:\n"							   \
-		".section .fixup,\"ax\"\n"				   \
-		"3:	movq %5,%0\n"					   \
-		"	jmp 2b\n"					   \
-		".previous\n"						   \
-		_ASM_EXTABLE(0b,3b)					   \
-		: "=&r"(res), "=&c"(count), "=&a" (__d0), "=&S" (__d1),	   \
-		  "=&D" (__d2)						   \
-		: "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
-		: "memory");						   \
-} while (0)
-
-long
-__strncpy_from_user(char *dst, const char __user *src, long count)
-{
-	long res;
-	__do_strncpy_from_user(dst, src, count, res);
-	return res;
-}
-EXPORT_SYMBOL(__strncpy_from_user);
-
-long
-strncpy_from_user(char *dst, const char __user *src, long count)
-{
-	long res = -EFAULT;
-	if (access_ok(VERIFY_READ, src, 1))
-		return __strncpy_from_user(dst, src, count);
-	return res;
-}
-EXPORT_SYMBOL(strncpy_from_user);
-
-/*
  * Zero Userspace
  */
 
@@ -67,6 +18,7 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
 	might_fault();
 	/* no memory constraint because it doesn't change any memory gcc knows
 	   about */
+	stac();
 	asm volatile(
 		"	testq  %[size8],%[size8]\n"
 		"	jz     4f\n"
@@ -89,6 +41,7 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
 		: [size8] "=&c"(size), [dst] "=&D" (__d0)
 		: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr),
 		  [zero] "r" (0UL), [eight] "r" (8UL));
+	clac();
 	return size;
 }
 EXPORT_SYMBOL(__clear_user);
@@ -101,54 +54,6 @@ unsigned long clear_user(void __user *to, unsigned long n)
 }
 EXPORT_SYMBOL(clear_user);
 
-/*
- * Return the size of a string (including the ending 0)
- *
- * Return 0 on exception, a value greater than N if too long
- */
-
-long __strnlen_user(const char __user *s, long n)
-{
-	long res = 0;
-	char c;
-
-	while (1) {
-		if (res>n)
-			return n+1;
-		if (__get_user(c, s))
-			return 0;
-		if (!c)
-			return res+1;
-		res++;
-		s++;
-	}
-}
-EXPORT_SYMBOL(__strnlen_user);
-
-long strnlen_user(const char __user *s, long n)
-{
-	if (!access_ok(VERIFY_READ, s, 1))
-		return 0;
-	return __strnlen_user(s, n);
-}
-EXPORT_SYMBOL(strnlen_user);
-
-long strlen_user(const char __user *s)
-{
-	long res = 0;
-	char c;
-
-	for (;;) {
-		if (get_user(c, s))
-			return 0;
-		if (!c)
-			return res+1;
-		res++;
-		s++;
-	}
-}
-EXPORT_SYMBOL(strlen_user);
-
 unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len)
 {
 	if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) { 
@@ -163,21 +68,22 @@ EXPORT_SYMBOL(copy_in_user);
  * Since protection fault in copy_from/to_user is not a normal situation,
  * it is not necessary to optimize tail handling.
  */
-unsigned long
+__visible unsigned long
 copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest)
 {
 	char c;
 	unsigned zero_len;
 
-	for (; len; --len) {
+	for (; len; --len, to++) {
 		if (__get_user_nocheck(c, from++, sizeof(char)))
 			break;
-		if (__put_user_nocheck(c, to++, sizeof(char)))
+		if (__put_user_nocheck(c, to, sizeof(char)))
 			break;
 	}
 
 	for (c = 0, zero_len = len; zerorest && zero_len; --zero_len)
 		if (__put_user_nocheck(c, to++, sizeof(char)))
 			break;
+	clac();
 	return len;
 }
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index a793da5e560..1a2be7c6895 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -1,5 +1,9 @@
 # x86 Opcode Maps
 #
+# This is (mostly) based on following documentations.
+# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2C
+#   (#326018-047US, June 2013)
+#
 #<Opcode maps>
 # Table: table-name
 # Referrer: escaped-name
@@ -15,11 +19,15 @@
 # EndTable
 #
 # AVX Superscripts
-#  (VEX): this opcode can accept VEX prefix.
-#  (oVEX): this opcode requires VEX prefix.
-#  (o128): this opcode only supports 128bit VEX.
-#  (o256): this opcode only supports 256bit VEX.
+#  (v): this opcode requires VEX prefix.
+#  (v1): this opcode only supports 128bit VEX.
 #
+# Last Prefix Superscripts
+#  - (66): the last prefix is 0x66
+#  - (F3): the last prefix is 0xF3
+#  - (F2): the last prefix is 0xF2
+#  - (!F3) : the last prefix is not 0xF3 (including non-last prefix case)
+#  - (66&F2): Both 0x66 and 0xF2 prefixes are specified.
 
 Table: one byte opcode
 Referrer:
@@ -199,8 +207,8 @@ a0: MOV AL,Ob
 a1: MOV rAX,Ov
 a2: MOV Ob,AL
 a3: MOV Ov,rAX
-a4: MOVS/B Xb,Yb
-a5: MOVS/W/D/Q Xv,Yv
+a4: MOVS/B Yb,Xb
+a5: MOVS/W/D/Q Yv,Xv
 a6: CMPS/B Xb,Yb
 a7: CMPS/W/D Xv,Yv
 a8: TEST AL,Ib
@@ -210,7 +218,9 @@ ab: STOS/W/D/Q Yv,rAX
 ac: LODS/B AL,Xb
 ad: LODS/W/D/Q rAX,Xv
 ae: SCAS/B AL,Yb
-af: SCAS/W/D/Q rAX,Xv
+# Note: The May 2011 Intel manual shows Xv for the second parameter of the
+# next instruction but Yv is correct
+af: SCAS/W/D/Q rAX,Yv
 # 0xb0 - 0xbf
 b0: MOV AL/R8L,Ib
 b1: MOV CL/R9L,Ib
@@ -233,10 +243,10 @@ c0: Grp2 Eb,Ib (1A)
 c1: Grp2 Ev,Ib (1A)
 c2: RETN Iw (f64)
 c3: RETN
-c4: LES Gz,Mp (i64) | 3bytes-VEX (Prefix)
-c5: LDS Gz,Mp (i64) | 2bytes-VEX (Prefix)
-c6: Grp11 Eb,Ib (1A)
-c7: Grp11 Ev,Iz (1A)
+c4: LES Gz,Mp (i64) | VEX+2byte (Prefix)
+c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix)
+c6: Grp11A Eb,Ib (1A)
+c7: Grp11B Ev,Iz (1A)
 c8: ENTER Iw,Ib
 c9: LEAVE (d64)
 ca: RETF Iw
@@ -282,8 +292,8 @@ ef: OUT DX,eAX
 # 0xf0 - 0xff
 f0: LOCK (Prefix)
 f1:
-f2: REPNE (Prefix)
-f3: REP/REPE (Prefix)
+f2: REPNE (Prefix) | XACQUIRE (Prefix)
+f3: REP/REPE (Prefix) | XRELEASE (Prefix)
 f4: HLT
 f5: CMC
 f6: Grp3_1 Eb (1A)
@@ -315,23 +325,29 @@ AVXcode: 1
 0a:
 0b: UD2 (1B)
 0c:
-0d: NOP Ev | GrpP
+# AMD's prefetch group. Intel supports prefetchw(/1) only.
+0d: GrpP
 0e: FEMMS
 # 3DNow! uses the last imm byte as opcode extension.
 0f: 3DNow! Pq,Qq,Ib
 # 0x0f 0x10-0x1f
-10: movups Vps,Wps (VEX) | movss Vss,Wss (F3),(VEX),(o128) | movupd Vpd,Wpd (66),(VEX) | movsd Vsd,Wsd (F2),(VEX),(o128)
-11: movups Wps,Vps (VEX) | movss Wss,Vss (F3),(VEX),(o128) | movupd Wpd,Vpd (66),(VEX) | movsd Wsd,Vsd (F2),(VEX),(o128)
-12: movlps Vq,Mq (VEX),(o128) | movlpd Vq,Mq (66),(VEX),(o128) | movhlps Vq,Uq (VEX),(o128) | movddup Vq,Wq (F2),(VEX) | movsldup Vq,Wq (F3),(VEX)
-13: mpvlps Mq,Vq (VEX),(o128) | movlpd Mq,Vq (66),(VEX),(o128)
-14: unpcklps Vps,Wq (VEX) | unpcklpd Vpd,Wq (66),(VEX)
-15: unpckhps Vps,Wq (VEX) | unpckhpd Vpd,Wq (66),(VEX)
-16: movhps Vq,Mq (VEX),(o128) | movhpd Vq,Mq (66),(VEX),(o128) | movlsps Vq,Uq (VEX),(o128) | movshdup Vq,Wq (F3),(VEX)
-17: movhps Mq,Vq (VEX),(o128) | movhpd Mq,Vq (66),(VEX),(o128)
+# NOTE: According to Intel SDM opcode map, vmovups and vmovupd has no operands
+# but it actually has operands. And also, vmovss and vmovsd only accept 128bit.
+# MOVSS/MOVSD has too many forms(3) on SDM. This map just shows a typical form.
+# Many AVX instructions lack v1 superscript, according to Intel AVX-Prgramming
+# Reference A.1
+10: vmovups Vps,Wps | vmovupd Vpd,Wpd (66) | vmovss Vx,Hx,Wss (F3),(v1) | vmovsd Vx,Hx,Wsd (F2),(v1)
+11: vmovups Wps,Vps | vmovupd Wpd,Vpd (66) | vmovss Wss,Hx,Vss (F3),(v1) | vmovsd Wsd,Hx,Vsd (F2),(v1)
+12: vmovlps Vq,Hq,Mq (v1) | vmovhlps Vq,Hq,Uq (v1) | vmovlpd Vq,Hq,Mq (66),(v1) | vmovsldup Vx,Wx (F3) | vmovddup Vx,Wx (F2)
+13: vmovlps Mq,Vq (v1) | vmovlpd Mq,Vq (66),(v1)
+14: vunpcklps Vx,Hx,Wx | vunpcklpd Vx,Hx,Wx (66)
+15: vunpckhps Vx,Hx,Wx | vunpckhpd Vx,Hx,Wx (66)
+16: vmovhps Vdq,Hq,Mq (v1) | vmovlhps Vdq,Hq,Uq (v1) | vmovhpd Vdq,Hq,Mq (66),(v1) | vmovshdup Vx,Wx (F3)
+17: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1)
 18: Grp16 (1A)
 19:
-1a:
-1b:
+1a: BNDCL Ev,Gv | BNDCU Ev,Gv | BNDMOV Gv,Ev | BNDLDX Gv,Ev,Gv
+1b: BNDCN Ev,Gv | BNDMOV Ev,Gv | BNDMK Gv,Ev | BNDSTX Ev,GV,Gv
 1c:
 1d:
 1e:
@@ -345,14 +361,14 @@ AVXcode: 1
 25:
 26:
 27:
-28: movaps Vps,Wps (VEX) | movapd Vpd,Wpd (66),(VEX)
-29: movaps Wps,Vps (VEX) | movapd Wpd,Vpd (66),(VEX)
-2a: cvtpi2ps Vps,Qpi | cvtsi2ss Vss,Ed/q (F3),(VEX),(o128) | cvtpi2pd Vpd,Qpi (66) | cvtsi2sd Vsd,Ed/q (F2),(VEX),(o128)
-2b: movntps Mps,Vps (VEX) | movntpd Mpd,Vpd (66),(VEX)
-2c: cvttps2pi Ppi,Wps | cvttss2si  Gd/q,Wss (F3),(VEX),(o128) | cvttpd2pi Ppi,Wpd (66) | cvttsd2si Gd/q,Wsd (F2),(VEX),(o128)
-2d: cvtps2pi Ppi,Wps | cvtss2si Gd/q,Wss (F3),(VEX),(o128) | cvtpd2pi Qpi,Wpd (66) | cvtsd2si Gd/q,Wsd (F2),(VEX),(o128)
-2e: ucomiss Vss,Wss (VEX),(o128) | ucomisd  Vsd,Wsd (66),(VEX),(o128)
-2f: comiss Vss,Wss (VEX),(o128) | comisd  Vsd,Wsd (66),(VEX),(o128)
+28: vmovaps Vps,Wps | vmovapd Vpd,Wpd (66)
+29: vmovaps Wps,Vps | vmovapd Wpd,Vpd (66)
+2a: cvtpi2ps Vps,Qpi | cvtpi2pd Vpd,Qpi (66) | vcvtsi2ss Vss,Hss,Ey (F3),(v1) | vcvtsi2sd Vsd,Hsd,Ey (F2),(v1)
+2b: vmovntps Mps,Vps | vmovntpd Mpd,Vpd (66)
+2c: cvttps2pi Ppi,Wps | cvttpd2pi Ppi,Wpd (66) | vcvttss2si Gy,Wss (F3),(v1) | vcvttsd2si Gy,Wsd (F2),(v1)
+2d: cvtps2pi Ppi,Wps | cvtpd2pi Qpi,Wpd (66) | vcvtss2si Gy,Wss (F3),(v1) | vcvtsd2si Gy,Wsd (F2),(v1)
+2e: vucomiss Vss,Wss (v1) | vucomisd  Vsd,Wsd (66),(v1)
+2f: vcomiss Vss,Wss (v1) | vcomisd  Vsd,Wsd (66),(v1)
 # 0x0f 0x30-0x3f
 30: WRMSR
 31: RDTSC
@@ -388,65 +404,66 @@ AVXcode: 1
 4e: CMOVLE/NG Gv,Ev
 4f: CMOVNLE/G Gv,Ev
 # 0x0f 0x50-0x5f
-50: movmskps Gd/q,Ups (VEX) | movmskpd Gd/q,Upd (66),(VEX)
-51: sqrtps Vps,Wps (VEX) | sqrtss Vss,Wss (F3),(VEX),(o128) | sqrtpd Vpd,Wpd (66),(VEX) | sqrtsd Vsd,Wsd (F2),(VEX),(o128)
-52: rsqrtps Vps,Wps (VEX) | rsqrtss Vss,Wss (F3),(VEX),(o128)
-53: rcpps Vps,Wps (VEX) | rcpss Vss,Wss (F3),(VEX),(o128)
-54: andps Vps,Wps (VEX) | andpd Vpd,Wpd (66),(VEX)
-55: andnps Vps,Wps (VEX) | andnpd Vpd,Wpd (66),(VEX)
-56: orps Vps,Wps (VEX) | orpd Vpd,Wpd (66),(VEX)
-57: xorps Vps,Wps (VEX) | xorpd Vpd,Wpd (66),(VEX)
-58: addps Vps,Wps (VEX) | addss Vss,Wss (F3),(VEX),(o128) | addpd Vpd,Wpd (66),(VEX) | addsd Vsd,Wsd (F2),(VEX),(o128)
-59: mulps Vps,Wps (VEX) | mulss Vss,Wss (F3),(VEX),(o128) | mulpd Vpd,Wpd (66),(VEX) | mulsd Vsd,Wsd (F2),(VEX),(o128)
-5a: cvtps2pd Vpd,Wps (VEX) | cvtss2sd Vsd,Wss (F3),(VEX),(o128) | cvtpd2ps Vps,Wpd (66),(VEX) | cvtsd2ss Vsd,Wsd (F2),(VEX),(o128)
-5b: cvtdq2ps Vps,Wdq (VEX) | cvtps2dq Vdq,Wps (66),(VEX) | cvttps2dq Vdq,Wps (F3),(VEX)
-5c: subps Vps,Wps (VEX) | subss Vss,Wss (F3),(VEX),(o128) | subpd Vpd,Wpd (66),(VEX) | subsd Vsd,Wsd (F2),(VEX),(o128)
-5d: minps Vps,Wps (VEX) | minss Vss,Wss (F3),(VEX),(o128) | minpd Vpd,Wpd (66),(VEX) | minsd Vsd,Wsd (F2),(VEX),(o128)
-5e: divps Vps,Wps (VEX) | divss Vss,Wss (F3),(VEX),(o128) | divpd Vpd,Wpd (66),(VEX) | divsd Vsd,Wsd (F2),(VEX),(o128)
-5f: maxps Vps,Wps (VEX) | maxss Vss,Wss (F3),(VEX),(o128) | maxpd Vpd,Wpd (66),(VEX) | maxsd Vsd,Wsd (F2),(VEX),(o128)
+50: vmovmskps Gy,Ups | vmovmskpd Gy,Upd (66)
+51: vsqrtps Vps,Wps | vsqrtpd Vpd,Wpd (66) | vsqrtss Vss,Hss,Wss (F3),(v1) | vsqrtsd Vsd,Hsd,Wsd (F2),(v1)
+52: vrsqrtps Vps,Wps | vrsqrtss Vss,Hss,Wss (F3),(v1)
+53: vrcpps Vps,Wps | vrcpss Vss,Hss,Wss (F3),(v1)
+54: vandps Vps,Hps,Wps | vandpd Vpd,Hpd,Wpd (66)
+55: vandnps Vps,Hps,Wps | vandnpd Vpd,Hpd,Wpd (66)
+56: vorps Vps,Hps,Wps | vorpd Vpd,Hpd,Wpd (66)
+57: vxorps Vps,Hps,Wps | vxorpd Vpd,Hpd,Wpd (66)
+58: vaddps Vps,Hps,Wps | vaddpd Vpd,Hpd,Wpd (66) | vaddss Vss,Hss,Wss (F3),(v1) | vaddsd Vsd,Hsd,Wsd (F2),(v1)
+59: vmulps Vps,Hps,Wps | vmulpd Vpd,Hpd,Wpd (66) | vmulss Vss,Hss,Wss (F3),(v1) | vmulsd Vsd,Hsd,Wsd (F2),(v1)
+5a: vcvtps2pd Vpd,Wps | vcvtpd2ps Vps,Wpd (66) | vcvtss2sd Vsd,Hx,Wss (F3),(v1) | vcvtsd2ss Vss,Hx,Wsd (F2),(v1)
+5b: vcvtdq2ps Vps,Wdq | vcvtps2dq Vdq,Wps (66) | vcvttps2dq Vdq,Wps (F3)
+5c: vsubps Vps,Hps,Wps | vsubpd Vpd,Hpd,Wpd (66) | vsubss Vss,Hss,Wss (F3),(v1) | vsubsd Vsd,Hsd,Wsd (F2),(v1)
+5d: vminps Vps,Hps,Wps | vminpd Vpd,Hpd,Wpd (66) | vminss Vss,Hss,Wss (F3),(v1) | vminsd Vsd,Hsd,Wsd (F2),(v1)
+5e: vdivps Vps,Hps,Wps | vdivpd Vpd,Hpd,Wpd (66) | vdivss Vss,Hss,Wss (F3),(v1) | vdivsd Vsd,Hsd,Wsd (F2),(v1)
+5f: vmaxps Vps,Hps,Wps | vmaxpd Vpd,Hpd,Wpd (66) | vmaxss Vss,Hss,Wss (F3),(v1) | vmaxsd Vsd,Hsd,Wsd (F2),(v1)
 # 0x0f 0x60-0x6f
-60: punpcklbw Pq,Qd | punpcklbw Vdq,Wdq (66),(VEX),(o128)
-61: punpcklwd Pq,Qd | punpcklwd Vdq,Wdq (66),(VEX),(o128)
-62: punpckldq Pq,Qd | punpckldq Vdq,Wdq (66),(VEX),(o128)
-63: packsswb Pq,Qq | packsswb Vdq,Wdq (66),(VEX),(o128)
-64: pcmpgtb Pq,Qq | pcmpgtb Vdq,Wdq (66),(VEX),(o128)
-65: pcmpgtw Pq,Qq | pcmpgtw Vdq,Wdq (66),(VEX),(o128)
-66: pcmpgtd Pq,Qq | pcmpgtd Vdq,Wdq (66),(VEX),(o128)
-67: packuswb Pq,Qq | packuswb Vdq,Wdq (66),(VEX),(o128)
-68: punpckhbw Pq,Qd | punpckhbw Vdq,Wdq (66),(VEX),(o128)
-69: punpckhwd Pq,Qd | punpckhwd Vdq,Wdq (66),(VEX),(o128)
-6a: punpckhdq Pq,Qd | punpckhdq Vdq,Wdq (66),(VEX),(o128)
-6b: packssdw Pq,Qd | packssdw Vdq,Wdq (66),(VEX),(o128)
-6c: punpcklqdq Vdq,Wdq (66),(VEX),(o128)
-6d: punpckhqdq Vdq,Wdq (66),(VEX),(o128)
-6e: movd/q/ Pd,Ed/q | movd/q Vdq,Ed/q (66),(VEX),(o128)
-6f: movq Pq,Qq | movdqa Vdq,Wdq (66),(VEX) | movdqu Vdq,Wdq (F3),(VEX)
+60: punpcklbw Pq,Qd | vpunpcklbw Vx,Hx,Wx (66),(v1)
+61: punpcklwd Pq,Qd | vpunpcklwd Vx,Hx,Wx (66),(v1)
+62: punpckldq Pq,Qd | vpunpckldq Vx,Hx,Wx (66),(v1)
+63: packsswb Pq,Qq | vpacksswb Vx,Hx,Wx (66),(v1)
+64: pcmpgtb Pq,Qq | vpcmpgtb Vx,Hx,Wx (66),(v1)
+65: pcmpgtw Pq,Qq | vpcmpgtw Vx,Hx,Wx (66),(v1)
+66: pcmpgtd Pq,Qq | vpcmpgtd Vx,Hx,Wx (66),(v1)
+67: packuswb Pq,Qq | vpackuswb Vx,Hx,Wx (66),(v1)
+68: punpckhbw Pq,Qd | vpunpckhbw Vx,Hx,Wx (66),(v1)
+69: punpckhwd Pq,Qd | vpunpckhwd Vx,Hx,Wx (66),(v1)
+6a: punpckhdq Pq,Qd | vpunpckhdq Vx,Hx,Wx (66),(v1)
+6b: packssdw Pq,Qd | vpackssdw Vx,Hx,Wx (66),(v1)
+6c: vpunpcklqdq Vx,Hx,Wx (66),(v1)
+6d: vpunpckhqdq Vx,Hx,Wx (66),(v1)
+6e: movd/q Pd,Ey | vmovd/q Vy,Ey (66),(v1)
+6f: movq Pq,Qq | vmovdqa Vx,Wx (66) | vmovdqu Vx,Wx (F3)
 # 0x0f 0x70-0x7f
-70: pshufw Pq,Qq,Ib | pshufd Vdq,Wdq,Ib (66),(VEX),(o128) | pshufhw Vdq,Wdq,Ib (F3),(VEX),(o128) | pshuflw VdqWdq,Ib (F2),(VEX),(o128)
+70: pshufw Pq,Qq,Ib | vpshufd Vx,Wx,Ib (66),(v1) | vpshufhw Vx,Wx,Ib (F3),(v1) | vpshuflw Vx,Wx,Ib (F2),(v1)
 71: Grp12 (1A)
 72: Grp13 (1A)
 73: Grp14 (1A)
-74: pcmpeqb Pq,Qq | pcmpeqb Vdq,Wdq (66),(VEX),(o128)
-75: pcmpeqw Pq,Qq | pcmpeqw Vdq,Wdq (66),(VEX),(o128)
-76: pcmpeqd Pq,Qq | pcmpeqd Vdq,Wdq (66),(VEX),(o128)
-77: emms/vzeroupper/vzeroall (VEX)
-78: VMREAD Ed/q,Gd/q
-79: VMWRITE Gd/q,Ed/q
+74: pcmpeqb Pq,Qq | vpcmpeqb Vx,Hx,Wx (66),(v1)
+75: pcmpeqw Pq,Qq | vpcmpeqw Vx,Hx,Wx (66),(v1)
+76: pcmpeqd Pq,Qq | vpcmpeqd Vx,Hx,Wx (66),(v1)
+# Note: Remove (v), because vzeroall and vzeroupper becomes emms without VEX.
+77: emms | vzeroupper | vzeroall
+78: VMREAD Ey,Gy
+79: VMWRITE Gy,Ey
 7a:
 7b:
-7c: haddps Vps,Wps (F2),(VEX) | haddpd Vpd,Wpd (66),(VEX)
-7d: hsubps Vps,Wps (F2),(VEX) | hsubpd Vpd,Wpd (66),(VEX)
-7e: movd/q Ed/q,Pd | movd/q Ed/q,Vdq (66),(VEX),(o128) | movq Vq,Wq (F3),(VEX),(o128)
-7f: movq Qq,Pq | movdqa Wdq,Vdq (66),(VEX) | movdqu Wdq,Vdq (F3),(VEX)
+7c: vhaddpd Vpd,Hpd,Wpd (66) | vhaddps Vps,Hps,Wps (F2)
+7d: vhsubpd Vpd,Hpd,Wpd (66) | vhsubps Vps,Hps,Wps (F2)
+7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1)
+7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3)
 # 0x0f 0x80-0x8f
 80: JO Jz (f64)
 81: JNO Jz (f64)
-82: JB/JNAE/JC Jz (f64)
-83: JNB/JAE/JNC Jz (f64)
-84: JZ/JE Jz (f64)
-85: JNZ/JNE Jz (f64)
+82: JB/JC/JNAE Jz (f64)
+83: JAE/JNB/JNC Jz (f64)
+84: JE/JZ Jz (f64)
+85: JNE/JNZ Jz (f64)
 86: JBE/JNA Jz (f64)
-87: JNBE/JA Jz (f64)
+87: JA/JNBE Jz (f64)
 88: JS Jz (f64)
 89: JNS Jz (f64)
 8a: JP/JPE Jz (f64)
@@ -498,22 +515,22 @@ b4: LFS Gv,Mp
 b5: LGS Gv,Mp
 b6: MOVZX Gv,Eb
 b7: MOVZX Gv,Ew
-b8: JMPE | POPCNT Gv,Ev (F3)
+b8: JMPE (!F3) | POPCNT Gv,Ev (F3)
 b9: Grp10 (1A)
 ba: Grp8 Ev,Ib (1A)
 bb: BTC Ev,Gv
-bc: BSF Gv,Ev
-bd: BSR Gv,Ev
+bc: BSF Gv,Ev (!F3) | TZCNT Gv,Ev (F3)
+bd: BSR Gv,Ev (!F3) | LZCNT Gv,Ev (F3)
 be: MOVSX Gv,Eb
 bf: MOVSX Gv,Ew
 # 0x0f 0xc0-0xcf
 c0: XADD Eb,Gb
 c1: XADD Ev,Gv
-c2: cmpps Vps,Wps,Ib (VEX) | cmpss Vss,Wss,Ib (F3),(VEX),(o128) | cmppd Vpd,Wpd,Ib (66),(VEX) | cmpsd Vsd,Wsd,Ib (F2),(VEX)
-c3: movnti Md/q,Gd/q
-c4: pinsrw Pq,Rd/q/Mw,Ib | pinsrw Vdq,Rd/q/Mw,Ib (66),(VEX),(o128)
-c5: pextrw Gd,Nq,Ib | pextrw Gd,Udq,Ib (66),(VEX),(o128)
-c6: shufps Vps,Wps,Ib (VEX) | shufpd Vpd,Wpd,Ib (66),(VEX)
+c2: vcmpps Vps,Hps,Wps,Ib | vcmppd Vpd,Hpd,Wpd,Ib (66) | vcmpss Vss,Hss,Wss,Ib (F3),(v1) | vcmpsd Vsd,Hsd,Wsd,Ib (F2),(v1)
+c3: movnti My,Gy
+c4: pinsrw Pq,Ry/Mw,Ib | vpinsrw Vdq,Hdq,Ry/Mw,Ib (66),(v1)
+c5: pextrw Gd,Nq,Ib | vpextrw Gd,Udq,Ib (66),(v1)
+c6: vshufps Vps,Hps,Wps,Ib | vshufpd Vpd,Hpd,Wpd,Ib (66)
 c7: Grp9 (1A)
 c8: BSWAP RAX/EAX/R8/R8D
 c9: BSWAP RCX/ECX/R9/R9D
@@ -524,55 +541,55 @@ cd: BSWAP RBP/EBP/R13/R13D
 ce: BSWAP RSI/ESI/R14/R14D
 cf: BSWAP RDI/EDI/R15/R15D
 # 0x0f 0xd0-0xdf
-d0: addsubps Vps,Wps (F2),(VEX) | addsubpd Vpd,Wpd (66),(VEX)
-d1: psrlw Pq,Qq | psrlw Vdq,Wdq (66),(VEX),(o128)
-d2: psrld Pq,Qq | psrld Vdq,Wdq (66),(VEX),(o128)
-d3: psrlq Pq,Qq | psrlq Vdq,Wdq (66),(VEX),(o128)
-d4: paddq Pq,Qq | paddq Vdq,Wdq (66),(VEX),(o128)
-d5: pmullw Pq,Qq | pmullw Vdq,Wdq (66),(VEX),(o128)
-d6: movq Wq,Vq (66),(VEX),(o128) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2)
-d7: pmovmskb Gd,Nq | pmovmskb Gd,Udq (66),(VEX),(o128)
-d8: psubusb Pq,Qq | psubusb Vdq,Wdq (66),(VEX),(o128)
-d9: psubusw Pq,Qq | psubusw Vdq,Wdq (66),(VEX),(o128)
-da: pminub Pq,Qq | pminub Vdq,Wdq (66),(VEX),(o128)
-db: pand Pq,Qq | pand Vdq,Wdq (66),(VEX),(o128)
-dc: paddusb Pq,Qq | paddusb Vdq,Wdq (66),(VEX),(o128)
-dd: paddusw Pq,Qq | paddusw Vdq,Wdq (66),(VEX),(o128)
-de: pmaxub Pq,Qq | pmaxub Vdq,Wdq (66),(VEX),(o128)
-df: pandn Pq,Qq | pandn Vdq,Wdq (66),(VEX),(o128)
+d0: vaddsubpd Vpd,Hpd,Wpd (66) | vaddsubps Vps,Hps,Wps (F2)
+d1: psrlw Pq,Qq | vpsrlw Vx,Hx,Wx (66),(v1)
+d2: psrld Pq,Qq | vpsrld Vx,Hx,Wx (66),(v1)
+d3: psrlq Pq,Qq | vpsrlq Vx,Hx,Wx (66),(v1)
+d4: paddq Pq,Qq | vpaddq Vx,Hx,Wx (66),(v1)
+d5: pmullw Pq,Qq | vpmullw Vx,Hx,Wx (66),(v1)
+d6: vmovq Wq,Vq (66),(v1) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2)
+d7: pmovmskb Gd,Nq | vpmovmskb Gd,Ux (66),(v1)
+d8: psubusb Pq,Qq | vpsubusb Vx,Hx,Wx (66),(v1)
+d9: psubusw Pq,Qq | vpsubusw Vx,Hx,Wx (66),(v1)
+da: pminub Pq,Qq | vpminub Vx,Hx,Wx (66),(v1)
+db: pand Pq,Qq | vpand Vx,Hx,Wx (66),(v1)
+dc: paddusb Pq,Qq | vpaddusb Vx,Hx,Wx (66),(v1)
+dd: paddusw Pq,Qq | vpaddusw Vx,Hx,Wx (66),(v1)
+de: pmaxub Pq,Qq | vpmaxub Vx,Hx,Wx (66),(v1)
+df: pandn Pq,Qq | vpandn Vx,Hx,Wx (66),(v1)
 # 0x0f 0xe0-0xef
-e0: pavgb Pq,Qq | pavgb Vdq,Wdq (66),(VEX),(o128)
-e1: psraw Pq,Qq | psraw Vdq,Wdq (66),(VEX),(o128)
-e2: psrad Pq,Qq | psrad Vdq,Wdq (66),(VEX),(o128)
-e3: pavgw Pq,Qq | pavgw Vdq,Wdq (66),(VEX),(o128)
-e4: pmulhuw Pq,Qq | pmulhuw Vdq,Wdq (66),(VEX),(o128)
-e5: pmulhw Pq,Qq | pmulhw Vdq,Wdq (66),(VEX),(o128)
-e6: cvtpd2dq Vdq,Wpd (F2),(VEX) | cvttpd2dq Vdq,Wpd (66),(VEX) | cvtdq2pd Vpd,Wdq (F3),(VEX)
-e7: movntq Mq,Pq | movntdq Mdq,Vdq (66),(VEX)
-e8: psubsb Pq,Qq | psubsb Vdq,Wdq (66),(VEX),(o128)
-e9: psubsw Pq,Qq | psubsw Vdq,Wdq (66),(VEX),(o128)
-ea: pminsw Pq,Qq | pminsw Vdq,Wdq (66),(VEX),(o128)
-eb: por Pq,Qq | por Vdq,Wdq (66),(VEX),(o128)
-ec: paddsb Pq,Qq | paddsb Vdq,Wdq (66),(VEX),(o128)
-ed: paddsw Pq,Qq | paddsw Vdq,Wdq (66),(VEX),(o128)
-ee: pmaxsw Pq,Qq | pmaxsw Vdq,Wdq (66),(VEX),(o128)
-ef: pxor Pq,Qq | pxor Vdq,Wdq (66),(VEX),(o128)
+e0: pavgb Pq,Qq | vpavgb Vx,Hx,Wx (66),(v1)
+e1: psraw Pq,Qq | vpsraw Vx,Hx,Wx (66),(v1)
+e2: psrad Pq,Qq | vpsrad Vx,Hx,Wx (66),(v1)
+e3: pavgw Pq,Qq | vpavgw Vx,Hx,Wx (66),(v1)
+e4: pmulhuw Pq,Qq | vpmulhuw Vx,Hx,Wx (66),(v1)
+e5: pmulhw Pq,Qq | vpmulhw Vx,Hx,Wx (66),(v1)
+e6: vcvttpd2dq Vx,Wpd (66) | vcvtdq2pd Vx,Wdq (F3) | vcvtpd2dq Vx,Wpd (F2)
+e7: movntq Mq,Pq | vmovntdq Mx,Vx (66)
+e8: psubsb Pq,Qq | vpsubsb Vx,Hx,Wx (66),(v1)
+e9: psubsw Pq,Qq | vpsubsw Vx,Hx,Wx (66),(v1)
+ea: pminsw Pq,Qq | vpminsw Vx,Hx,Wx (66),(v1)
+eb: por Pq,Qq | vpor Vx,Hx,Wx (66),(v1)
+ec: paddsb Pq,Qq | vpaddsb Vx,Hx,Wx (66),(v1)
+ed: paddsw Pq,Qq | vpaddsw Vx,Hx,Wx (66),(v1)
+ee: pmaxsw Pq,Qq | vpmaxsw Vx,Hx,Wx (66),(v1)
+ef: pxor Pq,Qq | vpxor Vx,Hx,Wx (66),(v1)
 # 0x0f 0xf0-0xff
-f0: lddqu Vdq,Mdq (F2),(VEX)
-f1: psllw Pq,Qq | psllw Vdq,Wdq (66),(VEX),(o128)
-f2: pslld Pq,Qq | pslld Vdq,Wdq (66),(VEX),(o128)
-f3: psllq Pq,Qq | psllq Vdq,Wdq (66),(VEX),(o128)
-f4: pmuludq Pq,Qq | pmuludq Vdq,Wdq (66),(VEX),(o128)
-f5: pmaddwd Pq,Qq | pmaddwd Vdq,Wdq (66),(VEX),(o128)
-f6: psadbw Pq,Qq | psadbw Vdq,Wdq (66),(VEX),(o128)
-f7: maskmovq Pq,Nq | maskmovdqu Vdq,Udq (66),(VEX),(o128)
-f8: psubb Pq,Qq | psubb Vdq,Wdq (66),(VEX),(o128)
-f9: psubw Pq,Qq | psubw Vdq,Wdq (66),(VEX),(o128)
-fa: psubd Pq,Qq | psubd Vdq,Wdq (66),(VEX),(o128)
-fb: psubq Pq,Qq | psubq Vdq,Wdq (66),(VEX),(o128)
-fc: paddb Pq,Qq | paddb Vdq,Wdq (66),(VEX),(o128)
-fd: paddw Pq,Qq | paddw Vdq,Wdq (66),(VEX),(o128)
-fe: paddd Pq,Qq | paddd Vdq,Wdq (66),(VEX),(o128)
+f0: vlddqu Vx,Mx (F2)
+f1: psllw Pq,Qq | vpsllw Vx,Hx,Wx (66),(v1)
+f2: pslld Pq,Qq | vpslld Vx,Hx,Wx (66),(v1)
+f3: psllq Pq,Qq | vpsllq Vx,Hx,Wx (66),(v1)
+f4: pmuludq Pq,Qq | vpmuludq Vx,Hx,Wx (66),(v1)
+f5: pmaddwd Pq,Qq | vpmaddwd Vx,Hx,Wx (66),(v1)
+f6: psadbw Pq,Qq | vpsadbw Vx,Hx,Wx (66),(v1)
+f7: maskmovq Pq,Nq | vmaskmovdqu Vx,Ux (66),(v1)
+f8: psubb Pq,Qq | vpsubb Vx,Hx,Wx (66),(v1)
+f9: psubw Pq,Qq | vpsubw Vx,Hx,Wx (66),(v1)
+fa: psubd Pq,Qq | vpsubd Vx,Hx,Wx (66),(v1)
+fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1)
+fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1)
+fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1)
+fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1)
 ff:
 EndTable
 
@@ -580,155 +597,193 @@ Table: 3-byte opcode 1 (0x0f 0x38)
 Referrer: 3-byte escape 1
 AVXcode: 2
 # 0x0f 0x38 0x00-0x0f
-00: pshufb Pq,Qq | pshufb Vdq,Wdq (66),(VEX),(o128)
-01: phaddw Pq,Qq | phaddw Vdq,Wdq (66),(VEX),(o128)
-02: phaddd Pq,Qq | phaddd Vdq,Wdq (66),(VEX),(o128)
-03: phaddsw Pq,Qq | phaddsw Vdq,Wdq (66),(VEX),(o128)
-04: pmaddubsw Pq,Qq | pmaddubsw Vdq,Wdq (66),(VEX),(o128)
-05: phsubw Pq,Qq | phsubw Vdq,Wdq (66),(VEX),(o128)
-06: phsubd Pq,Qq | phsubd Vdq,Wdq (66),(VEX),(o128)
-07: phsubsw Pq,Qq | phsubsw Vdq,Wdq (66),(VEX),(o128)
-08: psignb Pq,Qq | psignb Vdq,Wdq (66),(VEX),(o128)
-09: psignw Pq,Qq | psignw Vdq,Wdq (66),(VEX),(o128)
-0a: psignd Pq,Qq | psignd Vdq,Wdq (66),(VEX),(o128)
-0b: pmulhrsw Pq,Qq | pmulhrsw Vdq,Wdq (66),(VEX),(o128)
-0c: Vpermilps /r (66),(oVEX)
-0d: Vpermilpd /r (66),(oVEX)
-0e: vtestps /r (66),(oVEX)
-0f: vtestpd /r (66),(oVEX)
+00: pshufb Pq,Qq | vpshufb Vx,Hx,Wx (66),(v1)
+01: phaddw Pq,Qq | vphaddw Vx,Hx,Wx (66),(v1)
+02: phaddd Pq,Qq | vphaddd Vx,Hx,Wx (66),(v1)
+03: phaddsw Pq,Qq | vphaddsw Vx,Hx,Wx (66),(v1)
+04: pmaddubsw Pq,Qq | vpmaddubsw Vx,Hx,Wx (66),(v1)
+05: phsubw Pq,Qq | vphsubw Vx,Hx,Wx (66),(v1)
+06: phsubd Pq,Qq | vphsubd Vx,Hx,Wx (66),(v1)
+07: phsubsw Pq,Qq | vphsubsw Vx,Hx,Wx (66),(v1)
+08: psignb Pq,Qq | vpsignb Vx,Hx,Wx (66),(v1)
+09: psignw Pq,Qq | vpsignw Vx,Hx,Wx (66),(v1)
+0a: psignd Pq,Qq | vpsignd Vx,Hx,Wx (66),(v1)
+0b: pmulhrsw Pq,Qq | vpmulhrsw Vx,Hx,Wx (66),(v1)
+0c: vpermilps Vx,Hx,Wx (66),(v)
+0d: vpermilpd Vx,Hx,Wx (66),(v)
+0e: vtestps Vx,Wx (66),(v)
+0f: vtestpd Vx,Wx (66),(v)
 # 0x0f 0x38 0x10-0x1f
 10: pblendvb Vdq,Wdq (66)
 11:
 12:
-13:
+13: vcvtph2ps Vx,Wx,Ib (66),(v)
 14: blendvps Vdq,Wdq (66)
 15: blendvpd Vdq,Wdq (66)
-16:
-17: ptest Vdq,Wdq (66),(VEX)
-18: vbroadcastss /r (66),(oVEX)
-19: vbroadcastsd /r (66),(oVEX),(o256)
-1a: vbroadcastf128 /r (66),(oVEX),(o256)
+16: vpermps Vqq,Hqq,Wqq (66),(v)
+17: vptest Vx,Wx (66)
+18: vbroadcastss Vx,Wd (66),(v)
+19: vbroadcastsd Vqq,Wq (66),(v)
+1a: vbroadcastf128 Vqq,Mdq (66),(v)
 1b:
-1c: pabsb Pq,Qq | pabsb Vdq,Wdq (66),(VEX),(o128)
-1d: pabsw Pq,Qq | pabsw Vdq,Wdq (66),(VEX),(o128)
-1e: pabsd Pq,Qq | pabsd Vdq,Wdq (66),(VEX),(o128)
+1c: pabsb Pq,Qq | vpabsb Vx,Wx (66),(v1)
+1d: pabsw Pq,Qq | vpabsw Vx,Wx (66),(v1)
+1e: pabsd Pq,Qq | vpabsd Vx,Wx (66),(v1)
 1f:
 # 0x0f 0x38 0x20-0x2f
-20: pmovsxbw Vdq,Udq/Mq (66),(VEX),(o128)
-21: pmovsxbd Vdq,Udq/Md (66),(VEX),(o128)
-22: pmovsxbq Vdq,Udq/Mw (66),(VEX),(o128)
-23: pmovsxwd Vdq,Udq/Mq (66),(VEX),(o128)
-24: pmovsxwq Vdq,Udq/Md (66),(VEX),(o128)
-25: pmovsxdq Vdq,Udq/Mq (66),(VEX),(o128)
+20: vpmovsxbw Vx,Ux/Mq (66),(v1)
+21: vpmovsxbd Vx,Ux/Md (66),(v1)
+22: vpmovsxbq Vx,Ux/Mw (66),(v1)
+23: vpmovsxwd Vx,Ux/Mq (66),(v1)
+24: vpmovsxwq Vx,Ux/Md (66),(v1)
+25: vpmovsxdq Vx,Ux/Mq (66),(v1)
 26:
 27:
-28: pmuldq Vdq,Wdq (66),(VEX),(o128)
-29: pcmpeqq Vdq,Wdq (66),(VEX),(o128)
-2a: movntdqa Vdq,Mdq (66),(VEX),(o128)
-2b: packusdw Vdq,Wdq (66),(VEX),(o128)
-2c: vmaskmovps(ld) /r (66),(oVEX)
-2d: vmaskmovpd(ld) /r (66),(oVEX)
-2e: vmaskmovps(st) /r (66),(oVEX)
-2f: vmaskmovpd(st) /r (66),(oVEX)
+28: vpmuldq Vx,Hx,Wx (66),(v1)
+29: vpcmpeqq Vx,Hx,Wx (66),(v1)
+2a: vmovntdqa Vx,Mx (66),(v1)
+2b: vpackusdw Vx,Hx,Wx (66),(v1)
+2c: vmaskmovps Vx,Hx,Mx (66),(v)
+2d: vmaskmovpd Vx,Hx,Mx (66),(v)
+2e: vmaskmovps Mx,Hx,Vx (66),(v)
+2f: vmaskmovpd Mx,Hx,Vx (66),(v)
 # 0x0f 0x38 0x30-0x3f
-30: pmovzxbw Vdq,Udq/Mq (66),(VEX),(o128)
-31: pmovzxbd Vdq,Udq/Md (66),(VEX),(o128)
-32: pmovzxbq Vdq,Udq/Mw (66),(VEX),(o128)
-33: pmovzxwd Vdq,Udq/Mq (66),(VEX),(o128)
-34: pmovzxwq Vdq,Udq/Md (66),(VEX),(o128)
-35: pmovzxdq Vdq,Udq/Mq (66),(VEX),(o128)
-36:
-37: pcmpgtq Vdq,Wdq (66),(VEX),(o128)
-38: pminsb Vdq,Wdq (66),(VEX),(o128)
-39: pminsd Vdq,Wdq (66),(VEX),(o128)
-3a: pminuw Vdq,Wdq (66),(VEX),(o128)
-3b: pminud Vdq,Wdq (66),(VEX),(o128)
-3c: pmaxsb Vdq,Wdq (66),(VEX),(o128)
-3d: pmaxsd Vdq,Wdq (66),(VEX),(o128)
-3e: pmaxuw Vdq,Wdq (66),(VEX),(o128)
-3f: pmaxud Vdq,Wdq (66),(VEX),(o128)
+30: vpmovzxbw Vx,Ux/Mq (66),(v1)
+31: vpmovzxbd Vx,Ux/Md (66),(v1)
+32: vpmovzxbq Vx,Ux/Mw (66),(v1)
+33: vpmovzxwd Vx,Ux/Mq (66),(v1)
+34: vpmovzxwq Vx,Ux/Md (66),(v1)
+35: vpmovzxdq Vx,Ux/Mq (66),(v1)
+36: vpermd Vqq,Hqq,Wqq (66),(v)
+37: vpcmpgtq Vx,Hx,Wx (66),(v1)
+38: vpminsb Vx,Hx,Wx (66),(v1)
+39: vpminsd Vx,Hx,Wx (66),(v1)
+3a: vpminuw Vx,Hx,Wx (66),(v1)
+3b: vpminud Vx,Hx,Wx (66),(v1)
+3c: vpmaxsb Vx,Hx,Wx (66),(v1)
+3d: vpmaxsd Vx,Hx,Wx (66),(v1)
+3e: vpmaxuw Vx,Hx,Wx (66),(v1)
+3f: vpmaxud Vx,Hx,Wx (66),(v1)
 # 0x0f 0x38 0x40-0x8f
-40: pmulld Vdq,Wdq (66),(VEX),(o128)
-41: phminposuw Vdq,Wdq (66),(VEX),(o128)
-80: INVEPT Gd/q,Mdq (66)
-81: INVPID Gd/q,Mdq (66)
+40: vpmulld Vx,Hx,Wx (66),(v1)
+41: vphminposuw Vdq,Wdq (66),(v1)
+42:
+43:
+44:
+45: vpsrlvd/q Vx,Hx,Wx (66),(v)
+46: vpsravd Vx,Hx,Wx (66),(v)
+47: vpsllvd/q Vx,Hx,Wx (66),(v)
+# Skip 0x48-0x57
+58: vpbroadcastd Vx,Wx (66),(v)
+59: vpbroadcastq Vx,Wx (66),(v)
+5a: vbroadcasti128 Vqq,Mdq (66),(v)
+# Skip 0x5b-0x77
+78: vpbroadcastb Vx,Wx (66),(v)
+79: vpbroadcastw Vx,Wx (66),(v)
+# Skip 0x7a-0x7f
+80: INVEPT Gy,Mdq (66)
+81: INVPID Gy,Mdq (66)
+82: INVPCID Gy,Mdq (66)
+8c: vpmaskmovd/q Vx,Hx,Mx (66),(v)
+8e: vpmaskmovd/q Mx,Vx,Hx (66),(v)
 # 0x0f 0x38 0x90-0xbf (FMA)
-96: vfmaddsub132pd/ps /r (66),(VEX)
-97: vfmsubadd132pd/ps /r (66),(VEX)
-98: vfmadd132pd/ps /r (66),(VEX)
-99: vfmadd132sd/ss /r (66),(VEX),(o128)
-9a: vfmsub132pd/ps /r (66),(VEX)
-9b: vfmsub132sd/ss /r (66),(VEX),(o128)
-9c: vfnmadd132pd/ps /r (66),(VEX)
-9d: vfnmadd132sd/ss /r (66),(VEX),(o128)
-9e: vfnmsub132pd/ps /r (66),(VEX)
-9f: vfnmsub132sd/ss /r (66),(VEX),(o128)
-a6: vfmaddsub213pd/ps /r (66),(VEX)
-a7: vfmsubadd213pd/ps /r (66),(VEX)
-a8: vfmadd213pd/ps /r (66),(VEX)
-a9: vfmadd213sd/ss /r (66),(VEX),(o128)
-aa: vfmsub213pd/ps /r (66),(VEX)
-ab: vfmsub213sd/ss /r (66),(VEX),(o128)
-ac: vfnmadd213pd/ps /r (66),(VEX)
-ad: vfnmadd213sd/ss /r (66),(VEX),(o128)
-ae: vfnmsub213pd/ps /r (66),(VEX)
-af: vfnmsub213sd/ss /r (66),(VEX),(o128)
-b6: vfmaddsub231pd/ps /r (66),(VEX)
-b7: vfmsubadd231pd/ps /r (66),(VEX)
-b8: vfmadd231pd/ps /r (66),(VEX)
-b9: vfmadd231sd/ss /r (66),(VEX),(o128)
-ba: vfmsub231pd/ps /r (66),(VEX)
-bb: vfmsub231sd/ss /r (66),(VEX),(o128)
-bc: vfnmadd231pd/ps /r (66),(VEX)
-bd: vfnmadd231sd/ss /r (66),(VEX),(o128)
-be: vfnmsub231pd/ps /r (66),(VEX)
-bf: vfnmsub231sd/ss /r (66),(VEX),(o128)
+90: vgatherdd/q Vx,Hx,Wx (66),(v)
+91: vgatherqd/q Vx,Hx,Wx (66),(v)
+92: vgatherdps/d Vx,Hx,Wx (66),(v)
+93: vgatherqps/d Vx,Hx,Wx (66),(v)
+94:
+95:
+96: vfmaddsub132ps/d Vx,Hx,Wx (66),(v)
+97: vfmsubadd132ps/d Vx,Hx,Wx (66),(v)
+98: vfmadd132ps/d Vx,Hx,Wx (66),(v)
+99: vfmadd132ss/d Vx,Hx,Wx (66),(v),(v1)
+9a: vfmsub132ps/d Vx,Hx,Wx (66),(v)
+9b: vfmsub132ss/d Vx,Hx,Wx (66),(v),(v1)
+9c: vfnmadd132ps/d Vx,Hx,Wx (66),(v)
+9d: vfnmadd132ss/d Vx,Hx,Wx (66),(v),(v1)
+9e: vfnmsub132ps/d Vx,Hx,Wx (66),(v)
+9f: vfnmsub132ss/d Vx,Hx,Wx (66),(v),(v1)
+a6: vfmaddsub213ps/d Vx,Hx,Wx (66),(v)
+a7: vfmsubadd213ps/d Vx,Hx,Wx (66),(v)
+a8: vfmadd213ps/d Vx,Hx,Wx (66),(v)
+a9: vfmadd213ss/d Vx,Hx,Wx (66),(v),(v1)
+aa: vfmsub213ps/d Vx,Hx,Wx (66),(v)
+ab: vfmsub213ss/d Vx,Hx,Wx (66),(v),(v1)
+ac: vfnmadd213ps/d Vx,Hx,Wx (66),(v)
+ad: vfnmadd213ss/d Vx,Hx,Wx (66),(v),(v1)
+ae: vfnmsub213ps/d Vx,Hx,Wx (66),(v)
+af: vfnmsub213ss/d Vx,Hx,Wx (66),(v),(v1)
+b6: vfmaddsub231ps/d Vx,Hx,Wx (66),(v)
+b7: vfmsubadd231ps/d Vx,Hx,Wx (66),(v)
+b8: vfmadd231ps/d Vx,Hx,Wx (66),(v)
+b9: vfmadd231ss/d Vx,Hx,Wx (66),(v),(v1)
+ba: vfmsub231ps/d Vx,Hx,Wx (66),(v)
+bb: vfmsub231ss/d Vx,Hx,Wx (66),(v),(v1)
+bc: vfnmadd231ps/d Vx,Hx,Wx (66),(v)
+bd: vfnmadd231ss/d Vx,Hx,Wx (66),(v),(v1)
+be: vfnmsub231ps/d Vx,Hx,Wx (66),(v)
+bf: vfnmsub231ss/d Vx,Hx,Wx (66),(v),(v1)
 # 0x0f 0x38 0xc0-0xff
-db: aesimc Vdq,Wdq (66),(VEX),(o128)
-dc: aesenc Vdq,Wdq (66),(VEX),(o128)
-dd: aesenclast Vdq,Wdq (66),(VEX),(o128)
-de: aesdec Vdq,Wdq (66),(VEX),(o128)
-df: aesdeclast Vdq,Wdq (66),(VEX),(o128)
-f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2)
-f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2)
+db: VAESIMC Vdq,Wdq (66),(v1)
+dc: VAESENC Vdq,Hdq,Wdq (66),(v1)
+dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1)
+de: VAESDEC Vdq,Hdq,Wdq (66),(v1)
+df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1)
+f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2)
+f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2)
+f2: ANDN Gy,By,Ey (v)
+f3: Grp17 (1A)
+f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v)
+f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v)
+f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v)
 EndTable
 
 Table: 3-byte opcode 2 (0x0f 0x3a)
 Referrer: 3-byte escape 2
 AVXcode: 3
 # 0x0f 0x3a 0x00-0xff
-04: vpermilps /r,Ib (66),(oVEX)
-05: vpermilpd /r,Ib (66),(oVEX)
-06: vperm2f128 /r,Ib (66),(oVEX),(o256)
-08: roundps Vdq,Wdq,Ib (66),(VEX)
-09: roundpd Vdq,Wdq,Ib (66),(VEX)
-0a: roundss Vss,Wss,Ib (66),(VEX),(o128)
-0b: roundsd Vsd,Wsd,Ib (66),(VEX),(o128)
-0c: blendps Vdq,Wdq,Ib (66),(VEX)
-0d: blendpd Vdq,Wdq,Ib (66),(VEX)
-0e: pblendw Vdq,Wdq,Ib (66),(VEX),(o128)
-0f: palignr Pq,Qq,Ib | palignr Vdq,Wdq,Ib (66),(VEX),(o128)
-14: pextrb Rd/Mb,Vdq,Ib (66),(VEX),(o128)
-15: pextrw Rd/Mw,Vdq,Ib (66),(VEX),(o128)
-16: pextrd/pextrq Ed/q,Vdq,Ib (66),(VEX),(o128)
-17: extractps Ed,Vdq,Ib (66),(VEX),(o128)
-18: vinsertf128 /r,Ib (66),(oVEX),(o256)
-19: vextractf128 /r,Ib (66),(oVEX),(o256)
-20: pinsrb Vdq,Rd/q/Mb,Ib (66),(VEX),(o128)
-21: insertps Vdq,Udq/Md,Ib (66),(VEX),(o128)
-22: pinsrd/pinsrq Vdq,Ed/q,Ib (66),(VEX),(o128)
-40: dpps Vdq,Wdq,Ib (66),(VEX)
-41: dppd Vdq,Wdq,Ib (66),(VEX),(o128)
-42: mpsadbw Vdq,Wdq,Ib (66),(VEX),(o128)
-44: pclmulq Vdq,Wdq,Ib (66),(VEX),(o128)
-4a: vblendvps /r,Ib (66),(oVEX)
-4b: vblendvpd /r,Ib (66),(oVEX)
-4c: vpblendvb /r,Ib (66),(oVEX),(o128)
-60: pcmpestrm Vdq,Wdq,Ib (66),(VEX),(o128)
-61: pcmpestri Vdq,Wdq,Ib (66),(VEX),(o128)
-62: pcmpistrm Vdq,Wdq,Ib (66),(VEX),(o128)
-63: pcmpistri Vdq,Wdq,Ib (66),(VEX),(o128)
-df: aeskeygenassist Vdq,Wdq,Ib (66),(VEX),(o128)
+00: vpermq Vqq,Wqq,Ib (66),(v)
+01: vpermpd Vqq,Wqq,Ib (66),(v)
+02: vpblendd Vx,Hx,Wx,Ib (66),(v)
+03:
+04: vpermilps Vx,Wx,Ib (66),(v)
+05: vpermilpd Vx,Wx,Ib (66),(v)
+06: vperm2f128 Vqq,Hqq,Wqq,Ib (66),(v)
+07:
+08: vroundps Vx,Wx,Ib (66)
+09: vroundpd Vx,Wx,Ib (66)
+0a: vroundss Vss,Wss,Ib (66),(v1)
+0b: vroundsd Vsd,Wsd,Ib (66),(v1)
+0c: vblendps Vx,Hx,Wx,Ib (66)
+0d: vblendpd Vx,Hx,Wx,Ib (66)
+0e: vpblendw Vx,Hx,Wx,Ib (66),(v1)
+0f: palignr Pq,Qq,Ib | vpalignr Vx,Hx,Wx,Ib (66),(v1)
+14: vpextrb Rd/Mb,Vdq,Ib (66),(v1)
+15: vpextrw Rd/Mw,Vdq,Ib (66),(v1)
+16: vpextrd/q Ey,Vdq,Ib (66),(v1)
+17: vextractps Ed,Vdq,Ib (66),(v1)
+18: vinsertf128 Vqq,Hqq,Wqq,Ib (66),(v)
+19: vextractf128 Wdq,Vqq,Ib (66),(v)
+1d: vcvtps2ph Wx,Vx,Ib (66),(v)
+20: vpinsrb Vdq,Hdq,Ry/Mb,Ib (66),(v1)
+21: vinsertps Vdq,Hdq,Udq/Md,Ib (66),(v1)
+22: vpinsrd/q Vdq,Hdq,Ey,Ib (66),(v1)
+38: vinserti128 Vqq,Hqq,Wqq,Ib (66),(v)
+39: vextracti128 Wdq,Vqq,Ib (66),(v)
+40: vdpps Vx,Hx,Wx,Ib (66)
+41: vdppd Vdq,Hdq,Wdq,Ib (66),(v1)
+42: vmpsadbw Vx,Hx,Wx,Ib (66),(v1)
+44: vpclmulqdq Vdq,Hdq,Wdq,Ib (66),(v1)
+46: vperm2i128 Vqq,Hqq,Wqq,Ib (66),(v)
+4a: vblendvps Vx,Hx,Wx,Lx (66),(v)
+4b: vblendvpd Vx,Hx,Wx,Lx (66),(v)
+4c: vpblendvb Vx,Hx,Wx,Lx (66),(v1)
+60: vpcmpestrm Vdq,Wdq,Ib (66),(v1)
+61: vpcmpestri Vdq,Wdq,Ib (66),(v1)
+62: vpcmpistrm Vdq,Wdq,Ib (66),(v1)
+63: vpcmpistri Vdq,Wdq,Ib (66),(v1)
+df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1)
+f0: RORX Gy,Ey,Ib (F2),(v)
 EndTable
 
 GrpTable: Grp1
@@ -790,7 +845,7 @@ GrpTable: Grp5
 2: CALLN Ev (f64)
 3: CALLF Ep
 4: JMPN Ev (f64)
-5: JMPF Ep
+5: JMPF Mp
 6: PUSH Ev (d64)
 7:
 EndTable
@@ -806,8 +861,8 @@ EndTable
 
 GrpTable: Grp7
 0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B)
-1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001)
-2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B)
+1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B)
+2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B)
 3: LIDT Ms
 4: SMSW Mw/Rv
 5:
@@ -824,44 +879,51 @@ EndTable
 
 GrpTable: Grp9
 1: CMPXCHG8B/16B Mq/Mdq
-6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3)
-7: VMPTRST Mq
+6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B)
+7: VMPTRST Mq | VMPTRST Mq (F3) | RDSEED Rv (11B)
 EndTable
 
 GrpTable: Grp10
 EndTable
 
-GrpTable: Grp11
-0: MOV
+# Grp11A and Grp11B are expressed as Grp11 in Intel SDM
+GrpTable: Grp11A
+0: MOV Eb,Ib
+7: XABORT Ib (000),(11B)
+EndTable
+
+GrpTable: Grp11B
+0: MOV Eb,Iz
+7: XBEGIN Jz (000),(11B)
 EndTable
 
 GrpTable: Grp12
-2: psrlw Nq,Ib (11B) | psrlw Udq,Ib (66),(11B),(VEX),(o128)
-4: psraw Nq,Ib (11B) | psraw Udq,Ib (66),(11B),(VEX),(o128)
-6: psllw Nq,Ib (11B) | psllw Udq,Ib (66),(11B),(VEX),(o128)
+2: psrlw Nq,Ib (11B) | vpsrlw Hx,Ux,Ib (66),(11B),(v1)
+4: psraw Nq,Ib (11B) | vpsraw Hx,Ux,Ib (66),(11B),(v1)
+6: psllw Nq,Ib (11B) | vpsllw Hx,Ux,Ib (66),(11B),(v1)
 EndTable
 
 GrpTable: Grp13
-2: psrld Nq,Ib (11B) | psrld Udq,Ib (66),(11B),(VEX),(o128)
-4: psrad Nq,Ib (11B) | psrad Udq,Ib (66),(11B),(VEX),(o128)
-6: pslld Nq,Ib (11B) | pslld Udq,Ib (66),(11B),(VEX),(o128)
+2: psrld Nq,Ib (11B) | vpsrld Hx,Ux,Ib (66),(11B),(v1)
+4: psrad Nq,Ib (11B) | vpsrad Hx,Ux,Ib (66),(11B),(v1)
+6: pslld Nq,Ib (11B) | vpslld Hx,Ux,Ib (66),(11B),(v1)
 EndTable
 
 GrpTable: Grp14
-2: psrlq Nq,Ib (11B) | psrlq Udq,Ib (66),(11B),(VEX),(o128)
-3: psrldq Udq,Ib (66),(11B),(VEX),(o128)
-6: psllq Nq,Ib (11B) | psllq Udq,Ib (66),(11B),(VEX),(o128)
-7: pslldq Udq,Ib (66),(11B),(VEX),(o128)
+2: psrlq Nq,Ib (11B) | vpsrlq Hx,Ux,Ib (66),(11B),(v1)
+3: vpsrldq Hx,Ux,Ib (66),(11B),(v1)
+6: psllq Nq,Ib (11B) | vpsllq Hx,Ux,Ib (66),(11B),(v1)
+7: vpslldq Hx,Ux,Ib (66),(11B),(v1)
 EndTable
 
 GrpTable: Grp15
-0: fxsave
-1: fxstor
-2: ldmxcsr (VEX)
-3: stmxcsr (VEX)
+0: fxsave | RDFSBASE Ry (F3),(11B)
+1: fxstor | RDGSBASE Ry (F3),(11B)
+2: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B)
+3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B)
 4: XSAVE
 5: XRSTOR | lfence (11B)
-6: mfence (11B)
+6: XSAVEOPT | mfence (11B)
 7: clflush | sfence (11B)
 EndTable
 
@@ -872,6 +934,12 @@ GrpTable: Grp16
 3: prefetch T2
 EndTable
 
+GrpTable: Grp17
+1: BLSR By,Ey (v)
+2: BLSMSK By,Ey (v)
+3: BLSI By,Ey (v)
+EndTable
+
 # AMD's Prefetch Group
 GrpTable: GrpP
 0: PREFETCH