aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/lib')
-rw-r--r--arch/x86/lib/.gitignore1
-rw-r--r--arch/x86/lib/Makefile41
-rw-r--r--arch/x86/lib/atomic64_32.c4
-rw-r--r--arch/x86/lib/atomic64_386_32.S194
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S215
-rw-r--r--arch/x86/lib/bitops_32.c70
-rw-r--r--arch/x86/lib/bitops_64.c175
-rw-r--r--arch/x86/lib/cache-smp.c19
-rw-r--r--arch/x86/lib/checksum_32.S74
-rw-r--r--arch/x86/lib/clear_page_64.S38
-rw-r--r--arch/x86/lib/cmdline.c84
-rw-r--r--arch/x86/lib/cmpxchg16b_emu.S65
-rw-r--r--arch/x86/lib/cmpxchg8b_emu.S57
-rw-r--r--arch/x86/lib/copy_page_64.S137
-rw-r--r--arch/x86/lib/copy_user_64.S490
-rw-r--r--arch/x86/lib/copy_user_nocache_64.S286
-rw-r--r--arch/x86/lib/csum-copy_64.S246
-rw-r--r--arch/x86/lib/csum-partial_64.c4
-rw-r--r--arch/x86/lib/csum-wrappers_64.c14
-rw-r--r--arch/x86/lib/delay.c (renamed from arch/x86/lib/delay_32.c)81
-rw-r--r--arch/x86/lib/delay_64.c63
-rw-r--r--arch/x86/lib/getuser.S140
-rw-r--r--arch/x86/lib/getuser_32.S78
-rw-r--r--arch/x86/lib/getuser_64.S109
-rw-r--r--arch/x86/lib/hash.c92
-rw-r--r--arch/x86/lib/inat.c97
-rw-r--r--arch/x86/lib/insn.c580
-rw-r--r--arch/x86/lib/io_64.c25
-rw-r--r--arch/x86/lib/memcpy_32.c205
-rw-r--r--arch/x86/lib/memcpy_64.S283
-rw-r--r--arch/x86/lib/memmove_64.S223
-rw-r--r--arch/x86/lib/memmove_64.c21
-rw-r--r--arch/x86/lib/memset_64.S99
-rw-r--r--arch/x86/lib/misc.c21
-rw-r--r--arch/x86/lib/mmx_32.c197
-rw-r--r--arch/x86/lib/msr-on-cpu.c101
-rw-r--r--arch/x86/lib/msr-reg-export.c5
-rw-r--r--arch/x86/lib/msr-reg.S102
-rw-r--r--arch/x86/lib/msr-smp.c266
-rw-r--r--arch/x86/lib/msr.c110
-rw-r--r--arch/x86/lib/putuser.S (renamed from arch/x86/lib/putuser_32.S)81
-rw-r--r--arch/x86/lib/putuser_64.S106
-rw-r--r--arch/x86/lib/rwlock.S44
-rw-r--r--arch/x86/lib/rwlock_64.S38
-rw-r--r--arch/x86/lib/rwsem.S136
-rw-r--r--arch/x86/lib/semaphore_32.S219
-rw-r--r--arch/x86/lib/string_32.c94
-rw-r--r--arch/x86/lib/strstr_32.c10
-rw-r--r--arch/x86/lib/thunk_32.S30
-rw-r--r--arch/x86/lib/thunk_64.S65
-rw-r--r--arch/x86/lib/usercopy.c36
-rw-r--r--arch/x86/lib/usercopy_32.c537
-rw-r--r--arch/x86/lib/usercopy_64.c127
-rw-r--r--arch/x86/lib/x86-opcode-map.txt961
54 files changed, 4965 insertions, 2631 deletions
diff --git a/arch/x86/lib/.gitignore b/arch/x86/lib/.gitignore
new file mode 100644
index 00000000000..8df89f0a3fe
--- /dev/null
+++ b/arch/x86/lib/.gitignore
@@ -0,0 +1 @@
+inat-tables.c
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 25df1c1989f..4d4f96a2763 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -2,26 +2,45 @@
# Makefile for x86 specific library files.
#
-obj-$(CONFIG_SMP) := msr-on-cpu.o
+inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
+inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
+quiet_cmd_inat_tables = GEN $@
+ cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ || rm -f $@
-lib-y := delay_$(BITS).o
-lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o
+$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps)
+ $(call cmd,inat_tables)
+
+$(obj)/inat.o: $(obj)/inat-tables.c
+
+clean-files := inat-tables.c
+
+obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
+
+lib-y := delay.o misc.o cmdline.o
+lib-y += thunk_$(BITS).o
+lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
lib-y += memcpy_$(BITS).o
+lib-$(CONFIG_SMP) += rwlock.o
+lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
+lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
+
+obj-y += msr.o msr-reg.o msr-reg-export.o hash.o
ifeq ($(CONFIG_X86_32),y)
+ obj-y += atomic64_32.o
+ lib-y += atomic64_cx8_32.o
lib-y += checksum_32.o
lib-y += strstr_32.o
- lib-y += bitops_32.o semaphore_32.o string_32.o
-
+ lib-y += string_32.o
+ifneq ($(CONFIG_X86_CMPXCHG64),y)
+ lib-y += cmpxchg8b_emu.o atomic64_386_32.o
+endif
lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
else
- obj-y += io_64.o iomap_copy_64.o
-
- CFLAGS_csum-partial_64.o := -funroll-loops
-
+ obj-y += iomap_copy_64.o
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
lib-y += thunk_64.o clear_page_64.o copy_page_64.o
- lib-y += bitops_64.o
lib-y += memmove_64.o memset_64.o
- lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
+ lib-y += copy_user_64.o copy_user_nocache_64.o
+ lib-y += cmpxchg16b_emu.o
endif
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
new file mode 100644
index 00000000000..a0b4a350daa
--- /dev/null
+++ b/arch/x86/lib/atomic64_32.c
@@ -0,0 +1,4 @@
+#define ATOMIC64_EXPORT EXPORT_SYMBOL
+
+#include <linux/export.h>
+#include <linux/atomic.h>
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
new file mode 100644
index 00000000000..00933d5e992
--- /dev/null
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -0,0 +1,194 @@
+/*
+ * atomic64_t for 386/486
+ *
+ * Copyright © 2010 Luca Barbieri
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/dwarf2.h>
+
+/* if you want SMP support, implement these with real spinlocks */
+.macro LOCK reg
+ pushfl_cfi
+ cli
+.endm
+
+.macro UNLOCK reg
+ popfl_cfi
+.endm
+
+#define BEGIN(op) \
+.macro endp; \
+ CFI_ENDPROC; \
+ENDPROC(atomic64_##op##_386); \
+.purgem endp; \
+.endm; \
+ENTRY(atomic64_##op##_386); \
+ CFI_STARTPROC; \
+ LOCK v;
+
+#define ENDP endp
+
+#define RET \
+ UNLOCK v; \
+ ret
+
+#define RET_ENDP \
+ RET; \
+ ENDP
+
+#define v %ecx
+BEGIN(read)
+ movl (v), %eax
+ movl 4(v), %edx
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(set)
+ movl %ebx, (v)
+ movl %ecx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(xchg)
+ movl (v), %eax
+ movl 4(v), %edx
+ movl %ebx, (v)
+ movl %ecx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %ecx
+BEGIN(add)
+ addl %eax, (v)
+ adcl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %ecx
+BEGIN(add_return)
+ addl (v), %eax
+ adcl 4(v), %edx
+ movl %eax, (v)
+ movl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %ecx
+BEGIN(sub)
+ subl %eax, (v)
+ sbbl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %ecx
+BEGIN(sub_return)
+ negl %edx
+ negl %eax
+ sbbl $0, %edx
+ addl (v), %eax
+ adcl 4(v), %edx
+ movl %eax, (v)
+ movl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(inc)
+ addl $1, (v)
+ adcl $0, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(inc_return)
+ movl (v), %eax
+ movl 4(v), %edx
+ addl $1, %eax
+ adcl $0, %edx
+ movl %eax, (v)
+ movl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(dec)
+ subl $1, (v)
+ sbbl $0, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(dec_return)
+ movl (v), %eax
+ movl 4(v), %edx
+ subl $1, %eax
+ sbbl $0, %edx
+ movl %eax, (v)
+ movl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(add_unless)
+ addl %eax, %ecx
+ adcl %edx, %edi
+ addl (v), %eax
+ adcl 4(v), %edx
+ cmpl %eax, %ecx
+ je 3f
+1:
+ movl %eax, (v)
+ movl %edx, 4(v)
+ movl $1, %eax
+2:
+ RET
+3:
+ cmpl %edx, %edi
+ jne 1b
+ xorl %eax, %eax
+ jmp 2b
+ENDP
+#undef v
+
+#define v %esi
+BEGIN(inc_not_zero)
+ movl (v), %eax
+ movl 4(v), %edx
+ testl %eax, %eax
+ je 3f
+1:
+ addl $1, %eax
+ adcl $0, %edx
+ movl %eax, (v)
+ movl %edx, 4(v)
+ movl $1, %eax
+2:
+ RET
+3:
+ testl %edx, %edx
+ jne 1b
+ jmp 2b
+ENDP
+#undef v
+
+#define v %esi
+BEGIN(dec_if_positive)
+ movl (v), %eax
+ movl 4(v), %edx
+ subl $1, %eax
+ sbbl $0, %edx
+ js 1f
+ movl %eax, (v)
+ movl %edx, 4(v)
+1:
+RET_ENDP
+#undef v
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
new file mode 100644
index 00000000000..f5cc9eb1d51
--- /dev/null
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -0,0 +1,215 @@
+/*
+ * atomic64_t for 586+
+ *
+ * Copyright © 2010 Luca Barbieri
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/dwarf2.h>
+
+.macro SAVE reg
+ pushl_cfi %\reg
+ CFI_REL_OFFSET \reg, 0
+.endm
+
+.macro RESTORE reg
+ popl_cfi %\reg
+ CFI_RESTORE \reg
+.endm
+
+.macro read64 reg
+ movl %ebx, %eax
+ movl %ecx, %edx
+/* we need LOCK_PREFIX since otherwise cmpxchg8b always does the write */
+ LOCK_PREFIX
+ cmpxchg8b (\reg)
+.endm
+
+ENTRY(atomic64_read_cx8)
+ CFI_STARTPROC
+
+ read64 %ecx
+ ret
+ CFI_ENDPROC
+ENDPROC(atomic64_read_cx8)
+
+ENTRY(atomic64_set_cx8)
+ CFI_STARTPROC
+
+1:
+/* we don't need LOCK_PREFIX since aligned 64-bit writes
+ * are atomic on 586 and newer */
+ cmpxchg8b (%esi)
+ jne 1b
+
+ ret
+ CFI_ENDPROC
+ENDPROC(atomic64_set_cx8)
+
+ENTRY(atomic64_xchg_cx8)
+ CFI_STARTPROC
+
+1:
+ LOCK_PREFIX
+ cmpxchg8b (%esi)
+ jne 1b
+
+ ret
+ CFI_ENDPROC
+ENDPROC(atomic64_xchg_cx8)
+
+.macro addsub_return func ins insc
+ENTRY(atomic64_\func\()_return_cx8)
+ CFI_STARTPROC
+ SAVE ebp
+ SAVE ebx
+ SAVE esi
+ SAVE edi
+
+ movl %eax, %esi
+ movl %edx, %edi
+ movl %ecx, %ebp
+
+ read64 %ecx
+1:
+ movl %eax, %ebx
+ movl %edx, %ecx
+ \ins\()l %esi, %ebx
+ \insc\()l %edi, %ecx
+ LOCK_PREFIX
+ cmpxchg8b (%ebp)
+ jne 1b
+
+10:
+ movl %ebx, %eax
+ movl %ecx, %edx
+ RESTORE edi
+ RESTORE esi
+ RESTORE ebx
+ RESTORE ebp
+ ret
+ CFI_ENDPROC
+ENDPROC(atomic64_\func\()_return_cx8)
+.endm
+
+addsub_return add add adc
+addsub_return sub sub sbb
+
+.macro incdec_return func ins insc
+ENTRY(atomic64_\func\()_return_cx8)
+ CFI_STARTPROC
+ SAVE ebx
+
+ read64 %esi
+1:
+ movl %eax, %ebx
+ movl %edx, %ecx
+ \ins\()l $1, %ebx
+ \insc\()l $0, %ecx
+ LOCK_PREFIX
+ cmpxchg8b (%esi)
+ jne 1b
+
+10:
+ movl %ebx, %eax
+ movl %ecx, %edx
+ RESTORE ebx
+ ret
+ CFI_ENDPROC
+ENDPROC(atomic64_\func\()_return_cx8)
+.endm
+
+incdec_return inc add adc
+incdec_return dec sub sbb
+
+ENTRY(atomic64_dec_if_positive_cx8)
+ CFI_STARTPROC
+ SAVE ebx
+
+ read64 %esi
+1:
+ movl %eax, %ebx
+ movl %edx, %ecx
+ subl $1, %ebx
+ sbb $0, %ecx
+ js 2f
+ LOCK_PREFIX
+ cmpxchg8b (%esi)
+ jne 1b
+
+2:
+ movl %ebx, %eax
+ movl %ecx, %edx
+ RESTORE ebx
+ ret
+ CFI_ENDPROC
+ENDPROC(atomic64_dec_if_positive_cx8)
+
+ENTRY(atomic64_add_unless_cx8)
+ CFI_STARTPROC
+ SAVE ebp
+ SAVE ebx
+/* these just push these two parameters on the stack */
+ SAVE edi
+ SAVE ecx
+
+ movl %eax, %ebp
+ movl %edx, %edi
+
+ read64 %esi
+1:
+ cmpl %eax, 0(%esp)
+ je 4f
+2:
+ movl %eax, %ebx
+ movl %edx, %ecx
+ addl %ebp, %ebx
+ adcl %edi, %ecx
+ LOCK_PREFIX
+ cmpxchg8b (%esi)
+ jne 1b
+
+ movl $1, %eax
+3:
+ addl $8, %esp
+ CFI_ADJUST_CFA_OFFSET -8
+ RESTORE ebx
+ RESTORE ebp
+ ret
+4:
+ cmpl %edx, 4(%esp)
+ jne 2b
+ xorl %eax, %eax
+ jmp 3b
+ CFI_ENDPROC
+ENDPROC(atomic64_add_unless_cx8)
+
+ENTRY(atomic64_inc_not_zero_cx8)
+ CFI_STARTPROC
+ SAVE ebx
+
+ read64 %esi
+1:
+ movl %eax, %ecx
+ orl %edx, %ecx
+ jz 3f
+ movl %eax, %ebx
+ xorl %ecx, %ecx
+ addl $1, %ebx
+ adcl %edx, %ecx
+ LOCK_PREFIX
+ cmpxchg8b (%esi)
+ jne 1b
+
+ movl $1, %eax
+3:
+ RESTORE ebx
+ ret
+ CFI_ENDPROC
+ENDPROC(atomic64_inc_not_zero_cx8)
diff --git a/arch/x86/lib/bitops_32.c b/arch/x86/lib/bitops_32.c
deleted file mode 100644
index b6544045985..00000000000
--- a/arch/x86/lib/bitops_32.c
+++ /dev/null
@@ -1,70 +0,0 @@
-#include <linux/bitops.h>
-#include <linux/module.h>
-
-/**
- * find_next_bit - find the next set bit in a memory region
- * @addr: The address to base the search on
- * @offset: The bitnumber to start searching at
- * @size: The maximum size to search
- */
-int find_next_bit(const unsigned long *addr, int size, int offset)
-{
- const unsigned long *p = addr + (offset >> 5);
- int set = 0, bit = offset & 31, res;
-
- if (bit) {
- /*
- * Look for nonzero in the first 32 bits:
- */
- __asm__("bsfl %1,%0\n\t"
- "jne 1f\n\t"
- "movl $32, %0\n"
- "1:"
- : "=r" (set)
- : "r" (*p >> bit));
- if (set < (32 - bit))
- return set + offset;
- set = 32 - bit;
- p++;
- }
- /*
- * No set bit yet, search remaining full words for a bit
- */
- res = find_first_bit (p, size - 32 * (p - addr));
- return (offset + set + res);
-}
-EXPORT_SYMBOL(find_next_bit);
-
-/**
- * find_next_zero_bit - find the first zero bit in a memory region
- * @addr: The address to base the search on
- * @offset: The bitnumber to start searching at
- * @size: The maximum size to search
- */
-int find_next_zero_bit(const unsigned long *addr, int size, int offset)
-{
- const unsigned long *p = addr + (offset >> 5);
- int set = 0, bit = offset & 31, res;
-
- if (bit) {
- /*
- * Look for zero in the first 32 bits.
- */
- __asm__("bsfl %1,%0\n\t"
- "jne 1f\n\t"
- "movl $32, %0\n"
- "1:"
- : "=r" (set)
- : "r" (~(*p >> bit)));
- if (set < (32 - bit))
- return set + offset;
- set = 32 - bit;
- p++;
- }
- /*
- * No zero yet, search remaining full bytes for a zero
- */
- res = find_first_zero_bit(p, size - 32 * (p - addr));
- return (offset + set + res);
-}
-EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/x86/lib/bitops_64.c b/arch/x86/lib/bitops_64.c
deleted file mode 100644
index 0e8f491e6cc..00000000000
--- a/arch/x86/lib/bitops_64.c
+++ /dev/null
@@ -1,175 +0,0 @@
-#include <linux/bitops.h>
-
-#undef find_first_zero_bit
-#undef find_next_zero_bit
-#undef find_first_bit
-#undef find_next_bit
-
-static inline long
-__find_first_zero_bit(const unsigned long * addr, unsigned long size)
-{
- long d0, d1, d2;
- long res;
-
- /*
- * We must test the size in words, not in bits, because
- * otherwise incoming sizes in the range -63..-1 will not run
- * any scasq instructions, and then the flags used by the je
- * instruction will have whatever random value was in place
- * before. Nobody should call us like that, but
- * find_next_zero_bit() does when offset and size are at the
- * same word and it fails to find a zero itself.
- */
- size += 63;
- size >>= 6;
- if (!size)
- return 0;
- asm volatile(
- " repe; scasq\n"
- " je 1f\n"
- " xorq -8(%%rdi),%%rax\n"
- " subq $8,%%rdi\n"
- " bsfq %%rax,%%rdx\n"
- "1: subq %[addr],%%rdi\n"
- " shlq $3,%%rdi\n"
- " addq %%rdi,%%rdx"
- :"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2)
- :"0" (0ULL), "1" (size), "2" (addr), "3" (-1ULL),
- [addr] "S" (addr) : "memory");
- /*
- * Any register would do for [addr] above, but GCC tends to
- * prefer rbx over rsi, even though rsi is readily available
- * and doesn't have to be saved.
- */
- return res;
-}
-
-/**
- * find_first_zero_bit - find the first zero bit in a memory region
- * @addr: The address to start the search at
- * @size: The maximum size to search
- *
- * Returns the bit-number of the first zero bit, not the number of the byte
- * containing a bit.
- */
-long find_first_zero_bit(const unsigned long * addr, unsigned long size)
-{
- return __find_first_zero_bit (addr, size);
-}
-
-/**
- * find_next_zero_bit - find the next zero bit in a memory region
- * @addr: The address to base the search on
- * @offset: The bitnumber to start searching at
- * @size: The maximum size to search
- */
-long find_next_zero_bit (const unsigned long * addr, long size, long offset)
-{
- const unsigned long * p = addr + (offset >> 6);
- unsigned long set = 0;
- unsigned long res, bit = offset&63;
-
- if (bit) {
- /*
- * Look for zero in first word
- */
- asm("bsfq %1,%0\n\t"
- "cmoveq %2,%0"
- : "=r" (set)
- : "r" (~(*p >> bit)), "r"(64L));
- if (set < (64 - bit))
- return set + offset;
- set = 64 - bit;
- p++;
- }
- /*
- * No zero yet, search remaining full words for a zero
- */
- res = __find_first_zero_bit (p, size - 64 * (p - addr));
-
- return (offset + set + res);
-}
-
-static inline long
-__find_first_bit(const unsigned long * addr, unsigned long size)
-{
- long d0, d1;
- long res;
-
- /*
- * We must test the size in words, not in bits, because
- * otherwise incoming sizes in the range -63..-1 will not run
- * any scasq instructions, and then the flags used by the jz
- * instruction will have whatever random value was in place
- * before. Nobody should call us like that, but
- * find_next_bit() does when offset and size are at the same
- * word and it fails to find a one itself.
- */
- size += 63;
- size >>= 6;
- if (!size)
- return 0;
- asm volatile(
- " repe; scasq\n"
- " jz 1f\n"
- " subq $8,%%rdi\n"
- " bsfq (%%rdi),%%rax\n"
- "1: subq %[addr],%%rdi\n"
- " shlq $3,%%rdi\n"
- " addq %%rdi,%%rax"
- :"=a" (res), "=&c" (d0), "=&D" (d1)
- :"0" (0ULL), "1" (size), "2" (addr),
- [addr] "r" (addr) : "memory");
- return res;
-}
-
-/**
- * find_first_bit - find the first set bit in a memory region
- * @addr: The address to start the search at
- * @size: The maximum size to search
- *
- * Returns the bit-number of the first set bit, not the number of the byte
- * containing a bit.
- */
-long find_first_bit(const unsigned long * addr, unsigned long size)
-{
- return __find_first_bit(addr,size);
-}
-
-/**
- * find_next_bit - find the first set bit in a memory region
- * @addr: The address to base the search on
- * @offset: The bitnumber to start searching at
- * @size: The maximum size to search
- */
-long find_next_bit(const unsigned long * addr, long size, long offset)
-{
- const unsigned long * p = addr + (offset >> 6);
- unsigned long set = 0, bit = offset & 63, res;
-
- if (bit) {
- /*
- * Look for nonzero in the first 64 bits:
- */
- asm("bsfq %1,%0\n\t"
- "cmoveq %2,%0\n\t"
- : "=r" (set)
- : "r" (*p >> bit), "r" (64L));
- if (set < (64 - bit))
- return set + offset;
- set = 64 - bit;
- p++;
- }
- /*
- * No set bit yet, search remaining full words for a bit
- */
- res = __find_first_bit (p, size - 64 * (p - addr));
- return (offset + set + res);
-}
-
-#include <linux/module.h>
-
-EXPORT_SYMBOL(find_next_bit);
-EXPORT_SYMBOL(find_first_bit);
-EXPORT_SYMBOL(find_first_zero_bit);
-EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c
new file mode 100644
index 00000000000..a3c66887503
--- /dev/null
+++ b/arch/x86/lib/cache-smp.c
@@ -0,0 +1,19 @@
+#include <linux/smp.h>
+#include <linux/module.h>
+
+static void __wbinvd(void *dummy)
+{
+ wbinvd();
+}
+
+void wbinvd_on_cpu(int cpu)
+{
+ smp_call_function_single(cpu, __wbinvd, NULL, 1);
+}
+EXPORT_SYMBOL(wbinvd_on_cpu);
+
+int wbinvd_on_all_cpus(void)
+{
+ return on_each_cpu(__wbinvd, NULL, 1);
+}
+EXPORT_SYMBOL(wbinvd_on_all_cpus);
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index adbccd0bbb7..e78b8eee661 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -28,6 +28,7 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/errno.h>
+#include <asm/asm.h>
/*
* computes a partial checksum, e.g. for TCP/UDP fragments
@@ -50,11 +51,9 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
*/
ENTRY(csum_partial)
CFI_STARTPROC
- pushl %esi
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %esi
CFI_REL_OFFSET esi, 0
- pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ebx
CFI_REL_OFFSET ebx, 0
movl 20(%esp),%eax # Function arg: unsigned int sum
movl 16(%esp),%ecx # Function arg: int len
@@ -62,7 +61,7 @@ ENTRY(csum_partial)
testl $3, %esi # Check alignment.
jz 2f # Jump if alignment is ok.
testl $1, %esi # Check alignment.
- jz 10f # Jump if alignment is boundary of 2bytes.
+ jz 10f # Jump if alignment is boundary of 2 bytes.
# buf is odd
dec %ecx
@@ -132,11 +131,9 @@ ENTRY(csum_partial)
jz 8f
roll $8, %eax
8:
- popl %ebx
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %ebx
CFI_RESTORE ebx
- popl %esi
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %esi
CFI_RESTORE esi
ret
CFI_ENDPROC
@@ -148,11 +145,9 @@ ENDPROC(csum_partial)
ENTRY(csum_partial)
CFI_STARTPROC
- pushl %esi
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %esi
CFI_REL_OFFSET esi, 0
- pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ebx
CFI_REL_OFFSET ebx, 0
movl 20(%esp),%eax # Function arg: unsigned int sum
movl 16(%esp),%ecx # Function arg: int len
@@ -260,11 +255,9 @@ ENTRY(csum_partial)
jz 90f
roll $8, %eax
90:
- popl %ebx
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %ebx
CFI_RESTORE ebx
- popl %esi
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %esi
CFI_RESTORE esi
ret
CFI_ENDPROC
@@ -290,15 +283,11 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst,
#define SRC(y...) \
9999: y; \
- .section __ex_table, "a"; \
- .long 9999b, 6001f ; \
- .previous
+ _ASM_EXTABLE(9999b, 6001f)
#define DST(y...) \
9999: y; \
- .section __ex_table, "a"; \
- .long 9999b, 6002f ; \
- .previous
+ _ASM_EXTABLE(9999b, 6002f)
#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
@@ -309,14 +298,11 @@ ENTRY(csum_partial_copy_generic)
CFI_STARTPROC
subl $4,%esp
CFI_ADJUST_CFA_OFFSET 4
- pushl %edi
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %edi
CFI_REL_OFFSET edi, 0
- pushl %esi
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %esi
CFI_REL_OFFSET esi, 0
- pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ebx
CFI_REL_OFFSET ebx, 0
movl ARGBASE+16(%esp),%eax # sum
movl ARGBASE+12(%esp),%ecx # len
@@ -426,17 +412,13 @@ DST( movb %cl, (%edi) )
.previous
- popl %ebx
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %ebx
CFI_RESTORE ebx
- popl %esi
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %esi
CFI_RESTORE esi
- popl %edi
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %edi
CFI_RESTORE edi
- popl %ecx # equivalent to addl $4,%esp
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %ecx # equivalent to addl $4,%esp
ret
CFI_ENDPROC
ENDPROC(csum_partial_copy_generic)
@@ -459,14 +441,11 @@ ENDPROC(csum_partial_copy_generic)
ENTRY(csum_partial_copy_generic)
CFI_STARTPROC
- pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ebx
CFI_REL_OFFSET ebx, 0
- pushl %edi
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %edi
CFI_REL_OFFSET edi, 0
- pushl %esi
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %esi
CFI_REL_OFFSET esi, 0
movl ARGBASE+4(%esp),%esi #src
movl ARGBASE+8(%esp),%edi #dst
@@ -527,14 +506,11 @@ DST( movb %dl, (%edi) )
jmp 7b
.previous
- popl %esi
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %esi
CFI_RESTORE esi
- popl %edi
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %edi
CFI_RESTORE edi
- popl %ebx
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %ebx
CFI_RESTORE ebx
ret
CFI_ENDPROC
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index 9a10a78bb4a..f2145cfa12a 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,19 +1,28 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
/*
* Zero a page.
* rdi page
*/
- ALIGN
-clear_page_c:
+ENTRY(clear_page_c)
CFI_STARTPROC
movl $4096/8,%ecx
xorl %eax,%eax
rep stosq
ret
CFI_ENDPROC
-ENDPROC(clear_page)
+ENDPROC(clear_page_c)
+
+ENTRY(clear_page_c_e)
+ CFI_STARTPROC
+ movl $4096,%ecx
+ xorl %eax,%eax
+ rep stosb
+ ret
+ CFI_ENDPROC
+ENDPROC(clear_page_c_e)
ENTRY(clear_page)
CFI_STARTPROC
@@ -39,21 +48,26 @@ ENTRY(clear_page)
.Lclear_page_end:
ENDPROC(clear_page)
- /* Some CPUs run faster using the string instructions.
- It is also a lot simpler. Use this when possible */
+ /*
+ * Some CPUs support enhanced REP MOVSB/STOSB instructions.
+ * It is recommended to use this when possible.
+ * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
+ * Otherwise, use original function.
+ *
+ */
#include <asm/cpufeature.h>
.section .altinstr_replacement,"ax"
1: .byte 0xeb /* jmp <disp8> */
.byte (clear_page_c - clear_page) - (2f - 1b) /* offset */
-2:
+2: .byte 0xeb /* jmp <disp8> */
+ .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */
+3:
.previous
.section .altinstructions,"a"
- .align 8
- .quad clear_page
- .quad 1b
- .byte X86_FEATURE_REP_GOOD
- .byte .Lclear_page_end - clear_page
- .byte 2b - 1b
+ altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
+ .Lclear_page_end-clear_page, 2b-1b
+ altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \
+ .Lclear_page_end-clear_page,3b-2b
.previous
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
new file mode 100644
index 00000000000..422db000d72
--- /dev/null
+++ b/arch/x86/lib/cmdline.c
@@ -0,0 +1,84 @@
+/*
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * Misc librarized functions for cmdline poking.
+ */
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <asm/setup.h>
+
+static inline int myisspace(u8 c)
+{
+ return c <= ' '; /* Close enough approximation */
+}
+
+/**
+ * Find a boolean option (like quiet,noapic,nosmp....)
+ *
+ * @cmdline: the cmdline string
+ * @option: option string to look for
+ *
+ * Returns the position of that @option (starts counting with 1)
+ * or 0 on not found.
+ */
+int cmdline_find_option_bool(const char *cmdline, const char *option)
+{
+ char c;
+ int len, pos = 0, wstart = 0;
+ const char *opptr = NULL;
+ enum {
+ st_wordstart = 0, /* Start of word/after whitespace */
+ st_wordcmp, /* Comparing this word */
+ st_wordskip, /* Miscompare, skip */
+ } state = st_wordstart;
+
+ if (!cmdline)
+ return -1; /* No command line */
+
+ len = min_t(int, strlen(cmdline), COMMAND_LINE_SIZE);
+ if (!len)
+ return 0;
+
+ while (len--) {
+ c = *(char *)cmdline++;
+ pos++;
+
+ switch (state) {
+ case st_wordstart:
+ if (!c)
+ return 0;
+ else if (myisspace(c))
+ break;
+
+ state = st_wordcmp;
+ opptr = option;
+ wstart = pos;
+ /* fall through */
+
+ case st_wordcmp:
+ if (!*opptr)
+ if (!c || myisspace(c))
+ return wstart;
+ else
+ state = st_wordskip;
+ else if (!c)
+ return 0;
+ else if (c != *opptr++)
+ state = st_wordskip;
+ else if (!len) /* last word and is matching */
+ return wstart;
+ break;
+
+ case st_wordskip:
+ if (!c)
+ return 0;
+ else if (myisspace(c))
+ state = st_wordstart;
+ break;
+ }
+ }
+
+ return 0; /* Buffer overrun */
+}
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
new file mode 100644
index 00000000000..1e572c507d0
--- /dev/null
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -0,0 +1,65 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ */
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/frame.h>
+#include <asm/dwarf2.h>
+
+#ifdef CONFIG_SMP
+#define SEG_PREFIX %gs:
+#else
+#define SEG_PREFIX
+#endif
+
+.text
+
+/*
+ * Inputs:
+ * %rsi : memory location to compare
+ * %rax : low 64 bits of old value
+ * %rdx : high 64 bits of old value
+ * %rbx : low 64 bits of new value
+ * %rcx : high 64 bits of new value
+ * %al : Operation successful
+ */
+ENTRY(this_cpu_cmpxchg16b_emu)
+CFI_STARTPROC
+
+#
+# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not
+# via the ZF. Caller will access %al to get result.
+#
+# Note that this is only useful for a cpuops operation. Meaning that we
+# do *not* have a fully atomic operation but just an operation that is
+# *atomic* on a single cpu (as provided by the this_cpu_xx class of
+# macros).
+#
+this_cpu_cmpxchg16b_emu:
+ pushf
+ cli
+
+ cmpq SEG_PREFIX(%rsi), %rax
+ jne not_same
+ cmpq SEG_PREFIX 8(%rsi), %rdx
+ jne not_same
+
+ movq %rbx, SEG_PREFIX(%rsi)
+ movq %rcx, SEG_PREFIX 8(%rsi)
+
+ popf
+ mov $1, %al
+ ret
+
+ not_same:
+ popf
+ xor %al,%al
+ ret
+
+CFI_ENDPROC
+
+ENDPROC(this_cpu_cmpxchg16b_emu)
diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S
new file mode 100644
index 00000000000..828cb710dec
--- /dev/null
+++ b/arch/x86/lib/cmpxchg8b_emu.S
@@ -0,0 +1,57 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/frame.h>
+#include <asm/dwarf2.h>
+
+
+.text
+
+/*
+ * Inputs:
+ * %esi : memory location to compare
+ * %eax : low 32 bits of old value
+ * %edx : high 32 bits of old value
+ * %ebx : low 32 bits of new value
+ * %ecx : high 32 bits of new value
+ */
+ENTRY(cmpxchg8b_emu)
+CFI_STARTPROC
+
+#
+# Emulate 'cmpxchg8b (%esi)' on UP except we don't
+# set the whole ZF thing (caller will just compare
+# eax:edx with the expected value)
+#
+cmpxchg8b_emu:
+ pushfl
+ cli
+
+ cmpl (%esi), %eax
+ jne not_same
+ cmpl 4(%esi), %edx
+ jne half_same
+
+ movl %ebx, (%esi)
+ movl %ecx, 4(%esi)
+
+ popfl
+ ret
+
+ not_same:
+ movl (%esi), %eax
+ half_same:
+ movl 4(%esi), %edx
+
+ popfl
+ ret
+
+CFI_ENDPROC
+ENDPROC(cmpxchg8b_emu)
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 727a5d46d2f..176cca67212 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -2,98 +2,93 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
ALIGN
-copy_page_c:
+copy_page_rep:
CFI_STARTPROC
- movl $4096/8,%ecx
- rep movsq
+ movl $4096/8, %ecx
+ rep movsq
ret
CFI_ENDPROC
-ENDPROC(copy_page_c)
+ENDPROC(copy_page_rep)
-/* Don't use streaming store because it's better when the target
- ends up in cache. */
-
-/* Could vary the prefetch distance based on SMP/UP */
+/*
+ * Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD.
+ * Could vary the prefetch distance based on SMP/UP.
+*/
ENTRY(copy_page)
CFI_STARTPROC
- subq $3*8,%rsp
- CFI_ADJUST_CFA_OFFSET 3*8
- movq %rbx,(%rsp)
+ subq $2*8, %rsp
+ CFI_ADJUST_CFA_OFFSET 2*8
+ movq %rbx, (%rsp)
CFI_REL_OFFSET rbx, 0
- movq %r12,1*8(%rsp)
+ movq %r12, 1*8(%rsp)
CFI_REL_OFFSET r12, 1*8
- movq %r13,2*8(%rsp)
- CFI_REL_OFFSET r13, 2*8
- movl $(4096/64)-5,%ecx
+ movl $(4096/64)-5, %ecx
.p2align 4
.Loop64:
- dec %rcx
-
- movq (%rsi), %rax
- movq 8 (%rsi), %rbx
- movq 16 (%rsi), %rdx
- movq 24 (%rsi), %r8
- movq 32 (%rsi), %r9
- movq 40 (%rsi), %r10
- movq 48 (%rsi), %r11
- movq 56 (%rsi), %r12
+ dec %rcx
+ movq 0x8*0(%rsi), %rax
+ movq 0x8*1(%rsi), %rbx
+ movq 0x8*2(%rsi), %rdx
+ movq 0x8*3(%rsi), %r8
+ movq 0x8*4(%rsi), %r9
+ movq 0x8*5(%rsi), %r10
+ movq 0x8*6(%rsi), %r11
+ movq 0x8*7(%rsi), %r12
prefetcht0 5*64(%rsi)
- movq %rax, (%rdi)
- movq %rbx, 8 (%rdi)
- movq %rdx, 16 (%rdi)
- movq %r8, 24 (%rdi)
- movq %r9, 32 (%rdi)
- movq %r10, 40 (%rdi)
- movq %r11, 48 (%rdi)
- movq %r12, 56 (%rdi)
+ movq %rax, 0x8*0(%rdi)
+ movq %rbx, 0x8*1(%rdi)
+ movq %rdx, 0x8*2(%rdi)
+ movq %r8, 0x8*3(%rdi)
+ movq %r9, 0x8*4(%rdi)
+ movq %r10, 0x8*5(%rdi)
+ movq %r11, 0x8*6(%rdi)
+ movq %r12, 0x8*7(%rdi)
- leaq 64 (%rsi), %rsi
- leaq 64 (%rdi), %rdi
+ leaq 64 (%rsi), %rsi
+ leaq 64 (%rdi), %rdi
- jnz .Loop64
+ jnz .Loop64
- movl $5,%ecx
+ movl $5, %ecx
.p2align 4
.Loop2:
- decl %ecx
-
- movq (%rsi), %rax
- movq 8 (%rsi), %rbx
- movq 16 (%rsi), %rdx
- movq 24 (%rsi), %r8
- movq 32 (%rsi), %r9
- movq 40 (%rsi), %r10
- movq 48 (%rsi), %r11
- movq 56 (%rsi), %r12
-
- movq %rax, (%rdi)
- movq %rbx, 8 (%rdi)
- movq %rdx, 16 (%rdi)
- movq %r8, 24 (%rdi)
- movq %r9, 32 (%rdi)
- movq %r10, 40 (%rdi)
- movq %r11, 48 (%rdi)
- movq %r12, 56 (%rdi)
-
- leaq 64(%rdi),%rdi
- leaq 64(%rsi),%rsi
-
+ decl %ecx
+
+ movq 0x8*0(%rsi), %rax
+ movq 0x8*1(%rsi), %rbx
+ movq 0x8*2(%rsi), %rdx
+ movq 0x8*3(%rsi), %r8
+ movq 0x8*4(%rsi), %r9
+ movq 0x8*5(%rsi), %r10
+ movq 0x8*6(%rsi), %r11
+ movq 0x8*7(%rsi), %r12
+
+ movq %rax, 0x8*0(%rdi)
+ movq %rbx, 0x8*1(%rdi)
+ movq %rdx, 0x8*2(%rdi)
+ movq %r8, 0x8*3(%rdi)
+ movq %r9, 0x8*4(%rdi)
+ movq %r10, 0x8*5(%rdi)
+ movq %r11, 0x8*6(%rdi)
+ movq %r12, 0x8*7(%rdi)
+
+ leaq 64(%rdi), %rdi
+ leaq 64(%rsi), %rsi
jnz .Loop2
- movq (%rsp),%rbx
+ movq (%rsp), %rbx
CFI_RESTORE rbx
- movq 1*8(%rsp),%r12
+ movq 1*8(%rsp), %r12
CFI_RESTORE r12
- movq 2*8(%rsp),%r13
- CFI_RESTORE r13
- addq $3*8,%rsp
- CFI_ADJUST_CFA_OFFSET -3*8
+ addq $2*8, %rsp
+ CFI_ADJUST_CFA_OFFSET -2*8
ret
.Lcopy_page_end:
CFI_ENDPROC
@@ -106,14 +101,10 @@ ENDPROC(copy_page)
.section .altinstr_replacement,"ax"
1: .byte 0xeb /* jmp <disp8> */
- .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */
+ .byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */
2:
.previous
.section .altinstructions,"a"
- .align 8
- .quad copy_page
- .quad 1b
- .byte X86_FEATURE_REP_GOOD
- .byte .Lcopy_page_end - copy_page
- .byte 2b - 1b
+ altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \
+ .Lcopy_page_end-copy_page, 2b-1b
.previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 70bebd31040..dee945d5559 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -1,8 +1,10 @@
-/* Copyright 2002 Andi Kleen, SuSE Labs.
+/*
+ * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
+ * Copyright 2002 Andi Kleen, SuSE Labs.
* Subject to the GNU Public License v2.
- *
- * Functions to copy from and to user space.
- */
+ *
+ * Functions to copy from and to user space.
+ */
#include <linux/linkage.h>
#include <asm/dwarf2.h>
@@ -13,67 +15,94 @@
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
- .macro ALTERNATIVE_JUMP feature,orig,alt
+/*
+ * By placing feature2 after feature1 in altinstructions section, we logically
+ * implement:
+ * If CPU has feature2, jmp to alt2 is used
+ * else if CPU has feature1, jmp to alt1 is used
+ * else jmp to orig is used.
+ */
+ .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
0:
.byte 0xe9 /* 32bit jump */
.long \orig-1f /* by default jump to orig */
1:
.section .altinstr_replacement,"ax"
-2: .byte 0xe9 /* near jump with 32bit immediate */
- .long \alt-1b /* offset */ /* or alternatively to alt */
+2: .byte 0xe9 /* near jump with 32bit immediate */
+ .long \alt1-1b /* offset */ /* or alternatively to alt1 */
+3: .byte 0xe9 /* near jump with 32bit immediate */
+ .long \alt2-1b /* offset */ /* or alternatively to alt2 */
.previous
+
.section .altinstructions,"a"
- .align 8
- .quad 0b
- .quad 2b
- .byte \feature /* when feature is set */
- .byte 5
- .byte 5
+ altinstruction_entry 0b,2b,\feature1,5,5
+ altinstruction_entry 0b,3b,\feature2,5,5
.previous
.endm
-/* Standard copy_to_user with segment limit checking */
-ENTRY(copy_to_user)
+ .macro ALIGN_DESTINATION
+#ifdef FIX_ALIGNMENT
+ /* check for bad alignment of destination */
+ movl %edi,%ecx
+ andl $7,%ecx
+ jz 102f /* already aligned */
+ subl $8,%ecx
+ negl %ecx
+ subl %ecx,%edx
+100: movb (%rsi),%al
+101: movb %al,(%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz 100b
+102:
+ .section .fixup,"ax"
+103: addl %ecx,%edx /* ecx is zerorest also */
+ jmp copy_user_handle_tail
+ .previous
+
+ _ASM_EXTABLE(100b,103b)
+ _ASM_EXTABLE(101b,103b)
+#endif
+ .endm
+
+/* Standard copy_to_user with segment limit checking */
+ENTRY(_copy_to_user)
CFI_STARTPROC
GET_THREAD_INFO(%rax)
movq %rdi,%rcx
addq %rdx,%rcx
- jc bad_to_user
- cmpq threadinfo_addr_limit(%rax),%rcx
- jae bad_to_user
- xorl %eax,%eax /* clear zero flag */
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+ jc bad_to_user
+ cmpq TI_addr_limit(%rax),%rcx
+ ja bad_to_user
+ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
+ copy_user_generic_unrolled,copy_user_generic_string, \
+ copy_user_enhanced_fast_string
CFI_ENDPROC
+ENDPROC(_copy_to_user)
-ENTRY(copy_user_generic)
- CFI_STARTPROC
- movl $1,%ecx /* set zero flag */
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
- CFI_ENDPROC
-
-ENTRY(__copy_from_user_inatomic)
- CFI_STARTPROC
- xorl %ecx,%ecx /* clear zero flag */
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
- CFI_ENDPROC
-
-/* Standard copy_from_user with segment limit checking */
-ENTRY(copy_from_user)
+/* Standard copy_from_user with segment limit checking */
+ENTRY(_copy_from_user)
CFI_STARTPROC
GET_THREAD_INFO(%rax)
movq %rsi,%rcx
addq %rdx,%rcx
- jc bad_from_user
- cmpq threadinfo_addr_limit(%rax),%rcx
- jae bad_from_user
- movl $1,%ecx /* set zero flag */
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+ jc bad_from_user
+ cmpq TI_addr_limit(%rax),%rcx
+ ja bad_from_user
+ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
+ copy_user_generic_unrolled,copy_user_generic_string, \
+ copy_user_enhanced_fast_string
CFI_ENDPROC
-ENDPROC(copy_from_user)
-
+ENDPROC(_copy_from_user)
+
.section .fixup,"ax"
/* must zero dest */
+ENTRY(bad_from_user)
bad_from_user:
CFI_STARTPROC
movl %edx,%ecx
@@ -81,274 +110,185 @@ bad_from_user:
rep
stosb
bad_to_user:
- movl %edx,%eax
+ movl %edx,%eax
ret
CFI_ENDPROC
-END(bad_from_user)
+ENDPROC(bad_from_user)
.previous
-
-
+
/*
* copy_user_generic_unrolled - memory copy with exception handling.
- * This version is for CPUs like P4 that don't have efficient micro code for rep movsq
- *
- * Input:
+ * This version is for CPUs like P4 that don't have efficient micro
+ * code for rep movsq
+ *
+ * Input:
* rdi destination
* rsi source
* rdx count
- * ecx zero flag -- if true zero destination on error
*
- * Output:
+ * Output:
* eax uncopied bytes or 0 if successful.
*/
ENTRY(copy_user_generic_unrolled)
CFI_STARTPROC
- pushq %rbx
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rbx, 0
- pushq %rcx
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rcx, 0
- xorl %eax,%eax /*zero for the exception handler */
-
-#ifdef FIX_ALIGNMENT
- /* check for bad alignment of destination */
- movl %edi,%ecx
- andl $7,%ecx
- jnz .Lbad_alignment
-.Lafter_bad_alignment:
-#endif
-
- movq %rdx,%rcx
-
- movl $64,%ebx
- shrq $6,%rdx
- decq %rdx
- js .Lhandle_tail
-
- .p2align 4
-.Lloop:
-.Ls1: movq (%rsi),%r11
-.Ls2: movq 1*8(%rsi),%r8
-.Ls3: movq 2*8(%rsi),%r9
-.Ls4: movq 3*8(%rsi),%r10
-.Ld1: movq %r11,(%rdi)
-.Ld2: movq %r8,1*8(%rdi)
-.Ld3: movq %r9,2*8(%rdi)
-.Ld4: movq %r10,3*8(%rdi)
-
-.Ls5: movq 4*8(%rsi),%r11
-.Ls6: movq 5*8(%rsi),%r8
-.Ls7: movq 6*8(%rsi),%r9
-.Ls8: movq 7*8(%rsi),%r10
-.Ld5: movq %r11,4*8(%rdi)
-.Ld6: movq %r8,5*8(%rdi)
-.Ld7: movq %r9,6*8(%rdi)
-.Ld8: movq %r10,7*8(%rdi)
-
- decq %rdx
-
+ ASM_STAC
+ cmpl $8,%edx
+ jb 20f /* less then 8 bytes, go to byte copy loop */
+ ALIGN_DESTINATION
+ movl %edx,%ecx
+ andl $63,%edx
+ shrl $6,%ecx
+ jz 17f
+1: movq (%rsi),%r8
+2: movq 1*8(%rsi),%r9
+3: movq 2*8(%rsi),%r10
+4: movq 3*8(%rsi),%r11
+5: movq %r8,(%rdi)
+6: movq %r9,1*8(%rdi)
+7: movq %r10,2*8(%rdi)
+8: movq %r11,3*8(%rdi)
+9: movq 4*8(%rsi),%r8
+10: movq 5*8(%rsi),%r9
+11: movq 6*8(%rsi),%r10
+12: movq 7*8(%rsi),%r11
+13: movq %r8,4*8(%rdi)
+14: movq %r9,5*8(%rdi)
+15: movq %r10,6*8(%rdi)
+16: movq %r11,7*8(%rdi)
leaq 64(%rsi),%rsi
leaq 64(%rdi),%rdi
-
- jns .Lloop
-
- .p2align 4
-.Lhandle_tail:
- movl %ecx,%edx
- andl $63,%ecx
- shrl $3,%ecx
- jz .Lhandle_7
- movl $8,%ebx
- .p2align 4
-.Lloop_8:
-.Ls9: movq (%rsi),%r8
-.Ld9: movq %r8,(%rdi)
decl %ecx
- leaq 8(%rdi),%rdi
+ jnz 1b
+17: movl %edx,%ecx
+ andl $7,%edx
+ shrl $3,%ecx
+ jz 20f
+18: movq (%rsi),%r8
+19: movq %r8,(%rdi)
leaq 8(%rsi),%rsi
- jnz .Lloop_8
-
-.Lhandle_7:
- movl %edx,%ecx
- andl $7,%ecx
- jz .Lende
- .p2align 4
-.Lloop_1:
-.Ls10: movb (%rsi),%bl
-.Ld10: movb %bl,(%rdi)
- incq %rdi
- incq %rsi
+ leaq 8(%rdi),%rdi
decl %ecx
- jnz .Lloop_1
-
- CFI_REMEMBER_STATE
-.Lende:
- popq %rcx
- CFI_ADJUST_CFA_OFFSET -8
- CFI_RESTORE rcx
- popq %rbx
- CFI_ADJUST_CFA_OFFSET -8
- CFI_RESTORE rbx
- ret
- CFI_RESTORE_STATE
-
-#ifdef FIX_ALIGNMENT
- /* align destination */
- .p2align 4
-.Lbad_alignment:
- movl $8,%r9d
- subl %ecx,%r9d
- movl %r9d,%ecx
- cmpq %r9,%rdx
- jz .Lhandle_7
- js .Lhandle_7
-.Lalign_1:
-.Ls11: movb (%rsi),%bl
-.Ld11: movb %bl,(%rdi)
+ jnz 18b
+20: andl %edx,%edx
+ jz 23f
+ movl %edx,%ecx
+21: movb (%rsi),%al
+22: movb %al,(%rdi)
incq %rsi
incq %rdi
decl %ecx
- jnz .Lalign_1
- subq %r9,%rdx
- jmp .Lafter_bad_alignment
-#endif
+ jnz 21b
+23: xor %eax,%eax
+ ASM_CLAC
+ ret
- /* table sorted by exception address */
- .section __ex_table,"a"
- .align 8
- .quad .Ls1,.Ls1e
- .quad .Ls2,.Ls2e
- .quad .Ls3,.Ls3e
- .quad .Ls4,.Ls4e
- .quad .Ld1,.Ls1e
- .quad .Ld2,.Ls2e
- .quad .Ld3,.Ls3e
- .quad .Ld4,.Ls4e
- .quad .Ls5,.Ls5e
- .quad .Ls6,.Ls6e
- .quad .Ls7,.Ls7e
- .quad .Ls8,.Ls8e
- .quad .Ld5,.Ls5e
- .quad .Ld6,.Ls6e
- .quad .Ld7,.Ls7e
- .quad .Ld8,.Ls8e
- .quad .Ls9,.Le_quad
- .quad .Ld9,.Le_quad
- .quad .Ls10,.Le_byte
- .quad .Ld10,.Le_byte
-#ifdef FIX_ALIGNMENT
- .quad .Ls11,.Lzero_rest
- .quad .Ld11,.Lzero_rest
-#endif
- .quad .Le5,.Le_zero
+ .section .fixup,"ax"
+30: shll $6,%ecx
+ addl %ecx,%edx
+ jmp 60f
+40: leal (%rdx,%rcx,8),%edx
+ jmp 60f
+50: movl %ecx,%edx
+60: jmp copy_user_handle_tail /* ecx is zerorest also */
.previous
- /* compute 64-offset for main loop. 8 bytes accuracy with error on the
- pessimistic side. this is gross. it would be better to fix the
- interface. */
- /* eax: zero, ebx: 64 */
-.Ls1e: addl $8,%eax
-.Ls2e: addl $8,%eax
-.Ls3e: addl $8,%eax
-.Ls4e: addl $8,%eax
-.Ls5e: addl $8,%eax
-.Ls6e: addl $8,%eax
-.Ls7e: addl $8,%eax
-.Ls8e: addl $8,%eax
- addq %rbx,%rdi /* +64 */
- subq %rax,%rdi /* correct destination with computed offset */
-
- shlq $6,%rdx /* loop counter * 64 (stride length) */
- addq %rax,%rdx /* add offset to loopcnt */
- andl $63,%ecx /* remaining bytes */
- addq %rcx,%rdx /* add them */
- jmp .Lzero_rest
-
- /* exception on quad word loop in tail handling */
- /* ecx: loopcnt/8, %edx: length, rdi: correct */
-.Le_quad:
- shll $3,%ecx
- andl $7,%edx
- addl %ecx,%edx
- /* edx: bytes to zero, rdi: dest, eax:zero */
-.Lzero_rest:
- cmpl $0,(%rsp)
- jz .Le_zero
- movq %rdx,%rcx
-.Le_byte:
- xorl %eax,%eax
-.Le5: rep
- stosb
- /* when there is another exception while zeroing the rest just return */
-.Le_zero:
- movq %rdx,%rax
- jmp .Lende
+ _ASM_EXTABLE(1b,30b)
+ _ASM_EXTABLE(2b,30b)
+ _ASM_EXTABLE(3b,30b)
+ _ASM_EXTABLE(4b,30b)
+ _ASM_EXTABLE(5b,30b)
+ _ASM_EXTABLE(6b,30b)
+ _ASM_EXTABLE(7b,30b)
+ _ASM_EXTABLE(8b,30b)
+ _ASM_EXTABLE(9b,30b)
+ _ASM_EXTABLE(10b,30b)
+ _ASM_EXTABLE(11b,30b)
+ _ASM_EXTABLE(12b,30b)
+ _ASM_EXTABLE(13b,30b)
+ _ASM_EXTABLE(14b,30b)
+ _ASM_EXTABLE(15b,30b)
+ _ASM_EXTABLE(16b,30b)
+ _ASM_EXTABLE(18b,40b)
+ _ASM_EXTABLE(19b,40b)
+ _ASM_EXTABLE(21b,50b)
+ _ASM_EXTABLE(22b,50b)
CFI_ENDPROC
-ENDPROC(copy_user_generic)
-
+ENDPROC(copy_user_generic_unrolled)
- /* Some CPUs run faster using the string copy instructions.
- This is also a lot simpler. Use them when possible.
- Patch in jmps to this code instead of copying it fully
- to avoid unwanted aliasing in the exception tables. */
-
- /* rdi destination
- * rsi source
- * rdx count
- * ecx zero flag
- *
- * Output:
- * eax uncopied bytes or 0 if successfull.
- *
- * Only 4GB of copy is supported. This shouldn't be a problem
- * because the kernel normally only writes from/to page sized chunks
- * even if user space passed a longer buffer.
- * And more would be dangerous because both Intel and AMD have
- * errata with rep movsq > 4GB. If someone feels the need to fix
- * this please consider this.
- */
+/* Some CPUs run faster using the string copy instructions.
+ * This is also a lot simpler. Use them when possible.
+ *
+ * Only 4GB of copy is supported. This shouldn't be a problem
+ * because the kernel normally only writes from/to page sized chunks
+ * even if user space passed a longer buffer.
+ * And more would be dangerous because both Intel and AMD have
+ * errata with rep movsq > 4GB. If someone feels the need to fix
+ * this please consider this.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
ENTRY(copy_user_generic_string)
CFI_STARTPROC
- movl %ecx,%r8d /* save zero flag */
+ ASM_STAC
+ cmpl $8,%edx
+ jb 2f /* less than 8 bytes, go to byte copy loop */
+ ALIGN_DESTINATION
movl %edx,%ecx
shrl $3,%ecx
- andl $7,%edx
- jz 10f
-1: rep
- movsq
- movl %edx,%ecx
-2: rep
+ andl $7,%edx
+1: rep
+ movsq
+2: movl %edx,%ecx
+3: rep
movsb
-9: movl %ecx,%eax
+ xorl %eax,%eax
+ ASM_CLAC
ret
- /* multiple of 8 byte */
-10: rep
- movsq
- xor %eax,%eax
- ret
+ .section .fixup,"ax"
+11: leal (%rdx,%rcx,8),%ecx
+12: movl %ecx,%edx /* ecx is zerorest also */
+ jmp copy_user_handle_tail
+ .previous
- /* exception handling */
-3: lea (%rdx,%rcx,8),%rax /* exception on quad loop */
- jmp 6f
-5: movl %ecx,%eax /* exception on byte loop */
- /* eax: left over bytes */
-6: testl %r8d,%r8d /* zero flag set? */
- jz 7f
- movl %eax,%ecx /* initialize x86 loop counter */
- push %rax
- xorl %eax,%eax
-8: rep
- stosb /* zero the rest */
-11: pop %rax
-7: ret
+ _ASM_EXTABLE(1b,11b)
+ _ASM_EXTABLE(3b,12b)
CFI_ENDPROC
-END(copy_user_generic_c)
+ENDPROC(copy_user_generic_string)
+
+/*
+ * Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
+ * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(copy_user_enhanced_fast_string)
+ CFI_STARTPROC
+ ASM_STAC
+ movl %edx,%ecx
+1: rep
+ movsb
+ xorl %eax,%eax
+ ASM_CLAC
+ ret
- .section __ex_table,"a"
- .quad 1b,3b
- .quad 2b,5b
- .quad 8b,11b
- .quad 10b,3b
+ .section .fixup,"ax"
+12: movl %ecx,%edx /* ecx is zerorest also */
+ jmp copy_user_handle_tail
.previous
+
+ _ASM_EXTABLE(1b,12b)
+ CFI_ENDPROC
+ENDPROC(copy_user_enhanced_fast_string)
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
index 5196762b3b0..6a4f43c2d9e 100644
--- a/arch/x86/lib/copy_user_nocache_64.S
+++ b/arch/x86/lib/copy_user_nocache_64.S
@@ -1,4 +1,6 @@
-/* Copyright 2002 Andi Kleen, SuSE Labs.
+/*
+ * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
+ * Copyright 2002 Andi Kleen, SuSE Labs.
* Subject to the GNU Public License v2.
*
* Functions to copy from and to user space.
@@ -12,207 +14,123 @@
#include <asm/current.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
-#include <asm/cpufeature.h>
-
-/*
- * copy_user_nocache - Uncached memory copy with exception handling
- * This will force destination/source out of cache for more performance.
- *
- * Input:
- * rdi destination
- * rsi source
- * rdx count
- * rcx zero flag when 1 zero on exception
- *
- * Output:
- * eax uncopied bytes or 0 if successful.
- */
-ENTRY(__copy_user_nocache)
- CFI_STARTPROC
- pushq %rbx
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rbx, 0
- pushq %rcx /* save zero flag */
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rcx, 0
-
- xorl %eax,%eax /* zero for the exception handler */
+#include <asm/asm.h>
+#include <asm/smap.h>
+ .macro ALIGN_DESTINATION
#ifdef FIX_ALIGNMENT
/* check for bad alignment of destination */
movl %edi,%ecx
andl $7,%ecx
- jnz .Lbad_alignment
-.Lafter_bad_alignment:
-#endif
-
- movq %rdx,%rcx
-
- movl $64,%ebx
- shrq $6,%rdx
- decq %rdx
- js .Lhandle_tail
-
- .p2align 4
-.Lloop:
-.Ls1: movq (%rsi),%r11
-.Ls2: movq 1*8(%rsi),%r8
-.Ls3: movq 2*8(%rsi),%r9
-.Ls4: movq 3*8(%rsi),%r10
-.Ld1: movnti %r11,(%rdi)
-.Ld2: movnti %r8,1*8(%rdi)
-.Ld3: movnti %r9,2*8(%rdi)
-.Ld4: movnti %r10,3*8(%rdi)
-
-.Ls5: movq 4*8(%rsi),%r11
-.Ls6: movq 5*8(%rsi),%r8
-.Ls7: movq 6*8(%rsi),%r9
-.Ls8: movq 7*8(%rsi),%r10
-.Ld5: movnti %r11,4*8(%rdi)
-.Ld6: movnti %r8,5*8(%rdi)
-.Ld7: movnti %r9,6*8(%rdi)
-.Ld8: movnti %r10,7*8(%rdi)
+ jz 102f /* already aligned */
+ subl $8,%ecx
+ negl %ecx
+ subl %ecx,%edx
+100: movb (%rsi),%al
+101: movb %al,(%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz 100b
+102:
+ .section .fixup,"ax"
+103: addl %ecx,%edx /* ecx is zerorest also */
+ jmp copy_user_handle_tail
+ .previous
- dec %rdx
+ _ASM_EXTABLE(100b,103b)
+ _ASM_EXTABLE(101b,103b)
+#endif
+ .endm
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ * This will force destination/source out of cache for more performance.
+ */
+ENTRY(__copy_user_nocache)
+ CFI_STARTPROC
+ ASM_STAC
+ cmpl $8,%edx
+ jb 20f /* less then 8 bytes, go to byte copy loop */
+ ALIGN_DESTINATION
+ movl %edx,%ecx
+ andl $63,%edx
+ shrl $6,%ecx
+ jz 17f
+1: movq (%rsi),%r8
+2: movq 1*8(%rsi),%r9
+3: movq 2*8(%rsi),%r10
+4: movq 3*8(%rsi),%r11
+5: movnti %r8,(%rdi)
+6: movnti %r9,1*8(%rdi)
+7: movnti %r10,2*8(%rdi)
+8: movnti %r11,3*8(%rdi)
+9: movq 4*8(%rsi),%r8
+10: movq 5*8(%rsi),%r9
+11: movq 6*8(%rsi),%r10
+12: movq 7*8(%rsi),%r11
+13: movnti %r8,4*8(%rdi)
+14: movnti %r9,5*8(%rdi)
+15: movnti %r10,6*8(%rdi)
+16: movnti %r11,7*8(%rdi)
leaq 64(%rsi),%rsi
leaq 64(%rdi),%rdi
-
- jns .Lloop
-
- .p2align 4
-.Lhandle_tail:
- movl %ecx,%edx
- andl $63,%ecx
- shrl $3,%ecx
- jz .Lhandle_7
- movl $8,%ebx
- .p2align 4
-.Lloop_8:
-.Ls9: movq (%rsi),%r8
-.Ld9: movnti %r8,(%rdi)
decl %ecx
- leaq 8(%rdi),%rdi
+ jnz 1b
+17: movl %edx,%ecx
+ andl $7,%edx
+ shrl $3,%ecx
+ jz 20f
+18: movq (%rsi),%r8
+19: movnti %r8,(%rdi)
leaq 8(%rsi),%rsi
- jnz .Lloop_8
-
-.Lhandle_7:
+ leaq 8(%rdi),%rdi
+ decl %ecx
+ jnz 18b
+20: andl %edx,%edx
+ jz 23f
movl %edx,%ecx
- andl $7,%ecx
- jz .Lende
- .p2align 4
-.Lloop_1:
-.Ls10: movb (%rsi),%bl
-.Ld10: movb %bl,(%rdi)
- incq %rdi
+21: movb (%rsi),%al
+22: movb %al,(%rdi)
incq %rsi
+ incq %rdi
decl %ecx
- jnz .Lloop_1
-
- CFI_REMEMBER_STATE
-.Lende:
- popq %rcx
- CFI_ADJUST_CFA_OFFSET -8
- CFI_RESTORE %rcx
- popq %rbx
- CFI_ADJUST_CFA_OFFSET -8
- CFI_RESTORE rbx
+ jnz 21b
+23: xorl %eax,%eax
+ ASM_CLAC
sfence
ret
- CFI_RESTORE_STATE
-
-#ifdef FIX_ALIGNMENT
- /* align destination */
- .p2align 4
-.Lbad_alignment:
- movl $8,%r9d
- subl %ecx,%r9d
- movl %r9d,%ecx
- cmpq %r9,%rdx
- jz .Lhandle_7
- js .Lhandle_7
-.Lalign_1:
-.Ls11: movb (%rsi),%bl
-.Ld11: movb %bl,(%rdi)
- incq %rsi
- incq %rdi
- decl %ecx
- jnz .Lalign_1
- subq %r9,%rdx
- jmp .Lafter_bad_alignment
-#endif
- /* table sorted by exception address */
- .section __ex_table,"a"
- .align 8
- .quad .Ls1,.Ls1e
- .quad .Ls2,.Ls2e
- .quad .Ls3,.Ls3e
- .quad .Ls4,.Ls4e
- .quad .Ld1,.Ls1e
- .quad .Ld2,.Ls2e
- .quad .Ld3,.Ls3e
- .quad .Ld4,.Ls4e
- .quad .Ls5,.Ls5e
- .quad .Ls6,.Ls6e
- .quad .Ls7,.Ls7e
- .quad .Ls8,.Ls8e
- .quad .Ld5,.Ls5e
- .quad .Ld6,.Ls6e
- .quad .Ld7,.Ls7e
- .quad .Ld8,.Ls8e
- .quad .Ls9,.Le_quad
- .quad .Ld9,.Le_quad
- .quad .Ls10,.Le_byte
- .quad .Ld10,.Le_byte
-#ifdef FIX_ALIGNMENT
- .quad .Ls11,.Lzero_rest
- .quad .Ld11,.Lzero_rest
-#endif
- .quad .Le5,.Le_zero
+ .section .fixup,"ax"
+30: shll $6,%ecx
+ addl %ecx,%edx
+ jmp 60f
+40: lea (%rdx,%rcx,8),%rdx
+ jmp 60f
+50: movl %ecx,%edx
+60: sfence
+ jmp copy_user_handle_tail
.previous
- /* compute 64-offset for main loop. 8 bytes accuracy with error on the
- pessimistic side. this is gross. it would be better to fix the
- interface. */
- /* eax: zero, ebx: 64 */
-.Ls1e: addl $8,%eax
-.Ls2e: addl $8,%eax
-.Ls3e: addl $8,%eax
-.Ls4e: addl $8,%eax
-.Ls5e: addl $8,%eax
-.Ls6e: addl $8,%eax
-.Ls7e: addl $8,%eax
-.Ls8e: addl $8,%eax
- addq %rbx,%rdi /* +64 */
- subq %rax,%rdi /* correct destination with computed offset */
-
- shlq $6,%rdx /* loop counter * 64 (stride length) */
- addq %rax,%rdx /* add offset to loopcnt */
- andl $63,%ecx /* remaining bytes */
- addq %rcx,%rdx /* add them */
- jmp .Lzero_rest
-
- /* exception on quad word loop in tail handling */
- /* ecx: loopcnt/8, %edx: length, rdi: correct */
-.Le_quad:
- shll $3,%ecx
- andl $7,%edx
- addl %ecx,%edx
- /* edx: bytes to zero, rdi: dest, eax:zero */
-.Lzero_rest:
- cmpl $0,(%rsp) /* zero flag set? */
- jz .Le_zero
- movq %rdx,%rcx
-.Le_byte:
- xorl %eax,%eax
-.Le5: rep
- stosb
- /* when there is another exception while zeroing the rest just return */
-.Le_zero:
- movq %rdx,%rax
- jmp .Lende
+ _ASM_EXTABLE(1b,30b)
+ _ASM_EXTABLE(2b,30b)
+ _ASM_EXTABLE(3b,30b)
+ _ASM_EXTABLE(4b,30b)
+ _ASM_EXTABLE(5b,30b)
+ _ASM_EXTABLE(6b,30b)
+ _ASM_EXTABLE(7b,30b)
+ _ASM_EXTABLE(8b,30b)
+ _ASM_EXTABLE(9b,30b)
+ _ASM_EXTABLE(10b,30b)
+ _ASM_EXTABLE(11b,30b)
+ _ASM_EXTABLE(12b,30b)
+ _ASM_EXTABLE(13b,30b)
+ _ASM_EXTABLE(14b,30b)
+ _ASM_EXTABLE(15b,30b)
+ _ASM_EXTABLE(16b,30b)
+ _ASM_EXTABLE(18b,40b)
+ _ASM_EXTABLE(19b,40b)
+ _ASM_EXTABLE(21b,50b)
+ _ASM_EXTABLE(22b,50b)
CFI_ENDPROC
ENDPROC(__copy_user_nocache)
-
-
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index f0dba36578e..2419d5fefae 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -1,6 +1,6 @@
/*
- * Copyright 2002,2003 Andi Kleen, SuSE Labs.
- *
+ * Copyright 2002, 2003 Andi Kleen, SuSE Labs.
+ *
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file COPYING in the main directory of this archive
* for more details. No warranty for anything given at all.
@@ -8,85 +8,77 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/errno.h>
+#include <asm/asm.h>
/*
* Checksum copy with exception handling.
- * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
+ * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
* destination is zeroed.
- *
+ *
* Input
* rdi source
* rsi destination
* edx len (32bit)
- * ecx sum (32bit)
+ * ecx sum (32bit)
* r8 src_err_ptr (int)
* r9 dst_err_ptr (int)
*
* Output
* eax 64bit sum. undefined in case of exception.
- *
- * Wrappers need to take care of valid exception sum and zeroing.
+ *
+ * Wrappers need to take care of valid exception sum and zeroing.
* They also should align source or destination to 8 bytes.
*/
.macro source
10:
- .section __ex_table,"a"
- .align 8
- .quad 10b,.Lbad_source
- .previous
+ _ASM_EXTABLE(10b, .Lbad_source)
.endm
-
+
.macro dest
20:
- .section __ex_table,"a"
- .align 8
- .quad 20b,.Lbad_dest
- .previous
+ _ASM_EXTABLE(20b, .Lbad_dest)
.endm
-
+
.macro ignore L=.Lignore
30:
- .section __ex_table,"a"
- .align 8
- .quad 30b,\L
- .previous
+ _ASM_EXTABLE(30b, \L)
.endm
-
-
+
+
ENTRY(csum_partial_copy_generic)
CFI_STARTPROC
- cmpl $3*64,%edx
- jle .Lignore
+ cmpl $3*64, %edx
+ jle .Lignore
-.Lignore:
- subq $7*8,%rsp
+.Lignore:
+ subq $7*8, %rsp
CFI_ADJUST_CFA_OFFSET 7*8
- movq %rbx,2*8(%rsp)
+ movq %rbx, 2*8(%rsp)
CFI_REL_OFFSET rbx, 2*8
- movq %r12,3*8(%rsp)
+ movq %r12, 3*8(%rsp)
CFI_REL_OFFSET r12, 3*8
- movq %r14,4*8(%rsp)
+ movq %r14, 4*8(%rsp)
CFI_REL_OFFSET r14, 4*8
- movq %r13,5*8(%rsp)
+ movq %r13, 5*8(%rsp)
CFI_REL_OFFSET r13, 5*8
- movq %rbp,6*8(%rsp)
+ movq %rbp, 6*8(%rsp)
CFI_REL_OFFSET rbp, 6*8
- movq %r8,(%rsp)
- movq %r9,1*8(%rsp)
-
- movl %ecx,%eax
- movl %edx,%ecx
+ movq %r8, (%rsp)
+ movq %r9, 1*8(%rsp)
- xorl %r9d,%r9d
- movq %rcx,%r12
+ movl %ecx, %eax
+ movl %edx, %ecx
- shrq $6,%r12
- jz .Lhandle_tail /* < 64 */
+ xorl %r9d, %r9d
+ movq %rcx, %r12
+
+ shrq $6, %r12
+ jz .Lhandle_tail /* < 64 */
clc
-
+
/* main loop. clear in 64 byte blocks */
/* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
/* r11: temp3, rdx: temp4, r12 loopcnt */
@@ -94,156 +86,156 @@ ENTRY(csum_partial_copy_generic)
.p2align 4
.Lloop:
source
- movq (%rdi),%rbx
+ movq (%rdi), %rbx
source
- movq 8(%rdi),%r8
+ movq 8(%rdi), %r8
source
- movq 16(%rdi),%r11
+ movq 16(%rdi), %r11
source
- movq 24(%rdi),%rdx
+ movq 24(%rdi), %rdx
source
- movq 32(%rdi),%r10
+ movq 32(%rdi), %r10
source
- movq 40(%rdi),%rbp
+ movq 40(%rdi), %rbp
source
- movq 48(%rdi),%r14
+ movq 48(%rdi), %r14
source
- movq 56(%rdi),%r13
-
+ movq 56(%rdi), %r13
+
ignore 2f
prefetcht0 5*64(%rdi)
-2:
- adcq %rbx,%rax
- adcq %r8,%rax
- adcq %r11,%rax
- adcq %rdx,%rax
- adcq %r10,%rax
- adcq %rbp,%rax
- adcq %r14,%rax
- adcq %r13,%rax
+2:
+ adcq %rbx, %rax
+ adcq %r8, %rax
+ adcq %r11, %rax
+ adcq %rdx, %rax
+ adcq %r10, %rax
+ adcq %rbp, %rax
+ adcq %r14, %rax
+ adcq %r13, %rax
decl %r12d
-
+
dest
- movq %rbx,(%rsi)
+ movq %rbx, (%rsi)
dest
- movq %r8,8(%rsi)
+ movq %r8, 8(%rsi)
dest
- movq %r11,16(%rsi)
+ movq %r11, 16(%rsi)
dest
- movq %rdx,24(%rsi)
+ movq %rdx, 24(%rsi)
dest
- movq %r10,32(%rsi)
+ movq %r10, 32(%rsi)
dest
- movq %rbp,40(%rsi)
+ movq %rbp, 40(%rsi)
dest
- movq %r14,48(%rsi)
+ movq %r14, 48(%rsi)
dest
- movq %r13,56(%rsi)
-
+ movq %r13, 56(%rsi)
+
3:
-
- leaq 64(%rdi),%rdi
- leaq 64(%rsi),%rsi
- jnz .Lloop
+ leaq 64(%rdi), %rdi
+ leaq 64(%rsi), %rsi
- adcq %r9,%rax
+ jnz .Lloop
- /* do last upto 56 bytes */
+ adcq %r9, %rax
+
+ /* do last up to 56 bytes */
.Lhandle_tail:
/* ecx: count */
- movl %ecx,%r10d
- andl $63,%ecx
- shrl $3,%ecx
- jz .Lfold
+ movl %ecx, %r10d
+ andl $63, %ecx
+ shrl $3, %ecx
+ jz .Lfold
clc
.p2align 4
-.Lloop_8:
+.Lloop_8:
source
- movq (%rdi),%rbx
- adcq %rbx,%rax
+ movq (%rdi), %rbx
+ adcq %rbx, %rax
decl %ecx
dest
- movq %rbx,(%rsi)
- leaq 8(%rsi),%rsi /* preserve carry */
- leaq 8(%rdi),%rdi
+ movq %rbx, (%rsi)
+ leaq 8(%rsi), %rsi /* preserve carry */
+ leaq 8(%rdi), %rdi
jnz .Lloop_8
- adcq %r9,%rax /* add in carry */
+ adcq %r9, %rax /* add in carry */
.Lfold:
/* reduce checksum to 32bits */
- movl %eax,%ebx
- shrq $32,%rax
- addl %ebx,%eax
- adcl %r9d,%eax
+ movl %eax, %ebx
+ shrq $32, %rax
+ addl %ebx, %eax
+ adcl %r9d, %eax
- /* do last upto 6 bytes */
+ /* do last up to 6 bytes */
.Lhandle_7:
- movl %r10d,%ecx
- andl $7,%ecx
- shrl $1,%ecx
+ movl %r10d, %ecx
+ andl $7, %ecx
+ shrl $1, %ecx
jz .Lhandle_1
- movl $2,%edx
- xorl %ebx,%ebx
- clc
+ movl $2, %edx
+ xorl %ebx, %ebx
+ clc
.p2align 4
-.Lloop_1:
+.Lloop_1:
source
- movw (%rdi),%bx
- adcl %ebx,%eax
+ movw (%rdi), %bx
+ adcl %ebx, %eax
decl %ecx
dest
- movw %bx,(%rsi)
- leaq 2(%rdi),%rdi
- leaq 2(%rsi),%rsi
+ movw %bx, (%rsi)
+ leaq 2(%rdi), %rdi
+ leaq 2(%rsi), %rsi
jnz .Lloop_1
- adcl %r9d,%eax /* add in carry */
-
+ adcl %r9d, %eax /* add in carry */
+
/* handle last odd byte */
.Lhandle_1:
- testl $1,%r10d
+ testl $1, %r10d
jz .Lende
- xorl %ebx,%ebx
+ xorl %ebx, %ebx
source
- movb (%rdi),%bl
+ movb (%rdi), %bl
dest
- movb %bl,(%rsi)
- addl %ebx,%eax
- adcl %r9d,%eax /* carry */
-
+ movb %bl, (%rsi)
+ addl %ebx, %eax
+ adcl %r9d, %eax /* carry */
+
CFI_REMEMBER_STATE
.Lende:
- movq 2*8(%rsp),%rbx
+ movq 2*8(%rsp), %rbx
CFI_RESTORE rbx
- movq 3*8(%rsp),%r12
+ movq 3*8(%rsp), %r12
CFI_RESTORE r12
- movq 4*8(%rsp),%r14
+ movq 4*8(%rsp), %r14
CFI_RESTORE r14
- movq 5*8(%rsp),%r13
+ movq 5*8(%rsp), %r13
CFI_RESTORE r13
- movq 6*8(%rsp),%rbp
+ movq 6*8(%rsp), %rbp
CFI_RESTORE rbp
- addq $7*8,%rsp
+ addq $7*8, %rsp
CFI_ADJUST_CFA_OFFSET -7*8
ret
CFI_RESTORE_STATE
/* Exception handlers. Very simple, zeroing is done in the wrappers */
.Lbad_source:
- movq (%rsp),%rax
- testq %rax,%rax
+ movq (%rsp), %rax
+ testq %rax, %rax
jz .Lende
- movl $-EFAULT,(%rax)
+ movl $-EFAULT, (%rax)
jmp .Lende
-
+
.Lbad_dest:
- movq 8(%rsp),%rax
- testq %rax,%rax
- jz .Lende
- movl $-EFAULT,(%rax)
+ movq 8(%rsp), %rax
+ testq %rax, %rax
+ jz .Lende
+ movl $-EFAULT, (%rax)
jmp .Lende
CFI_ENDPROC
ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index bc503f50690..9845371c5c3 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -84,7 +84,7 @@ static unsigned do_csum(const unsigned char *buff, unsigned len)
count64--;
}
- /* last upto 7 8byte blocks */
+ /* last up to 7 8byte blocks */
count %= 8;
while (count) {
asm("addq %1,%0\n\t"
@@ -136,8 +136,6 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
(__force u32)sum);
}
-EXPORT_SYMBOL(csum_partial);
-
/*
* this routine is used for miscellaneous IP-like checksums, mainly
* in icmp.c
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index 459b58a8a15..7609e0e421e 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -6,6 +6,7 @@
*/
#include <asm/checksum.h>
#include <linux/module.h>
+#include <asm/smap.h>
/**
* csum_partial_copy_from_user - Copy and checksum from user space.
@@ -52,8 +53,10 @@ csum_partial_copy_from_user(const void __user *src, void *dst,
len -= 2;
}
}
+ stac();
isum = csum_partial_copy_generic((__force const void *)src,
dst, len, isum, errp, NULL);
+ clac();
if (unlikely(*errp))
goto out_err;
@@ -82,6 +85,8 @@ __wsum
csum_partial_copy_to_user(const void *src, void __user *dst,
int len, __wsum isum, int *errp)
{
+ __wsum ret;
+
might_sleep();
if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) {
@@ -105,8 +110,11 @@ csum_partial_copy_to_user(const void *src, void __user *dst,
}
*errp = 0;
- return csum_partial_copy_generic(src, (void __force *)dst,
- len, isum, NULL, errp);
+ stac();
+ ret = csum_partial_copy_generic(src, (void __force *)dst,
+ len, isum, NULL, errp);
+ clac();
+ return ret;
}
EXPORT_SYMBOL(csum_partial_copy_to_user);
@@ -115,7 +123,7 @@ EXPORT_SYMBOL(csum_partial_copy_to_user);
* @src: source address
* @dst: destination address
* @len: number of bytes to be copied.
- * @isum: initial sum that is added into the result (32bit unfolded)
+ * @sum: initial sum that is added into the result (32bit unfolded)
*
* Returns an 32bit unfolded checksum of the buffer.
*/
diff --git a/arch/x86/lib/delay_32.c b/arch/x86/lib/delay.c
index 4535e6d147a..39d6a3db0b9 100644
--- a/arch/x86/lib/delay_32.c
+++ b/arch/x86/lib/delay.c
@@ -3,6 +3,7 @@
*
* Copyright (C) 1993 Linus Torvalds
* Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ * Copyright (C) 2008 Jiri Hladky <hladky _dot_ jiri _at_ gmail _dot_ com>
*
* The __delay function must _NOT_ be inlined as its execution time
* depends wildly on alignment on many x86 processors. The additional
@@ -15,7 +16,6 @@
#include <linux/timex.h>
#include <linux/preempt.h>
#include <linux/delay.h>
-#include <linux/init.h>
#include <asm/processor.h>
#include <asm/delay.h>
@@ -28,29 +28,61 @@
/* simple loop based delay: */
static void delay_loop(unsigned long loops)
{
- int d0;
-
- __asm__ __volatile__(
- "\tjmp 1f\n"
- ".align 16\n"
- "1:\tjmp 2f\n"
- ".align 16\n"
- "2:\tdecl %0\n\tjns 2b"
- :"=&a" (d0)
- :"0" (loops));
+ asm volatile(
+ " test %0,%0 \n"
+ " jz 3f \n"
+ " jmp 1f \n"
+
+ ".align 16 \n"
+ "1: jmp 2f \n"
+
+ ".align 16 \n"
+ "2: dec %0 \n"
+ " jnz 2b \n"
+ "3: dec %0 \n"
+
+ : /* we don't need output */
+ :"a" (loops)
+ );
}
/* TSC based delay: */
-static void delay_tsc(unsigned long loops)
+static void delay_tsc(unsigned long __loops)
{
- unsigned long bclock, now;
+ u32 bclock, now, loops = __loops;
+ int cpu;
- preempt_disable(); /* TSC's are per-cpu */
+ preempt_disable();
+ cpu = smp_processor_id();
+ rdtsc_barrier();
rdtscl(bclock);
- do {
- rep_nop();
+ for (;;) {
+ rdtsc_barrier();
rdtscl(now);
- } while ((now-bclock) < loops);
+ if ((now - bclock) >= loops)
+ break;
+
+ /* Allow RT tasks to run */
+ preempt_enable();
+ rep_nop();
+ preempt_disable();
+
+ /*
+ * It is possible that we moved to another CPU, and
+ * since TSC's are per-cpu we need to calculate
+ * that. The delay must guarantee that we wait "at
+ * least" the amount of time. Being moved to another
+ * CPU could make the wait longer but we just need to
+ * make sure we waited long enough. Rebalance the
+ * counter for this CPU.
+ */
+ if (unlikely(cpu != smp_processor_id())) {
+ loops -= (now - bclock);
+ cpu = smp_processor_id();
+ rdtsc_barrier();
+ rdtscl(bclock);
+ }
+ }
preempt_enable();
}
@@ -65,10 +97,10 @@ void use_tsc_delay(void)
delay_fn = delay_tsc;
}
-int __devinit read_current_timer(unsigned long *timer_val)
+int read_current_timer(unsigned long *timer_val)
{
if (delay_fn == delay_tsc) {
- rdtscl(*timer_val);
+ rdtscll(*timer_val);
return 0;
}
return -1;
@@ -78,31 +110,30 @@ void __delay(unsigned long loops)
{
delay_fn(loops);
}
+EXPORT_SYMBOL(__delay);
inline void __const_udelay(unsigned long xloops)
{
int d0;
xloops *= 4;
- __asm__("mull %0"
+ asm("mull %%edx"
:"=d" (xloops), "=&a" (d0)
:"1" (xloops), "0"
- (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4)));
+ (this_cpu_read(cpu_info.loops_per_jiffy) * (HZ/4)));
__delay(++xloops);
}
+EXPORT_SYMBOL(__const_udelay);
void __udelay(unsigned long usecs)
{
__const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
}
+EXPORT_SYMBOL(__udelay);
void __ndelay(unsigned long nsecs)
{
__const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
}
-
-EXPORT_SYMBOL(__delay);
-EXPORT_SYMBOL(__const_udelay);
-EXPORT_SYMBOL(__udelay);
EXPORT_SYMBOL(__ndelay);
diff --git a/arch/x86/lib/delay_64.c b/arch/x86/lib/delay_64.c
deleted file mode 100644
index bbc61051851..00000000000
--- a/arch/x86/lib/delay_64.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Precise Delay Loops for x86-64
- *
- * Copyright (C) 1993 Linus Torvalds
- * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
- *
- * The __delay function must _NOT_ be inlined as its execution time
- * depends wildly on alignment on many x86 processors.
- */
-
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/timex.h>
-#include <linux/preempt.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-
-#include <asm/delay.h>
-#include <asm/msr.h>
-
-#ifdef CONFIG_SMP
-#include <asm/smp.h>
-#endif
-
-int __devinit read_current_timer(unsigned long *timer_value)
-{
- rdtscll(*timer_value);
- return 0;
-}
-
-void __delay(unsigned long loops)
-{
- unsigned bclock, now;
-
- preempt_disable(); /* TSC's are pre-cpu */
- rdtscl(bclock);
- do {
- rep_nop();
- rdtscl(now);
- }
- while ((now-bclock) < loops);
- preempt_enable();
-}
-EXPORT_SYMBOL(__delay);
-
-inline void __const_udelay(unsigned long xloops)
-{
- __delay(((xloops * HZ *
- cpu_data(raw_smp_processor_id()).loops_per_jiffy) >> 32) + 1);
-}
-EXPORT_SYMBOL(__const_udelay);
-
-void __udelay(unsigned long usecs)
-{
- __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
-}
-EXPORT_SYMBOL(__udelay);
-
-void __ndelay(unsigned long nsecs)
-{
- __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
-}
-EXPORT_SYMBOL(__ndelay);
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
new file mode 100644
index 00000000000..a4512359656
--- /dev/null
+++ b/arch/x86/lib/getuser.S
@@ -0,0 +1,140 @@
+/*
+ * __get_user functions.
+ *
+ * (C) Copyright 1998 Linus Torvalds
+ * (C) Copyright 2005 Andi Kleen
+ * (C) Copyright 2008 Glauber Costa
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+
+/*
+ * __get_user_X
+ *
+ * Inputs: %[r|e]ax contains the address.
+ *
+ * Outputs: %[r|e]ax is error code (0 or -EFAULT)
+ * %[r|e]dx contains zero-extended value
+ * %ecx contains the high half for 32-bit __get_user_8
+ *
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/page_types.h>
+#include <asm/errno.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
+
+ .text
+ENTRY(__get_user_1)
+ CFI_STARTPROC
+ GET_THREAD_INFO(%_ASM_DX)
+ cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
+ jae bad_get_user
+ ASM_STAC
+1: movzbl (%_ASM_AX),%edx
+ xor %eax,%eax
+ ASM_CLAC
+ ret
+ CFI_ENDPROC
+ENDPROC(__get_user_1)
+
+ENTRY(__get_user_2)
+ CFI_STARTPROC
+ add $1,%_ASM_AX
+ jc bad_get_user
+ GET_THREAD_INFO(%_ASM_DX)
+ cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
+ jae bad_get_user
+ ASM_STAC
+2: movzwl -1(%_ASM_AX),%edx
+ xor %eax,%eax
+ ASM_CLAC
+ ret
+ CFI_ENDPROC
+ENDPROC(__get_user_2)
+
+ENTRY(__get_user_4)
+ CFI_STARTPROC
+ add $3,%_ASM_AX
+ jc bad_get_user
+ GET_THREAD_INFO(%_ASM_DX)
+ cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
+ jae bad_get_user
+ ASM_STAC
+3: movl -3(%_ASM_AX),%edx
+ xor %eax,%eax
+ ASM_CLAC
+ ret
+ CFI_ENDPROC
+ENDPROC(__get_user_4)
+
+ENTRY(__get_user_8)
+ CFI_STARTPROC
+#ifdef CONFIG_X86_64
+ add $7,%_ASM_AX
+ jc bad_get_user
+ GET_THREAD_INFO(%_ASM_DX)
+ cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
+ jae bad_get_user
+ ASM_STAC
+4: movq -7(%_ASM_AX),%rdx
+ xor %eax,%eax
+ ASM_CLAC
+ ret
+#else
+ add $7,%_ASM_AX
+ jc bad_get_user_8
+ GET_THREAD_INFO(%_ASM_DX)
+ cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
+ jae bad_get_user_8
+ ASM_STAC
+4: movl -7(%_ASM_AX),%edx
+5: movl -3(%_ASM_AX),%ecx
+ xor %eax,%eax
+ ASM_CLAC
+ ret
+#endif
+ CFI_ENDPROC
+ENDPROC(__get_user_8)
+
+
+bad_get_user:
+ CFI_STARTPROC
+ xor %edx,%edx
+ mov $(-EFAULT),%_ASM_AX
+ ASM_CLAC
+ ret
+ CFI_ENDPROC
+END(bad_get_user)
+
+#ifdef CONFIG_X86_32
+bad_get_user_8:
+ CFI_STARTPROC
+ xor %edx,%edx
+ xor %ecx,%ecx
+ mov $(-EFAULT),%_ASM_AX
+ ASM_CLAC
+ ret
+ CFI_ENDPROC
+END(bad_get_user_8)
+#endif
+
+ _ASM_EXTABLE(1b,bad_get_user)
+ _ASM_EXTABLE(2b,bad_get_user)
+ _ASM_EXTABLE(3b,bad_get_user)
+#ifdef CONFIG_X86_64
+ _ASM_EXTABLE(4b,bad_get_user)
+#else
+ _ASM_EXTABLE(4b,bad_get_user_8)
+ _ASM_EXTABLE(5b,bad_get_user_8)
+#endif
diff --git a/arch/x86/lib/getuser_32.S b/arch/x86/lib/getuser_32.S
deleted file mode 100644
index 6d84b53f12a..00000000000
--- a/arch/x86/lib/getuser_32.S
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * __get_user functions.
- *
- * (C) Copyright 1998 Linus Torvalds
- *
- * These functions have a non-standard call interface
- * to make them more efficient, especially as they
- * return an error value in addition to the "real"
- * return value.
- */
-#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-#include <asm/thread_info.h>
-
-
-/*
- * __get_user_X
- *
- * Inputs: %eax contains the address
- *
- * Outputs: %eax is error code (0 or -EFAULT)
- * %edx contains zero-extended value
- *
- * These functions should not modify any other registers,
- * as they get called from within inline assembly.
- */
-
-.text
-ENTRY(__get_user_1)
- CFI_STARTPROC
- GET_THREAD_INFO(%edx)
- cmpl TI_addr_limit(%edx),%eax
- jae bad_get_user
-1: movzbl (%eax),%edx
- xorl %eax,%eax
- ret
- CFI_ENDPROC
-ENDPROC(__get_user_1)
-
-ENTRY(__get_user_2)
- CFI_STARTPROC
- addl $1,%eax
- jc bad_get_user
- GET_THREAD_INFO(%edx)
- cmpl TI_addr_limit(%edx),%eax
- jae bad_get_user
-2: movzwl -1(%eax),%edx
- xorl %eax,%eax
- ret
- CFI_ENDPROC
-ENDPROC(__get_user_2)
-
-ENTRY(__get_user_4)
- CFI_STARTPROC
- addl $3,%eax
- jc bad_get_user
- GET_THREAD_INFO(%edx)
- cmpl TI_addr_limit(%edx),%eax
- jae bad_get_user
-3: movl -3(%eax),%edx
- xorl %eax,%eax
- ret
- CFI_ENDPROC
-ENDPROC(__get_user_4)
-
-bad_get_user:
- CFI_STARTPROC
- xorl %edx,%edx
- movl $-14,%eax
- ret
- CFI_ENDPROC
-END(bad_get_user)
-
-.section __ex_table,"a"
- .long 1b,bad_get_user
- .long 2b,bad_get_user
- .long 3b,bad_get_user
-.previous
diff --git a/arch/x86/lib/getuser_64.S b/arch/x86/lib/getuser_64.S
deleted file mode 100644
index 5448876261f..00000000000
--- a/arch/x86/lib/getuser_64.S
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * __get_user functions.
- *
- * (C) Copyright 1998 Linus Torvalds
- * (C) Copyright 2005 Andi Kleen
- *
- * These functions have a non-standard call interface
- * to make them more efficient, especially as they
- * return an error value in addition to the "real"
- * return value.
- */
-
-/*
- * __get_user_X
- *
- * Inputs: %rcx contains the address.
- * The register is modified, but all changes are undone
- * before returning because the C code doesn't know about it.
- *
- * Outputs: %rax is error code (0 or -EFAULT)
- * %rdx contains zero-extended value
- *
- * %r8 is destroyed.
- *
- * These functions should not modify any other registers,
- * as they get called from within inline assembly.
- */
-
-#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-#include <asm/page.h>
-#include <asm/errno.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-
- .text
-ENTRY(__get_user_1)
- CFI_STARTPROC
- GET_THREAD_INFO(%r8)
- cmpq threadinfo_addr_limit(%r8),%rcx
- jae bad_get_user
-1: movzb (%rcx),%edx
- xorl %eax,%eax
- ret
- CFI_ENDPROC
-ENDPROC(__get_user_1)
-
-ENTRY(__get_user_2)
- CFI_STARTPROC
- GET_THREAD_INFO(%r8)
- addq $1,%rcx
- jc 20f
- cmpq threadinfo_addr_limit(%r8),%rcx
- jae 20f
- decq %rcx
-2: movzwl (%rcx),%edx
- xorl %eax,%eax
- ret
-20: decq %rcx
- jmp bad_get_user
- CFI_ENDPROC
-ENDPROC(__get_user_2)
-
-ENTRY(__get_user_4)
- CFI_STARTPROC
- GET_THREAD_INFO(%r8)
- addq $3,%rcx
- jc 30f
- cmpq threadinfo_addr_limit(%r8),%rcx
- jae 30f
- subq $3,%rcx
-3: movl (%rcx),%edx
- xorl %eax,%eax
- ret
-30: subq $3,%rcx
- jmp bad_get_user
- CFI_ENDPROC
-ENDPROC(__get_user_4)
-
-ENTRY(__get_user_8)
- CFI_STARTPROC
- GET_THREAD_INFO(%r8)
- addq $7,%rcx
- jc 40f
- cmpq threadinfo_addr_limit(%r8),%rcx
- jae 40f
- subq $7,%rcx
-4: movq (%rcx),%rdx
- xorl %eax,%eax
- ret
-40: subq $7,%rcx
- jmp bad_get_user
- CFI_ENDPROC
-ENDPROC(__get_user_8)
-
-bad_get_user:
- CFI_STARTPROC
- xorl %edx,%edx
- movq $(-EFAULT),%rax
- ret
- CFI_ENDPROC
-END(bad_get_user)
-
-.section __ex_table,"a"
- .quad 1b,bad_get_user
- .quad 2b,bad_get_user
- .quad 3b,bad_get_user
- .quad 4b,bad_get_user
-.previous
diff --git a/arch/x86/lib/hash.c b/arch/x86/lib/hash.c
new file mode 100644
index 00000000000..ff4fa51a5b1
--- /dev/null
+++ b/arch/x86/lib/hash.c
@@ -0,0 +1,92 @@
+/*
+ * Some portions derived from code covered by the following notice:
+ *
+ * Copyright (c) 2010-2013 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/hash.h>
+#include <linux/init.h>
+
+#include <asm/processor.h>
+#include <asm/cpufeature.h>
+#include <asm/hash.h>
+
+static inline u32 crc32_u32(u32 crc, u32 val)
+{
+#ifdef CONFIG_AS_CRC32
+ asm ("crc32l %1,%0\n" : "+r" (crc) : "rm" (val));
+#else
+ asm (".byte 0xf2, 0x0f, 0x38, 0xf1, 0xc1" : "+a" (crc) : "c" (val));
+#endif
+ return crc;
+}
+
+static u32 intel_crc4_2_hash(const void *data, u32 len, u32 seed)
+{
+ const u32 *p32 = (const u32 *) data;
+ u32 i, tmp = 0;
+
+ for (i = 0; i < len / 4; i++)
+ seed = crc32_u32(seed, *p32++);
+
+ switch (len & 3) {
+ case 3:
+ tmp |= *((const u8 *) p32 + 2) << 16;
+ /* fallthrough */
+ case 2:
+ tmp |= *((const u8 *) p32 + 1) << 8;
+ /* fallthrough */
+ case 1:
+ tmp |= *((const u8 *) p32);
+ seed = crc32_u32(seed, tmp);
+ break;
+ }
+
+ return seed;
+}
+
+static u32 intel_crc4_2_hash2(const u32 *data, u32 len, u32 seed)
+{
+ const u32 *p32 = (const u32 *) data;
+ u32 i;
+
+ for (i = 0; i < len; i++)
+ seed = crc32_u32(seed, *p32++);
+
+ return seed;
+}
+
+void __init setup_arch_fast_hash(struct fast_hash_ops *ops)
+{
+ if (cpu_has_xmm4_2) {
+ ops->hash = intel_crc4_2_hash;
+ ops->hash2 = intel_crc4_2_hash2;
+ }
+}
diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c
new file mode 100644
index 00000000000..c1f01a8e9f6
--- /dev/null
+++ b/arch/x86/lib/inat.c
@@ -0,0 +1,97 @@
+/*
+ * x86 instruction attribute tables
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+#include <asm/insn.h>
+
+/* Attribute tables are generated from opcode map */
+#include "inat-tables.c"
+
+/* Attribute search APIs */
+insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode)
+{
+ return inat_primary_table[opcode];
+}
+
+int inat_get_last_prefix_id(insn_byte_t last_pfx)
+{
+ insn_attr_t lpfx_attr;
+
+ lpfx_attr = inat_get_opcode_attribute(last_pfx);
+ return inat_last_prefix_id(lpfx_attr);
+}
+
+insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, int lpfx_id,
+ insn_attr_t esc_attr)
+{
+ const insn_attr_t *table;
+ int n;
+
+ n = inat_escape_id(esc_attr);
+
+ table = inat_escape_tables[n][0];
+ if (!table)
+ return 0;
+ if (inat_has_variant(table[opcode]) && lpfx_id) {
+ table = inat_escape_tables[n][lpfx_id];
+ if (!table)
+ return 0;
+ }
+ return table[opcode];
+}
+
+insn_attr_t inat_get_group_attribute(insn_byte_t modrm, int lpfx_id,
+ insn_attr_t grp_attr)
+{
+ const insn_attr_t *table;
+ int n;
+
+ n = inat_group_id(grp_attr);
+
+ table = inat_group_tables[n][0];
+ if (!table)
+ return inat_group_common_attribute(grp_attr);
+ if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && lpfx_id) {
+ table = inat_group_tables[n][lpfx_id];
+ if (!table)
+ return inat_group_common_attribute(grp_attr);
+ }
+ return table[X86_MODRM_REG(modrm)] |
+ inat_group_common_attribute(grp_attr);
+}
+
+insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m,
+ insn_byte_t vex_p)
+{
+ const insn_attr_t *table;
+ if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX)
+ return 0;
+ /* At first, this checks the master table */
+ table = inat_avx_tables[vex_m][0];
+ if (!table)
+ return 0;
+ if (!inat_is_group(table[opcode]) && vex_p) {
+ /* If this is not a group, get attribute directly */
+ table = inat_avx_tables[vex_m][vex_p];
+ if (!table)
+ return 0;
+ }
+ return table[opcode];
+}
+
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
new file mode 100644
index 00000000000..54fcffed28e
--- /dev/null
+++ b/arch/x86/lib/insn.c
@@ -0,0 +1,580 @@
+/*
+ * x86 instruction analysis
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004, 2009
+ */
+
+#ifdef __KERNEL__
+#include <linux/string.h>
+#else
+#include <string.h>
+#endif
+#include <asm/inat.h>
+#include <asm/insn.h>
+
+/* Verify next sizeof(t) bytes can be on the same instruction */
+#define validate_next(t, insn, n) \
+ ((insn)->next_byte + sizeof(t) + n - (insn)->kaddr <= MAX_INSN_SIZE)
+
+#define __get_next(t, insn) \
+ ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; })
+
+#define __peek_nbyte_next(t, insn, n) \
+ ({ t r = *(t*)((insn)->next_byte + n); r; })
+
+#define get_next(t, insn) \
+ ({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); })
+
+#define peek_nbyte_next(t, insn, n) \
+ ({ if (unlikely(!validate_next(t, insn, n))) goto err_out; __peek_nbyte_next(t, insn, n); })
+
+#define peek_next(t, insn) peek_nbyte_next(t, insn, 0)
+
+/**
+ * insn_init() - initialize struct insn
+ * @insn: &struct insn to be initialized
+ * @kaddr: address (in kernel memory) of instruction (or copy thereof)
+ * @x86_64: !0 for 64-bit kernel or 64-bit app
+ */
+void insn_init(struct insn *insn, const void *kaddr, int x86_64)
+{
+ memset(insn, 0, sizeof(*insn));
+ insn->kaddr = kaddr;
+ insn->next_byte = kaddr;
+ insn->x86_64 = x86_64 ? 1 : 0;
+ insn->opnd_bytes = 4;
+ if (x86_64)
+ insn->addr_bytes = 8;
+ else
+ insn->addr_bytes = 4;
+}
+
+/**
+ * insn_get_prefixes - scan x86 instruction prefix bytes
+ * @insn: &struct insn containing instruction
+ *
+ * Populates the @insn->prefixes bitmap, and updates @insn->next_byte
+ * to point to the (first) opcode. No effect if @insn->prefixes.got
+ * is already set.
+ */
+void insn_get_prefixes(struct insn *insn)
+{
+ struct insn_field *prefixes = &insn->prefixes;
+ insn_attr_t attr;
+ insn_byte_t b, lb;
+ int i, nb;
+
+ if (prefixes->got)
+ return;
+
+ nb = 0;
+ lb = 0;
+ b = peek_next(insn_byte_t, insn);
+ attr = inat_get_opcode_attribute(b);
+ while (inat_is_legacy_prefix(attr)) {
+ /* Skip if same prefix */
+ for (i = 0; i < nb; i++)
+ if (prefixes->bytes[i] == b)
+ goto found;
+ if (nb == 4)
+ /* Invalid instruction */
+ break;
+ prefixes->bytes[nb++] = b;
+ if (inat_is_address_size_prefix(attr)) {
+ /* address size switches 2/4 or 4/8 */
+ if (insn->x86_64)
+ insn->addr_bytes ^= 12;
+ else
+ insn->addr_bytes ^= 6;
+ } else if (inat_is_operand_size_prefix(attr)) {
+ /* oprand size switches 2/4 */
+ insn->opnd_bytes ^= 6;
+ }
+found:
+ prefixes->nbytes++;
+ insn->next_byte++;
+ lb = b;
+ b = peek_next(insn_byte_t, insn);
+ attr = inat_get_opcode_attribute(b);
+ }
+ /* Set the last prefix */
+ if (lb && lb != insn->prefixes.bytes[3]) {
+ if (unlikely(insn->prefixes.bytes[3])) {
+ /* Swap the last prefix */
+ b = insn->prefixes.bytes[3];
+ for (i = 0; i < nb; i++)
+ if (prefixes->bytes[i] == lb)
+ prefixes->bytes[i] = b;
+ }
+ insn->prefixes.bytes[3] = lb;
+ }
+
+ /* Decode REX prefix */
+ if (insn->x86_64) {
+ b = peek_next(insn_byte_t, insn);
+ attr = inat_get_opcode_attribute(b);
+ if (inat_is_rex_prefix(attr)) {
+ insn->rex_prefix.value = b;
+ insn->rex_prefix.nbytes = 1;
+ insn->next_byte++;
+ if (X86_REX_W(b))
+ /* REX.W overrides opnd_size */
+ insn->opnd_bytes = 8;
+ }
+ }
+ insn->rex_prefix.got = 1;
+
+ /* Decode VEX prefix */
+ b = peek_next(insn_byte_t, insn);
+ attr = inat_get_opcode_attribute(b);
+ if (inat_is_vex_prefix(attr)) {
+ insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1);
+ if (!insn->x86_64) {
+ /*
+ * In 32-bits mode, if the [7:6] bits (mod bits of
+ * ModRM) on the second byte are not 11b, it is
+ * LDS or LES.
+ */
+ if (X86_MODRM_MOD(b2) != 3)
+ goto vex_end;
+ }
+ insn->vex_prefix.bytes[0] = b;
+ insn->vex_prefix.bytes[1] = b2;
+ if (inat_is_vex3_prefix(attr)) {
+ b2 = peek_nbyte_next(insn_byte_t, insn, 2);
+ insn->vex_prefix.bytes[2] = b2;
+ insn->vex_prefix.nbytes = 3;
+ insn->next_byte += 3;
+ if (insn->x86_64 && X86_VEX_W(b2))
+ /* VEX.W overrides opnd_size */
+ insn->opnd_bytes = 8;
+ } else {
+ insn->vex_prefix.nbytes = 2;
+ insn->next_byte += 2;
+ }
+ }
+vex_end:
+ insn->vex_prefix.got = 1;
+
+ prefixes->got = 1;
+
+err_out:
+ return;
+}
+
+/**
+ * insn_get_opcode - collect opcode(s)
+ * @insn: &struct insn containing instruction
+ *
+ * Populates @insn->opcode, updates @insn->next_byte to point past the
+ * opcode byte(s), and set @insn->attr (except for groups).
+ * If necessary, first collects any preceding (prefix) bytes.
+ * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got
+ * is already 1.
+ */
+void insn_get_opcode(struct insn *insn)
+{
+ struct insn_field *opcode = &insn->opcode;
+ insn_byte_t op;
+ int pfx_id;
+ if (opcode->got)
+ return;
+ if (!insn->prefixes.got)
+ insn_get_prefixes(insn);
+
+ /* Get first opcode */
+ op = get_next(insn_byte_t, insn);
+ opcode->bytes[0] = op;
+ opcode->nbytes = 1;
+
+ /* Check if there is VEX prefix or not */
+ if (insn_is_avx(insn)) {
+ insn_byte_t m, p;
+ m = insn_vex_m_bits(insn);
+ p = insn_vex_p_bits(insn);
+ insn->attr = inat_get_avx_attribute(op, m, p);
+ if (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr))
+ insn->attr = 0; /* This instruction is bad */
+ goto end; /* VEX has only 1 byte for opcode */
+ }
+
+ insn->attr = inat_get_opcode_attribute(op);
+ while (inat_is_escape(insn->attr)) {
+ /* Get escaped opcode */
+ op = get_next(insn_byte_t, insn);
+ opcode->bytes[opcode->nbytes++] = op;
+ pfx_id = insn_last_prefix_id(insn);
+ insn->attr = inat_get_escape_attribute(op, pfx_id, insn->attr);
+ }
+ if (inat_must_vex(insn->attr))
+ insn->attr = 0; /* This instruction is bad */
+end:
+ opcode->got = 1;
+
+err_out:
+ return;
+}
+
+/**
+ * insn_get_modrm - collect ModRM byte, if any
+ * @insn: &struct insn containing instruction
+ *
+ * Populates @insn->modrm and updates @insn->next_byte to point past the
+ * ModRM byte, if any. If necessary, first collects the preceding bytes
+ * (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1.
+ */
+void insn_get_modrm(struct insn *insn)
+{
+ struct insn_field *modrm = &insn->modrm;
+ insn_byte_t pfx_id, mod;
+ if (modrm->got)
+ return;
+ if (!insn->opcode.got)
+ insn_get_opcode(insn);
+
+ if (inat_has_modrm(insn->attr)) {
+ mod = get_next(insn_byte_t, insn);
+ modrm->value = mod;
+ modrm->nbytes = 1;
+ if (inat_is_group(insn->attr)) {
+ pfx_id = insn_last_prefix_id(insn);
+ insn->attr = inat_get_group_attribute(mod, pfx_id,
+ insn->attr);
+ if (insn_is_avx(insn) && !inat_accept_vex(insn->attr))
+ insn->attr = 0; /* This is bad */
+ }
+ }
+
+ if (insn->x86_64 && inat_is_force64(insn->attr))
+ insn->opnd_bytes = 8;
+ modrm->got = 1;
+
+err_out:
+ return;
+}
+
+
+/**
+ * insn_rip_relative() - Does instruction use RIP-relative addressing mode?
+ * @insn: &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * ModRM byte. No effect if @insn->x86_64 is 0.
+ */
+int insn_rip_relative(struct insn *insn)
+{
+ struct insn_field *modrm = &insn->modrm;
+
+ if (!insn->x86_64)
+ return 0;
+ if (!modrm->got)
+ insn_get_modrm(insn);
+ /*
+ * For rip-relative instructions, the mod field (top 2 bits)
+ * is zero and the r/m field (bottom 3 bits) is 0x5.
+ */
+ return (modrm->nbytes && (modrm->value & 0xc7) == 0x5);
+}
+
+/**
+ * insn_get_sib() - Get the SIB byte of instruction
+ * @insn: &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * ModRM byte.
+ */
+void insn_get_sib(struct insn *insn)
+{
+ insn_byte_t modrm;
+
+ if (insn->sib.got)
+ return;
+ if (!insn->modrm.got)
+ insn_get_modrm(insn);
+ if (insn->modrm.nbytes) {
+ modrm = (insn_byte_t)insn->modrm.value;
+ if (insn->addr_bytes != 2 &&
+ X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) {
+ insn->sib.value = get_next(insn_byte_t, insn);
+ insn->sib.nbytes = 1;
+ }
+ }
+ insn->sib.got = 1;
+
+err_out:
+ return;
+}
+
+
+/**
+ * insn_get_displacement() - Get the displacement of instruction
+ * @insn: &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * SIB byte.
+ * Displacement value is sign-expanded.
+ */
+void insn_get_displacement(struct insn *insn)
+{
+ insn_byte_t mod, rm, base;
+
+ if (insn->displacement.got)
+ return;
+ if (!insn->sib.got)
+ insn_get_sib(insn);
+ if (insn->modrm.nbytes) {
+ /*
+ * Interpreting the modrm byte:
+ * mod = 00 - no displacement fields (exceptions below)
+ * mod = 01 - 1-byte displacement field
+ * mod = 10 - displacement field is 4 bytes, or 2 bytes if
+ * address size = 2 (0x67 prefix in 32-bit mode)
+ * mod = 11 - no memory operand
+ *
+ * If address size = 2...
+ * mod = 00, r/m = 110 - displacement field is 2 bytes
+ *
+ * If address size != 2...
+ * mod != 11, r/m = 100 - SIB byte exists
+ * mod = 00, SIB base = 101 - displacement field is 4 bytes
+ * mod = 00, r/m = 101 - rip-relative addressing, displacement
+ * field is 4 bytes
+ */
+ mod = X86_MODRM_MOD(insn->modrm.value);
+ rm = X86_MODRM_RM(insn->modrm.value);
+ base = X86_SIB_BASE(insn->sib.value);
+ if (mod == 3)
+ goto out;
+ if (mod == 1) {
+ insn->displacement.value = get_next(char, insn);
+ insn->displacement.nbytes = 1;
+ } else if (insn->addr_bytes == 2) {
+ if ((mod == 0 && rm == 6) || mod == 2) {
+ insn->displacement.value =
+ get_next(short, insn);
+ insn->displacement.nbytes = 2;
+ }
+ } else {
+ if ((mod == 0 && rm == 5) || mod == 2 ||
+ (mod == 0 && base == 5)) {
+ insn->displacement.value = get_next(int, insn);
+ insn->displacement.nbytes = 4;
+ }
+ }
+ }
+out:
+ insn->displacement.got = 1;
+
+err_out:
+ return;
+}
+
+/* Decode moffset16/32/64. Return 0 if failed */
+static int __get_moffset(struct insn *insn)
+{
+ switch (insn->addr_bytes) {
+ case 2:
+ insn->moffset1.value = get_next(short, insn);
+ insn->moffset1.nbytes = 2;
+ break;
+ case 4:
+ insn->moffset1.value = get_next(int, insn);
+ insn->moffset1.nbytes = 4;
+ break;
+ case 8:
+ insn->moffset1.value = get_next(int, insn);
+ insn->moffset1.nbytes = 4;
+ insn->moffset2.value = get_next(int, insn);
+ insn->moffset2.nbytes = 4;
+ break;
+ default: /* opnd_bytes must be modified manually */
+ goto err_out;
+ }
+ insn->moffset1.got = insn->moffset2.got = 1;
+
+ return 1;
+
+err_out:
+ return 0;
+}
+
+/* Decode imm v32(Iz). Return 0 if failed */
+static int __get_immv32(struct insn *insn)
+{
+ switch (insn->opnd_bytes) {
+ case 2:
+ insn->immediate.value = get_next(short, insn);
+ insn->immediate.nbytes = 2;
+ break;
+ case 4:
+ case 8:
+ insn->immediate.value = get_next(int, insn);
+ insn->immediate.nbytes = 4;
+ break;
+ default: /* opnd_bytes must be modified manually */
+ goto err_out;
+ }
+
+ return 1;
+
+err_out:
+ return 0;
+}
+
+/* Decode imm v64(Iv/Ov), Return 0 if failed */
+static int __get_immv(struct insn *insn)
+{
+ switch (insn->opnd_bytes) {
+ case 2:
+ insn->immediate1.value = get_next(short, insn);
+ insn->immediate1.nbytes = 2;
+ break;
+ case 4:
+ insn->immediate1.value = get_next(int, insn);
+ insn->immediate1.nbytes = 4;
+ break;
+ case 8:
+ insn->immediate1.value = get_next(int, insn);
+ insn->immediate1.nbytes = 4;
+ insn->immediate2.value = get_next(int, insn);
+ insn->immediate2.nbytes = 4;
+ break;
+ default: /* opnd_bytes must be modified manually */
+ goto err_out;
+ }
+ insn->immediate1.got = insn->immediate2.got = 1;
+
+ return 1;
+err_out:
+ return 0;
+}
+
+/* Decode ptr16:16/32(Ap) */
+static int __get_immptr(struct insn *insn)
+{
+ switch (insn->opnd_bytes) {
+ case 2:
+ insn->immediate1.value = get_next(short, insn);
+ insn->immediate1.nbytes = 2;
+ break;
+ case 4:
+ insn->immediate1.value = get_next(int, insn);
+ insn->immediate1.nbytes = 4;
+ break;
+ case 8:
+ /* ptr16:64 is not exist (no segment) */
+ return 0;
+ default: /* opnd_bytes must be modified manually */
+ goto err_out;
+ }
+ insn->immediate2.value = get_next(unsigned short, insn);
+ insn->immediate2.nbytes = 2;
+ insn->immediate1.got = insn->immediate2.got = 1;
+
+ return 1;
+err_out:
+ return 0;
+}
+
+/**
+ * insn_get_immediate() - Get the immediates of instruction
+ * @insn: &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * displacement bytes.
+ * Basically, most of immediates are sign-expanded. Unsigned-value can be
+ * get by bit masking with ((1 << (nbytes * 8)) - 1)
+ */
+void insn_get_immediate(struct insn *insn)
+{
+ if (insn->immediate.got)
+ return;
+ if (!insn->displacement.got)
+ insn_get_displacement(insn);
+
+ if (inat_has_moffset(insn->attr)) {
+ if (!__get_moffset(insn))
+ goto err_out;
+ goto done;
+ }
+
+ if (!inat_has_immediate(insn->attr))
+ /* no immediates */
+ goto done;
+
+ switch (inat_immediate_size(insn->attr)) {
+ case INAT_IMM_BYTE:
+ insn->immediate.value = get_next(char, insn);
+ insn->immediate.nbytes = 1;
+ break;
+ case INAT_IMM_WORD:
+ insn->immediate.value = get_next(short, insn);
+ insn->immediate.nbytes = 2;
+ break;
+ case INAT_IMM_DWORD:
+ insn->immediate.value = get_next(int, insn);
+ insn->immediate.nbytes = 4;
+ break;
+ case INAT_IMM_QWORD:
+ insn->immediate1.value = get_next(int, insn);
+ insn->immediate1.nbytes = 4;
+ insn->immediate2.value = get_next(int, insn);
+ insn->immediate2.nbytes = 4;
+ break;
+ case INAT_IMM_PTR:
+ if (!__get_immptr(insn))
+ goto err_out;
+ break;
+ case INAT_IMM_VWORD32:
+ if (!__get_immv32(insn))
+ goto err_out;
+ break;
+ case INAT_IMM_VWORD:
+ if (!__get_immv(insn))
+ goto err_out;
+ break;
+ default:
+ /* Here, insn must have an immediate, but failed */
+ goto err_out;
+ }
+ if (inat_has_second_immediate(insn->attr)) {
+ insn->immediate2.value = get_next(char, insn);
+ insn->immediate2.nbytes = 1;
+ }
+done:
+ insn->immediate.got = 1;
+
+err_out:
+ return;
+}
+
+/**
+ * insn_get_length() - Get the length of instruction
+ * @insn: &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * immediates bytes.
+ */
+void insn_get_length(struct insn *insn)
+{
+ if (insn->length)
+ return;
+ if (!insn->immediate.got)
+ insn_get_immediate(insn);
+ insn->length = (unsigned char)((unsigned long)insn->next_byte
+ - (unsigned long)insn->kaddr);
+}
diff --git a/arch/x86/lib/io_64.c b/arch/x86/lib/io_64.c
deleted file mode 100644
index 3f1eb59b5f0..00000000000
--- a/arch/x86/lib/io_64.c
+++ /dev/null
@@ -1,25 +0,0 @@
-#include <linux/string.h>
-#include <linux/module.h>
-#include <asm/io.h>
-
-void __memcpy_toio(unsigned long dst, const void *src, unsigned len)
-{
- __inline_memcpy((void *)dst, src, len);
-}
-EXPORT_SYMBOL(__memcpy_toio);
-
-void __memcpy_fromio(void *dst, unsigned long src, unsigned len)
-{
- __inline_memcpy(dst, (const void *)src, len);
-}
-EXPORT_SYMBOL(__memcpy_fromio);
-
-void memset_io(volatile void __iomem *a, int b, size_t c)
-{
- /*
- * TODO: memset can mangle the IO patterns quite a bit.
- * perhaps it would be better to use a dumb one:
- */
- memset((void *)a, b, c);
-}
-EXPORT_SYMBOL(memset_io);
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index 37756b6fb32..a404b4b7553 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -4,7 +4,7 @@
#undef memcpy
#undef memset
-void *memcpy(void *to, const void *from, size_t n)
+__visible void *memcpy(void *to, const void *from, size_t n)
{
#ifdef CONFIG_X86_USE_3DNOW
return __memcpy3d(to, from, n);
@@ -14,30 +14,195 @@ void *memcpy(void *to, const void *from, size_t n)
}
EXPORT_SYMBOL(memcpy);
-void *memset(void *s, int c, size_t count)
+__visible void *memset(void *s, int c, size_t count)
{
return __memset(s, c, count);
}
EXPORT_SYMBOL(memset);
-void *memmove(void *dest, const void *src, size_t n)
+__visible void *memmove(void *dest, const void *src, size_t n)
{
- int d0, d1, d2;
-
- if (dest < src) {
- memcpy(dest,src,n);
- } else {
- __asm__ __volatile__(
- "std\n\t"
- "rep\n\t"
- "movsb\n\t"
- "cld"
- : "=&c" (d0), "=&S" (d1), "=&D" (d2)
- :"0" (n),
- "1" (n-1+src),
- "2" (n-1+dest)
- :"memory");
- }
- return dest;
+ int d0,d1,d2,d3,d4,d5;
+ char *ret = dest;
+
+ __asm__ __volatile__(
+ /* Handle more 16 bytes in loop */
+ "cmp $0x10, %0\n\t"
+ "jb 1f\n\t"
+
+ /* Decide forward/backward copy mode */
+ "cmp %2, %1\n\t"
+ "jb 2f\n\t"
+
+ /*
+ * movs instruction have many startup latency
+ * so we handle small size by general register.
+ */
+ "cmp $680, %0\n\t"
+ "jb 3f\n\t"
+ /*
+ * movs instruction is only good for aligned case.
+ */
+ "mov %1, %3\n\t"
+ "xor %2, %3\n\t"
+ "and $0xff, %3\n\t"
+ "jz 4f\n\t"
+ "3:\n\t"
+ "sub $0x10, %0\n\t"
+
+ /*
+ * We gobble 16 bytes forward in each loop.
+ */
+ "3:\n\t"
+ "sub $0x10, %0\n\t"
+ "mov 0*4(%1), %3\n\t"
+ "mov 1*4(%1), %4\n\t"
+ "mov %3, 0*4(%2)\n\t"
+ "mov %4, 1*4(%2)\n\t"
+ "mov 2*4(%1), %3\n\t"
+ "mov 3*4(%1), %4\n\t"
+ "mov %3, 2*4(%2)\n\t"
+ "mov %4, 3*4(%2)\n\t"
+ "lea 0x10(%1), %1\n\t"
+ "lea 0x10(%2), %2\n\t"
+ "jae 3b\n\t"
+ "add $0x10, %0\n\t"
+ "jmp 1f\n\t"
+
+ /*
+ * Handle data forward by movs.
+ */
+ ".p2align 4\n\t"
+ "4:\n\t"
+ "mov -4(%1, %0), %3\n\t"
+ "lea -4(%2, %0), %4\n\t"
+ "shr $2, %0\n\t"
+ "rep movsl\n\t"
+ "mov %3, (%4)\n\t"
+ "jmp 11f\n\t"
+ /*
+ * Handle data backward by movs.
+ */
+ ".p2align 4\n\t"
+ "6:\n\t"
+ "mov (%1), %3\n\t"
+ "mov %2, %4\n\t"
+ "lea -4(%1, %0), %1\n\t"
+ "lea -4(%2, %0), %2\n\t"
+ "shr $2, %0\n\t"
+ "std\n\t"
+ "rep movsl\n\t"
+ "mov %3,(%4)\n\t"
+ "cld\n\t"
+ "jmp 11f\n\t"
+
+ /*
+ * Start to prepare for backward copy.
+ */
+ ".p2align 4\n\t"
+ "2:\n\t"
+ "cmp $680, %0\n\t"
+ "jb 5f\n\t"
+ "mov %1, %3\n\t"
+ "xor %2, %3\n\t"
+ "and $0xff, %3\n\t"
+ "jz 6b\n\t"
+
+ /*
+ * Calculate copy position to tail.
+ */
+ "5:\n\t"
+ "add %0, %1\n\t"
+ "add %0, %2\n\t"
+ "sub $0x10, %0\n\t"
+
+ /*
+ * We gobble 16 bytes backward in each loop.
+ */
+ "7:\n\t"
+ "sub $0x10, %0\n\t"
+
+ "mov -1*4(%1), %3\n\t"
+ "mov -2*4(%1), %4\n\t"
+ "mov %3, -1*4(%2)\n\t"
+ "mov %4, -2*4(%2)\n\t"
+ "mov -3*4(%1), %3\n\t"
+ "mov -4*4(%1), %4\n\t"
+ "mov %3, -3*4(%2)\n\t"
+ "mov %4, -4*4(%2)\n\t"
+ "lea -0x10(%1), %1\n\t"
+ "lea -0x10(%2), %2\n\t"
+ "jae 7b\n\t"
+ /*
+ * Calculate copy position to head.
+ */
+ "add $0x10, %0\n\t"
+ "sub %0, %1\n\t"
+ "sub %0, %2\n\t"
+
+ /*
+ * Move data from 8 bytes to 15 bytes.
+ */
+ ".p2align 4\n\t"
+ "1:\n\t"
+ "cmp $8, %0\n\t"
+ "jb 8f\n\t"
+ "mov 0*4(%1), %3\n\t"
+ "mov 1*4(%1), %4\n\t"
+ "mov -2*4(%1, %0), %5\n\t"
+ "mov -1*4(%1, %0), %1\n\t"
+
+ "mov %3, 0*4(%2)\n\t"
+ "mov %4, 1*4(%2)\n\t"
+ "mov %5, -2*4(%2, %0)\n\t"
+ "mov %1, -1*4(%2, %0)\n\t"
+ "jmp 11f\n\t"
+
+ /*
+ * Move data from 4 bytes to 7 bytes.
+ */
+ ".p2align 4\n\t"
+ "8:\n\t"
+ "cmp $4, %0\n\t"
+ "jb 9f\n\t"
+ "mov 0*4(%1), %3\n\t"
+ "mov -1*4(%1, %0), %4\n\t"
+ "mov %3, 0*4(%2)\n\t"
+ "mov %4, -1*4(%2, %0)\n\t"
+ "jmp 11f\n\t"
+
+ /*
+ * Move data from 2 bytes to 3 bytes.
+ */
+ ".p2align 4\n\t"
+ "9:\n\t"
+ "cmp $2, %0\n\t"
+ "jb 10f\n\t"
+ "movw 0*2(%1), %%dx\n\t"
+ "movw -1*2(%1, %0), %%bx\n\t"
+ "movw %%dx, 0*2(%2)\n\t"
+ "movw %%bx, -1*2(%2, %0)\n\t"
+ "jmp 11f\n\t"
+
+ /*
+ * Move data for 1 byte.
+ */
+ ".p2align 4\n\t"
+ "10:\n\t"
+ "cmp $1, %0\n\t"
+ "jb 11f\n\t"
+ "movb (%1), %%cl\n\t"
+ "movb %%cl, (%2)\n\t"
+ ".p2align 4\n\t"
+ "11:"
+ : "=&c" (d0), "=&S" (d1), "=&D" (d2),
+ "=r" (d3),"=r" (d4), "=r"(d5)
+ :"0" (n),
+ "1" (src),
+ "2" (dest)
+ :"memory");
+
+ return ret;
+
}
EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index c22981fa2f3..56313a32618 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,131 +1,206 @@
/* Copyright 2002 Andi Kleen */
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
+
#include <asm/cpufeature.h>
+#include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
/*
* memcpy - Copy a memory block.
*
- * Input:
- * rdi destination
- * rsi source
- * rdx count
- *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
* Output:
* rax original destination
- */
+ */
- ALIGN
-memcpy_c:
- CFI_STARTPROC
- movq %rdi,%rax
- movl %edx,%ecx
- shrl $3,%ecx
- andl $7,%edx
+/*
+ * memcpy_c() - fast string ops (REP MOVSQ) based variant.
+ *
+ * This gets patched over the unrolled variant (below) via the
+ * alternative instructions framework:
+ */
+ .section .altinstr_replacement, "ax", @progbits
+.Lmemcpy_c:
+ movq %rdi, %rax
+ movq %rdx, %rcx
+ shrq $3, %rcx
+ andl $7, %edx
rep movsq
- movl %edx,%ecx
+ movl %edx, %ecx
rep movsb
ret
- CFI_ENDPROC
-ENDPROC(memcpy_c)
+.Lmemcpy_e:
+ .previous
+
+/*
+ * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
+ * memcpy_c. Use memcpy_c_e when possible.
+ *
+ * This gets patched over the unrolled variant (below) via the
+ * alternative instructions framework:
+ */
+ .section .altinstr_replacement, "ax", @progbits
+.Lmemcpy_c_e:
+ movq %rdi, %rax
+ movq %rdx, %rcx
+ rep movsb
+ ret
+.Lmemcpy_e_e:
+ .previous
ENTRY(__memcpy)
ENTRY(memcpy)
CFI_STARTPROC
- pushq %rbx
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rbx, 0
- movq %rdi,%rax
-
- movl %edx,%ecx
- shrl $6,%ecx
- jz .Lhandle_tail
-
+ movq %rdi, %rax
+
+ cmpq $0x20, %rdx
+ jb .Lhandle_tail
+
+ /*
+ * We check whether memory false dependence could occur,
+ * then jump to corresponding copy mode.
+ */
+ cmp %dil, %sil
+ jl .Lcopy_backward
+ subq $0x20, %rdx
+.Lcopy_forward_loop:
+ subq $0x20, %rdx
+
+ /*
+ * Move in blocks of 4x8 bytes:
+ */
+ movq 0*8(%rsi), %r8
+ movq 1*8(%rsi), %r9
+ movq 2*8(%rsi), %r10
+ movq 3*8(%rsi), %r11
+ leaq 4*8(%rsi), %rsi
+
+ movq %r8, 0*8(%rdi)
+ movq %r9, 1*8(%rdi)
+ movq %r10, 2*8(%rdi)
+ movq %r11, 3*8(%rdi)
+ leaq 4*8(%rdi), %rdi
+ jae .Lcopy_forward_loop
+ addl $0x20, %edx
+ jmp .Lhandle_tail
+
+.Lcopy_backward:
+ /*
+ * Calculate copy position to tail.
+ */
+ addq %rdx, %rsi
+ addq %rdx, %rdi
+ subq $0x20, %rdx
+ /*
+ * At most 3 ALU operations in one cycle,
+ * so append NOPS in the same 16 bytes trunk.
+ */
.p2align 4
-.Lloop_64:
- decl %ecx
-
- movq (%rsi),%r11
- movq 8(%rsi),%r8
-
- movq %r11,(%rdi)
- movq %r8,1*8(%rdi)
-
- movq 2*8(%rsi),%r9
- movq 3*8(%rsi),%r10
-
- movq %r9,2*8(%rdi)
- movq %r10,3*8(%rdi)
-
- movq 4*8(%rsi),%r11
- movq 5*8(%rsi),%r8
-
- movq %r11,4*8(%rdi)
- movq %r8,5*8(%rdi)
-
- movq 6*8(%rsi),%r9
- movq 7*8(%rsi),%r10
-
- movq %r9,6*8(%rdi)
- movq %r10,7*8(%rdi)
-
- leaq 64(%rsi),%rsi
- leaq 64(%rdi),%rdi
- jnz .Lloop_64
-
+.Lcopy_backward_loop:
+ subq $0x20, %rdx
+ movq -1*8(%rsi), %r8
+ movq -2*8(%rsi), %r9
+ movq -3*8(%rsi), %r10
+ movq -4*8(%rsi), %r11
+ leaq -4*8(%rsi), %rsi
+ movq %r8, -1*8(%rdi)
+ movq %r9, -2*8(%rdi)
+ movq %r10, -3*8(%rdi)
+ movq %r11, -4*8(%rdi)
+ leaq -4*8(%rdi), %rdi
+ jae .Lcopy_backward_loop
+
+ /*
+ * Calculate copy position to head.
+ */
+ addl $0x20, %edx
+ subq %rdx, %rsi
+ subq %rdx, %rdi
.Lhandle_tail:
- movl %edx,%ecx
- andl $63,%ecx
- shrl $3,%ecx
- jz .Lhandle_7
+ cmpl $16, %edx
+ jb .Lless_16bytes
+
+ /*
+ * Move data from 16 bytes to 31 bytes.
+ */
+ movq 0*8(%rsi), %r8
+ movq 1*8(%rsi), %r9
+ movq -2*8(%rsi, %rdx), %r10
+ movq -1*8(%rsi, %rdx), %r11
+ movq %r8, 0*8(%rdi)
+ movq %r9, 1*8(%rdi)
+ movq %r10, -2*8(%rdi, %rdx)
+ movq %r11, -1*8(%rdi, %rdx)
+ retq
.p2align 4
-.Lloop_8:
- decl %ecx
- movq (%rsi),%r8
- movq %r8,(%rdi)
- leaq 8(%rdi),%rdi
- leaq 8(%rsi),%rsi
- jnz .Lloop_8
-
-.Lhandle_7:
- movl %edx,%ecx
- andl $7,%ecx
- jz .Lende
+.Lless_16bytes:
+ cmpl $8, %edx
+ jb .Lless_8bytes
+ /*
+ * Move data from 8 bytes to 15 bytes.
+ */
+ movq 0*8(%rsi), %r8
+ movq -1*8(%rsi, %rdx), %r9
+ movq %r8, 0*8(%rdi)
+ movq %r9, -1*8(%rdi, %rdx)
+ retq
.p2align 4
-.Lloop_1:
- movb (%rsi),%r8b
- movb %r8b,(%rdi)
- incq %rdi
- incq %rsi
- decl %ecx
- jnz .Lloop_1
-
-.Lende:
- popq %rbx
- CFI_ADJUST_CFA_OFFSET -8
- CFI_RESTORE rbx
- ret
-.Lfinal:
+.Lless_8bytes:
+ cmpl $4, %edx
+ jb .Lless_3bytes
+
+ /*
+ * Move data from 4 bytes to 7 bytes.
+ */
+ movl (%rsi), %ecx
+ movl -4(%rsi, %rdx), %r8d
+ movl %ecx, (%rdi)
+ movl %r8d, -4(%rdi, %rdx)
+ retq
+ .p2align 4
+.Lless_3bytes:
+ subl $1, %edx
+ jb .Lend
+ /*
+ * Move data from 1 bytes to 3 bytes.
+ */
+ movzbl (%rsi), %ecx
+ jz .Lstore_1byte
+ movzbq 1(%rsi), %r8
+ movzbq (%rsi, %rdx), %r9
+ movb %r8b, 1(%rdi)
+ movb %r9b, (%rdi, %rdx)
+.Lstore_1byte:
+ movb %cl, (%rdi)
+
+.Lend:
+ retq
CFI_ENDPROC
ENDPROC(memcpy)
ENDPROC(__memcpy)
- /* Some CPUs run faster using the string copy instructions.
- It is also a lot simpler. Use this when possible */
-
- .section .altinstr_replacement,"ax"
-1: .byte 0xeb /* jmp <disp8> */
- .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
-2:
- .previous
- .section .altinstructions,"a"
- .align 8
- .quad memcpy
- .quad 1b
- .byte X86_FEATURE_REP_GOOD
- /* Replace only beginning, memcpy is used to apply alternatives, so it
- * is silly to overwrite itself with nops - reboot is only outcome... */
- .byte 2b - 1b
- .byte 2b - 1b
+ /*
+ * Some CPUs are adding enhanced REP MOVSB/STOSB feature
+ * If the feature is supported, memcpy_c_e() is the first choice.
+ * If enhanced rep movsb copy is not available, use fast string copy
+ * memcpy_c() when possible. This is faster and code is simpler than
+ * original memcpy().
+ * Otherwise, original memcpy() is used.
+ * In .altinstructions section, ERMS feature is placed after REG_GOOD
+ * feature to implement the right patch order.
+ *
+ * Replace only beginning, memcpy is used to apply alternatives,
+ * so it is silly to overwrite itself with nops - reboot is the
+ * only outcome...
+ */
+ .section .altinstructions, "a"
+ altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
+ .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
+ altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
+ .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
.previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
new file mode 100644
index 00000000000..65268a6104f
--- /dev/null
+++ b/arch/x86/lib/memmove_64.S
@@ -0,0 +1,223 @@
+/*
+ * Normally compiler builtins are used, but sometimes the compiler calls out
+ * of line code. Based on asm-i386/string.h.
+ *
+ * This assembly file is re-written from memmove_64.c file.
+ * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
+ */
+#define _STRING_C
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
+
+#undef memmove
+
+/*
+ * Implement memmove(). This can handle overlap between src and dst.
+ *
+ * Input:
+ * rdi: dest
+ * rsi: src
+ * rdx: count
+ *
+ * Output:
+ * rax: dest
+ */
+ENTRY(memmove)
+ CFI_STARTPROC
+
+ /* Handle more 32 bytes in loop */
+ mov %rdi, %rax
+ cmp $0x20, %rdx
+ jb 1f
+
+ /* Decide forward/backward copy mode */
+ cmp %rdi, %rsi
+ jge .Lmemmove_begin_forward
+ mov %rsi, %r8
+ add %rdx, %r8
+ cmp %rdi, %r8
+ jg 2f
+
+.Lmemmove_begin_forward:
+ /*
+ * movsq instruction have many startup latency
+ * so we handle small size by general register.
+ */
+ cmp $680, %rdx
+ jb 3f
+ /*
+ * movsq instruction is only good for aligned case.
+ */
+
+ cmpb %dil, %sil
+ je 4f
+3:
+ sub $0x20, %rdx
+ /*
+ * We gobble 32 bytes forward in each loop.
+ */
+5:
+ sub $0x20, %rdx
+ movq 0*8(%rsi), %r11
+ movq 1*8(%rsi), %r10
+ movq 2*8(%rsi), %r9
+ movq 3*8(%rsi), %r8
+ leaq 4*8(%rsi), %rsi
+
+ movq %r11, 0*8(%rdi)
+ movq %r10, 1*8(%rdi)
+ movq %r9, 2*8(%rdi)
+ movq %r8, 3*8(%rdi)
+ leaq 4*8(%rdi), %rdi
+ jae 5b
+ addq $0x20, %rdx
+ jmp 1f
+ /*
+ * Handle data forward by movsq.
+ */
+ .p2align 4
+4:
+ movq %rdx, %rcx
+ movq -8(%rsi, %rdx), %r11
+ lea -8(%rdi, %rdx), %r10
+ shrq $3, %rcx
+ rep movsq
+ movq %r11, (%r10)
+ jmp 13f
+.Lmemmove_end_forward:
+
+ /*
+ * Handle data backward by movsq.
+ */
+ .p2align 4
+7:
+ movq %rdx, %rcx
+ movq (%rsi), %r11
+ movq %rdi, %r10
+ leaq -8(%rsi, %rdx), %rsi
+ leaq -8(%rdi, %rdx), %rdi
+ shrq $3, %rcx
+ std
+ rep movsq
+ cld
+ movq %r11, (%r10)
+ jmp 13f
+
+ /*
+ * Start to prepare for backward copy.
+ */
+ .p2align 4
+2:
+ cmp $680, %rdx
+ jb 6f
+ cmp %dil, %sil
+ je 7b
+6:
+ /*
+ * Calculate copy position to tail.
+ */
+ addq %rdx, %rsi
+ addq %rdx, %rdi
+ subq $0x20, %rdx
+ /*
+ * We gobble 32 bytes backward in each loop.
+ */
+8:
+ subq $0x20, %rdx
+ movq -1*8(%rsi), %r11
+ movq -2*8(%rsi), %r10
+ movq -3*8(%rsi), %r9
+ movq -4*8(%rsi), %r8
+ leaq -4*8(%rsi), %rsi
+
+ movq %r11, -1*8(%rdi)
+ movq %r10, -2*8(%rdi)
+ movq %r9, -3*8(%rdi)
+ movq %r8, -4*8(%rdi)
+ leaq -4*8(%rdi), %rdi
+ jae 8b
+ /*
+ * Calculate copy position to head.
+ */
+ addq $0x20, %rdx
+ subq %rdx, %rsi
+ subq %rdx, %rdi
+1:
+ cmpq $16, %rdx
+ jb 9f
+ /*
+ * Move data from 16 bytes to 31 bytes.
+ */
+ movq 0*8(%rsi), %r11
+ movq 1*8(%rsi), %r10
+ movq -2*8(%rsi, %rdx), %r9
+ movq -1*8(%rsi, %rdx), %r8
+ movq %r11, 0*8(%rdi)
+ movq %r10, 1*8(%rdi)
+ movq %r9, -2*8(%rdi, %rdx)
+ movq %r8, -1*8(%rdi, %rdx)
+ jmp 13f
+ .p2align 4
+9:
+ cmpq $8, %rdx
+ jb 10f
+ /*
+ * Move data from 8 bytes to 15 bytes.
+ */
+ movq 0*8(%rsi), %r11
+ movq -1*8(%rsi, %rdx), %r10
+ movq %r11, 0*8(%rdi)
+ movq %r10, -1*8(%rdi, %rdx)
+ jmp 13f
+10:
+ cmpq $4, %rdx
+ jb 11f
+ /*
+ * Move data from 4 bytes to 7 bytes.
+ */
+ movl (%rsi), %r11d
+ movl -4(%rsi, %rdx), %r10d
+ movl %r11d, (%rdi)
+ movl %r10d, -4(%rdi, %rdx)
+ jmp 13f
+11:
+ cmp $2, %rdx
+ jb 12f
+ /*
+ * Move data from 2 bytes to 3 bytes.
+ */
+ movw (%rsi), %r11w
+ movw -2(%rsi, %rdx), %r10w
+ movw %r11w, (%rdi)
+ movw %r10w, -2(%rdi, %rdx)
+ jmp 13f
+12:
+ cmp $1, %rdx
+ jb 13f
+ /*
+ * Move data for 1 byte.
+ */
+ movb (%rsi), %r11b
+ movb %r11b, (%rdi)
+13:
+ retq
+ CFI_ENDPROC
+
+ .section .altinstr_replacement,"ax"
+.Lmemmove_begin_forward_efs:
+ /* Forward moving data. */
+ movq %rdx, %rcx
+ rep movsb
+ retq
+.Lmemmove_end_forward_efs:
+ .previous
+
+ .section .altinstructions,"a"
+ altinstruction_entry .Lmemmove_begin_forward, \
+ .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \
+ .Lmemmove_end_forward-.Lmemmove_begin_forward, \
+ .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
+ .previous
+ENDPROC(memmove)
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
deleted file mode 100644
index 80175e47b19..00000000000
--- a/arch/x86/lib/memmove_64.c
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Normally compiler builtins are used, but sometimes the compiler calls out
- of line code. Based on asm-i386/string.h.
- */
-#define _STRING_C
-#include <linux/string.h>
-#include <linux/module.h>
-
-#undef memmove
-void *memmove(void * dest,const void *src,size_t count)
-{
- if (dest < src) {
- return memcpy(dest,src,count);
- } else {
- char *p = dest + count;
- const char *s = src + count;
- while (count--)
- *--p = *--s;
- }
- return dest;
-}
-EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 2c5948116bd..2dcb3808cbd 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -2,9 +2,13 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
/*
- * ISO C memset - set a memory block to a byte value.
+ * ISO C memset - set a memory block to a byte value. This function uses fast
+ * string to get better performance than the original function. The code is
+ * simpler and shorter than the orignal function as well.
*
* rdi destination
* rsi value (char)
@@ -12,36 +16,55 @@
*
* rax original destination
*/
- ALIGN
-memset_c:
- CFI_STARTPROC
+ .section .altinstr_replacement, "ax", @progbits
+.Lmemset_c:
movq %rdi,%r9
- movl %edx,%r8d
- andl $7,%r8d
- movl %edx,%ecx
- shrl $3,%ecx
+ movq %rdx,%rcx
+ andl $7,%edx
+ shrq $3,%rcx
/* expand byte value */
movzbl %sil,%esi
movabs $0x0101010101010101,%rax
- mulq %rsi /* with rax, clobbers rdx */
+ imulq %rsi,%rax
rep stosq
- movl %r8d,%ecx
+ movl %edx,%ecx
rep stosb
movq %r9,%rax
ret
- CFI_ENDPROC
-ENDPROC(memset_c)
+.Lmemset_e:
+ .previous
+
+/*
+ * ISO C memset - set a memory block to a byte value. This function uses
+ * enhanced rep stosb to override the fast string function.
+ * The code is simpler and shorter than the fast string function as well.
+ *
+ * rdi destination
+ * rsi value (char)
+ * rdx count (bytes)
+ *
+ * rax original destination
+ */
+ .section .altinstr_replacement, "ax", @progbits
+.Lmemset_c_e:
+ movq %rdi,%r9
+ movb %sil,%al
+ movq %rdx,%rcx
+ rep stosb
+ movq %r9,%rax
+ ret
+.Lmemset_e_e:
+ .previous
ENTRY(memset)
ENTRY(__memset)
CFI_STARTPROC
movq %rdi,%r10
- movq %rdx,%r11
/* expand byte value */
movzbl %sil,%ecx
movabs $0x0101010101010101,%rax
- mul %rcx /* with rax, clobbers rdx */
+ imulq %rcx,%rax
/* align dst */
movl %edi,%r9d
@@ -50,13 +73,13 @@ ENTRY(__memset)
CFI_REMEMBER_STATE
.Lafter_bad_alignment:
- movl %r11d,%ecx
- shrl $6,%ecx
+ movq %rdx,%rcx
+ shrq $6,%rcx
jz .Lhandle_tail
.p2align 4
.Lloop_64:
- decl %ecx
+ decq %rcx
movq %rax,(%rdi)
movq %rax,8(%rdi)
movq %rax,16(%rdi)
@@ -72,7 +95,7 @@ ENTRY(__memset)
to predict jump tables. */
.p2align 4
.Lhandle_tail:
- movl %r11d,%ecx
+ movl %edx,%ecx
andl $63&(~7),%ecx
jz .Lhandle_7
shrl $3,%ecx
@@ -84,12 +107,11 @@ ENTRY(__memset)
jnz .Lloop_8
.Lhandle_7:
- movl %r11d,%ecx
- andl $7,%ecx
+ andl $7,%edx
jz .Lende
.p2align 4
.Lloop_1:
- decl %ecx
+ decl %edx
movb %al,(%rdi)
leaq 1(%rdi),%rdi
jnz .Lloop_1
@@ -100,34 +122,33 @@ ENTRY(__memset)
CFI_RESTORE_STATE
.Lbad_alignment:
- cmpq $7,%r11
+ cmpq $7,%rdx
jbe .Lhandle_7
movq %rax,(%rdi) /* unaligned store */
movq $8,%r8
subq %r9,%r8
addq %r8,%rdi
- subq %r8,%r11
+ subq %r8,%rdx
jmp .Lafter_bad_alignment
.Lfinal:
CFI_ENDPROC
ENDPROC(memset)
ENDPROC(__memset)
- /* Some CPUs run faster using the string instructions.
- It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
- .section .altinstr_replacement,"ax"
-1: .byte 0xeb /* jmp <disp8> */
- .byte (memset_c - memset) - (2f - 1b) /* offset */
-2:
- .previous
+ /* Some CPUs support enhanced REP MOVSB/STOSB feature.
+ * It is recommended to use this when possible.
+ *
+ * If enhanced REP MOVSB/STOSB feature is not available, use fast string
+ * instructions.
+ *
+ * Otherwise, use original memset function.
+ *
+ * In .altinstructions section, ERMS feature is placed after REG_GOOD
+ * feature to implement the right patch order.
+ */
.section .altinstructions,"a"
- .align 8
- .quad memset
- .quad 1b
- .byte X86_FEATURE_REP_GOOD
- .byte .Lfinal - memset
- .byte 2b - 1b
+ altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
+ .Lfinal-memset,.Lmemset_e-.Lmemset_c
+ altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
+ .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
.previous
diff --git a/arch/x86/lib/misc.c b/arch/x86/lib/misc.c
new file mode 100644
index 00000000000..76b373af03f
--- /dev/null
+++ b/arch/x86/lib/misc.c
@@ -0,0 +1,21 @@
+/*
+ * Count the digits of @val including a possible sign.
+ *
+ * (Typed on and submitted from hpa's mobile phone.)
+ */
+int num_digits(int val)
+{
+ int m = 10;
+ int d = 1;
+
+ if (val < 0) {
+ d++;
+ val = -val;
+ }
+
+ while (val >= m) {
+ m *= 10;
+ d++;
+ }
+ return d;
+}
diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c
index cc9b4a4450f..c9f2d9ba8dd 100644
--- a/arch/x86/lib/mmx_32.c
+++ b/arch/x86/lib/mmx_32.c
@@ -1,32 +1,30 @@
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/sched.h>
-#include <linux/hardirq.h>
-#include <linux/module.h>
-
-#include <asm/asm.h>
-#include <asm/i387.h>
-
-
/*
* MMX 3DNow! library helper functions
*
* To do:
- * We can use MMX just for prefetch in IRQ's. This may be a win.
+ * We can use MMX just for prefetch in IRQ's. This may be a win.
* (reported so on K6-III)
* We should use a better code neutral filler for the short jump
* leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
* We also want to clobber the filler register so we don't get any
- * register forwarding stalls on the filler.
+ * register forwarding stalls on the filler.
*
* Add *user handling. Checksums are not a win with MMX on any CPU
* tested so far for any MMX solution figured.
*
- * 22/09/2000 - Arjan van de Ven
- * Improved for non-egineering-sample Athlons
+ * 22/09/2000 - Arjan van de Ven
+ * Improved for non-egineering-sample Athlons
*
*/
-
+#include <linux/hardirq.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+#include <asm/i387.h>
+#include <asm/asm.h>
+
void *_mmx_memcpy(void *to, const void *from, size_t len)
{
void *p;
@@ -51,12 +49,10 @@ void *_mmx_memcpy(void *to, const void *from, size_t len)
"3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
" jmp 2b\n"
".previous\n"
- _ASM_EXTABLE(1b,3b)
- : : "r" (from) );
-
-
- for(; i>5; i--)
- {
+ _ASM_EXTABLE(1b, 3b)
+ : : "r" (from));
+
+ for ( ; i > 5; i--) {
__asm__ __volatile__ (
"1: prefetch 320(%0)\n"
"2: movq (%0), %%mm0\n"
@@ -79,14 +75,14 @@ void *_mmx_memcpy(void *to, const void *from, size_t len)
"3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
" jmp 2b\n"
".previous\n"
- _ASM_EXTABLE(1b,3b)
- : : "r" (from), "r" (to) : "memory");
- from+=64;
- to+=64;
+ _ASM_EXTABLE(1b, 3b)
+ : : "r" (from), "r" (to) : "memory");
+
+ from += 64;
+ to += 64;
}
- for(; i>0; i--)
- {
+ for ( ; i > 0; i--) {
__asm__ __volatile__ (
" movq (%0), %%mm0\n"
" movq 8(%0), %%mm1\n"
@@ -104,17 +100,20 @@ void *_mmx_memcpy(void *to, const void *from, size_t len)
" movq %%mm1, 40(%1)\n"
" movq %%mm2, 48(%1)\n"
" movq %%mm3, 56(%1)\n"
- : : "r" (from), "r" (to) : "memory");
- from+=64;
- to+=64;
+ : : "r" (from), "r" (to) : "memory");
+
+ from += 64;
+ to += 64;
}
/*
- * Now do the tail of the block
+ * Now do the tail of the block:
*/
- __memcpy(to, from, len&63);
+ __memcpy(to, from, len & 63);
kernel_fpu_end();
+
return p;
}
+EXPORT_SYMBOL(_mmx_memcpy);
#ifdef CONFIG_MK7
@@ -128,13 +127,12 @@ static void fast_clear_page(void *page)
int i;
kernel_fpu_begin();
-
+
__asm__ __volatile__ (
" pxor %%mm0, %%mm0\n" : :
);
- for(i=0;i<4096/64;i++)
- {
+ for (i = 0; i < 4096/64; i++) {
__asm__ __volatile__ (
" movntq %%mm0, (%0)\n"
" movntq %%mm0, 8(%0)\n"
@@ -145,14 +143,15 @@ static void fast_clear_page(void *page)
" movntq %%mm0, 48(%0)\n"
" movntq %%mm0, 56(%0)\n"
: : "r" (page) : "memory");
- page+=64;
+ page += 64;
}
- /* since movntq is weakly-ordered, a "sfence" is needed to become
- * ordered again.
+
+ /*
+ * Since movntq is weakly-ordered, a "sfence" is needed to become
+ * ordered again:
*/
- __asm__ __volatile__ (
- " sfence \n" : :
- );
+ __asm__ __volatile__("sfence\n"::);
+
kernel_fpu_end();
}
@@ -162,10 +161,11 @@ static void fast_copy_page(void *to, void *from)
kernel_fpu_begin();
- /* maybe the prefetch stuff can go before the expensive fnsave...
+ /*
+ * maybe the prefetch stuff can go before the expensive fnsave...
* but that is for later. -AV
*/
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"1: prefetch (%0)\n"
" prefetch 64(%0)\n"
" prefetch 128(%0)\n"
@@ -176,11 +176,9 @@ static void fast_copy_page(void *to, void *from)
"3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
" jmp 2b\n"
".previous\n"
- _ASM_EXTABLE(1b,3b)
- : : "r" (from) );
+ _ASM_EXTABLE(1b, 3b) : : "r" (from));
- for(i=0; i<(4096-320)/64; i++)
- {
+ for (i = 0; i < (4096-320)/64; i++) {
__asm__ __volatile__ (
"1: prefetch 320(%0)\n"
"2: movq (%0), %%mm0\n"
@@ -203,13 +201,13 @@ static void fast_copy_page(void *to, void *from)
"3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
" jmp 2b\n"
".previous\n"
- _ASM_EXTABLE(1b,3b)
- : : "r" (from), "r" (to) : "memory");
- from+=64;
- to+=64;
+ _ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory");
+
+ from += 64;
+ to += 64;
}
- for(i=(4096-320)/64; i<4096/64; i++)
- {
+
+ for (i = (4096-320)/64; i < 4096/64; i++) {
__asm__ __volatile__ (
"2: movq (%0), %%mm0\n"
" movntq %%mm0, (%1)\n"
@@ -227,37 +225,34 @@ static void fast_copy_page(void *to, void *from)
" movntq %%mm6, 48(%1)\n"
" movq 56(%0), %%mm7\n"
" movntq %%mm7, 56(%1)\n"
- : : "r" (from), "r" (to) : "memory");
- from+=64;
- to+=64;
+ : : "r" (from), "r" (to) : "memory");
+ from += 64;
+ to += 64;
}
- /* since movntq is weakly-ordered, a "sfence" is needed to become
- * ordered again.
+ /*
+ * Since movntq is weakly-ordered, a "sfence" is needed to become
+ * ordered again:
*/
- __asm__ __volatile__ (
- " sfence \n" : :
- );
+ __asm__ __volatile__("sfence \n"::);
kernel_fpu_end();
}
-#else
+#else /* CONFIG_MK7 */
/*
* Generic MMX implementation without K7 specific streaming
*/
-
static void fast_clear_page(void *page)
{
int i;
-
+
kernel_fpu_begin();
-
+
__asm__ __volatile__ (
" pxor %%mm0, %%mm0\n" : :
);
- for(i=0;i<4096/128;i++)
- {
+ for (i = 0; i < 4096/128; i++) {
__asm__ __volatile__ (
" movq %%mm0, (%0)\n"
" movq %%mm0, 8(%0)\n"
@@ -275,8 +270,8 @@ static void fast_clear_page(void *page)
" movq %%mm0, 104(%0)\n"
" movq %%mm0, 112(%0)\n"
" movq %%mm0, 120(%0)\n"
- : : "r" (page) : "memory");
- page+=128;
+ : : "r" (page) : "memory");
+ page += 128;
}
kernel_fpu_end();
@@ -285,8 +280,7 @@ static void fast_clear_page(void *page)
static void fast_copy_page(void *to, void *from)
{
int i;
-
-
+
kernel_fpu_begin();
__asm__ __volatile__ (
@@ -300,11 +294,9 @@ static void fast_copy_page(void *to, void *from)
"3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
" jmp 2b\n"
".previous\n"
- _ASM_EXTABLE(1b,3b)
- : : "r" (from) );
+ _ASM_EXTABLE(1b, 3b) : : "r" (from));
- for(i=0; i<4096/64; i++)
- {
+ for (i = 0; i < 4096/64; i++) {
__asm__ __volatile__ (
"1: prefetch 320(%0)\n"
"2: movq (%0), %%mm0\n"
@@ -327,60 +319,59 @@ static void fast_copy_page(void *to, void *from)
"3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
" jmp 2b\n"
".previous\n"
- _ASM_EXTABLE(1b,3b)
- : : "r" (from), "r" (to) : "memory");
- from+=64;
- to+=64;
+ _ASM_EXTABLE(1b, 3b)
+ : : "r" (from), "r" (to) : "memory");
+
+ from += 64;
+ to += 64;
}
kernel_fpu_end();
}
-
-#endif
+#endif /* !CONFIG_MK7 */
/*
- * Favour MMX for page clear and copy.
+ * Favour MMX for page clear and copy:
*/
-
-static void slow_zero_page(void * page)
+static void slow_zero_page(void *page)
{
int d0, d1;
- __asm__ __volatile__( \
- "cld\n\t" \
- "rep ; stosl" \
- : "=&c" (d0), "=&D" (d1)
- :"a" (0),"1" (page),"0" (1024)
- :"memory");
+
+ __asm__ __volatile__(
+ "cld\n\t"
+ "rep ; stosl"
+
+ : "=&c" (d0), "=&D" (d1)
+ :"a" (0), "1" (page), "0" (1024)
+ :"memory");
}
-
-void mmx_clear_page(void * page)
+
+void mmx_clear_page(void *page)
{
- if(unlikely(in_interrupt()))
+ if (unlikely(in_interrupt()))
slow_zero_page(page);
else
fast_clear_page(page);
}
+EXPORT_SYMBOL(mmx_clear_page);
static void slow_copy_page(void *to, void *from)
{
int d0, d1, d2;
- __asm__ __volatile__( \
- "cld\n\t" \
- "rep ; movsl" \
- : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
- : "0" (1024),"1" ((long) to),"2" ((long) from) \
+
+ __asm__ __volatile__(
+ "cld\n\t"
+ "rep ; movsl"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (1024), "1" ((long) to), "2" ((long) from)
: "memory");
}
-
void mmx_copy_page(void *to, void *from)
{
- if(unlikely(in_interrupt()))
+ if (unlikely(in_interrupt()))
slow_copy_page(to, from);
else
fast_copy_page(to, from);
}
-
-EXPORT_SYMBOL(_mmx_memcpy);
-EXPORT_SYMBOL(mmx_clear_page);
EXPORT_SYMBOL(mmx_copy_page);
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
deleted file mode 100644
index 57d043fa893..00000000000
--- a/arch/x86/lib/msr-on-cpu.c
+++ /dev/null
@@ -1,101 +0,0 @@
-#include <linux/module.h>
-#include <linux/preempt.h>
-#include <linux/smp.h>
-#include <asm/msr.h>
-
-struct msr_info {
- u32 msr_no;
- u32 l, h;
- int err;
-};
-
-static void __rdmsr_on_cpu(void *info)
-{
- struct msr_info *rv = info;
-
- rdmsr(rv->msr_no, rv->l, rv->h);
-}
-
-static void __rdmsr_safe_on_cpu(void *info)
-{
- struct msr_info *rv = info;
-
- rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
-}
-
-static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe)
-{
- int err = 0;
- struct msr_info rv;
-
- rv.msr_no = msr_no;
- if (safe) {
- smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 0, 1);
- err = rv.err;
- } else {
- smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 0, 1);
- }
- *l = rv.l;
- *h = rv.h;
-
- return err;
-}
-
-static void __wrmsr_on_cpu(void *info)
-{
- struct msr_info *rv = info;
-
- wrmsr(rv->msr_no, rv->l, rv->h);
-}
-
-static void __wrmsr_safe_on_cpu(void *info)
-{
- struct msr_info *rv = info;
-
- rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h);
-}
-
-static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe)
-{
- int err = 0;
- struct msr_info rv;
-
- rv.msr_no = msr_no;
- rv.l = l;
- rv.h = h;
- if (safe) {
- smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 0, 1);
- err = rv.err;
- } else {
- smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 0, 1);
- }
-
- return err;
-}
-
-void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
-{
- _wrmsr_on_cpu(cpu, msr_no, l, h, 0);
-}
-
-void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
-{
- _rdmsr_on_cpu(cpu, msr_no, l, h, 0);
-}
-
-/* These "safe" variants are slower and should be used when the target MSR
- may not actually exist. */
-int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
-{
- return _wrmsr_on_cpu(cpu, msr_no, l, h, 1);
-}
-
-int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
-{
- return _rdmsr_on_cpu(cpu, msr_no, l, h, 1);
-}
-
-EXPORT_SYMBOL(rdmsr_on_cpu);
-EXPORT_SYMBOL(wrmsr_on_cpu);
-EXPORT_SYMBOL(rdmsr_safe_on_cpu);
-EXPORT_SYMBOL(wrmsr_safe_on_cpu);
diff --git a/arch/x86/lib/msr-reg-export.c b/arch/x86/lib/msr-reg-export.c
new file mode 100644
index 00000000000..8d6ef78b5d0
--- /dev/null
+++ b/arch/x86/lib/msr-reg-export.c
@@ -0,0 +1,5 @@
+#include <linux/module.h>
+#include <asm/msr.h>
+
+EXPORT_SYMBOL(rdmsr_safe_regs);
+EXPORT_SYMBOL(wrmsr_safe_regs);
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
new file mode 100644
index 00000000000..f6d13eefad1
--- /dev/null
+++ b/arch/x86/lib/msr-reg.S
@@ -0,0 +1,102 @@
+#include <linux/linkage.h>
+#include <linux/errno.h>
+#include <asm/dwarf2.h>
+#include <asm/asm.h>
+#include <asm/msr.h>
+
+#ifdef CONFIG_X86_64
+/*
+ * int {rdmsr,wrmsr}_safe_regs(u32 gprs[8]);
+ *
+ * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi]
+ *
+ */
+.macro op_safe_regs op
+ENTRY(\op\()_safe_regs)
+ CFI_STARTPROC
+ pushq_cfi %rbx
+ pushq_cfi %rbp
+ movq %rdi, %r10 /* Save pointer */
+ xorl %r11d, %r11d /* Return value */
+ movl (%rdi), %eax
+ movl 4(%rdi), %ecx
+ movl 8(%rdi), %edx
+ movl 12(%rdi), %ebx
+ movl 20(%rdi), %ebp
+ movl 24(%rdi), %esi
+ movl 28(%rdi), %edi
+ CFI_REMEMBER_STATE
+1: \op
+2: movl %eax, (%r10)
+ movl %r11d, %eax /* Return value */
+ movl %ecx, 4(%r10)
+ movl %edx, 8(%r10)
+ movl %ebx, 12(%r10)
+ movl %ebp, 20(%r10)
+ movl %esi, 24(%r10)
+ movl %edi, 28(%r10)
+ popq_cfi %rbp
+ popq_cfi %rbx
+ ret
+3:
+ CFI_RESTORE_STATE
+ movl $-EIO, %r11d
+ jmp 2b
+
+ _ASM_EXTABLE(1b, 3b)
+ CFI_ENDPROC
+ENDPROC(\op\()_safe_regs)
+.endm
+
+#else /* X86_32 */
+
+.macro op_safe_regs op
+ENTRY(\op\()_safe_regs)
+ CFI_STARTPROC
+ pushl_cfi %ebx
+ pushl_cfi %ebp
+ pushl_cfi %esi
+ pushl_cfi %edi
+ pushl_cfi $0 /* Return value */
+ pushl_cfi %eax
+ movl 4(%eax), %ecx
+ movl 8(%eax), %edx
+ movl 12(%eax), %ebx
+ movl 20(%eax), %ebp
+ movl 24(%eax), %esi
+ movl 28(%eax), %edi
+ movl (%eax), %eax
+ CFI_REMEMBER_STATE
+1: \op
+2: pushl_cfi %eax
+ movl 4(%esp), %eax
+ popl_cfi (%eax)
+ addl $4, %esp
+ CFI_ADJUST_CFA_OFFSET -4
+ movl %ecx, 4(%eax)
+ movl %edx, 8(%eax)
+ movl %ebx, 12(%eax)
+ movl %ebp, 20(%eax)
+ movl %esi, 24(%eax)
+ movl %edi, 28(%eax)
+ popl_cfi %eax
+ popl_cfi %edi
+ popl_cfi %esi
+ popl_cfi %ebp
+ popl_cfi %ebx
+ ret
+3:
+ CFI_RESTORE_STATE
+ movl $-EIO, 4(%esp)
+ jmp 2b
+
+ _ASM_EXTABLE(1b, 3b)
+ CFI_ENDPROC
+ENDPROC(\op\()_safe_regs)
+.endm
+
+#endif
+
+op_safe_regs rdmsr
+op_safe_regs wrmsr
+
diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c
new file mode 100644
index 00000000000..518532e6a3f
--- /dev/null
+++ b/arch/x86/lib/msr-smp.c
@@ -0,0 +1,266 @@
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/smp.h>
+#include <asm/msr.h>
+
+static void __rdmsr_on_cpu(void *info)
+{
+ struct msr_info *rv = info;
+ struct msr *reg;
+ int this_cpu = raw_smp_processor_id();
+
+ if (rv->msrs)
+ reg = per_cpu_ptr(rv->msrs, this_cpu);
+ else
+ reg = &rv->reg;
+
+ rdmsr(rv->msr_no, reg->l, reg->h);
+}
+
+static void __wrmsr_on_cpu(void *info)
+{
+ struct msr_info *rv = info;
+ struct msr *reg;
+ int this_cpu = raw_smp_processor_id();
+
+ if (rv->msrs)
+ reg = per_cpu_ptr(rv->msrs, this_cpu);
+ else
+ reg = &rv->reg;
+
+ wrmsr(rv->msr_no, reg->l, reg->h);
+}
+
+int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
+ *l = rv.reg.l;
+ *h = rv.reg.h;
+
+ return err;
+}
+EXPORT_SYMBOL(rdmsr_on_cpu);
+
+int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
+ *q = rv.reg.q;
+
+ return err;
+}
+EXPORT_SYMBOL(rdmsrl_on_cpu);
+
+int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ rv.reg.l = l;
+ rv.reg.h = h;
+ err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
+
+ return err;
+}
+EXPORT_SYMBOL(wrmsr_on_cpu);
+
+int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ rv.reg.q = q;
+
+ err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
+
+ return err;
+}
+EXPORT_SYMBOL(wrmsrl_on_cpu);
+
+static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no,
+ struct msr *msrs,
+ void (*msr_func) (void *info))
+{
+ struct msr_info rv;
+ int this_cpu;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msrs = msrs;
+ rv.msr_no = msr_no;
+
+ this_cpu = get_cpu();
+
+ if (cpumask_test_cpu(this_cpu, mask))
+ msr_func(&rv);
+
+ smp_call_function_many(mask, msr_func, &rv, 1);
+ put_cpu();
+}
+
+/* rdmsr on a bunch of CPUs
+ *
+ * @mask: which CPUs
+ * @msr_no: which MSR
+ * @msrs: array of MSR values
+ *
+ */
+void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
+{
+ __rwmsr_on_cpus(mask, msr_no, msrs, __rdmsr_on_cpu);
+}
+EXPORT_SYMBOL(rdmsr_on_cpus);
+
+/*
+ * wrmsr on a bunch of CPUs
+ *
+ * @mask: which CPUs
+ * @msr_no: which MSR
+ * @msrs: array of MSR values
+ *
+ */
+void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
+{
+ __rwmsr_on_cpus(mask, msr_no, msrs, __wrmsr_on_cpu);
+}
+EXPORT_SYMBOL(wrmsr_on_cpus);
+
+/* These "safe" variants are slower and should be used when the target MSR
+ may not actually exist. */
+static void __rdmsr_safe_on_cpu(void *info)
+{
+ struct msr_info *rv = info;
+
+ rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h);
+}
+
+static void __wrmsr_safe_on_cpu(void *info)
+{
+ struct msr_info *rv = info;
+
+ rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h);
+}
+
+int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
+ *l = rv.reg.l;
+ *h = rv.reg.h;
+
+ return err ? err : rv.err;
+}
+EXPORT_SYMBOL(rdmsr_safe_on_cpu);
+
+int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ rv.reg.l = l;
+ rv.reg.h = h;
+ err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
+
+ return err ? err : rv.err;
+}
+EXPORT_SYMBOL(wrmsr_safe_on_cpu);
+
+int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ rv.reg.q = q;
+
+ err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
+
+ return err ? err : rv.err;
+}
+EXPORT_SYMBOL(wrmsrl_safe_on_cpu);
+
+int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
+ *q = rv.reg.q;
+
+ return err ? err : rv.err;
+}
+EXPORT_SYMBOL(rdmsrl_safe_on_cpu);
+
+/*
+ * These variants are significantly slower, but allows control over
+ * the entire 32-bit GPR set.
+ */
+static void __rdmsr_safe_regs_on_cpu(void *info)
+{
+ struct msr_regs_info *rv = info;
+
+ rv->err = rdmsr_safe_regs(rv->regs);
+}
+
+static void __wrmsr_safe_regs_on_cpu(void *info)
+{
+ struct msr_regs_info *rv = info;
+
+ rv->err = wrmsr_safe_regs(rv->regs);
+}
+
+int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
+{
+ int err;
+ struct msr_regs_info rv;
+
+ rv.regs = regs;
+ rv.err = -EIO;
+ err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1);
+
+ return err ? err : rv.err;
+}
+EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu);
+
+int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
+{
+ int err;
+ struct msr_regs_info rv;
+
+ rv.regs = regs;
+ rv.err = -EIO;
+ err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1);
+
+ return err ? err : rv.err;
+}
+EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu);
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
new file mode 100644
index 00000000000..43623739c7c
--- /dev/null
+++ b/arch/x86/lib/msr.c
@@ -0,0 +1,110 @@
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <asm/msr.h>
+
+struct msr *msrs_alloc(void)
+{
+ struct msr *msrs = NULL;
+
+ msrs = alloc_percpu(struct msr);
+ if (!msrs) {
+ pr_warn("%s: error allocating msrs\n", __func__);
+ return NULL;
+ }
+
+ return msrs;
+}
+EXPORT_SYMBOL(msrs_alloc);
+
+void msrs_free(struct msr *msrs)
+{
+ free_percpu(msrs);
+}
+EXPORT_SYMBOL(msrs_free);
+
+/**
+ * Read an MSR with error handling
+ *
+ * @msr: MSR to read
+ * @m: value to read into
+ *
+ * It returns read data only on success, otherwise it doesn't change the output
+ * argument @m.
+ *
+ */
+int msr_read(u32 msr, struct msr *m)
+{
+ int err;
+ u64 val;
+
+ err = rdmsrl_safe(msr, &val);
+ if (!err)
+ m->q = val;
+
+ return err;
+}
+
+/**
+ * Write an MSR with error handling
+ *
+ * @msr: MSR to write
+ * @m: value to write
+ */
+int msr_write(u32 msr, struct msr *m)
+{
+ return wrmsrl_safe(msr, m->q);
+}
+
+static inline int __flip_bit(u32 msr, u8 bit, bool set)
+{
+ struct msr m, m1;
+ int err = -EINVAL;
+
+ if (bit > 63)
+ return err;
+
+ err = msr_read(msr, &m);
+ if (err)
+ return err;
+
+ m1 = m;
+ if (set)
+ m1.q |= BIT_64(bit);
+ else
+ m1.q &= ~BIT_64(bit);
+
+ if (m1.q == m.q)
+ return 0;
+
+ err = msr_write(msr, &m1);
+ if (err)
+ return err;
+
+ return 1;
+}
+
+/**
+ * Set @bit in a MSR @msr.
+ *
+ * Retval:
+ * < 0: An error was encountered.
+ * = 0: Bit was already set.
+ * > 0: Hardware accepted the MSR write.
+ */
+int msr_set_bit(u32 msr, u8 bit)
+{
+ return __flip_bit(msr, bit, true);
+}
+
+/**
+ * Clear @bit in a MSR @msr.
+ *
+ * Retval:
+ * < 0: An error was encountered.
+ * = 0: Bit was already cleared.
+ * > 0: Hardware accepted the MSR write.
+ */
+int msr_clear_bit(u32 msr, u8 bit)
+{
+ return __flip_bit(msr, bit, false);
+}
diff --git a/arch/x86/lib/putuser_32.S b/arch/x86/lib/putuser.S
index f58fba109d1..fc6ba17a7ee 100644
--- a/arch/x86/lib/putuser_32.S
+++ b/arch/x86/lib/putuser.S
@@ -2,6 +2,8 @@
* __put_user functions.
*
* (C) Copyright 2005 Linus Torvalds
+ * (C) Copyright 2005 Andi Kleen
+ * (C) Copyright 2008 Glauber Costa
*
* These functions have a non-standard call interface
* to make them more efficient, especially as they
@@ -11,6 +13,9 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/thread_info.h>
+#include <asm/errno.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
/*
@@ -26,73 +31,71 @@
*/
#define ENTER CFI_STARTPROC ; \
- pushl %ebx ; \
- CFI_ADJUST_CFA_OFFSET 4 ; \
- CFI_REL_OFFSET ebx, 0 ; \
- GET_THREAD_INFO(%ebx)
-#define EXIT popl %ebx ; \
- CFI_ADJUST_CFA_OFFSET -4 ; \
- CFI_RESTORE ebx ; \
- ret ; \
+ GET_THREAD_INFO(%_ASM_BX)
+#define EXIT ASM_CLAC ; \
+ ret ; \
CFI_ENDPROC
.text
ENTRY(__put_user_1)
ENTER
- cmpl TI_addr_limit(%ebx),%ecx
+ cmp TI_addr_limit(%_ASM_BX),%_ASM_CX
jae bad_put_user
-1: movb %al,(%ecx)
- xorl %eax,%eax
+ ASM_STAC
+1: movb %al,(%_ASM_CX)
+ xor %eax,%eax
EXIT
ENDPROC(__put_user_1)
ENTRY(__put_user_2)
ENTER
- movl TI_addr_limit(%ebx),%ebx
- subl $1,%ebx
- cmpl %ebx,%ecx
+ mov TI_addr_limit(%_ASM_BX),%_ASM_BX
+ sub $1,%_ASM_BX
+ cmp %_ASM_BX,%_ASM_CX
jae bad_put_user
-2: movw %ax,(%ecx)
- xorl %eax,%eax
+ ASM_STAC
+2: movw %ax,(%_ASM_CX)
+ xor %eax,%eax
EXIT
ENDPROC(__put_user_2)
ENTRY(__put_user_4)
ENTER
- movl TI_addr_limit(%ebx),%ebx
- subl $3,%ebx
- cmpl %ebx,%ecx
+ mov TI_addr_limit(%_ASM_BX),%_ASM_BX
+ sub $3,%_ASM_BX
+ cmp %_ASM_BX,%_ASM_CX
jae bad_put_user
-3: movl %eax,(%ecx)
- xorl %eax,%eax
+ ASM_STAC
+3: movl %eax,(%_ASM_CX)
+ xor %eax,%eax
EXIT
ENDPROC(__put_user_4)
ENTRY(__put_user_8)
ENTER
- movl TI_addr_limit(%ebx),%ebx
- subl $7,%ebx
- cmpl %ebx,%ecx
+ mov TI_addr_limit(%_ASM_BX),%_ASM_BX
+ sub $7,%_ASM_BX
+ cmp %_ASM_BX,%_ASM_CX
jae bad_put_user
-4: movl %eax,(%ecx)
-5: movl %edx,4(%ecx)
- xorl %eax,%eax
+ ASM_STAC
+4: mov %_ASM_AX,(%_ASM_CX)
+#ifdef CONFIG_X86_32
+5: movl %edx,4(%_ASM_CX)
+#endif
+ xor %eax,%eax
EXIT
ENDPROC(__put_user_8)
bad_put_user:
- CFI_STARTPROC simple
- CFI_DEF_CFA esp, 2*4
- CFI_OFFSET eip, -1*4
- CFI_OFFSET ebx, -2*4
- movl $-14,%eax
+ CFI_STARTPROC
+ movl $-EFAULT,%eax
EXIT
END(bad_put_user)
-.section __ex_table,"a"
- .long 1b,bad_put_user
- .long 2b,bad_put_user
- .long 3b,bad_put_user
- .long 4b,bad_put_user
- .long 5b,bad_put_user
-.previous
+ _ASM_EXTABLE(1b,bad_put_user)
+ _ASM_EXTABLE(2b,bad_put_user)
+ _ASM_EXTABLE(3b,bad_put_user)
+ _ASM_EXTABLE(4b,bad_put_user)
+#ifdef CONFIG_X86_32
+ _ASM_EXTABLE(5b,bad_put_user)
+#endif
diff --git a/arch/x86/lib/putuser_64.S b/arch/x86/lib/putuser_64.S
deleted file mode 100644
index 4989f5a8fa9..00000000000
--- a/arch/x86/lib/putuser_64.S
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * __put_user functions.
- *
- * (C) Copyright 1998 Linus Torvalds
- * (C) Copyright 2005 Andi Kleen
- *
- * These functions have a non-standard call interface
- * to make them more efficient, especially as they
- * return an error value in addition to the "real"
- * return value.
- */
-
-/*
- * __put_user_X
- *
- * Inputs: %rcx contains the address
- * %rdx contains new value
- *
- * Outputs: %rax is error code (0 or -EFAULT)
- *
- * %r8 is destroyed.
- *
- * These functions should not modify any other registers,
- * as they get called from within inline assembly.
- */
-
-#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-#include <asm/page.h>
-#include <asm/errno.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-
- .text
-ENTRY(__put_user_1)
- CFI_STARTPROC
- GET_THREAD_INFO(%r8)
- cmpq threadinfo_addr_limit(%r8),%rcx
- jae bad_put_user
-1: movb %dl,(%rcx)
- xorl %eax,%eax
- ret
- CFI_ENDPROC
-ENDPROC(__put_user_1)
-
-ENTRY(__put_user_2)
- CFI_STARTPROC
- GET_THREAD_INFO(%r8)
- addq $1,%rcx
- jc 20f
- cmpq threadinfo_addr_limit(%r8),%rcx
- jae 20f
- decq %rcx
-2: movw %dx,(%rcx)
- xorl %eax,%eax
- ret
-20: decq %rcx
- jmp bad_put_user
- CFI_ENDPROC
-ENDPROC(__put_user_2)
-
-ENTRY(__put_user_4)
- CFI_STARTPROC
- GET_THREAD_INFO(%r8)
- addq $3,%rcx
- jc 30f
- cmpq threadinfo_addr_limit(%r8),%rcx
- jae 30f
- subq $3,%rcx
-3: movl %edx,(%rcx)
- xorl %eax,%eax
- ret
-30: subq $3,%rcx
- jmp bad_put_user
- CFI_ENDPROC
-ENDPROC(__put_user_4)
-
-ENTRY(__put_user_8)
- CFI_STARTPROC
- GET_THREAD_INFO(%r8)
- addq $7,%rcx
- jc 40f
- cmpq threadinfo_addr_limit(%r8),%rcx
- jae 40f
- subq $7,%rcx
-4: movq %rdx,(%rcx)
- xorl %eax,%eax
- ret
-40: subq $7,%rcx
- jmp bad_put_user
- CFI_ENDPROC
-ENDPROC(__put_user_8)
-
-bad_put_user:
- CFI_STARTPROC
- movq $(-EFAULT),%rax
- ret
- CFI_ENDPROC
-END(bad_put_user)
-
-.section __ex_table,"a"
- .quad 1b,bad_put_user
- .quad 2b,bad_put_user
- .quad 3b,bad_put_user
- .quad 4b,bad_put_user
-.previous
diff --git a/arch/x86/lib/rwlock.S b/arch/x86/lib/rwlock.S
new file mode 100644
index 00000000000..1cad22139c8
--- /dev/null
+++ b/arch/x86/lib/rwlock.S
@@ -0,0 +1,44 @@
+/* Slow paths of read/write spinlocks. */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/frame.h>
+#include <asm/rwlock.h>
+
+#ifdef CONFIG_X86_32
+# define __lock_ptr eax
+#else
+# define __lock_ptr rdi
+#endif
+
+ENTRY(__write_lock_failed)
+ CFI_STARTPROC
+ FRAME
+0: LOCK_PREFIX
+ WRITE_LOCK_ADD($RW_LOCK_BIAS) (%__lock_ptr)
+1: rep; nop
+ cmpl $WRITE_LOCK_CMP, (%__lock_ptr)
+ jne 1b
+ LOCK_PREFIX
+ WRITE_LOCK_SUB($RW_LOCK_BIAS) (%__lock_ptr)
+ jnz 0b
+ ENDFRAME
+ ret
+ CFI_ENDPROC
+END(__write_lock_failed)
+
+ENTRY(__read_lock_failed)
+ CFI_STARTPROC
+ FRAME
+0: LOCK_PREFIX
+ READ_LOCK_SIZE(inc) (%__lock_ptr)
+1: rep; nop
+ READ_LOCK_SIZE(cmp) $1, (%__lock_ptr)
+ js 1b
+ LOCK_PREFIX
+ READ_LOCK_SIZE(dec) (%__lock_ptr)
+ js 0b
+ ENDFRAME
+ ret
+ CFI_ENDPROC
+END(__read_lock_failed)
diff --git a/arch/x86/lib/rwlock_64.S b/arch/x86/lib/rwlock_64.S
deleted file mode 100644
index 05ea55f7140..00000000000
--- a/arch/x86/lib/rwlock_64.S
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Slow paths of read/write spinlocks. */
-
-#include <linux/linkage.h>
-#include <asm/rwlock.h>
-#include <asm/alternative-asm.h>
-#include <asm/dwarf2.h>
-
-/* rdi: pointer to rwlock_t */
-ENTRY(__write_lock_failed)
- CFI_STARTPROC
- LOCK_PREFIX
- addl $RW_LOCK_BIAS,(%rdi)
-1: rep
- nop
- cmpl $RW_LOCK_BIAS,(%rdi)
- jne 1b
- LOCK_PREFIX
- subl $RW_LOCK_BIAS,(%rdi)
- jnz __write_lock_failed
- ret
- CFI_ENDPROC
-END(__write_lock_failed)
-
-/* rdi: pointer to rwlock_t */
-ENTRY(__read_lock_failed)
- CFI_STARTPROC
- LOCK_PREFIX
- incl (%rdi)
-1: rep
- nop
- cmpl $1,(%rdi)
- js 1b
- LOCK_PREFIX
- decl (%rdi)
- js __read_lock_failed
- ret
- CFI_ENDPROC
-END(__read_lock_failed)
diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S
new file mode 100644
index 00000000000..5dff5f04246
--- /dev/null
+++ b/arch/x86/lib/rwsem.S
@@ -0,0 +1,136 @@
+/*
+ * x86 semaphore implementation.
+ *
+ * (C) Copyright 1999 Linus Torvalds
+ *
+ * Portions Copyright 1999 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/dwarf2.h>
+
+#define __ASM_HALF_REG(reg) __ASM_SEL(reg, e##reg)
+#define __ASM_HALF_SIZE(inst) __ASM_SEL(inst##w, inst##l)
+
+#ifdef CONFIG_X86_32
+
+/*
+ * The semaphore operations have a special calling sequence that
+ * allow us to do a simpler in-line version of them. These routines
+ * need to convert that sequence back into the C sequence when
+ * there is contention on the semaphore.
+ *
+ * %eax contains the semaphore pointer on entry. Save the C-clobbered
+ * registers (%eax, %edx and %ecx) except %eax whish is either a return
+ * value or just clobbered..
+ */
+
+#define save_common_regs \
+ pushl_cfi %ecx; CFI_REL_OFFSET ecx, 0
+
+#define restore_common_regs \
+ popl_cfi %ecx; CFI_RESTORE ecx
+
+ /* Avoid uglifying the argument copying x86-64 needs to do. */
+ .macro movq src, dst
+ .endm
+
+#else
+
+/*
+ * x86-64 rwsem wrappers
+ *
+ * This interfaces the inline asm code to the slow-path
+ * C routines. We need to save the call-clobbered regs
+ * that the asm does not mark as clobbered, and move the
+ * argument from %rax to %rdi.
+ *
+ * NOTE! We don't need to save %rax, because the functions
+ * will always return the semaphore pointer in %rax (which
+ * is also the input argument to these helpers)
+ *
+ * The following can clobber %rdx because the asm clobbers it:
+ * call_rwsem_down_write_failed
+ * call_rwsem_wake
+ * but %rdi, %rsi, %rcx, %r8-r11 always need saving.
+ */
+
+#define save_common_regs \
+ pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \
+ pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \
+ pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \
+ pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \
+ pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \
+ pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \
+ pushq_cfi %r11; CFI_REL_OFFSET r11, 0
+
+#define restore_common_regs \
+ popq_cfi %r11; CFI_RESTORE r11; \
+ popq_cfi %r10; CFI_RESTORE r10; \
+ popq_cfi %r9; CFI_RESTORE r9; \
+ popq_cfi %r8; CFI_RESTORE r8; \
+ popq_cfi %rcx; CFI_RESTORE rcx; \
+ popq_cfi %rsi; CFI_RESTORE rsi; \
+ popq_cfi %rdi; CFI_RESTORE rdi
+
+#endif
+
+/* Fix up special calling conventions */
+ENTRY(call_rwsem_down_read_failed)
+ CFI_STARTPROC
+ save_common_regs
+ __ASM_SIZE(push,_cfi) %__ASM_REG(dx)
+ CFI_REL_OFFSET __ASM_REG(dx), 0
+ movq %rax,%rdi
+ call rwsem_down_read_failed
+ __ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
+ CFI_RESTORE __ASM_REG(dx)
+ restore_common_regs
+ ret
+ CFI_ENDPROC
+ENDPROC(call_rwsem_down_read_failed)
+
+ENTRY(call_rwsem_down_write_failed)
+ CFI_STARTPROC
+ save_common_regs
+ movq %rax,%rdi
+ call rwsem_down_write_failed
+ restore_common_regs
+ ret
+ CFI_ENDPROC
+ENDPROC(call_rwsem_down_write_failed)
+
+ENTRY(call_rwsem_wake)
+ CFI_STARTPROC
+ /* do nothing if still outstanding active readers */
+ __ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx)
+ jnz 1f
+ save_common_regs
+ movq %rax,%rdi
+ call rwsem_wake
+ restore_common_regs
+1: ret
+ CFI_ENDPROC
+ENDPROC(call_rwsem_wake)
+
+ENTRY(call_rwsem_downgrade_wake)
+ CFI_STARTPROC
+ save_common_regs
+ __ASM_SIZE(push,_cfi) %__ASM_REG(dx)
+ CFI_REL_OFFSET __ASM_REG(dx), 0
+ movq %rax,%rdi
+ call rwsem_downgrade_wake
+ __ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
+ CFI_RESTORE __ASM_REG(dx)
+ restore_common_regs
+ ret
+ CFI_ENDPROC
+ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
deleted file mode 100644
index 3899bd37fdf..00000000000
--- a/arch/x86/lib/semaphore_32.S
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * i386 semaphore implementation.
- *
- * (C) Copyright 1999 Linus Torvalds
- *
- * Portions Copyright 1999 Red Hat, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/rwlock.h>
-#include <asm/alternative-asm.h>
-#include <asm/frame.h>
-#include <asm/dwarf2.h>
-
-/*
- * The semaphore operations have a special calling sequence that
- * allow us to do a simpler in-line version of them. These routines
- * need to convert that sequence back into the C sequence when
- * there is contention on the semaphore.
- *
- * %eax contains the semaphore pointer on entry. Save the C-clobbered
- * registers (%eax, %edx and %ecx) except %eax whish is either a return
- * value or just clobbered..
- */
- .section .sched.text, "ax"
-ENTRY(__down_failed)
- CFI_STARTPROC
- FRAME
- pushl %edx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edx,0
- pushl %ecx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ecx,0
- call __down
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE ecx
- popl %edx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE edx
- ENDFRAME
- ret
- CFI_ENDPROC
- ENDPROC(__down_failed)
-
-ENTRY(__down_failed_interruptible)
- CFI_STARTPROC
- FRAME
- pushl %edx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edx,0
- pushl %ecx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ecx,0
- call __down_interruptible
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE ecx
- popl %edx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE edx
- ENDFRAME
- ret
- CFI_ENDPROC
- ENDPROC(__down_failed_interruptible)
-
-ENTRY(__down_failed_trylock)
- CFI_STARTPROC
- FRAME
- pushl %edx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edx,0
- pushl %ecx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ecx,0
- call __down_trylock
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE ecx
- popl %edx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE edx
- ENDFRAME
- ret
- CFI_ENDPROC
- ENDPROC(__down_failed_trylock)
-
-ENTRY(__up_wakeup)
- CFI_STARTPROC
- FRAME
- pushl %edx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edx,0
- pushl %ecx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ecx,0
- call __up
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE ecx
- popl %edx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE edx
- ENDFRAME
- ret
- CFI_ENDPROC
- ENDPROC(__up_wakeup)
-
-/*
- * rw spinlock fallbacks
- */
-#ifdef CONFIG_SMP
-ENTRY(__write_lock_failed)
- CFI_STARTPROC simple
- FRAME
-2: LOCK_PREFIX
- addl $ RW_LOCK_BIAS,(%eax)
-1: rep; nop
- cmpl $ RW_LOCK_BIAS,(%eax)
- jne 1b
- LOCK_PREFIX
- subl $ RW_LOCK_BIAS,(%eax)
- jnz 2b
- ENDFRAME
- ret
- CFI_ENDPROC
- ENDPROC(__write_lock_failed)
-
-ENTRY(__read_lock_failed)
- CFI_STARTPROC
- FRAME
-2: LOCK_PREFIX
- incl (%eax)
-1: rep; nop
- cmpl $1,(%eax)
- js 1b
- LOCK_PREFIX
- decl (%eax)
- js 2b
- ENDFRAME
- ret
- CFI_ENDPROC
- ENDPROC(__read_lock_failed)
-
-#endif
-
-#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
-
-/* Fix up special calling conventions */
-ENTRY(call_rwsem_down_read_failed)
- CFI_STARTPROC
- push %ecx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ecx,0
- push %edx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edx,0
- call rwsem_down_read_failed
- pop %edx
- CFI_ADJUST_CFA_OFFSET -4
- pop %ecx
- CFI_ADJUST_CFA_OFFSET -4
- ret
- CFI_ENDPROC
- ENDPROC(call_rwsem_down_read_failed)
-
-ENTRY(call_rwsem_down_write_failed)
- CFI_STARTPROC
- push %ecx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ecx,0
- calll rwsem_down_write_failed
- pop %ecx
- CFI_ADJUST_CFA_OFFSET -4
- ret
- CFI_ENDPROC
- ENDPROC(call_rwsem_down_write_failed)
-
-ENTRY(call_rwsem_wake)
- CFI_STARTPROC
- decw %dx /* do nothing if still outstanding active readers */
- jnz 1f
- push %ecx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ecx,0
- call rwsem_wake
- pop %ecx
- CFI_ADJUST_CFA_OFFSET -4
-1: ret
- CFI_ENDPROC
- ENDPROC(call_rwsem_wake)
-
-/* Fix up special calling conventions */
-ENTRY(call_rwsem_downgrade_wake)
- CFI_STARTPROC
- push %ecx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ecx,0
- push %edx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edx,0
- call rwsem_downgrade_wake
- pop %edx
- CFI_ADJUST_CFA_OFFSET -4
- pop %ecx
- CFI_ADJUST_CFA_OFFSET -4
- ret
- CFI_ENDPROC
- ENDPROC(call_rwsem_downgrade_wake)
-
-#endif
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
index c2c0504a307..bd59090825d 100644
--- a/arch/x86/lib/string_32.c
+++ b/arch/x86/lib/string_32.c
@@ -14,25 +14,25 @@
#include <linux/module.h>
#ifdef __HAVE_ARCH_STRCPY
-char *strcpy(char * dest,const char *src)
+char *strcpy(char *dest, const char *src)
{
int d0, d1, d2;
- asm volatile( "1:\tlodsb\n\t"
+ asm volatile("1:\tlodsb\n\t"
"stosb\n\t"
"testb %%al,%%al\n\t"
"jne 1b"
: "=&S" (d0), "=&D" (d1), "=&a" (d2)
- :"0" (src),"1" (dest) : "memory");
+ : "0" (src), "1" (dest) : "memory");
return dest;
}
EXPORT_SYMBOL(strcpy);
#endif
#ifdef __HAVE_ARCH_STRNCPY
-char *strncpy(char * dest,const char *src,size_t count)
+char *strncpy(char *dest, const char *src, size_t count)
{
int d0, d1, d2, d3;
- asm volatile( "1:\tdecl %2\n\t"
+ asm volatile("1:\tdecl %2\n\t"
"js 2f\n\t"
"lodsb\n\t"
"stosb\n\t"
@@ -42,17 +42,17 @@ char *strncpy(char * dest,const char *src,size_t count)
"stosb\n"
"2:"
: "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
- :"0" (src),"1" (dest),"2" (count) : "memory");
+ : "0" (src), "1" (dest), "2" (count) : "memory");
return dest;
}
EXPORT_SYMBOL(strncpy);
#endif
#ifdef __HAVE_ARCH_STRCAT
-char *strcat(char * dest,const char * src)
+char *strcat(char *dest, const char *src)
{
int d0, d1, d2, d3;
- asm volatile( "repne\n\t"
+ asm volatile("repne\n\t"
"scasb\n\t"
"decl %1\n"
"1:\tlodsb\n\t"
@@ -60,17 +60,17 @@ char *strcat(char * dest,const char * src)
"testb %%al,%%al\n\t"
"jne 1b"
: "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
- : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu): "memory");
+ : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu) : "memory");
return dest;
}
EXPORT_SYMBOL(strcat);
#endif
#ifdef __HAVE_ARCH_STRNCAT
-char *strncat(char * dest,const char * src,size_t count)
+char *strncat(char *dest, const char *src, size_t count)
{
int d0, d1, d2, d3;
- asm volatile( "repne\n\t"
+ asm volatile("repne\n\t"
"scasb\n\t"
"decl %1\n\t"
"movl %8,%3\n"
@@ -83,7 +83,7 @@ char *strncat(char * dest,const char * src,size_t count)
"2:\txorl %2,%2\n\t"
"stosb"
: "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
- : "0" (src),"1" (dest),"2" (0),"3" (0xffffffffu), "g" (count)
+ : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu), "g" (count)
: "memory");
return dest;
}
@@ -91,11 +91,11 @@ EXPORT_SYMBOL(strncat);
#endif
#ifdef __HAVE_ARCH_STRCMP
-int strcmp(const char * cs,const char * ct)
+int strcmp(const char *cs, const char *ct)
{
int d0, d1;
int res;
- asm volatile( "1:\tlodsb\n\t"
+ asm volatile("1:\tlodsb\n\t"
"scasb\n\t"
"jne 2f\n\t"
"testb %%al,%%al\n\t"
@@ -105,20 +105,20 @@ int strcmp(const char * cs,const char * ct)
"2:\tsbbl %%eax,%%eax\n\t"
"orb $1,%%al\n"
"3:"
- :"=a" (res), "=&S" (d0), "=&D" (d1)
- :"1" (cs),"2" (ct)
- :"memory");
+ : "=a" (res), "=&S" (d0), "=&D" (d1)
+ : "1" (cs), "2" (ct)
+ : "memory");
return res;
}
EXPORT_SYMBOL(strcmp);
#endif
#ifdef __HAVE_ARCH_STRNCMP
-int strncmp(const char * cs,const char * ct,size_t count)
+int strncmp(const char *cs, const char *ct, size_t count)
{
int res;
int d0, d1, d2;
- asm volatile( "1:\tdecl %3\n\t"
+ asm volatile("1:\tdecl %3\n\t"
"js 2f\n\t"
"lodsb\n\t"
"scasb\n\t"
@@ -130,20 +130,20 @@ int strncmp(const char * cs,const char * ct,size_t count)
"3:\tsbbl %%eax,%%eax\n\t"
"orb $1,%%al\n"
"4:"
- :"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
- :"1" (cs),"2" (ct),"3" (count)
- :"memory");
+ : "=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
+ : "1" (cs), "2" (ct), "3" (count)
+ : "memory");
return res;
}
EXPORT_SYMBOL(strncmp);
#endif
#ifdef __HAVE_ARCH_STRCHR
-char *strchr(const char * s, int c)
+char *strchr(const char *s, int c)
{
int d0;
- char * res;
- asm volatile( "movb %%al,%%ah\n"
+ char *res;
+ asm volatile("movb %%al,%%ah\n"
"1:\tlodsb\n\t"
"cmpb %%ah,%%al\n\t"
"je 2f\n\t"
@@ -152,53 +152,51 @@ char *strchr(const char * s, int c)
"movl $1,%1\n"
"2:\tmovl %1,%0\n\t"
"decl %0"
- :"=a" (res), "=&S" (d0)
- :"1" (s),"0" (c)
- :"memory");
+ : "=a" (res), "=&S" (d0)
+ : "1" (s), "0" (c)
+ : "memory");
return res;
}
EXPORT_SYMBOL(strchr);
#endif
#ifdef __HAVE_ARCH_STRLEN
-size_t strlen(const char * s)
+size_t strlen(const char *s)
{
int d0;
- int res;
- asm volatile( "repne\n\t"
- "scasb\n\t"
- "notl %0\n\t"
- "decl %0"
- :"=c" (res), "=&D" (d0)
- :"1" (s),"a" (0), "0" (0xffffffffu)
- :"memory");
- return res;
+ size_t res;
+ asm volatile("repne\n\t"
+ "scasb"
+ : "=c" (res), "=&D" (d0)
+ : "1" (s), "a" (0), "0" (0xffffffffu)
+ : "memory");
+ return ~res - 1;
}
EXPORT_SYMBOL(strlen);
#endif
#ifdef __HAVE_ARCH_MEMCHR
-void *memchr(const void *cs,int c,size_t count)
+void *memchr(const void *cs, int c, size_t count)
{
int d0;
void *res;
if (!count)
return NULL;
- asm volatile( "repne\n\t"
+ asm volatile("repne\n\t"
"scasb\n\t"
"je 1f\n\t"
"movl $1,%0\n"
"1:\tdecl %0"
- :"=D" (res), "=&c" (d0)
- :"a" (c),"0" (cs),"1" (count)
- :"memory");
+ : "=D" (res), "=&c" (d0)
+ : "a" (c), "0" (cs), "1" (count)
+ : "memory");
return res;
}
EXPORT_SYMBOL(memchr);
#endif
#ifdef __HAVE_ARCH_MEMSCAN
-void *memscan(void * addr, int c, size_t size)
+void *memscan(void *addr, int c, size_t size)
{
if (!size)
return addr;
@@ -219,7 +217,7 @@ size_t strnlen(const char *s, size_t count)
{
int d0;
int res;
- asm volatile( "movl %2,%0\n\t"
+ asm volatile("movl %2,%0\n\t"
"jmp 2f\n"
"1:\tcmpb $0,(%0)\n\t"
"je 3f\n\t"
@@ -228,9 +226,9 @@ size_t strnlen(const char *s, size_t count)
"cmpl $-1,%1\n\t"
"jne 1b\n"
"3:\tsubl %2,%0"
- :"=a" (res), "=&d" (d0)
- :"c" (s),"1" (count)
- :"memory");
+ : "=a" (res), "=&d" (d0)
+ : "c" (s), "1" (count)
+ : "memory");
return res;
}
EXPORT_SYMBOL(strnlen);
diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c
index a3dafbf59da..8e2d55f754b 100644
--- a/arch/x86/lib/strstr_32.c
+++ b/arch/x86/lib/strstr_32.c
@@ -1,9 +1,9 @@
#include <linux/string.h>
-char * strstr(const char * cs,const char * ct)
+char *strstr(const char *cs, const char *ct)
{
int d0, d1;
-register char * __res;
+register char *__res;
__asm__ __volatile__(
"movl %6,%%edi\n\t"
"repne\n\t"
@@ -23,9 +23,9 @@ __asm__ __volatile__(
"jne 1b\n\t"
"xorl %%eax,%%eax\n\t"
"2:"
- :"=a" (__res), "=&c" (d0), "=&S" (d1)
- :"0" (0), "1" (0xffffffff), "2" (cs), "g" (ct)
- :"dx", "di");
+ : "=a" (__res), "=&c" (d0), "=&S" (d1)
+ : "0" (0), "1" (0xffffffff), "2" (cs), "g" (ct)
+ : "dx", "di");
return __res;
}
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
new file mode 100644
index 00000000000..28f85c91671
--- /dev/null
+++ b/arch/x86/lib/thunk_32.S
@@ -0,0 +1,30 @@
+/*
+ * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash)
+ * Copyright 2008 by Steven Rostedt, Red Hat, Inc
+ * (inspired by Andi Kleen's thunk_64.S)
+ * Subject to the GNU public license, v.2. No warranty of any kind.
+ */
+ #include <linux/linkage.h>
+ #include <asm/asm.h>
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+ /* put return address in eax (arg1) */
+ .macro thunk_ra name,func
+ .globl \name
+\name:
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ /* Place EIP in the arg1 */
+ movl 3*4(%esp), %eax
+ call \func
+ popl %edx
+ popl %ecx
+ popl %eax
+ ret
+ _ASM_NOKPROBE(\name)
+ .endm
+
+ thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
+ thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
+#endif
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index 8b92d428ab0..92d9feaff42 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -2,70 +2,47 @@
* Save registers before calling assembly functions. This avoids
* disturbance of register allocation in some inline assembly constructs.
* Copyright 2001,2002 by Andi Kleen, SuSE Labs.
+ * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc.
* Subject to the GNU public license, v.2. No warranty of any kind.
*/
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
+#include <asm/asm.h>
- #include <linux/linkage.h>
- #include <asm/dwarf2.h>
- #include <asm/calling.h>
- #include <asm/rwlock.h>
-
- /* rdi: arg1 ... normal C conventions. rax is saved/restored. */
- .macro thunk name,func
+ /* rdi: arg1 ... normal C conventions. rax is saved/restored. */
+ .macro THUNK name, func, put_ret_addr_in_rdi=0
.globl \name
-\name:
+\name:
CFI_STARTPROC
- SAVE_ARGS
- call \func
- jmp restore
- CFI_ENDPROC
- .endm
- /* rdi: arg1 ... normal C conventions. rax is passed from C. */
- .macro thunk_retrax name,func
- .globl \name
-\name:
- CFI_STARTPROC
+ /* this one pushes 9 elems, the next one would be %rIP */
SAVE_ARGS
+
+ .if \put_ret_addr_in_rdi
+ movq_cfi_restore 9*8, rdi
+ .endif
+
call \func
- jmp restore_norax
+ jmp restore
CFI_ENDPROC
+ _ASM_NOKPROBE(\name)
.endm
-
-
- .section .sched.text, "ax"
-#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
- thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
- thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
- thunk rwsem_wake_thunk,rwsem_wake
- thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
-#endif
-
- thunk __down_failed,__down
- thunk_retrax __down_failed_interruptible,__down_interruptible
- thunk_retrax __down_failed_trylock,__down_trylock
- thunk __up_wakeup,__up
#ifdef CONFIG_TRACE_IRQFLAGS
- thunk trace_hardirqs_on_thunk,trace_hardirqs_on
- thunk trace_hardirqs_off_thunk,trace_hardirqs_off
+ THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1
+ THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
- thunk lockdep_sys_exit_thunk,lockdep_sys_exit
+ THUNK lockdep_sys_exit_thunk,lockdep_sys_exit
#endif
-
+
/* SAVE_ARGS below is used only for the .cfi directives it contains. */
CFI_STARTPROC
SAVE_ARGS
restore:
RESTORE_ARGS
- ret
- CFI_ENDPROC
-
- CFI_STARTPROC
- SAVE_ARGS
-restore_norax:
- RESTORE_ARGS 1
ret
CFI_ENDPROC
+ _ASM_NOKPROBE(restore)
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
new file mode 100644
index 00000000000..ddf9ecb53cc
--- /dev/null
+++ b/arch/x86/lib/usercopy.c
@@ -0,0 +1,36 @@
+/*
+ * User address space access functions.
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/highmem.h>
+#include <linux/module.h>
+
+#include <asm/word-at-a-time.h>
+#include <linux/sched.h>
+
+/*
+ * We rely on the nested NMI work to allow atomic faults from the NMI path; the
+ * nested NMI paths are careful to preserve CR2.
+ */
+unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
+{
+ unsigned long ret;
+
+ if (__range_not_ok(from, n, TASK_SIZE))
+ return 0;
+
+ /*
+ * Even though this function is typically called from NMI/IRQ context
+ * disable pagefaults so that its behaviour is consistent even when
+ * called form other contexts.
+ */
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(to, from, n);
+ pagefault_enable();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(copy_from_user_nmi);
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index e849b9998b0..e2f5e21c03b 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -1,4 +1,4 @@
-/*
+/*
* User address space access functions.
* The non inlined parts of asm-i386/uaccess.h are here.
*
@@ -13,6 +13,14 @@
#include <linux/interrupt.h>
#include <asm/uaccess.h>
#include <asm/mmx.h>
+#include <asm/asm.h>
+
+#ifdef CONFIG_X86_INTEL_USERCOPY
+/*
+ * Alignment at which movsl is preferred for bulk memory copies.
+ */
+struct movsl_mask movsl_mask __read_mostly;
+#endif
static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n)
{
@@ -22,95 +30,8 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon
#endif
return 1;
}
-#define movsl_is_ok(a1,a2,n) \
- __movsl_is_ok((unsigned long)(a1),(unsigned long)(a2),(n))
-
-/*
- * Copy a null terminated string from userspace.
- */
-
-#define __do_strncpy_from_user(dst,src,count,res) \
-do { \
- int __d0, __d1, __d2; \
- might_sleep(); \
- __asm__ __volatile__( \
- " testl %1,%1\n" \
- " jz 2f\n" \
- "0: lodsb\n" \
- " stosb\n" \
- " testb %%al,%%al\n" \
- " jz 1f\n" \
- " decl %1\n" \
- " jnz 0b\n" \
- "1: subl %1,%0\n" \
- "2:\n" \
- ".section .fixup,\"ax\"\n" \
- "3: movl %5,%0\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE(0b,3b) \
- : "=d"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \
- "=&D" (__d2) \
- : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
- : "memory"); \
-} while (0)
-
-/**
- * __strncpy_from_user: - Copy a NUL terminated string from userspace, with less checking.
- * @dst: Destination address, in kernel space. This buffer must be at
- * least @count bytes long.
- * @src: Source address, in user space.
- * @count: Maximum number of bytes to copy, including the trailing NUL.
- *
- * Copies a NUL-terminated string from userspace to kernel space.
- * Caller must check the specified block with access_ok() before calling
- * this function.
- *
- * On success, returns the length of the string (not including the trailing
- * NUL).
- *
- * If access to userspace fails, returns -EFAULT (some data may have been
- * copied).
- *
- * If @count is smaller than the length of the string, copies @count bytes
- * and returns @count.
- */
-long
-__strncpy_from_user(char *dst, const char __user *src, long count)
-{
- long res;
- __do_strncpy_from_user(dst, src, count, res);
- return res;
-}
-EXPORT_SYMBOL(__strncpy_from_user);
-
-/**
- * strncpy_from_user: - Copy a NUL terminated string from userspace.
- * @dst: Destination address, in kernel space. This buffer must be at
- * least @count bytes long.
- * @src: Source address, in user space.
- * @count: Maximum number of bytes to copy, including the trailing NUL.
- *
- * Copies a NUL-terminated string from userspace to kernel space.
- *
- * On success, returns the length of the string (not including the trailing
- * NUL).
- *
- * If access to userspace fails, returns -EFAULT (some data may have been
- * copied).
- *
- * If @count is smaller than the length of the string, copies @count bytes
- * and returns @count.
- */
-long
-strncpy_from_user(char *dst, const char __user *src, long count)
-{
- long res = -EFAULT;
- if (access_ok(VERIFY_READ, src, 1))
- __do_strncpy_from_user(dst, src, count, res);
- return res;
-}
-EXPORT_SYMBOL(strncpy_from_user);
+#define movsl_is_ok(a1, a2, n) \
+ __movsl_is_ok((unsigned long)(a1), (unsigned long)(a2), (n))
/*
* Zero Userspace
@@ -119,12 +40,13 @@ EXPORT_SYMBOL(strncpy_from_user);
#define __do_clear_user(addr,size) \
do { \
int __d0; \
- might_sleep(); \
- __asm__ __volatile__( \
+ might_fault(); \
+ __asm__ __volatile__( \
+ ASM_STAC "\n" \
"0: rep; stosl\n" \
" movl %2,%0\n" \
"1: rep; stosb\n" \
- "2:\n" \
+ "2: " ASM_CLAC "\n" \
".section .fixup,\"ax\"\n" \
"3: lea 0(%2,%0,4),%0\n" \
" jmp 2b\n" \
@@ -148,7 +70,7 @@ do { \
unsigned long
clear_user(void __user *to, unsigned long n)
{
- might_sleep();
+ might_fault();
if (access_ok(VERIFY_WRITE, to, n))
__do_clear_user(to, n);
return n;
@@ -174,50 +96,6 @@ __clear_user(void __user *to, unsigned long n)
}
EXPORT_SYMBOL(__clear_user);
-/**
- * strnlen_user: - Get the size of a string in user space.
- * @s: The string to measure.
- * @n: The maximum valid length
- *
- * Get the size of a NUL-terminated string in user space.
- *
- * Returns the size of the string INCLUDING the terminating NUL.
- * On exception, returns 0.
- * If the string is too long, returns a value greater than @n.
- */
-long strnlen_user(const char __user *s, long n)
-{
- unsigned long mask = -__addr_ok(s);
- unsigned long res, tmp;
-
- might_sleep();
-
- __asm__ __volatile__(
- " testl %0, %0\n"
- " jz 3f\n"
- " andl %0,%%ecx\n"
- "0: repne; scasb\n"
- " setne %%al\n"
- " subl %%ecx,%0\n"
- " addl %0,%%eax\n"
- "1:\n"
- ".section .fixup,\"ax\"\n"
- "2: xorl %%eax,%%eax\n"
- " jmp 1b\n"
- "3: movb $1,%%al\n"
- " jmp 1b\n"
- ".previous\n"
- ".section __ex_table,\"a\"\n"
- " .align 4\n"
- " .long 0b,2b\n"
- ".previous"
- :"=r" (n), "=D" (s), "=a" (res), "=c" (tmp)
- :"0" (n), "1" (s), "2" (0), "3" (mask)
- :"cc");
- return res & mask;
-}
-EXPORT_SYMBOL(strnlen_user);
-
#ifdef CONFIG_X86_INTEL_USERCOPY
static unsigned long
__copy_user_intel(void __user *to, const void *from, unsigned long size)
@@ -279,47 +157,44 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size)
"101: lea 0(%%eax,%0,4),%0\n"
" jmp 100b\n"
".previous\n"
- ".section __ex_table,\"a\"\n"
- " .align 4\n"
- " .long 1b,100b\n"
- " .long 2b,100b\n"
- " .long 3b,100b\n"
- " .long 4b,100b\n"
- " .long 5b,100b\n"
- " .long 6b,100b\n"
- " .long 7b,100b\n"
- " .long 8b,100b\n"
- " .long 9b,100b\n"
- " .long 10b,100b\n"
- " .long 11b,100b\n"
- " .long 12b,100b\n"
- " .long 13b,100b\n"
- " .long 14b,100b\n"
- " .long 15b,100b\n"
- " .long 16b,100b\n"
- " .long 17b,100b\n"
- " .long 18b,100b\n"
- " .long 19b,100b\n"
- " .long 20b,100b\n"
- " .long 21b,100b\n"
- " .long 22b,100b\n"
- " .long 23b,100b\n"
- " .long 24b,100b\n"
- " .long 25b,100b\n"
- " .long 26b,100b\n"
- " .long 27b,100b\n"
- " .long 28b,100b\n"
- " .long 29b,100b\n"
- " .long 30b,100b\n"
- " .long 31b,100b\n"
- " .long 32b,100b\n"
- " .long 33b,100b\n"
- " .long 34b,100b\n"
- " .long 35b,100b\n"
- " .long 36b,100b\n"
- " .long 37b,100b\n"
- " .long 99b,101b\n"
- ".previous"
+ _ASM_EXTABLE(1b,100b)
+ _ASM_EXTABLE(2b,100b)
+ _ASM_EXTABLE(3b,100b)
+ _ASM_EXTABLE(4b,100b)
+ _ASM_EXTABLE(5b,100b)
+ _ASM_EXTABLE(6b,100b)
+ _ASM_EXTABLE(7b,100b)
+ _ASM_EXTABLE(8b,100b)
+ _ASM_EXTABLE(9b,100b)
+ _ASM_EXTABLE(10b,100b)
+ _ASM_EXTABLE(11b,100b)
+ _ASM_EXTABLE(12b,100b)
+ _ASM_EXTABLE(13b,100b)
+ _ASM_EXTABLE(14b,100b)
+ _ASM_EXTABLE(15b,100b)
+ _ASM_EXTABLE(16b,100b)
+ _ASM_EXTABLE(17b,100b)
+ _ASM_EXTABLE(18b,100b)
+ _ASM_EXTABLE(19b,100b)
+ _ASM_EXTABLE(20b,100b)
+ _ASM_EXTABLE(21b,100b)
+ _ASM_EXTABLE(22b,100b)
+ _ASM_EXTABLE(23b,100b)
+ _ASM_EXTABLE(24b,100b)
+ _ASM_EXTABLE(25b,100b)
+ _ASM_EXTABLE(26b,100b)
+ _ASM_EXTABLE(27b,100b)
+ _ASM_EXTABLE(28b,100b)
+ _ASM_EXTABLE(29b,100b)
+ _ASM_EXTABLE(30b,100b)
+ _ASM_EXTABLE(31b,100b)
+ _ASM_EXTABLE(32b,100b)
+ _ASM_EXTABLE(33b,100b)
+ _ASM_EXTABLE(34b,100b)
+ _ASM_EXTABLE(35b,100b)
+ _ASM_EXTABLE(36b,100b)
+ _ASM_EXTABLE(37b,100b)
+ _ASM_EXTABLE(99b,101b)
: "=&c"(size), "=&D" (d0), "=&S" (d1)
: "1"(to), "2"(from), "0"(size)
: "eax", "edx", "memory");
@@ -333,17 +208,17 @@ __copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size)
__asm__ __volatile__(
" .align 2,0x90\n"
"0: movl 32(%4), %%eax\n"
- " cmpl $67, %0\n"
- " jbe 2f\n"
+ " cmpl $67, %0\n"
+ " jbe 2f\n"
"1: movl 64(%4), %%eax\n"
- " .align 2,0x90\n"
- "2: movl 0(%4), %%eax\n"
- "21: movl 4(%4), %%edx\n"
- " movl %%eax, 0(%3)\n"
- " movl %%edx, 4(%3)\n"
- "3: movl 8(%4), %%eax\n"
- "31: movl 12(%4),%%edx\n"
- " movl %%eax, 8(%3)\n"
+ " .align 2,0x90\n"
+ "2: movl 0(%4), %%eax\n"
+ "21: movl 4(%4), %%edx\n"
+ " movl %%eax, 0(%3)\n"
+ " movl %%edx, 4(%3)\n"
+ "3: movl 8(%4), %%eax\n"
+ "31: movl 12(%4),%%edx\n"
+ " movl %%eax, 8(%3)\n"
" movl %%edx, 12(%3)\n"
"4: movl 16(%4), %%eax\n"
"41: movl 20(%4), %%edx\n"
@@ -369,52 +244,49 @@ __copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size)
"91: movl 60(%4), %%edx\n"
" movl %%eax, 56(%3)\n"
" movl %%edx, 60(%3)\n"
- " addl $-64, %0\n"
- " addl $64, %4\n"
- " addl $64, %3\n"
- " cmpl $63, %0\n"
- " ja 0b\n"
- "5: movl %0, %%eax\n"
- " shrl $2, %0\n"
- " andl $3, %%eax\n"
- " cld\n"
- "6: rep; movsl\n"
+ " addl $-64, %0\n"
+ " addl $64, %4\n"
+ " addl $64, %3\n"
+ " cmpl $63, %0\n"
+ " ja 0b\n"
+ "5: movl %0, %%eax\n"
+ " shrl $2, %0\n"
+ " andl $3, %%eax\n"
+ " cld\n"
+ "6: rep; movsl\n"
" movl %%eax,%0\n"
- "7: rep; movsb\n"
- "8:\n"
+ "7: rep; movsb\n"
+ "8:\n"
".section .fixup,\"ax\"\n"
- "9: lea 0(%%eax,%0,4),%0\n"
- "16: pushl %0\n"
- " pushl %%eax\n"
+ "9: lea 0(%%eax,%0,4),%0\n"
+ "16: pushl %0\n"
+ " pushl %%eax\n"
" xorl %%eax,%%eax\n"
- " rep; stosb\n"
- " popl %%eax\n"
- " popl %0\n"
- " jmp 8b\n"
- ".previous\n"
- ".section __ex_table,\"a\"\n"
- " .align 4\n"
- " .long 0b,16b\n"
- " .long 1b,16b\n"
- " .long 2b,16b\n"
- " .long 21b,16b\n"
- " .long 3b,16b\n"
- " .long 31b,16b\n"
- " .long 4b,16b\n"
- " .long 41b,16b\n"
- " .long 10b,16b\n"
- " .long 51b,16b\n"
- " .long 11b,16b\n"
- " .long 61b,16b\n"
- " .long 12b,16b\n"
- " .long 71b,16b\n"
- " .long 13b,16b\n"
- " .long 81b,16b\n"
- " .long 14b,16b\n"
- " .long 91b,16b\n"
- " .long 6b,9b\n"
- " .long 7b,16b\n"
- ".previous"
+ " rep; stosb\n"
+ " popl %%eax\n"
+ " popl %0\n"
+ " jmp 8b\n"
+ ".previous\n"
+ _ASM_EXTABLE(0b,16b)
+ _ASM_EXTABLE(1b,16b)
+ _ASM_EXTABLE(2b,16b)
+ _ASM_EXTABLE(21b,16b)
+ _ASM_EXTABLE(3b,16b)
+ _ASM_EXTABLE(31b,16b)
+ _ASM_EXTABLE(4b,16b)
+ _ASM_EXTABLE(41b,16b)
+ _ASM_EXTABLE(10b,16b)
+ _ASM_EXTABLE(51b,16b)
+ _ASM_EXTABLE(11b,16b)
+ _ASM_EXTABLE(61b,16b)
+ _ASM_EXTABLE(12b,16b)
+ _ASM_EXTABLE(71b,16b)
+ _ASM_EXTABLE(13b,16b)
+ _ASM_EXTABLE(81b,16b)
+ _ASM_EXTABLE(14b,16b)
+ _ASM_EXTABLE(91b,16b)
+ _ASM_EXTABLE(6b,9b)
+ _ASM_EXTABLE(7b,16b)
: "=&c"(size), "=&D" (d0), "=&S" (d1)
: "1"(to), "2"(from), "0"(size)
: "eax", "edx", "memory");
@@ -429,7 +301,7 @@ __copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size)
static unsigned long __copy_user_zeroing_intel_nocache(void *to,
const void __user *from, unsigned long size)
{
- int d0, d1;
+ int d0, d1;
__asm__ __volatile__(
" .align 2,0x90\n"
@@ -494,29 +366,26 @@ static unsigned long __copy_user_zeroing_intel_nocache(void *to,
" popl %0\n"
" jmp 8b\n"
".previous\n"
- ".section __ex_table,\"a\"\n"
- " .align 4\n"
- " .long 0b,16b\n"
- " .long 1b,16b\n"
- " .long 2b,16b\n"
- " .long 21b,16b\n"
- " .long 3b,16b\n"
- " .long 31b,16b\n"
- " .long 4b,16b\n"
- " .long 41b,16b\n"
- " .long 10b,16b\n"
- " .long 51b,16b\n"
- " .long 11b,16b\n"
- " .long 61b,16b\n"
- " .long 12b,16b\n"
- " .long 71b,16b\n"
- " .long 13b,16b\n"
- " .long 81b,16b\n"
- " .long 14b,16b\n"
- " .long 91b,16b\n"
- " .long 6b,9b\n"
- " .long 7b,16b\n"
- ".previous"
+ _ASM_EXTABLE(0b,16b)
+ _ASM_EXTABLE(1b,16b)
+ _ASM_EXTABLE(2b,16b)
+ _ASM_EXTABLE(21b,16b)
+ _ASM_EXTABLE(3b,16b)
+ _ASM_EXTABLE(31b,16b)
+ _ASM_EXTABLE(4b,16b)
+ _ASM_EXTABLE(41b,16b)
+ _ASM_EXTABLE(10b,16b)
+ _ASM_EXTABLE(51b,16b)
+ _ASM_EXTABLE(11b,16b)
+ _ASM_EXTABLE(61b,16b)
+ _ASM_EXTABLE(12b,16b)
+ _ASM_EXTABLE(71b,16b)
+ _ASM_EXTABLE(13b,16b)
+ _ASM_EXTABLE(81b,16b)
+ _ASM_EXTABLE(14b,16b)
+ _ASM_EXTABLE(91b,16b)
+ _ASM_EXTABLE(6b,9b)
+ _ASM_EXTABLE(7b,16b)
: "=&c"(size), "=&D" (d0), "=&S" (d1)
: "1"(to), "2"(from), "0"(size)
: "eax", "edx", "memory");
@@ -526,7 +395,7 @@ static unsigned long __copy_user_zeroing_intel_nocache(void *to,
static unsigned long __copy_user_intel_nocache(void *to,
const void __user *from, unsigned long size)
{
- int d0, d1;
+ int d0, d1;
__asm__ __volatile__(
" .align 2,0x90\n"
@@ -585,29 +454,26 @@ static unsigned long __copy_user_intel_nocache(void *to,
"9: lea 0(%%eax,%0,4),%0\n"
"16: jmp 8b\n"
".previous\n"
- ".section __ex_table,\"a\"\n"
- " .align 4\n"
- " .long 0b,16b\n"
- " .long 1b,16b\n"
- " .long 2b,16b\n"
- " .long 21b,16b\n"
- " .long 3b,16b\n"
- " .long 31b,16b\n"
- " .long 4b,16b\n"
- " .long 41b,16b\n"
- " .long 10b,16b\n"
- " .long 51b,16b\n"
- " .long 11b,16b\n"
- " .long 61b,16b\n"
- " .long 12b,16b\n"
- " .long 71b,16b\n"
- " .long 13b,16b\n"
- " .long 81b,16b\n"
- " .long 14b,16b\n"
- " .long 91b,16b\n"
- " .long 6b,9b\n"
- " .long 7b,16b\n"
- ".previous"
+ _ASM_EXTABLE(0b,16b)
+ _ASM_EXTABLE(1b,16b)
+ _ASM_EXTABLE(2b,16b)
+ _ASM_EXTABLE(21b,16b)
+ _ASM_EXTABLE(3b,16b)
+ _ASM_EXTABLE(31b,16b)
+ _ASM_EXTABLE(4b,16b)
+ _ASM_EXTABLE(41b,16b)
+ _ASM_EXTABLE(10b,16b)
+ _ASM_EXTABLE(51b,16b)
+ _ASM_EXTABLE(11b,16b)
+ _ASM_EXTABLE(61b,16b)
+ _ASM_EXTABLE(12b,16b)
+ _ASM_EXTABLE(71b,16b)
+ _ASM_EXTABLE(13b,16b)
+ _ASM_EXTABLE(81b,16b)
+ _ASM_EXTABLE(14b,16b)
+ _ASM_EXTABLE(91b,16b)
+ _ASM_EXTABLE(6b,9b)
+ _ASM_EXTABLE(7b,16b)
: "=&c"(size), "=&D" (d0), "=&S" (d1)
: "1"(to), "2"(from), "0"(size)
: "eax", "edx", "memory");
@@ -629,7 +495,7 @@ unsigned long __copy_user_zeroing_intel_nocache(void *to,
#endif /* CONFIG_X86_INTEL_USERCOPY */
/* Generic arbitrary sized copy. */
-#define __copy_user(to,from,size) \
+#define __copy_user(to, from, size) \
do { \
int __d0, __d1, __d2; \
__asm__ __volatile__( \
@@ -654,18 +520,15 @@ do { \
"3: lea 0(%3,%0,4),%0\n" \
" jmp 2b\n" \
".previous\n" \
- ".section __ex_table,\"a\"\n" \
- " .align 4\n" \
- " .long 4b,5b\n" \
- " .long 0b,3b\n" \
- " .long 1b,2b\n" \
- ".previous" \
+ _ASM_EXTABLE(4b,5b) \
+ _ASM_EXTABLE(0b,3b) \
+ _ASM_EXTABLE(1b,2b) \
: "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \
: "3"(size), "0"(size), "1"(to), "2"(from) \
: "memory"); \
} while (0)
-#define __copy_user_zeroing(to,from,size) \
+#define __copy_user_zeroing(to, from, size) \
do { \
int __d0, __d1, __d2; \
__asm__ __volatile__( \
@@ -696,12 +559,9 @@ do { \
" popl %0\n" \
" jmp 2b\n" \
".previous\n" \
- ".section __ex_table,\"a\"\n" \
- " .align 4\n" \
- " .long 4b,5b\n" \
- " .long 0b,3b\n" \
- " .long 1b,6b\n" \
- ".previous" \
+ _ASM_EXTABLE(4b,5b) \
+ _ASM_EXTABLE(0b,3b) \
+ _ASM_EXTABLE(1b,6b) \
: "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \
: "3"(size), "0"(size), "1"(to), "2"(from) \
: "memory"); \
@@ -710,67 +570,12 @@ do { \
unsigned long __copy_to_user_ll(void __user *to, const void *from,
unsigned long n)
{
-#ifndef CONFIG_X86_WP_WORKS_OK
- if (unlikely(boot_cpu_data.wp_works_ok == 0) &&
- ((unsigned long )to) < TASK_SIZE) {
- /*
- * When we are in an atomic section (see
- * mm/filemap.c:file_read_actor), return the full
- * length to take the slow path.
- */
- if (in_atomic())
- return n;
-
- /*
- * CPU does not honor the WP bit when writing
- * from supervisory mode, and due to preemption or SMP,
- * the page tables can change at any time.
- * Do it manually. Manfred <manfred@colorfullife.com>
- */
- while (n) {
- unsigned long offset = ((unsigned long)to)%PAGE_SIZE;
- unsigned long len = PAGE_SIZE - offset;
- int retval;
- struct page *pg;
- void *maddr;
-
- if (len > n)
- len = n;
-
-survive:
- down_read(&current->mm->mmap_sem);
- retval = get_user_pages(current, current->mm,
- (unsigned long )to, 1, 1, 0, &pg, NULL);
-
- if (retval == -ENOMEM && is_global_init(current)) {
- up_read(&current->mm->mmap_sem);
- congestion_wait(WRITE, HZ/50);
- goto survive;
- }
-
- if (retval != 1) {
- up_read(&current->mm->mmap_sem);
- break;
- }
-
- maddr = kmap_atomic(pg, KM_USER0);
- memcpy(maddr + offset, from, len);
- kunmap_atomic(maddr, KM_USER0);
- set_page_dirty_lock(pg);
- put_page(pg);
- up_read(&current->mm->mmap_sem);
-
- from += len;
- to += len;
- n -= len;
- }
- return n;
- }
-#endif
+ stac();
if (movsl_is_ok(to, from, n))
__copy_user(to, from, n);
else
n = __copy_user_intel(to, from, n);
+ clac();
return n;
}
EXPORT_SYMBOL(__copy_to_user_ll);
@@ -778,10 +583,12 @@ EXPORT_SYMBOL(__copy_to_user_ll);
unsigned long __copy_from_user_ll(void *to, const void __user *from,
unsigned long n)
{
+ stac();
if (movsl_is_ok(to, from, n))
__copy_user_zeroing(to, from, n);
else
n = __copy_user_zeroing_intel(to, from, n);
+ clac();
return n;
}
EXPORT_SYMBOL(__copy_from_user_ll);
@@ -789,11 +596,13 @@ EXPORT_SYMBOL(__copy_from_user_ll);
unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from,
unsigned long n)
{
+ stac();
if (movsl_is_ok(to, from, n))
__copy_user(to, from, n);
else
n = __copy_user_intel((void __user *)to,
(const void *)from, n);
+ clac();
return n;
}
EXPORT_SYMBOL(__copy_from_user_ll_nozero);
@@ -801,14 +610,16 @@ EXPORT_SYMBOL(__copy_from_user_ll_nozero);
unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
unsigned long n)
{
+ stac();
#ifdef CONFIG_X86_INTEL_USERCOPY
- if ( n > 64 && cpu_has_xmm2)
- n = __copy_user_zeroing_intel_nocache(to, from, n);
+ if (n > 64 && cpu_has_xmm2)
+ n = __copy_user_zeroing_intel_nocache(to, from, n);
else
__copy_user_zeroing(to, from, n);
#else
- __copy_user_zeroing(to, from, n);
+ __copy_user_zeroing(to, from, n);
#endif
+ clac();
return n;
}
EXPORT_SYMBOL(__copy_from_user_ll_nocache);
@@ -816,14 +627,16 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocache);
unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
unsigned long n)
{
+ stac();
#ifdef CONFIG_X86_INTEL_USERCOPY
- if ( n > 64 && cpu_has_xmm2)
- n = __copy_user_intel_nocache(to, from, n);
+ if (n > 64 && cpu_has_xmm2)
+ n = __copy_user_intel_nocache(to, from, n);
else
__copy_user(to, from, n);
#else
- __copy_user(to, from, n);
+ __copy_user(to, from, n);
#endif
+ clac();
return n;
}
EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
@@ -841,14 +654,13 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
* Returns number of bytes that could not be copied.
* On success, this will be zero.
*/
-unsigned long
-copy_to_user(void __user *to, const void *from, unsigned long n)
+unsigned long _copy_to_user(void __user *to, const void *from, unsigned n)
{
if (access_ok(VERIFY_WRITE, to, n))
n = __copy_to_user(to, from, n);
return n;
}
-EXPORT_SYMBOL(copy_to_user);
+EXPORT_SYMBOL(_copy_to_user);
/**
* copy_from_user: - Copy a block of data from user space.
@@ -866,8 +678,7 @@ EXPORT_SYMBOL(copy_to_user);
* If some data could not be copied, this function will pad the copied
* data to the requested size using zero bytes.
*/
-unsigned long
-copy_from_user(void *to, const void __user *from, unsigned long n)
+unsigned long _copy_from_user(void *to, const void __user *from, unsigned n)
{
if (access_ok(VERIFY_READ, from, n))
n = __copy_from_user(to, from, n);
@@ -875,4 +686,4 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
memset(to, 0, n);
return n;
}
-EXPORT_SYMBOL(copy_from_user);
+EXPORT_SYMBOL(_copy_from_user);
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 0c89d1bb028..c905e89e19f 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -9,64 +9,16 @@
#include <asm/uaccess.h>
/*
- * Copy a null terminated string from userspace.
- */
-
-#define __do_strncpy_from_user(dst,src,count,res) \
-do { \
- long __d0, __d1, __d2; \
- might_sleep(); \
- __asm__ __volatile__( \
- " testq %1,%1\n" \
- " jz 2f\n" \
- "0: lodsb\n" \
- " stosb\n" \
- " testb %%al,%%al\n" \
- " jz 1f\n" \
- " decq %1\n" \
- " jnz 0b\n" \
- "1: subq %1,%0\n" \
- "2:\n" \
- ".section .fixup,\"ax\"\n" \
- "3: movq %5,%0\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE(0b,3b) \
- : "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \
- "=&D" (__d2) \
- : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
- : "memory"); \
-} while (0)
-
-long
-__strncpy_from_user(char *dst, const char __user *src, long count)
-{
- long res;
- __do_strncpy_from_user(dst, src, count, res);
- return res;
-}
-EXPORT_SYMBOL(__strncpy_from_user);
-
-long
-strncpy_from_user(char *dst, const char __user *src, long count)
-{
- long res = -EFAULT;
- if (access_ok(VERIFY_READ, src, 1))
- return __strncpy_from_user(dst, src, count);
- return res;
-}
-EXPORT_SYMBOL(strncpy_from_user);
-
-/*
* Zero Userspace
*/
unsigned long __clear_user(void __user *addr, unsigned long size)
{
long __d0;
- might_sleep();
+ might_fault();
/* no memory constraint because it doesn't change any memory gcc knows
about */
+ stac();
asm volatile(
" testq %[size8],%[size8]\n"
" jz 4f\n"
@@ -86,9 +38,10 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
".previous\n"
_ASM_EXTABLE(0b,3b)
_ASM_EXTABLE(1b,2b)
- : [size8] "=c"(size), [dst] "=&D" (__d0)
+ : [size8] "=&c"(size), [dst] "=&D" (__d0)
: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr),
[zero] "r" (0UL), [eight] "r" (8UL));
+ clac();
return size;
}
EXPORT_SYMBOL(__clear_user);
@@ -101,54 +54,6 @@ unsigned long clear_user(void __user *to, unsigned long n)
}
EXPORT_SYMBOL(clear_user);
-/*
- * Return the size of a string (including the ending 0)
- *
- * Return 0 on exception, a value greater than N if too long
- */
-
-long __strnlen_user(const char __user *s, long n)
-{
- long res = 0;
- char c;
-
- while (1) {
- if (res>n)
- return n+1;
- if (__get_user(c, s))
- return 0;
- if (!c)
- return res+1;
- res++;
- s++;
- }
-}
-EXPORT_SYMBOL(__strnlen_user);
-
-long strnlen_user(const char __user *s, long n)
-{
- if (!access_ok(VERIFY_READ, s, n))
- return 0;
- return __strnlen_user(s, n);
-}
-EXPORT_SYMBOL(strnlen_user);
-
-long strlen_user(const char __user *s)
-{
- long res = 0;
- char c;
-
- for (;;) {
- if (get_user(c, s))
- return 0;
- if (!c)
- return res+1;
- res++;
- s++;
- }
-}
-EXPORT_SYMBOL(strlen_user);
-
unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len)
{
if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) {
@@ -158,3 +63,27 @@ unsigned long copy_in_user(void __user *to, const void __user *from, unsigned le
}
EXPORT_SYMBOL(copy_in_user);
+/*
+ * Try to copy last bytes and clear the rest if needed.
+ * Since protection fault in copy_from/to_user is not a normal situation,
+ * it is not necessary to optimize tail handling.
+ */
+__visible unsigned long
+copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest)
+{
+ char c;
+ unsigned zero_len;
+
+ for (; len; --len, to++) {
+ if (__get_user_nocheck(c, from++, sizeof(char)))
+ break;
+ if (__put_user_nocheck(c, to, sizeof(char)))
+ break;
+ }
+
+ for (c = 0, zero_len = len; zerorest && zero_len; --zero_len)
+ if (__put_user_nocheck(c, to++, sizeof(char)))
+ break;
+ clac();
+ return len;
+}
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
new file mode 100644
index 00000000000..1a2be7c6895
--- /dev/null
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -0,0 +1,961 @@
+# x86 Opcode Maps
+#
+# This is (mostly) based on following documentations.
+# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2C
+# (#326018-047US, June 2013)
+#
+#<Opcode maps>
+# Table: table-name
+# Referrer: escaped-name
+# AVXcode: avx-code
+# opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
+# (or)
+# opcode: escape # escaped-name
+# EndTable
+#
+#<group maps>
+# GrpTable: GrpXXX
+# reg: mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
+# EndTable
+#
+# AVX Superscripts
+# (v): this opcode requires VEX prefix.
+# (v1): this opcode only supports 128bit VEX.
+#
+# Last Prefix Superscripts
+# - (66): the last prefix is 0x66
+# - (F3): the last prefix is 0xF3
+# - (F2): the last prefix is 0xF2
+# - (!F3) : the last prefix is not 0xF3 (including non-last prefix case)
+# - (66&F2): Both 0x66 and 0xF2 prefixes are specified.
+
+Table: one byte opcode
+Referrer:
+AVXcode:
+# 0x00 - 0x0f
+00: ADD Eb,Gb
+01: ADD Ev,Gv
+02: ADD Gb,Eb
+03: ADD Gv,Ev
+04: ADD AL,Ib
+05: ADD rAX,Iz
+06: PUSH ES (i64)
+07: POP ES (i64)
+08: OR Eb,Gb
+09: OR Ev,Gv
+0a: OR Gb,Eb
+0b: OR Gv,Ev
+0c: OR AL,Ib
+0d: OR rAX,Iz
+0e: PUSH CS (i64)
+0f: escape # 2-byte escape
+# 0x10 - 0x1f
+10: ADC Eb,Gb
+11: ADC Ev,Gv
+12: ADC Gb,Eb
+13: ADC Gv,Ev
+14: ADC AL,Ib
+15: ADC rAX,Iz
+16: PUSH SS (i64)
+17: POP SS (i64)
+18: SBB Eb,Gb
+19: SBB Ev,Gv
+1a: SBB Gb,Eb
+1b: SBB Gv,Ev
+1c: SBB AL,Ib
+1d: SBB rAX,Iz
+1e: PUSH DS (i64)
+1f: POP DS (i64)
+# 0x20 - 0x2f
+20: AND Eb,Gb
+21: AND Ev,Gv
+22: AND Gb,Eb
+23: AND Gv,Ev
+24: AND AL,Ib
+25: AND rAx,Iz
+26: SEG=ES (Prefix)
+27: DAA (i64)
+28: SUB Eb,Gb
+29: SUB Ev,Gv
+2a: SUB Gb,Eb
+2b: SUB Gv,Ev
+2c: SUB AL,Ib
+2d: SUB rAX,Iz
+2e: SEG=CS (Prefix)
+2f: DAS (i64)
+# 0x30 - 0x3f
+30: XOR Eb,Gb
+31: XOR Ev,Gv
+32: XOR Gb,Eb
+33: XOR Gv,Ev
+34: XOR AL,Ib
+35: XOR rAX,Iz
+36: SEG=SS (Prefix)
+37: AAA (i64)
+38: CMP Eb,Gb
+39: CMP Ev,Gv
+3a: CMP Gb,Eb
+3b: CMP Gv,Ev
+3c: CMP AL,Ib
+3d: CMP rAX,Iz
+3e: SEG=DS (Prefix)
+3f: AAS (i64)
+# 0x40 - 0x4f
+40: INC eAX (i64) | REX (o64)
+41: INC eCX (i64) | REX.B (o64)
+42: INC eDX (i64) | REX.X (o64)
+43: INC eBX (i64) | REX.XB (o64)
+44: INC eSP (i64) | REX.R (o64)
+45: INC eBP (i64) | REX.RB (o64)
+46: INC eSI (i64) | REX.RX (o64)
+47: INC eDI (i64) | REX.RXB (o64)
+48: DEC eAX (i64) | REX.W (o64)
+49: DEC eCX (i64) | REX.WB (o64)
+4a: DEC eDX (i64) | REX.WX (o64)
+4b: DEC eBX (i64) | REX.WXB (o64)
+4c: DEC eSP (i64) | REX.WR (o64)
+4d: DEC eBP (i64) | REX.WRB (o64)
+4e: DEC eSI (i64) | REX.WRX (o64)
+4f: DEC eDI (i64) | REX.WRXB (o64)
+# 0x50 - 0x5f
+50: PUSH rAX/r8 (d64)
+51: PUSH rCX/r9 (d64)
+52: PUSH rDX/r10 (d64)
+53: PUSH rBX/r11 (d64)
+54: PUSH rSP/r12 (d64)
+55: PUSH rBP/r13 (d64)
+56: PUSH rSI/r14 (d64)
+57: PUSH rDI/r15 (d64)
+58: POP rAX/r8 (d64)
+59: POP rCX/r9 (d64)
+5a: POP rDX/r10 (d64)
+5b: POP rBX/r11 (d64)
+5c: POP rSP/r12 (d64)
+5d: POP rBP/r13 (d64)
+5e: POP rSI/r14 (d64)
+5f: POP rDI/r15 (d64)
+# 0x60 - 0x6f
+60: PUSHA/PUSHAD (i64)
+61: POPA/POPAD (i64)
+62: BOUND Gv,Ma (i64)
+63: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64)
+64: SEG=FS (Prefix)
+65: SEG=GS (Prefix)
+66: Operand-Size (Prefix)
+67: Address-Size (Prefix)
+68: PUSH Iz (d64)
+69: IMUL Gv,Ev,Iz
+6a: PUSH Ib (d64)
+6b: IMUL Gv,Ev,Ib
+6c: INS/INSB Yb,DX
+6d: INS/INSW/INSD Yz,DX
+6e: OUTS/OUTSB DX,Xb
+6f: OUTS/OUTSW/OUTSD DX,Xz
+# 0x70 - 0x7f
+70: JO Jb
+71: JNO Jb
+72: JB/JNAE/JC Jb
+73: JNB/JAE/JNC Jb
+74: JZ/JE Jb
+75: JNZ/JNE Jb
+76: JBE/JNA Jb
+77: JNBE/JA Jb
+78: JS Jb
+79: JNS Jb
+7a: JP/JPE Jb
+7b: JNP/JPO Jb
+7c: JL/JNGE Jb
+7d: JNL/JGE Jb
+7e: JLE/JNG Jb
+7f: JNLE/JG Jb
+# 0x80 - 0x8f
+80: Grp1 Eb,Ib (1A)
+81: Grp1 Ev,Iz (1A)
+82: Grp1 Eb,Ib (1A),(i64)
+83: Grp1 Ev,Ib (1A)
+84: TEST Eb,Gb
+85: TEST Ev,Gv
+86: XCHG Eb,Gb
+87: XCHG Ev,Gv
+88: MOV Eb,Gb
+89: MOV Ev,Gv
+8a: MOV Gb,Eb
+8b: MOV Gv,Ev
+8c: MOV Ev,Sw
+8d: LEA Gv,M
+8e: MOV Sw,Ew
+8f: Grp1A (1A) | POP Ev (d64)
+# 0x90 - 0x9f
+90: NOP | PAUSE (F3) | XCHG r8,rAX
+91: XCHG rCX/r9,rAX
+92: XCHG rDX/r10,rAX
+93: XCHG rBX/r11,rAX
+94: XCHG rSP/r12,rAX
+95: XCHG rBP/r13,rAX
+96: XCHG rSI/r14,rAX
+97: XCHG rDI/r15,rAX
+98: CBW/CWDE/CDQE
+99: CWD/CDQ/CQO
+9a: CALLF Ap (i64)
+9b: FWAIT/WAIT
+9c: PUSHF/D/Q Fv (d64)
+9d: POPF/D/Q Fv (d64)
+9e: SAHF
+9f: LAHF
+# 0xa0 - 0xaf
+a0: MOV AL,Ob
+a1: MOV rAX,Ov
+a2: MOV Ob,AL
+a3: MOV Ov,rAX
+a4: MOVS/B Yb,Xb
+a5: MOVS/W/D/Q Yv,Xv
+a6: CMPS/B Xb,Yb
+a7: CMPS/W/D Xv,Yv
+a8: TEST AL,Ib
+a9: TEST rAX,Iz
+aa: STOS/B Yb,AL
+ab: STOS/W/D/Q Yv,rAX
+ac: LODS/B AL,Xb
+ad: LODS/W/D/Q rAX,Xv
+ae: SCAS/B AL,Yb
+# Note: The May 2011 Intel manual shows Xv for the second parameter of the
+# next instruction but Yv is correct
+af: SCAS/W/D/Q rAX,Yv
+# 0xb0 - 0xbf
+b0: MOV AL/R8L,Ib
+b1: MOV CL/R9L,Ib
+b2: MOV DL/R10L,Ib
+b3: MOV BL/R11L,Ib
+b4: MOV AH/R12L,Ib
+b5: MOV CH/R13L,Ib
+b6: MOV DH/R14L,Ib
+b7: MOV BH/R15L,Ib
+b8: MOV rAX/r8,Iv
+b9: MOV rCX/r9,Iv
+ba: MOV rDX/r10,Iv
+bb: MOV rBX/r11,Iv
+bc: MOV rSP/r12,Iv
+bd: MOV rBP/r13,Iv
+be: MOV rSI/r14,Iv
+bf: MOV rDI/r15,Iv
+# 0xc0 - 0xcf
+c0: Grp2 Eb,Ib (1A)
+c1: Grp2 Ev,Ib (1A)
+c2: RETN Iw (f64)
+c3: RETN
+c4: LES Gz,Mp (i64) | VEX+2byte (Prefix)
+c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix)
+c6: Grp11A Eb,Ib (1A)
+c7: Grp11B Ev,Iz (1A)
+c8: ENTER Iw,Ib
+c9: LEAVE (d64)
+ca: RETF Iw
+cb: RETF
+cc: INT3
+cd: INT Ib
+ce: INTO (i64)
+cf: IRET/D/Q
+# 0xd0 - 0xdf
+d0: Grp2 Eb,1 (1A)
+d1: Grp2 Ev,1 (1A)
+d2: Grp2 Eb,CL (1A)
+d3: Grp2 Ev,CL (1A)
+d4: AAM Ib (i64)
+d5: AAD Ib (i64)
+d6:
+d7: XLAT/XLATB
+d8: ESC
+d9: ESC
+da: ESC
+db: ESC
+dc: ESC
+dd: ESC
+de: ESC
+df: ESC
+# 0xe0 - 0xef
+e0: LOOPNE/LOOPNZ Jb (f64)
+e1: LOOPE/LOOPZ Jb (f64)
+e2: LOOP Jb (f64)
+e3: JrCXZ Jb (f64)
+e4: IN AL,Ib
+e5: IN eAX,Ib
+e6: OUT Ib,AL
+e7: OUT Ib,eAX
+e8: CALL Jz (f64)
+e9: JMP-near Jz (f64)
+ea: JMP-far Ap (i64)
+eb: JMP-short Jb (f64)
+ec: IN AL,DX
+ed: IN eAX,DX
+ee: OUT DX,AL
+ef: OUT DX,eAX
+# 0xf0 - 0xff
+f0: LOCK (Prefix)
+f1:
+f2: REPNE (Prefix) | XACQUIRE (Prefix)
+f3: REP/REPE (Prefix) | XRELEASE (Prefix)
+f4: HLT
+f5: CMC
+f6: Grp3_1 Eb (1A)
+f7: Grp3_2 Ev (1A)
+f8: CLC
+f9: STC
+fa: CLI
+fb: STI
+fc: CLD
+fd: STD
+fe: Grp4 (1A)
+ff: Grp5 (1A)
+EndTable
+
+Table: 2-byte opcode (0x0f)
+Referrer: 2-byte escape
+AVXcode: 1
+# 0x0f 0x00-0x0f
+00: Grp6 (1A)
+01: Grp7 (1A)
+02: LAR Gv,Ew
+03: LSL Gv,Ew
+04:
+05: SYSCALL (o64)
+06: CLTS
+07: SYSRET (o64)
+08: INVD
+09: WBINVD
+0a:
+0b: UD2 (1B)
+0c:
+# AMD's prefetch group. Intel supports prefetchw(/1) only.
+0d: GrpP
+0e: FEMMS
+# 3DNow! uses the last imm byte as opcode extension.
+0f: 3DNow! Pq,Qq,Ib
+# 0x0f 0x10-0x1f
+# NOTE: According to Intel SDM opcode map, vmovups and vmovupd has no operands
+# but it actually has operands. And also, vmovss and vmovsd only accept 128bit.
+# MOVSS/MOVSD has too many forms(3) on SDM. This map just shows a typical form.
+# Many AVX instructions lack v1 superscript, according to Intel AVX-Prgramming
+# Reference A.1
+10: vmovups Vps,Wps | vmovupd Vpd,Wpd (66) | vmovss Vx,Hx,Wss (F3),(v1) | vmovsd Vx,Hx,Wsd (F2),(v1)
+11: vmovups Wps,Vps | vmovupd Wpd,Vpd (66) | vmovss Wss,Hx,Vss (F3),(v1) | vmovsd Wsd,Hx,Vsd (F2),(v1)
+12: vmovlps Vq,Hq,Mq (v1) | vmovhlps Vq,Hq,Uq (v1) | vmovlpd Vq,Hq,Mq (66),(v1) | vmovsldup Vx,Wx (F3) | vmovddup Vx,Wx (F2)
+13: vmovlps Mq,Vq (v1) | vmovlpd Mq,Vq (66),(v1)
+14: vunpcklps Vx,Hx,Wx | vunpcklpd Vx,Hx,Wx (66)
+15: vunpckhps Vx,Hx,Wx | vunpckhpd Vx,Hx,Wx (66)
+16: vmovhps Vdq,Hq,Mq (v1) | vmovlhps Vdq,Hq,Uq (v1) | vmovhpd Vdq,Hq,Mq (66),(v1) | vmovshdup Vx,Wx (F3)
+17: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1)
+18: Grp16 (1A)
+19:
+1a: BNDCL Ev,Gv | BNDCU Ev,Gv | BNDMOV Gv,Ev | BNDLDX Gv,Ev,Gv
+1b: BNDCN Ev,Gv | BNDMOV Ev,Gv | BNDMK Gv,Ev | BNDSTX Ev,GV,Gv
+1c:
+1d:
+1e:
+1f: NOP Ev
+# 0x0f 0x20-0x2f
+20: MOV Rd,Cd
+21: MOV Rd,Dd
+22: MOV Cd,Rd
+23: MOV Dd,Rd
+24:
+25:
+26:
+27:
+28: vmovaps Vps,Wps | vmovapd Vpd,Wpd (66)
+29: vmovaps Wps,Vps | vmovapd Wpd,Vpd (66)
+2a: cvtpi2ps Vps,Qpi | cvtpi2pd Vpd,Qpi (66) | vcvtsi2ss Vss,Hss,Ey (F3),(v1) | vcvtsi2sd Vsd,Hsd,Ey (F2),(v1)
+2b: vmovntps Mps,Vps | vmovntpd Mpd,Vpd (66)
+2c: cvttps2pi Ppi,Wps | cvttpd2pi Ppi,Wpd (66) | vcvttss2si Gy,Wss (F3),(v1) | vcvttsd2si Gy,Wsd (F2),(v1)
+2d: cvtps2pi Ppi,Wps | cvtpd2pi Qpi,Wpd (66) | vcvtss2si Gy,Wss (F3),(v1) | vcvtsd2si Gy,Wsd (F2),(v1)
+2e: vucomiss Vss,Wss (v1) | vucomisd Vsd,Wsd (66),(v1)
+2f: vcomiss Vss,Wss (v1) | vcomisd Vsd,Wsd (66),(v1)
+# 0x0f 0x30-0x3f
+30: WRMSR
+31: RDTSC
+32: RDMSR
+33: RDPMC
+34: SYSENTER
+35: SYSEXIT
+36:
+37: GETSEC
+38: escape # 3-byte escape 1
+39:
+3a: escape # 3-byte escape 2
+3b:
+3c:
+3d:
+3e:
+3f:
+# 0x0f 0x40-0x4f
+40: CMOVO Gv,Ev
+41: CMOVNO Gv,Ev
+42: CMOVB/C/NAE Gv,Ev
+43: CMOVAE/NB/NC Gv,Ev
+44: CMOVE/Z Gv,Ev
+45: CMOVNE/NZ Gv,Ev
+46: CMOVBE/NA Gv,Ev
+47: CMOVA/NBE Gv,Ev
+48: CMOVS Gv,Ev
+49: CMOVNS Gv,Ev
+4a: CMOVP/PE Gv,Ev
+4b: CMOVNP/PO Gv,Ev
+4c: CMOVL/NGE Gv,Ev
+4d: CMOVNL/GE Gv,Ev
+4e: CMOVLE/NG Gv,Ev
+4f: CMOVNLE/G Gv,Ev
+# 0x0f 0x50-0x5f
+50: vmovmskps Gy,Ups | vmovmskpd Gy,Upd (66)
+51: vsqrtps Vps,Wps | vsqrtpd Vpd,Wpd (66) | vsqrtss Vss,Hss,Wss (F3),(v1) | vsqrtsd Vsd,Hsd,Wsd (F2),(v1)
+52: vrsqrtps Vps,Wps | vrsqrtss Vss,Hss,Wss (F3),(v1)
+53: vrcpps Vps,Wps | vrcpss Vss,Hss,Wss (F3),(v1)
+54: vandps Vps,Hps,Wps | vandpd Vpd,Hpd,Wpd (66)
+55: vandnps Vps,Hps,Wps | vandnpd Vpd,Hpd,Wpd (66)
+56: vorps Vps,Hps,Wps | vorpd Vpd,Hpd,Wpd (66)
+57: vxorps Vps,Hps,Wps | vxorpd Vpd,Hpd,Wpd (66)
+58: vaddps Vps,Hps,Wps | vaddpd Vpd,Hpd,Wpd (66) | vaddss Vss,Hss,Wss (F3),(v1) | vaddsd Vsd,Hsd,Wsd (F2),(v1)
+59: vmulps Vps,Hps,Wps | vmulpd Vpd,Hpd,Wpd (66) | vmulss Vss,Hss,Wss (F3),(v1) | vmulsd Vsd,Hsd,Wsd (F2),(v1)
+5a: vcvtps2pd Vpd,Wps | vcvtpd2ps Vps,Wpd (66) | vcvtss2sd Vsd,Hx,Wss (F3),(v1) | vcvtsd2ss Vss,Hx,Wsd (F2),(v1)
+5b: vcvtdq2ps Vps,Wdq | vcvtps2dq Vdq,Wps (66) | vcvttps2dq Vdq,Wps (F3)
+5c: vsubps Vps,Hps,Wps | vsubpd Vpd,Hpd,Wpd (66) | vsubss Vss,Hss,Wss (F3),(v1) | vsubsd Vsd,Hsd,Wsd (F2),(v1)
+5d: vminps Vps,Hps,Wps | vminpd Vpd,Hpd,Wpd (66) | vminss Vss,Hss,Wss (F3),(v1) | vminsd Vsd,Hsd,Wsd (F2),(v1)
+5e: vdivps Vps,Hps,Wps | vdivpd Vpd,Hpd,Wpd (66) | vdivss Vss,Hss,Wss (F3),(v1) | vdivsd Vsd,Hsd,Wsd (F2),(v1)
+5f: vmaxps Vps,Hps,Wps | vmaxpd Vpd,Hpd,Wpd (66) | vmaxss Vss,Hss,Wss (F3),(v1) | vmaxsd Vsd,Hsd,Wsd (F2),(v1)
+# 0x0f 0x60-0x6f
+60: punpcklbw Pq,Qd | vpunpcklbw Vx,Hx,Wx (66),(v1)
+61: punpcklwd Pq,Qd | vpunpcklwd Vx,Hx,Wx (66),(v1)
+62: punpckldq Pq,Qd | vpunpckldq Vx,Hx,Wx (66),(v1)
+63: packsswb Pq,Qq | vpacksswb Vx,Hx,Wx (66),(v1)
+64: pcmpgtb Pq,Qq | vpcmpgtb Vx,Hx,Wx (66),(v1)
+65: pcmpgtw Pq,Qq | vpcmpgtw Vx,Hx,Wx (66),(v1)
+66: pcmpgtd Pq,Qq | vpcmpgtd Vx,Hx,Wx (66),(v1)
+67: packuswb Pq,Qq | vpackuswb Vx,Hx,Wx (66),(v1)
+68: punpckhbw Pq,Qd | vpunpckhbw Vx,Hx,Wx (66),(v1)
+69: punpckhwd Pq,Qd | vpunpckhwd Vx,Hx,Wx (66),(v1)
+6a: punpckhdq Pq,Qd | vpunpckhdq Vx,Hx,Wx (66),(v1)
+6b: packssdw Pq,Qd | vpackssdw Vx,Hx,Wx (66),(v1)
+6c: vpunpcklqdq Vx,Hx,Wx (66),(v1)
+6d: vpunpckhqdq Vx,Hx,Wx (66),(v1)
+6e: movd/q Pd,Ey | vmovd/q Vy,Ey (66),(v1)
+6f: movq Pq,Qq | vmovdqa Vx,Wx (66) | vmovdqu Vx,Wx (F3)
+# 0x0f 0x70-0x7f
+70: pshufw Pq,Qq,Ib | vpshufd Vx,Wx,Ib (66),(v1) | vpshufhw Vx,Wx,Ib (F3),(v1) | vpshuflw Vx,Wx,Ib (F2),(v1)
+71: Grp12 (1A)
+72: Grp13 (1A)
+73: Grp14 (1A)
+74: pcmpeqb Pq,Qq | vpcmpeqb Vx,Hx,Wx (66),(v1)
+75: pcmpeqw Pq,Qq | vpcmpeqw Vx,Hx,Wx (66),(v1)
+76: pcmpeqd Pq,Qq | vpcmpeqd Vx,Hx,Wx (66),(v1)
+# Note: Remove (v), because vzeroall and vzeroupper becomes emms without VEX.
+77: emms | vzeroupper | vzeroall
+78: VMREAD Ey,Gy
+79: VMWRITE Gy,Ey
+7a:
+7b:
+7c: vhaddpd Vpd,Hpd,Wpd (66) | vhaddps Vps,Hps,Wps (F2)
+7d: vhsubpd Vpd,Hpd,Wpd (66) | vhsubps Vps,Hps,Wps (F2)
+7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1)
+7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3)
+# 0x0f 0x80-0x8f
+80: JO Jz (f64)
+81: JNO Jz (f64)
+82: JB/JC/JNAE Jz (f64)
+83: JAE/JNB/JNC Jz (f64)
+84: JE/JZ Jz (f64)
+85: JNE/JNZ Jz (f64)
+86: JBE/JNA Jz (f64)
+87: JA/JNBE Jz (f64)
+88: JS Jz (f64)
+89: JNS Jz (f64)
+8a: JP/JPE Jz (f64)
+8b: JNP/JPO Jz (f64)
+8c: JL/JNGE Jz (f64)
+8d: JNL/JGE Jz (f64)
+8e: JLE/JNG Jz (f64)
+8f: JNLE/JG Jz (f64)
+# 0x0f 0x90-0x9f
+90: SETO Eb
+91: SETNO Eb
+92: SETB/C/NAE Eb
+93: SETAE/NB/NC Eb
+94: SETE/Z Eb
+95: SETNE/NZ Eb
+96: SETBE/NA Eb
+97: SETA/NBE Eb
+98: SETS Eb
+99: SETNS Eb
+9a: SETP/PE Eb
+9b: SETNP/PO Eb
+9c: SETL/NGE Eb
+9d: SETNL/GE Eb
+9e: SETLE/NG Eb
+9f: SETNLE/G Eb
+# 0x0f 0xa0-0xaf
+a0: PUSH FS (d64)
+a1: POP FS (d64)
+a2: CPUID
+a3: BT Ev,Gv
+a4: SHLD Ev,Gv,Ib
+a5: SHLD Ev,Gv,CL
+a6: GrpPDLK
+a7: GrpRNG
+a8: PUSH GS (d64)
+a9: POP GS (d64)
+aa: RSM
+ab: BTS Ev,Gv
+ac: SHRD Ev,Gv,Ib
+ad: SHRD Ev,Gv,CL
+ae: Grp15 (1A),(1C)
+af: IMUL Gv,Ev
+# 0x0f 0xb0-0xbf
+b0: CMPXCHG Eb,Gb
+b1: CMPXCHG Ev,Gv
+b2: LSS Gv,Mp
+b3: BTR Ev,Gv
+b4: LFS Gv,Mp
+b5: LGS Gv,Mp
+b6: MOVZX Gv,Eb
+b7: MOVZX Gv,Ew
+b8: JMPE (!F3) | POPCNT Gv,Ev (F3)
+b9: Grp10 (1A)
+ba: Grp8 Ev,Ib (1A)
+bb: BTC Ev,Gv
+bc: BSF Gv,Ev (!F3) | TZCNT Gv,Ev (F3)
+bd: BSR Gv,Ev (!F3) | LZCNT Gv,Ev (F3)
+be: MOVSX Gv,Eb
+bf: MOVSX Gv,Ew
+# 0x0f 0xc0-0xcf
+c0: XADD Eb,Gb
+c1: XADD Ev,Gv
+c2: vcmpps Vps,Hps,Wps,Ib | vcmppd Vpd,Hpd,Wpd,Ib (66) | vcmpss Vss,Hss,Wss,Ib (F3),(v1) | vcmpsd Vsd,Hsd,Wsd,Ib (F2),(v1)
+c3: movnti My,Gy
+c4: pinsrw Pq,Ry/Mw,Ib | vpinsrw Vdq,Hdq,Ry/Mw,Ib (66),(v1)
+c5: pextrw Gd,Nq,Ib | vpextrw Gd,Udq,Ib (66),(v1)
+c6: vshufps Vps,Hps,Wps,Ib | vshufpd Vpd,Hpd,Wpd,Ib (66)
+c7: Grp9 (1A)
+c8: BSWAP RAX/EAX/R8/R8D
+c9: BSWAP RCX/ECX/R9/R9D
+ca: BSWAP RDX/EDX/R10/R10D
+cb: BSWAP RBX/EBX/R11/R11D
+cc: BSWAP RSP/ESP/R12/R12D
+cd: BSWAP RBP/EBP/R13/R13D
+ce: BSWAP RSI/ESI/R14/R14D
+cf: BSWAP RDI/EDI/R15/R15D
+# 0x0f 0xd0-0xdf
+d0: vaddsubpd Vpd,Hpd,Wpd (66) | vaddsubps Vps,Hps,Wps (F2)
+d1: psrlw Pq,Qq | vpsrlw Vx,Hx,Wx (66),(v1)
+d2: psrld Pq,Qq | vpsrld Vx,Hx,Wx (66),(v1)
+d3: psrlq Pq,Qq | vpsrlq Vx,Hx,Wx (66),(v1)
+d4: paddq Pq,Qq | vpaddq Vx,Hx,Wx (66),(v1)
+d5: pmullw Pq,Qq | vpmullw Vx,Hx,Wx (66),(v1)
+d6: vmovq Wq,Vq (66),(v1) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2)
+d7: pmovmskb Gd,Nq | vpmovmskb Gd,Ux (66),(v1)
+d8: psubusb Pq,Qq | vpsubusb Vx,Hx,Wx (66),(v1)
+d9: psubusw Pq,Qq | vpsubusw Vx,Hx,Wx (66),(v1)
+da: pminub Pq,Qq | vpminub Vx,Hx,Wx (66),(v1)
+db: pand Pq,Qq | vpand Vx,Hx,Wx (66),(v1)
+dc: paddusb Pq,Qq | vpaddusb Vx,Hx,Wx (66),(v1)
+dd: paddusw Pq,Qq | vpaddusw Vx,Hx,Wx (66),(v1)
+de: pmaxub Pq,Qq | vpmaxub Vx,Hx,Wx (66),(v1)
+df: pandn Pq,Qq | vpandn Vx,Hx,Wx (66),(v1)
+# 0x0f 0xe0-0xef
+e0: pavgb Pq,Qq | vpavgb Vx,Hx,Wx (66),(v1)
+e1: psraw Pq,Qq | vpsraw Vx,Hx,Wx (66),(v1)
+e2: psrad Pq,Qq | vpsrad Vx,Hx,Wx (66),(v1)
+e3: pavgw Pq,Qq | vpavgw Vx,Hx,Wx (66),(v1)
+e4: pmulhuw Pq,Qq | vpmulhuw Vx,Hx,Wx (66),(v1)
+e5: pmulhw Pq,Qq | vpmulhw Vx,Hx,Wx (66),(v1)
+e6: vcvttpd2dq Vx,Wpd (66) | vcvtdq2pd Vx,Wdq (F3) | vcvtpd2dq Vx,Wpd (F2)
+e7: movntq Mq,Pq | vmovntdq Mx,Vx (66)
+e8: psubsb Pq,Qq | vpsubsb Vx,Hx,Wx (66),(v1)
+e9: psubsw Pq,Qq | vpsubsw Vx,Hx,Wx (66),(v1)
+ea: pminsw Pq,Qq | vpminsw Vx,Hx,Wx (66),(v1)
+eb: por Pq,Qq | vpor Vx,Hx,Wx (66),(v1)
+ec: paddsb Pq,Qq | vpaddsb Vx,Hx,Wx (66),(v1)
+ed: paddsw Pq,Qq | vpaddsw Vx,Hx,Wx (66),(v1)
+ee: pmaxsw Pq,Qq | vpmaxsw Vx,Hx,Wx (66),(v1)
+ef: pxor Pq,Qq | vpxor Vx,Hx,Wx (66),(v1)
+# 0x0f 0xf0-0xff
+f0: vlddqu Vx,Mx (F2)
+f1: psllw Pq,Qq | vpsllw Vx,Hx,Wx (66),(v1)
+f2: pslld Pq,Qq | vpslld Vx,Hx,Wx (66),(v1)
+f3: psllq Pq,Qq | vpsllq Vx,Hx,Wx (66),(v1)
+f4: pmuludq Pq,Qq | vpmuludq Vx,Hx,Wx (66),(v1)
+f5: pmaddwd Pq,Qq | vpmaddwd Vx,Hx,Wx (66),(v1)
+f6: psadbw Pq,Qq | vpsadbw Vx,Hx,Wx (66),(v1)
+f7: maskmovq Pq,Nq | vmaskmovdqu Vx,Ux (66),(v1)
+f8: psubb Pq,Qq | vpsubb Vx,Hx,Wx (66),(v1)
+f9: psubw Pq,Qq | vpsubw Vx,Hx,Wx (66),(v1)
+fa: psubd Pq,Qq | vpsubd Vx,Hx,Wx (66),(v1)
+fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1)
+fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1)
+fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1)
+fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1)
+ff:
+EndTable
+
+Table: 3-byte opcode 1 (0x0f 0x38)
+Referrer: 3-byte escape 1
+AVXcode: 2
+# 0x0f 0x38 0x00-0x0f
+00: pshufb Pq,Qq | vpshufb Vx,Hx,Wx (66),(v1)
+01: phaddw Pq,Qq | vphaddw Vx,Hx,Wx (66),(v1)
+02: phaddd Pq,Qq | vphaddd Vx,Hx,Wx (66),(v1)
+03: phaddsw Pq,Qq | vphaddsw Vx,Hx,Wx (66),(v1)
+04: pmaddubsw Pq,Qq | vpmaddubsw Vx,Hx,Wx (66),(v1)
+05: phsubw Pq,Qq | vphsubw Vx,Hx,Wx (66),(v1)
+06: phsubd Pq,Qq | vphsubd Vx,Hx,Wx (66),(v1)
+07: phsubsw Pq,Qq | vphsubsw Vx,Hx,Wx (66),(v1)
+08: psignb Pq,Qq | vpsignb Vx,Hx,Wx (66),(v1)
+09: psignw Pq,Qq | vpsignw Vx,Hx,Wx (66),(v1)
+0a: psignd Pq,Qq | vpsignd Vx,Hx,Wx (66),(v1)
+0b: pmulhrsw Pq,Qq | vpmulhrsw Vx,Hx,Wx (66),(v1)
+0c: vpermilps Vx,Hx,Wx (66),(v)
+0d: vpermilpd Vx,Hx,Wx (66),(v)
+0e: vtestps Vx,Wx (66),(v)
+0f: vtestpd Vx,Wx (66),(v)
+# 0x0f 0x38 0x10-0x1f
+10: pblendvb Vdq,Wdq (66)
+11:
+12:
+13: vcvtph2ps Vx,Wx,Ib (66),(v)
+14: blendvps Vdq,Wdq (66)
+15: blendvpd Vdq,Wdq (66)
+16: vpermps Vqq,Hqq,Wqq (66),(v)
+17: vptest Vx,Wx (66)
+18: vbroadcastss Vx,Wd (66),(v)
+19: vbroadcastsd Vqq,Wq (66),(v)
+1a: vbroadcastf128 Vqq,Mdq (66),(v)
+1b:
+1c: pabsb Pq,Qq | vpabsb Vx,Wx (66),(v1)
+1d: pabsw Pq,Qq | vpabsw Vx,Wx (66),(v1)
+1e: pabsd Pq,Qq | vpabsd Vx,Wx (66),(v1)
+1f:
+# 0x0f 0x38 0x20-0x2f
+20: vpmovsxbw Vx,Ux/Mq (66),(v1)
+21: vpmovsxbd Vx,Ux/Md (66),(v1)
+22: vpmovsxbq Vx,Ux/Mw (66),(v1)
+23: vpmovsxwd Vx,Ux/Mq (66),(v1)
+24: vpmovsxwq Vx,Ux/Md (66),(v1)
+25: vpmovsxdq Vx,Ux/Mq (66),(v1)
+26:
+27:
+28: vpmuldq Vx,Hx,Wx (66),(v1)
+29: vpcmpeqq Vx,Hx,Wx (66),(v1)
+2a: vmovntdqa Vx,Mx (66),(v1)
+2b: vpackusdw Vx,Hx,Wx (66),(v1)
+2c: vmaskmovps Vx,Hx,Mx (66),(v)
+2d: vmaskmovpd Vx,Hx,Mx (66),(v)
+2e: vmaskmovps Mx,Hx,Vx (66),(v)
+2f: vmaskmovpd Mx,Hx,Vx (66),(v)
+# 0x0f 0x38 0x30-0x3f
+30: vpmovzxbw Vx,Ux/Mq (66),(v1)
+31: vpmovzxbd Vx,Ux/Md (66),(v1)
+32: vpmovzxbq Vx,Ux/Mw (66),(v1)
+33: vpmovzxwd Vx,Ux/Mq (66),(v1)
+34: vpmovzxwq Vx,Ux/Md (66),(v1)
+35: vpmovzxdq Vx,Ux/Mq (66),(v1)
+36: vpermd Vqq,Hqq,Wqq (66),(v)
+37: vpcmpgtq Vx,Hx,Wx (66),(v1)
+38: vpminsb Vx,Hx,Wx (66),(v1)
+39: vpminsd Vx,Hx,Wx (66),(v1)
+3a: vpminuw Vx,Hx,Wx (66),(v1)
+3b: vpminud Vx,Hx,Wx (66),(v1)
+3c: vpmaxsb Vx,Hx,Wx (66),(v1)
+3d: vpmaxsd Vx,Hx,Wx (66),(v1)
+3e: vpmaxuw Vx,Hx,Wx (66),(v1)
+3f: vpmaxud Vx,Hx,Wx (66),(v1)
+# 0x0f 0x38 0x40-0x8f
+40: vpmulld Vx,Hx,Wx (66),(v1)
+41: vphminposuw Vdq,Wdq (66),(v1)
+42:
+43:
+44:
+45: vpsrlvd/q Vx,Hx,Wx (66),(v)
+46: vpsravd Vx,Hx,Wx (66),(v)
+47: vpsllvd/q Vx,Hx,Wx (66),(v)
+# Skip 0x48-0x57
+58: vpbroadcastd Vx,Wx (66),(v)
+59: vpbroadcastq Vx,Wx (66),(v)
+5a: vbroadcasti128 Vqq,Mdq (66),(v)
+# Skip 0x5b-0x77
+78: vpbroadcastb Vx,Wx (66),(v)
+79: vpbroadcastw Vx,Wx (66),(v)
+# Skip 0x7a-0x7f
+80: INVEPT Gy,Mdq (66)
+81: INVPID Gy,Mdq (66)
+82: INVPCID Gy,Mdq (66)
+8c: vpmaskmovd/q Vx,Hx,Mx (66),(v)
+8e: vpmaskmovd/q Mx,Vx,Hx (66),(v)
+# 0x0f 0x38 0x90-0xbf (FMA)
+90: vgatherdd/q Vx,Hx,Wx (66),(v)
+91: vgatherqd/q Vx,Hx,Wx (66),(v)
+92: vgatherdps/d Vx,Hx,Wx (66),(v)
+93: vgatherqps/d Vx,Hx,Wx (66),(v)
+94:
+95:
+96: vfmaddsub132ps/d Vx,Hx,Wx (66),(v)
+97: vfmsubadd132ps/d Vx,Hx,Wx (66),(v)
+98: vfmadd132ps/d Vx,Hx,Wx (66),(v)
+99: vfmadd132ss/d Vx,Hx,Wx (66),(v),(v1)
+9a: vfmsub132ps/d Vx,Hx,Wx (66),(v)
+9b: vfmsub132ss/d Vx,Hx,Wx (66),(v),(v1)
+9c: vfnmadd132ps/d Vx,Hx,Wx (66),(v)
+9d: vfnmadd132ss/d Vx,Hx,Wx (66),(v),(v1)
+9e: vfnmsub132ps/d Vx,Hx,Wx (66),(v)
+9f: vfnmsub132ss/d Vx,Hx,Wx (66),(v),(v1)
+a6: vfmaddsub213ps/d Vx,Hx,Wx (66),(v)
+a7: vfmsubadd213ps/d Vx,Hx,Wx (66),(v)
+a8: vfmadd213ps/d Vx,Hx,Wx (66),(v)
+a9: vfmadd213ss/d Vx,Hx,Wx (66),(v),(v1)
+aa: vfmsub213ps/d Vx,Hx,Wx (66),(v)
+ab: vfmsub213ss/d Vx,Hx,Wx (66),(v),(v1)
+ac: vfnmadd213ps/d Vx,Hx,Wx (66),(v)
+ad: vfnmadd213ss/d Vx,Hx,Wx (66),(v),(v1)
+ae: vfnmsub213ps/d Vx,Hx,Wx (66),(v)
+af: vfnmsub213ss/d Vx,Hx,Wx (66),(v),(v1)
+b6: vfmaddsub231ps/d Vx,Hx,Wx (66),(v)
+b7: vfmsubadd231ps/d Vx,Hx,Wx (66),(v)
+b8: vfmadd231ps/d Vx,Hx,Wx (66),(v)
+b9: vfmadd231ss/d Vx,Hx,Wx (66),(v),(v1)
+ba: vfmsub231ps/d Vx,Hx,Wx (66),(v)
+bb: vfmsub231ss/d Vx,Hx,Wx (66),(v),(v1)
+bc: vfnmadd231ps/d Vx,Hx,Wx (66),(v)
+bd: vfnmadd231ss/d Vx,Hx,Wx (66),(v),(v1)
+be: vfnmsub231ps/d Vx,Hx,Wx (66),(v)
+bf: vfnmsub231ss/d Vx,Hx,Wx (66),(v),(v1)
+# 0x0f 0x38 0xc0-0xff
+db: VAESIMC Vdq,Wdq (66),(v1)
+dc: VAESENC Vdq,Hdq,Wdq (66),(v1)
+dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1)
+de: VAESDEC Vdq,Hdq,Wdq (66),(v1)
+df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1)
+f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2)
+f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2)
+f2: ANDN Gy,By,Ey (v)
+f3: Grp17 (1A)
+f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v)
+f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v)
+f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v)
+EndTable
+
+Table: 3-byte opcode 2 (0x0f 0x3a)
+Referrer: 3-byte escape 2
+AVXcode: 3
+# 0x0f 0x3a 0x00-0xff
+00: vpermq Vqq,Wqq,Ib (66),(v)
+01: vpermpd Vqq,Wqq,Ib (66),(v)
+02: vpblendd Vx,Hx,Wx,Ib (66),(v)
+03:
+04: vpermilps Vx,Wx,Ib (66),(v)
+05: vpermilpd Vx,Wx,Ib (66),(v)
+06: vperm2f128 Vqq,Hqq,Wqq,Ib (66),(v)
+07:
+08: vroundps Vx,Wx,Ib (66)
+09: vroundpd Vx,Wx,Ib (66)
+0a: vroundss Vss,Wss,Ib (66),(v1)
+0b: vroundsd Vsd,Wsd,Ib (66),(v1)
+0c: vblendps Vx,Hx,Wx,Ib (66)
+0d: vblendpd Vx,Hx,Wx,Ib (66)
+0e: vpblendw Vx,Hx,Wx,Ib (66),(v1)
+0f: palignr Pq,Qq,Ib | vpalignr Vx,Hx,Wx,Ib (66),(v1)
+14: vpextrb Rd/Mb,Vdq,Ib (66),(v1)
+15: vpextrw Rd/Mw,Vdq,Ib (66),(v1)
+16: vpextrd/q Ey,Vdq,Ib (66),(v1)
+17: vextractps Ed,Vdq,Ib (66),(v1)
+18: vinsertf128 Vqq,Hqq,Wqq,Ib (66),(v)
+19: vextractf128 Wdq,Vqq,Ib (66),(v)
+1d: vcvtps2ph Wx,Vx,Ib (66),(v)
+20: vpinsrb Vdq,Hdq,Ry/Mb,Ib (66),(v1)
+21: vinsertps Vdq,Hdq,Udq/Md,Ib (66),(v1)
+22: vpinsrd/q Vdq,Hdq,Ey,Ib (66),(v1)
+38: vinserti128 Vqq,Hqq,Wqq,Ib (66),(v)
+39: vextracti128 Wdq,Vqq,Ib (66),(v)
+40: vdpps Vx,Hx,Wx,Ib (66)
+41: vdppd Vdq,Hdq,Wdq,Ib (66),(v1)
+42: vmpsadbw Vx,Hx,Wx,Ib (66),(v1)
+44: vpclmulqdq Vdq,Hdq,Wdq,Ib (66),(v1)
+46: vperm2i128 Vqq,Hqq,Wqq,Ib (66),(v)
+4a: vblendvps Vx,Hx,Wx,Lx (66),(v)
+4b: vblendvpd Vx,Hx,Wx,Lx (66),(v)
+4c: vpblendvb Vx,Hx,Wx,Lx (66),(v1)
+60: vpcmpestrm Vdq,Wdq,Ib (66),(v1)
+61: vpcmpestri Vdq,Wdq,Ib (66),(v1)
+62: vpcmpistrm Vdq,Wdq,Ib (66),(v1)
+63: vpcmpistri Vdq,Wdq,Ib (66),(v1)
+df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1)
+f0: RORX Gy,Ey,Ib (F2),(v)
+EndTable
+
+GrpTable: Grp1
+0: ADD
+1: OR
+2: ADC
+3: SBB
+4: AND
+5: SUB
+6: XOR
+7: CMP
+EndTable
+
+GrpTable: Grp1A
+0: POP
+EndTable
+
+GrpTable: Grp2
+0: ROL
+1: ROR
+2: RCL
+3: RCR
+4: SHL/SAL
+5: SHR
+6:
+7: SAR
+EndTable
+
+GrpTable: Grp3_1
+0: TEST Eb,Ib
+1:
+2: NOT Eb
+3: NEG Eb
+4: MUL AL,Eb
+5: IMUL AL,Eb
+6: DIV AL,Eb
+7: IDIV AL,Eb
+EndTable
+
+GrpTable: Grp3_2
+0: TEST Ev,Iz
+1:
+2: NOT Ev
+3: NEG Ev
+4: MUL rAX,Ev
+5: IMUL rAX,Ev
+6: DIV rAX,Ev
+7: IDIV rAX,Ev
+EndTable
+
+GrpTable: Grp4
+0: INC Eb
+1: DEC Eb
+EndTable
+
+GrpTable: Grp5
+0: INC Ev
+1: DEC Ev
+2: CALLN Ev (f64)
+3: CALLF Ep
+4: JMPN Ev (f64)
+5: JMPF Mp
+6: PUSH Ev (d64)
+7:
+EndTable
+
+GrpTable: Grp6
+0: SLDT Rv/Mw
+1: STR Rv/Mw
+2: LLDT Ew
+3: LTR Ew
+4: VERR Ew
+5: VERW Ew
+EndTable
+
+GrpTable: Grp7
+0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B)
+1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B)
+2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B)
+3: LIDT Ms
+4: SMSW Mw/Rv
+5:
+6: LMSW Ew
+7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B)
+EndTable
+
+GrpTable: Grp8
+4: BT
+5: BTS
+6: BTR
+7: BTC
+EndTable
+
+GrpTable: Grp9
+1: CMPXCHG8B/16B Mq/Mdq
+6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B)
+7: VMPTRST Mq | VMPTRST Mq (F3) | RDSEED Rv (11B)
+EndTable
+
+GrpTable: Grp10
+EndTable
+
+# Grp11A and Grp11B are expressed as Grp11 in Intel SDM
+GrpTable: Grp11A
+0: MOV Eb,Ib
+7: XABORT Ib (000),(11B)
+EndTable
+
+GrpTable: Grp11B
+0: MOV Eb,Iz
+7: XBEGIN Jz (000),(11B)
+EndTable
+
+GrpTable: Grp12
+2: psrlw Nq,Ib (11B) | vpsrlw Hx,Ux,Ib (66),(11B),(v1)
+4: psraw Nq,Ib (11B) | vpsraw Hx,Ux,Ib (66),(11B),(v1)
+6: psllw Nq,Ib (11B) | vpsllw Hx,Ux,Ib (66),(11B),(v1)
+EndTable
+
+GrpTable: Grp13
+2: psrld Nq,Ib (11B) | vpsrld Hx,Ux,Ib (66),(11B),(v1)
+4: psrad Nq,Ib (11B) | vpsrad Hx,Ux,Ib (66),(11B),(v1)
+6: pslld Nq,Ib (11B) | vpslld Hx,Ux,Ib (66),(11B),(v1)
+EndTable
+
+GrpTable: Grp14
+2: psrlq Nq,Ib (11B) | vpsrlq Hx,Ux,Ib (66),(11B),(v1)
+3: vpsrldq Hx,Ux,Ib (66),(11B),(v1)
+6: psllq Nq,Ib (11B) | vpsllq Hx,Ux,Ib (66),(11B),(v1)
+7: vpslldq Hx,Ux,Ib (66),(11B),(v1)
+EndTable
+
+GrpTable: Grp15
+0: fxsave | RDFSBASE Ry (F3),(11B)
+1: fxstor | RDGSBASE Ry (F3),(11B)
+2: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B)
+3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B)
+4: XSAVE
+5: XRSTOR | lfence (11B)
+6: XSAVEOPT | mfence (11B)
+7: clflush | sfence (11B)
+EndTable
+
+GrpTable: Grp16
+0: prefetch NTA
+1: prefetch T0
+2: prefetch T1
+3: prefetch T2
+EndTable
+
+GrpTable: Grp17
+1: BLSR By,Ey (v)
+2: BLSMSK By,Ey (v)
+3: BLSI By,Ey (v)
+EndTable
+
+# AMD's Prefetch Group
+GrpTable: GrpP
+0: PREFETCH
+1: PREFETCHW
+EndTable
+
+GrpTable: GrpPDLK
+0: MONTMUL
+1: XSHA1
+2: XSHA2
+EndTable
+
+GrpTable: GrpRNG
+0: xstore-rng
+1: xcrypt-ecb
+2: xcrypt-cbc
+4: xcrypt-cfb
+5: xcrypt-ofb
+EndTable