diff options
Diffstat (limited to 'arch/x86/lib')
54 files changed, 4965 insertions, 2631 deletions
diff --git a/arch/x86/lib/.gitignore b/arch/x86/lib/.gitignore new file mode 100644 index 00000000000..8df89f0a3fe --- /dev/null +++ b/arch/x86/lib/.gitignore @@ -0,0 +1 @@ +inat-tables.c diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 25df1c1989f..4d4f96a2763 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -2,26 +2,45 @@ # Makefile for x86 specific library files. # -obj-$(CONFIG_SMP) := msr-on-cpu.o +inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk +inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt +quiet_cmd_inat_tables = GEN $@ + cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ || rm -f $@ -lib-y := delay_$(BITS).o -lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o +$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) + $(call cmd,inat_tables) + +$(obj)/inat.o: $(obj)/inat-tables.c + +clean-files := inat-tables.c + +obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o + +lib-y := delay.o misc.o cmdline.o +lib-y += thunk_$(BITS).o +lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o +lib-$(CONFIG_SMP) += rwlock.o +lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o +lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o + +obj-y += msr.o msr-reg.o msr-reg-export.o hash.o ifeq ($(CONFIG_X86_32),y) + obj-y += atomic64_32.o + lib-y += atomic64_cx8_32.o lib-y += checksum_32.o lib-y += strstr_32.o - lib-y += bitops_32.o semaphore_32.o string_32.o - + lib-y += string_32.o +ifneq ($(CONFIG_X86_CMPXCHG64),y) + lib-y += cmpxchg8b_emu.o atomic64_386_32.o +endif lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o else - obj-y += io_64.o iomap_copy_64.o - - CFLAGS_csum-partial_64.o := -funroll-loops - + obj-y += iomap_copy_64.o lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o lib-y += thunk_64.o clear_page_64.o copy_page_64.o - lib-y += bitops_64.o lib-y += memmove_64.o memset_64.o - lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o + lib-y += copy_user_64.o copy_user_nocache_64.o + lib-y += cmpxchg16b_emu.o endif diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c new file mode 100644 index 00000000000..a0b4a350daa --- /dev/null +++ b/arch/x86/lib/atomic64_32.c @@ -0,0 +1,4 @@ +#define ATOMIC64_EXPORT EXPORT_SYMBOL + +#include <linux/export.h> +#include <linux/atomic.h> diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S new file mode 100644 index 00000000000..00933d5e992 --- /dev/null +++ b/arch/x86/lib/atomic64_386_32.S @@ -0,0 +1,194 @@ +/* + * atomic64_t for 386/486 + * + * Copyright © 2010 Luca Barbieri + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/linkage.h> +#include <asm/alternative-asm.h> +#include <asm/dwarf2.h> + +/* if you want SMP support, implement these with real spinlocks */ +.macro LOCK reg + pushfl_cfi + cli +.endm + +.macro UNLOCK reg + popfl_cfi +.endm + +#define BEGIN(op) \ +.macro endp; \ + CFI_ENDPROC; \ +ENDPROC(atomic64_##op##_386); \ +.purgem endp; \ +.endm; \ +ENTRY(atomic64_##op##_386); \ + CFI_STARTPROC; \ + LOCK v; + +#define ENDP endp + +#define RET \ + UNLOCK v; \ + ret + +#define RET_ENDP \ + RET; \ + ENDP + +#define v %ecx +BEGIN(read) + movl (v), %eax + movl 4(v), %edx +RET_ENDP +#undef v + +#define v %esi +BEGIN(set) + movl %ebx, (v) + movl %ecx, 4(v) +RET_ENDP +#undef v + +#define v %esi +BEGIN(xchg) + movl (v), %eax + movl 4(v), %edx + movl %ebx, (v) + movl %ecx, 4(v) +RET_ENDP +#undef v + +#define v %ecx +BEGIN(add) + addl %eax, (v) + adcl %edx, 4(v) +RET_ENDP +#undef v + +#define v %ecx +BEGIN(add_return) + addl (v), %eax + adcl 4(v), %edx + movl %eax, (v) + movl %edx, 4(v) +RET_ENDP +#undef v + +#define v %ecx +BEGIN(sub) + subl %eax, (v) + sbbl %edx, 4(v) +RET_ENDP +#undef v + +#define v %ecx +BEGIN(sub_return) + negl %edx + negl %eax + sbbl $0, %edx + addl (v), %eax + adcl 4(v), %edx + movl %eax, (v) + movl %edx, 4(v) +RET_ENDP +#undef v + +#define v %esi +BEGIN(inc) + addl $1, (v) + adcl $0, 4(v) +RET_ENDP +#undef v + +#define v %esi +BEGIN(inc_return) + movl (v), %eax + movl 4(v), %edx + addl $1, %eax + adcl $0, %edx + movl %eax, (v) + movl %edx, 4(v) +RET_ENDP +#undef v + +#define v %esi +BEGIN(dec) + subl $1, (v) + sbbl $0, 4(v) +RET_ENDP +#undef v + +#define v %esi +BEGIN(dec_return) + movl (v), %eax + movl 4(v), %edx + subl $1, %eax + sbbl $0, %edx + movl %eax, (v) + movl %edx, 4(v) +RET_ENDP +#undef v + +#define v %esi +BEGIN(add_unless) + addl %eax, %ecx + adcl %edx, %edi + addl (v), %eax + adcl 4(v), %edx + cmpl %eax, %ecx + je 3f +1: + movl %eax, (v) + movl %edx, 4(v) + movl $1, %eax +2: + RET +3: + cmpl %edx, %edi + jne 1b + xorl %eax, %eax + jmp 2b +ENDP +#undef v + +#define v %esi +BEGIN(inc_not_zero) + movl (v), %eax + movl 4(v), %edx + testl %eax, %eax + je 3f +1: + addl $1, %eax + adcl $0, %edx + movl %eax, (v) + movl %edx, 4(v) + movl $1, %eax +2: + RET +3: + testl %edx, %edx + jne 1b + jmp 2b +ENDP +#undef v + +#define v %esi +BEGIN(dec_if_positive) + movl (v), %eax + movl 4(v), %edx + subl $1, %eax + sbbl $0, %edx + js 1f + movl %eax, (v) + movl %edx, 4(v) +1: +RET_ENDP +#undef v diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S new file mode 100644 index 00000000000..f5cc9eb1d51 --- /dev/null +++ b/arch/x86/lib/atomic64_cx8_32.S @@ -0,0 +1,215 @@ +/* + * atomic64_t for 586+ + * + * Copyright © 2010 Luca Barbieri + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/linkage.h> +#include <asm/alternative-asm.h> +#include <asm/dwarf2.h> + +.macro SAVE reg + pushl_cfi %\reg + CFI_REL_OFFSET \reg, 0 +.endm + +.macro RESTORE reg + popl_cfi %\reg + CFI_RESTORE \reg +.endm + +.macro read64 reg + movl %ebx, %eax + movl %ecx, %edx +/* we need LOCK_PREFIX since otherwise cmpxchg8b always does the write */ + LOCK_PREFIX + cmpxchg8b (\reg) +.endm + +ENTRY(atomic64_read_cx8) + CFI_STARTPROC + + read64 %ecx + ret + CFI_ENDPROC +ENDPROC(atomic64_read_cx8) + +ENTRY(atomic64_set_cx8) + CFI_STARTPROC + +1: +/* we don't need LOCK_PREFIX since aligned 64-bit writes + * are atomic on 586 and newer */ + cmpxchg8b (%esi) + jne 1b + + ret + CFI_ENDPROC +ENDPROC(atomic64_set_cx8) + +ENTRY(atomic64_xchg_cx8) + CFI_STARTPROC + +1: + LOCK_PREFIX + cmpxchg8b (%esi) + jne 1b + + ret + CFI_ENDPROC +ENDPROC(atomic64_xchg_cx8) + +.macro addsub_return func ins insc +ENTRY(atomic64_\func\()_return_cx8) + CFI_STARTPROC + SAVE ebp + SAVE ebx + SAVE esi + SAVE edi + + movl %eax, %esi + movl %edx, %edi + movl %ecx, %ebp + + read64 %ecx +1: + movl %eax, %ebx + movl %edx, %ecx + \ins\()l %esi, %ebx + \insc\()l %edi, %ecx + LOCK_PREFIX + cmpxchg8b (%ebp) + jne 1b + +10: + movl %ebx, %eax + movl %ecx, %edx + RESTORE edi + RESTORE esi + RESTORE ebx + RESTORE ebp + ret + CFI_ENDPROC +ENDPROC(atomic64_\func\()_return_cx8) +.endm + +addsub_return add add adc +addsub_return sub sub sbb + +.macro incdec_return func ins insc +ENTRY(atomic64_\func\()_return_cx8) + CFI_STARTPROC + SAVE ebx + + read64 %esi +1: + movl %eax, %ebx + movl %edx, %ecx + \ins\()l $1, %ebx + \insc\()l $0, %ecx + LOCK_PREFIX + cmpxchg8b (%esi) + jne 1b + +10: + movl %ebx, %eax + movl %ecx, %edx + RESTORE ebx + ret + CFI_ENDPROC +ENDPROC(atomic64_\func\()_return_cx8) +.endm + +incdec_return inc add adc +incdec_return dec sub sbb + +ENTRY(atomic64_dec_if_positive_cx8) + CFI_STARTPROC + SAVE ebx + + read64 %esi +1: + movl %eax, %ebx + movl %edx, %ecx + subl $1, %ebx + sbb $0, %ecx + js 2f + LOCK_PREFIX + cmpxchg8b (%esi) + jne 1b + +2: + movl %ebx, %eax + movl %ecx, %edx + RESTORE ebx + ret + CFI_ENDPROC +ENDPROC(atomic64_dec_if_positive_cx8) + +ENTRY(atomic64_add_unless_cx8) + CFI_STARTPROC + SAVE ebp + SAVE ebx +/* these just push these two parameters on the stack */ + SAVE edi + SAVE ecx + + movl %eax, %ebp + movl %edx, %edi + + read64 %esi +1: + cmpl %eax, 0(%esp) + je 4f +2: + movl %eax, %ebx + movl %edx, %ecx + addl %ebp, %ebx + adcl %edi, %ecx + LOCK_PREFIX + cmpxchg8b (%esi) + jne 1b + + movl $1, %eax +3: + addl $8, %esp + CFI_ADJUST_CFA_OFFSET -8 + RESTORE ebx + RESTORE ebp + ret +4: + cmpl %edx, 4(%esp) + jne 2b + xorl %eax, %eax + jmp 3b + CFI_ENDPROC +ENDPROC(atomic64_add_unless_cx8) + +ENTRY(atomic64_inc_not_zero_cx8) + CFI_STARTPROC + SAVE ebx + + read64 %esi +1: + movl %eax, %ecx + orl %edx, %ecx + jz 3f + movl %eax, %ebx + xorl %ecx, %ecx + addl $1, %ebx + adcl %edx, %ecx + LOCK_PREFIX + cmpxchg8b (%esi) + jne 1b + + movl $1, %eax +3: + RESTORE ebx + ret + CFI_ENDPROC +ENDPROC(atomic64_inc_not_zero_cx8) diff --git a/arch/x86/lib/bitops_32.c b/arch/x86/lib/bitops_32.c deleted file mode 100644 index b6544045985..00000000000 --- a/arch/x86/lib/bitops_32.c +++ /dev/null @@ -1,70 +0,0 @@ -#include <linux/bitops.h> -#include <linux/module.h> - -/** - * find_next_bit - find the next set bit in a memory region - * @addr: The address to base the search on - * @offset: The bitnumber to start searching at - * @size: The maximum size to search - */ -int find_next_bit(const unsigned long *addr, int size, int offset) -{ - const unsigned long *p = addr + (offset >> 5); - int set = 0, bit = offset & 31, res; - - if (bit) { - /* - * Look for nonzero in the first 32 bits: - */ - __asm__("bsfl %1,%0\n\t" - "jne 1f\n\t" - "movl $32, %0\n" - "1:" - : "=r" (set) - : "r" (*p >> bit)); - if (set < (32 - bit)) - return set + offset; - set = 32 - bit; - p++; - } - /* - * No set bit yet, search remaining full words for a bit - */ - res = find_first_bit (p, size - 32 * (p - addr)); - return (offset + set + res); -} -EXPORT_SYMBOL(find_next_bit); - -/** - * find_next_zero_bit - find the first zero bit in a memory region - * @addr: The address to base the search on - * @offset: The bitnumber to start searching at - * @size: The maximum size to search - */ -int find_next_zero_bit(const unsigned long *addr, int size, int offset) -{ - const unsigned long *p = addr + (offset >> 5); - int set = 0, bit = offset & 31, res; - - if (bit) { - /* - * Look for zero in the first 32 bits. - */ - __asm__("bsfl %1,%0\n\t" - "jne 1f\n\t" - "movl $32, %0\n" - "1:" - : "=r" (set) - : "r" (~(*p >> bit))); - if (set < (32 - bit)) - return set + offset; - set = 32 - bit; - p++; - } - /* - * No zero yet, search remaining full bytes for a zero - */ - res = find_first_zero_bit(p, size - 32 * (p - addr)); - return (offset + set + res); -} -EXPORT_SYMBOL(find_next_zero_bit); diff --git a/arch/x86/lib/bitops_64.c b/arch/x86/lib/bitops_64.c deleted file mode 100644 index 0e8f491e6cc..00000000000 --- a/arch/x86/lib/bitops_64.c +++ /dev/null @@ -1,175 +0,0 @@ -#include <linux/bitops.h> - -#undef find_first_zero_bit -#undef find_next_zero_bit -#undef find_first_bit -#undef find_next_bit - -static inline long -__find_first_zero_bit(const unsigned long * addr, unsigned long size) -{ - long d0, d1, d2; - long res; - - /* - * We must test the size in words, not in bits, because - * otherwise incoming sizes in the range -63..-1 will not run - * any scasq instructions, and then the flags used by the je - * instruction will have whatever random value was in place - * before. Nobody should call us like that, but - * find_next_zero_bit() does when offset and size are at the - * same word and it fails to find a zero itself. - */ - size += 63; - size >>= 6; - if (!size) - return 0; - asm volatile( - " repe; scasq\n" - " je 1f\n" - " xorq -8(%%rdi),%%rax\n" - " subq $8,%%rdi\n" - " bsfq %%rax,%%rdx\n" - "1: subq %[addr],%%rdi\n" - " shlq $3,%%rdi\n" - " addq %%rdi,%%rdx" - :"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2) - :"0" (0ULL), "1" (size), "2" (addr), "3" (-1ULL), - [addr] "S" (addr) : "memory"); - /* - * Any register would do for [addr] above, but GCC tends to - * prefer rbx over rsi, even though rsi is readily available - * and doesn't have to be saved. - */ - return res; -} - -/** - * find_first_zero_bit - find the first zero bit in a memory region - * @addr: The address to start the search at - * @size: The maximum size to search - * - * Returns the bit-number of the first zero bit, not the number of the byte - * containing a bit. - */ -long find_first_zero_bit(const unsigned long * addr, unsigned long size) -{ - return __find_first_zero_bit (addr, size); -} - -/** - * find_next_zero_bit - find the next zero bit in a memory region - * @addr: The address to base the search on - * @offset: The bitnumber to start searching at - * @size: The maximum size to search - */ -long find_next_zero_bit (const unsigned long * addr, long size, long offset) -{ - const unsigned long * p = addr + (offset >> 6); - unsigned long set = 0; - unsigned long res, bit = offset&63; - - if (bit) { - /* - * Look for zero in first word - */ - asm("bsfq %1,%0\n\t" - "cmoveq %2,%0" - : "=r" (set) - : "r" (~(*p >> bit)), "r"(64L)); - if (set < (64 - bit)) - return set + offset; - set = 64 - bit; - p++; - } - /* - * No zero yet, search remaining full words for a zero - */ - res = __find_first_zero_bit (p, size - 64 * (p - addr)); - - return (offset + set + res); -} - -static inline long -__find_first_bit(const unsigned long * addr, unsigned long size) -{ - long d0, d1; - long res; - - /* - * We must test the size in words, not in bits, because - * otherwise incoming sizes in the range -63..-1 will not run - * any scasq instructions, and then the flags used by the jz - * instruction will have whatever random value was in place - * before. Nobody should call us like that, but - * find_next_bit() does when offset and size are at the same - * word and it fails to find a one itself. - */ - size += 63; - size >>= 6; - if (!size) - return 0; - asm volatile( - " repe; scasq\n" - " jz 1f\n" - " subq $8,%%rdi\n" - " bsfq (%%rdi),%%rax\n" - "1: subq %[addr],%%rdi\n" - " shlq $3,%%rdi\n" - " addq %%rdi,%%rax" - :"=a" (res), "=&c" (d0), "=&D" (d1) - :"0" (0ULL), "1" (size), "2" (addr), - [addr] "r" (addr) : "memory"); - return res; -} - -/** - * find_first_bit - find the first set bit in a memory region - * @addr: The address to start the search at - * @size: The maximum size to search - * - * Returns the bit-number of the first set bit, not the number of the byte - * containing a bit. - */ -long find_first_bit(const unsigned long * addr, unsigned long size) -{ - return __find_first_bit(addr,size); -} - -/** - * find_next_bit - find the first set bit in a memory region - * @addr: The address to base the search on - * @offset: The bitnumber to start searching at - * @size: The maximum size to search - */ -long find_next_bit(const unsigned long * addr, long size, long offset) -{ - const unsigned long * p = addr + (offset >> 6); - unsigned long set = 0, bit = offset & 63, res; - - if (bit) { - /* - * Look for nonzero in the first 64 bits: - */ - asm("bsfq %1,%0\n\t" - "cmoveq %2,%0\n\t" - : "=r" (set) - : "r" (*p >> bit), "r" (64L)); - if (set < (64 - bit)) - return set + offset; - set = 64 - bit; - p++; - } - /* - * No set bit yet, search remaining full words for a bit - */ - res = __find_first_bit (p, size - 64 * (p - addr)); - return (offset + set + res); -} - -#include <linux/module.h> - -EXPORT_SYMBOL(find_next_bit); -EXPORT_SYMBOL(find_first_bit); -EXPORT_SYMBOL(find_first_zero_bit); -EXPORT_SYMBOL(find_next_zero_bit); diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c new file mode 100644 index 00000000000..a3c66887503 --- /dev/null +++ b/arch/x86/lib/cache-smp.c @@ -0,0 +1,19 @@ +#include <linux/smp.h> +#include <linux/module.h> + +static void __wbinvd(void *dummy) +{ + wbinvd(); +} + +void wbinvd_on_cpu(int cpu) +{ + smp_call_function_single(cpu, __wbinvd, NULL, 1); +} +EXPORT_SYMBOL(wbinvd_on_cpu); + +int wbinvd_on_all_cpus(void) +{ + return on_each_cpu(__wbinvd, NULL, 1); +} +EXPORT_SYMBOL(wbinvd_on_all_cpus); diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index adbccd0bbb7..e78b8eee661 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -28,6 +28,7 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> #include <asm/errno.h> +#include <asm/asm.h> /* * computes a partial checksum, e.g. for TCP/UDP fragments @@ -50,11 +51,9 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) */ ENTRY(csum_partial) CFI_STARTPROC - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len @@ -62,7 +61,7 @@ ENTRY(csum_partial) testl $3, %esi # Check alignment. jz 2f # Jump if alignment is ok. testl $1, %esi # Check alignment. - jz 10f # Jump if alignment is boundary of 2bytes. + jz 10f # Jump if alignment is boundary of 2 bytes. # buf is odd dec %ecx @@ -132,11 +131,9 @@ ENTRY(csum_partial) jz 8f roll $8, %eax 8: - popl %ebx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebx CFI_RESTORE ebx - popl %esi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %esi CFI_RESTORE esi ret CFI_ENDPROC @@ -148,11 +145,9 @@ ENDPROC(csum_partial) ENTRY(csum_partial) CFI_STARTPROC - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len @@ -260,11 +255,9 @@ ENTRY(csum_partial) jz 90f roll $8, %eax 90: - popl %ebx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebx CFI_RESTORE ebx - popl %esi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %esi CFI_RESTORE esi ret CFI_ENDPROC @@ -290,15 +283,11 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst, #define SRC(y...) \ 9999: y; \ - .section __ex_table, "a"; \ - .long 9999b, 6001f ; \ - .previous + _ASM_EXTABLE(9999b, 6001f) #define DST(y...) \ 9999: y; \ - .section __ex_table, "a"; \ - .long 9999b, 6002f ; \ - .previous + _ASM_EXTABLE(9999b, 6002f) #ifndef CONFIG_X86_USE_PPRO_CHECKSUM @@ -309,14 +298,11 @@ ENTRY(csum_partial_copy_generic) CFI_STARTPROC subl $4,%esp CFI_ADJUST_CFA_OFFSET 4 - pushl %edi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edi CFI_REL_OFFSET edi, 0 - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 movl ARGBASE+16(%esp),%eax # sum movl ARGBASE+12(%esp),%ecx # len @@ -426,17 +412,13 @@ DST( movb %cl, (%edi) ) .previous - popl %ebx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebx CFI_RESTORE ebx - popl %esi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %esi CFI_RESTORE esi - popl %edi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %edi CFI_RESTORE edi - popl %ecx # equivalent to addl $4,%esp - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ecx # equivalent to addl $4,%esp ret CFI_ENDPROC ENDPROC(csum_partial_copy_generic) @@ -459,14 +441,11 @@ ENDPROC(csum_partial_copy_generic) ENTRY(csum_partial_copy_generic) CFI_STARTPROC - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 - pushl %edi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edi CFI_REL_OFFSET edi, 0 - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 movl ARGBASE+4(%esp),%esi #src movl ARGBASE+8(%esp),%edi #dst @@ -527,14 +506,11 @@ DST( movb %dl, (%edi) ) jmp 7b .previous - popl %esi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %esi CFI_RESTORE esi - popl %edi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %edi CFI_RESTORE edi - popl %ebx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebx CFI_RESTORE ebx ret CFI_ENDPROC diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index 9a10a78bb4a..f2145cfa12a 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -1,19 +1,28 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> +#include <asm/alternative-asm.h> /* * Zero a page. * rdi page */ - ALIGN -clear_page_c: +ENTRY(clear_page_c) CFI_STARTPROC movl $4096/8,%ecx xorl %eax,%eax rep stosq ret CFI_ENDPROC -ENDPROC(clear_page) +ENDPROC(clear_page_c) + +ENTRY(clear_page_c_e) + CFI_STARTPROC + movl $4096,%ecx + xorl %eax,%eax + rep stosb + ret + CFI_ENDPROC +ENDPROC(clear_page_c_e) ENTRY(clear_page) CFI_STARTPROC @@ -39,21 +48,26 @@ ENTRY(clear_page) .Lclear_page_end: ENDPROC(clear_page) - /* Some CPUs run faster using the string instructions. - It is also a lot simpler. Use this when possible */ + /* + * Some CPUs support enhanced REP MOVSB/STOSB instructions. + * It is recommended to use this when possible. + * If enhanced REP MOVSB/STOSB is not available, try to use fast string. + * Otherwise, use original function. + * + */ #include <asm/cpufeature.h> .section .altinstr_replacement,"ax" 1: .byte 0xeb /* jmp <disp8> */ .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ -2: +2: .byte 0xeb /* jmp <disp8> */ + .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */ +3: .previous .section .altinstructions,"a" - .align 8 - .quad clear_page - .quad 1b - .byte X86_FEATURE_REP_GOOD - .byte .Lclear_page_end - clear_page - .byte 2b - 1b + altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\ + .Lclear_page_end-clear_page, 2b-1b + altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \ + .Lclear_page_end-clear_page,3b-2b .previous diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c new file mode 100644 index 00000000000..422db000d72 --- /dev/null +++ b/arch/x86/lib/cmdline.c @@ -0,0 +1,84 @@ +/* + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * Misc librarized functions for cmdline poking. + */ +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <asm/setup.h> + +static inline int myisspace(u8 c) +{ + return c <= ' '; /* Close enough approximation */ +} + +/** + * Find a boolean option (like quiet,noapic,nosmp....) + * + * @cmdline: the cmdline string + * @option: option string to look for + * + * Returns the position of that @option (starts counting with 1) + * or 0 on not found. + */ +int cmdline_find_option_bool(const char *cmdline, const char *option) +{ + char c; + int len, pos = 0, wstart = 0; + const char *opptr = NULL; + enum { + st_wordstart = 0, /* Start of word/after whitespace */ + st_wordcmp, /* Comparing this word */ + st_wordskip, /* Miscompare, skip */ + } state = st_wordstart; + + if (!cmdline) + return -1; /* No command line */ + + len = min_t(int, strlen(cmdline), COMMAND_LINE_SIZE); + if (!len) + return 0; + + while (len--) { + c = *(char *)cmdline++; + pos++; + + switch (state) { + case st_wordstart: + if (!c) + return 0; + else if (myisspace(c)) + break; + + state = st_wordcmp; + opptr = option; + wstart = pos; + /* fall through */ + + case st_wordcmp: + if (!*opptr) + if (!c || myisspace(c)) + return wstart; + else + state = st_wordskip; + else if (!c) + return 0; + else if (c != *opptr++) + state = st_wordskip; + else if (!len) /* last word and is matching */ + return wstart; + break; + + case st_wordskip: + if (!c) + return 0; + else if (myisspace(c)) + state = st_wordstart; + break; + } + } + + return 0; /* Buffer overrun */ +} diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S new file mode 100644 index 00000000000..1e572c507d0 --- /dev/null +++ b/arch/x86/lib/cmpxchg16b_emu.S @@ -0,0 +1,65 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + */ +#include <linux/linkage.h> +#include <asm/alternative-asm.h> +#include <asm/frame.h> +#include <asm/dwarf2.h> + +#ifdef CONFIG_SMP +#define SEG_PREFIX %gs: +#else +#define SEG_PREFIX +#endif + +.text + +/* + * Inputs: + * %rsi : memory location to compare + * %rax : low 64 bits of old value + * %rdx : high 64 bits of old value + * %rbx : low 64 bits of new value + * %rcx : high 64 bits of new value + * %al : Operation successful + */ +ENTRY(this_cpu_cmpxchg16b_emu) +CFI_STARTPROC + +# +# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not +# via the ZF. Caller will access %al to get result. +# +# Note that this is only useful for a cpuops operation. Meaning that we +# do *not* have a fully atomic operation but just an operation that is +# *atomic* on a single cpu (as provided by the this_cpu_xx class of +# macros). +# +this_cpu_cmpxchg16b_emu: + pushf + cli + + cmpq SEG_PREFIX(%rsi), %rax + jne not_same + cmpq SEG_PREFIX 8(%rsi), %rdx + jne not_same + + movq %rbx, SEG_PREFIX(%rsi) + movq %rcx, SEG_PREFIX 8(%rsi) + + popf + mov $1, %al + ret + + not_same: + popf + xor %al,%al + ret + +CFI_ENDPROC + +ENDPROC(this_cpu_cmpxchg16b_emu) diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S new file mode 100644 index 00000000000..828cb710dec --- /dev/null +++ b/arch/x86/lib/cmpxchg8b_emu.S @@ -0,0 +1,57 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + */ + +#include <linux/linkage.h> +#include <asm/alternative-asm.h> +#include <asm/frame.h> +#include <asm/dwarf2.h> + + +.text + +/* + * Inputs: + * %esi : memory location to compare + * %eax : low 32 bits of old value + * %edx : high 32 bits of old value + * %ebx : low 32 bits of new value + * %ecx : high 32 bits of new value + */ +ENTRY(cmpxchg8b_emu) +CFI_STARTPROC + +# +# Emulate 'cmpxchg8b (%esi)' on UP except we don't +# set the whole ZF thing (caller will just compare +# eax:edx with the expected value) +# +cmpxchg8b_emu: + pushfl + cli + + cmpl (%esi), %eax + jne not_same + cmpl 4(%esi), %edx + jne half_same + + movl %ebx, (%esi) + movl %ecx, 4(%esi) + + popfl + ret + + not_same: + movl (%esi), %eax + half_same: + movl 4(%esi), %edx + + popfl + ret + +CFI_ENDPROC +ENDPROC(cmpxchg8b_emu) diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 727a5d46d2f..176cca67212 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -2,98 +2,93 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> +#include <asm/alternative-asm.h> ALIGN -copy_page_c: +copy_page_rep: CFI_STARTPROC - movl $4096/8,%ecx - rep movsq + movl $4096/8, %ecx + rep movsq ret CFI_ENDPROC -ENDPROC(copy_page_c) +ENDPROC(copy_page_rep) -/* Don't use streaming store because it's better when the target - ends up in cache. */ - -/* Could vary the prefetch distance based on SMP/UP */ +/* + * Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD. + * Could vary the prefetch distance based on SMP/UP. +*/ ENTRY(copy_page) CFI_STARTPROC - subq $3*8,%rsp - CFI_ADJUST_CFA_OFFSET 3*8 - movq %rbx,(%rsp) + subq $2*8, %rsp + CFI_ADJUST_CFA_OFFSET 2*8 + movq %rbx, (%rsp) CFI_REL_OFFSET rbx, 0 - movq %r12,1*8(%rsp) + movq %r12, 1*8(%rsp) CFI_REL_OFFSET r12, 1*8 - movq %r13,2*8(%rsp) - CFI_REL_OFFSET r13, 2*8 - movl $(4096/64)-5,%ecx + movl $(4096/64)-5, %ecx .p2align 4 .Loop64: - dec %rcx - - movq (%rsi), %rax - movq 8 (%rsi), %rbx - movq 16 (%rsi), %rdx - movq 24 (%rsi), %r8 - movq 32 (%rsi), %r9 - movq 40 (%rsi), %r10 - movq 48 (%rsi), %r11 - movq 56 (%rsi), %r12 + dec %rcx + movq 0x8*0(%rsi), %rax + movq 0x8*1(%rsi), %rbx + movq 0x8*2(%rsi), %rdx + movq 0x8*3(%rsi), %r8 + movq 0x8*4(%rsi), %r9 + movq 0x8*5(%rsi), %r10 + movq 0x8*6(%rsi), %r11 + movq 0x8*7(%rsi), %r12 prefetcht0 5*64(%rsi) - movq %rax, (%rdi) - movq %rbx, 8 (%rdi) - movq %rdx, 16 (%rdi) - movq %r8, 24 (%rdi) - movq %r9, 32 (%rdi) - movq %r10, 40 (%rdi) - movq %r11, 48 (%rdi) - movq %r12, 56 (%rdi) + movq %rax, 0x8*0(%rdi) + movq %rbx, 0x8*1(%rdi) + movq %rdx, 0x8*2(%rdi) + movq %r8, 0x8*3(%rdi) + movq %r9, 0x8*4(%rdi) + movq %r10, 0x8*5(%rdi) + movq %r11, 0x8*6(%rdi) + movq %r12, 0x8*7(%rdi) - leaq 64 (%rsi), %rsi - leaq 64 (%rdi), %rdi + leaq 64 (%rsi), %rsi + leaq 64 (%rdi), %rdi - jnz .Loop64 + jnz .Loop64 - movl $5,%ecx + movl $5, %ecx .p2align 4 .Loop2: - decl %ecx - - movq (%rsi), %rax - movq 8 (%rsi), %rbx - movq 16 (%rsi), %rdx - movq 24 (%rsi), %r8 - movq 32 (%rsi), %r9 - movq 40 (%rsi), %r10 - movq 48 (%rsi), %r11 - movq 56 (%rsi), %r12 - - movq %rax, (%rdi) - movq %rbx, 8 (%rdi) - movq %rdx, 16 (%rdi) - movq %r8, 24 (%rdi) - movq %r9, 32 (%rdi) - movq %r10, 40 (%rdi) - movq %r11, 48 (%rdi) - movq %r12, 56 (%rdi) - - leaq 64(%rdi),%rdi - leaq 64(%rsi),%rsi - + decl %ecx + + movq 0x8*0(%rsi), %rax + movq 0x8*1(%rsi), %rbx + movq 0x8*2(%rsi), %rdx + movq 0x8*3(%rsi), %r8 + movq 0x8*4(%rsi), %r9 + movq 0x8*5(%rsi), %r10 + movq 0x8*6(%rsi), %r11 + movq 0x8*7(%rsi), %r12 + + movq %rax, 0x8*0(%rdi) + movq %rbx, 0x8*1(%rdi) + movq %rdx, 0x8*2(%rdi) + movq %r8, 0x8*3(%rdi) + movq %r9, 0x8*4(%rdi) + movq %r10, 0x8*5(%rdi) + movq %r11, 0x8*6(%rdi) + movq %r12, 0x8*7(%rdi) + + leaq 64(%rdi), %rdi + leaq 64(%rsi), %rsi jnz .Loop2 - movq (%rsp),%rbx + movq (%rsp), %rbx CFI_RESTORE rbx - movq 1*8(%rsp),%r12 + movq 1*8(%rsp), %r12 CFI_RESTORE r12 - movq 2*8(%rsp),%r13 - CFI_RESTORE r13 - addq $3*8,%rsp - CFI_ADJUST_CFA_OFFSET -3*8 + addq $2*8, %rsp + CFI_ADJUST_CFA_OFFSET -2*8 ret .Lcopy_page_end: CFI_ENDPROC @@ -106,14 +101,10 @@ ENDPROC(copy_page) .section .altinstr_replacement,"ax" 1: .byte 0xeb /* jmp <disp8> */ - .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */ + .byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */ 2: .previous .section .altinstructions,"a" - .align 8 - .quad copy_page - .quad 1b - .byte X86_FEATURE_REP_GOOD - .byte .Lcopy_page_end - copy_page - .byte 2b - 1b + altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \ + .Lcopy_page_end-copy_page, 2b-1b .previous diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 70bebd31040..dee945d5559 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -1,8 +1,10 @@ -/* Copyright 2002 Andi Kleen, SuSE Labs. +/* + * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com> + * Copyright 2002 Andi Kleen, SuSE Labs. * Subject to the GNU Public License v2. - * - * Functions to copy from and to user space. - */ + * + * Functions to copy from and to user space. + */ #include <linux/linkage.h> #include <asm/dwarf2.h> @@ -13,67 +15,94 @@ #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/cpufeature.h> +#include <asm/alternative-asm.h> +#include <asm/asm.h> +#include <asm/smap.h> - .macro ALTERNATIVE_JUMP feature,orig,alt +/* + * By placing feature2 after feature1 in altinstructions section, we logically + * implement: + * If CPU has feature2, jmp to alt2 is used + * else if CPU has feature1, jmp to alt1 is used + * else jmp to orig is used. + */ + .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2 0: .byte 0xe9 /* 32bit jump */ .long \orig-1f /* by default jump to orig */ 1: .section .altinstr_replacement,"ax" -2: .byte 0xe9 /* near jump with 32bit immediate */ - .long \alt-1b /* offset */ /* or alternatively to alt */ +2: .byte 0xe9 /* near jump with 32bit immediate */ + .long \alt1-1b /* offset */ /* or alternatively to alt1 */ +3: .byte 0xe9 /* near jump with 32bit immediate */ + .long \alt2-1b /* offset */ /* or alternatively to alt2 */ .previous + .section .altinstructions,"a" - .align 8 - .quad 0b - .quad 2b - .byte \feature /* when feature is set */ - .byte 5 - .byte 5 + altinstruction_entry 0b,2b,\feature1,5,5 + altinstruction_entry 0b,3b,\feature2,5,5 .previous .endm -/* Standard copy_to_user with segment limit checking */ -ENTRY(copy_to_user) + .macro ALIGN_DESTINATION +#ifdef FIX_ALIGNMENT + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $7,%ecx + jz 102f /* already aligned */ + subl $8,%ecx + negl %ecx + subl %ecx,%edx +100: movb (%rsi),%al +101: movb %al,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 100b +102: + .section .fixup,"ax" +103: addl %ecx,%edx /* ecx is zerorest also */ + jmp copy_user_handle_tail + .previous + + _ASM_EXTABLE(100b,103b) + _ASM_EXTABLE(101b,103b) +#endif + .endm + +/* Standard copy_to_user with segment limit checking */ +ENTRY(_copy_to_user) CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rdi,%rcx addq %rdx,%rcx - jc bad_to_user - cmpq threadinfo_addr_limit(%rax),%rcx - jae bad_to_user - xorl %eax,%eax /* clear zero flag */ - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + jc bad_to_user + cmpq TI_addr_limit(%rax),%rcx + ja bad_to_user + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ + copy_user_generic_unrolled,copy_user_generic_string, \ + copy_user_enhanced_fast_string CFI_ENDPROC +ENDPROC(_copy_to_user) -ENTRY(copy_user_generic) - CFI_STARTPROC - movl $1,%ecx /* set zero flag */ - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string - CFI_ENDPROC - -ENTRY(__copy_from_user_inatomic) - CFI_STARTPROC - xorl %ecx,%ecx /* clear zero flag */ - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string - CFI_ENDPROC - -/* Standard copy_from_user with segment limit checking */ -ENTRY(copy_from_user) +/* Standard copy_from_user with segment limit checking */ +ENTRY(_copy_from_user) CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rsi,%rcx addq %rdx,%rcx - jc bad_from_user - cmpq threadinfo_addr_limit(%rax),%rcx - jae bad_from_user - movl $1,%ecx /* set zero flag */ - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + jc bad_from_user + cmpq TI_addr_limit(%rax),%rcx + ja bad_from_user + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ + copy_user_generic_unrolled,copy_user_generic_string, \ + copy_user_enhanced_fast_string CFI_ENDPROC -ENDPROC(copy_from_user) - +ENDPROC(_copy_from_user) + .section .fixup,"ax" /* must zero dest */ +ENTRY(bad_from_user) bad_from_user: CFI_STARTPROC movl %edx,%ecx @@ -81,274 +110,185 @@ bad_from_user: rep stosb bad_to_user: - movl %edx,%eax + movl %edx,%eax ret CFI_ENDPROC -END(bad_from_user) +ENDPROC(bad_from_user) .previous - - + /* * copy_user_generic_unrolled - memory copy with exception handling. - * This version is for CPUs like P4 that don't have efficient micro code for rep movsq - * - * Input: + * This version is for CPUs like P4 that don't have efficient micro + * code for rep movsq + * + * Input: * rdi destination * rsi source * rdx count - * ecx zero flag -- if true zero destination on error * - * Output: + * Output: * eax uncopied bytes or 0 if successful. */ ENTRY(copy_user_generic_unrolled) CFI_STARTPROC - pushq %rbx - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rbx, 0 - pushq %rcx - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rcx, 0 - xorl %eax,%eax /*zero for the exception handler */ - -#ifdef FIX_ALIGNMENT - /* check for bad alignment of destination */ - movl %edi,%ecx - andl $7,%ecx - jnz .Lbad_alignment -.Lafter_bad_alignment: -#endif - - movq %rdx,%rcx - - movl $64,%ebx - shrq $6,%rdx - decq %rdx - js .Lhandle_tail - - .p2align 4 -.Lloop: -.Ls1: movq (%rsi),%r11 -.Ls2: movq 1*8(%rsi),%r8 -.Ls3: movq 2*8(%rsi),%r9 -.Ls4: movq 3*8(%rsi),%r10 -.Ld1: movq %r11,(%rdi) -.Ld2: movq %r8,1*8(%rdi) -.Ld3: movq %r9,2*8(%rdi) -.Ld4: movq %r10,3*8(%rdi) - -.Ls5: movq 4*8(%rsi),%r11 -.Ls6: movq 5*8(%rsi),%r8 -.Ls7: movq 6*8(%rsi),%r9 -.Ls8: movq 7*8(%rsi),%r10 -.Ld5: movq %r11,4*8(%rdi) -.Ld6: movq %r8,5*8(%rdi) -.Ld7: movq %r9,6*8(%rdi) -.Ld8: movq %r10,7*8(%rdi) - - decq %rdx - + ASM_STAC + cmpl $8,%edx + jb 20f /* less then 8 bytes, go to byte copy loop */ + ALIGN_DESTINATION + movl %edx,%ecx + andl $63,%edx + shrl $6,%ecx + jz 17f +1: movq (%rsi),%r8 +2: movq 1*8(%rsi),%r9 +3: movq 2*8(%rsi),%r10 +4: movq 3*8(%rsi),%r11 +5: movq %r8,(%rdi) +6: movq %r9,1*8(%rdi) +7: movq %r10,2*8(%rdi) +8: movq %r11,3*8(%rdi) +9: movq 4*8(%rsi),%r8 +10: movq 5*8(%rsi),%r9 +11: movq 6*8(%rsi),%r10 +12: movq 7*8(%rsi),%r11 +13: movq %r8,4*8(%rdi) +14: movq %r9,5*8(%rdi) +15: movq %r10,6*8(%rdi) +16: movq %r11,7*8(%rdi) leaq 64(%rsi),%rsi leaq 64(%rdi),%rdi - - jns .Lloop - - .p2align 4 -.Lhandle_tail: - movl %ecx,%edx - andl $63,%ecx - shrl $3,%ecx - jz .Lhandle_7 - movl $8,%ebx - .p2align 4 -.Lloop_8: -.Ls9: movq (%rsi),%r8 -.Ld9: movq %r8,(%rdi) decl %ecx - leaq 8(%rdi),%rdi + jnz 1b +17: movl %edx,%ecx + andl $7,%edx + shrl $3,%ecx + jz 20f +18: movq (%rsi),%r8 +19: movq %r8,(%rdi) leaq 8(%rsi),%rsi - jnz .Lloop_8 - -.Lhandle_7: - movl %edx,%ecx - andl $7,%ecx - jz .Lende - .p2align 4 -.Lloop_1: -.Ls10: movb (%rsi),%bl -.Ld10: movb %bl,(%rdi) - incq %rdi - incq %rsi + leaq 8(%rdi),%rdi decl %ecx - jnz .Lloop_1 - - CFI_REMEMBER_STATE -.Lende: - popq %rcx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE rcx - popq %rbx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE rbx - ret - CFI_RESTORE_STATE - -#ifdef FIX_ALIGNMENT - /* align destination */ - .p2align 4 -.Lbad_alignment: - movl $8,%r9d - subl %ecx,%r9d - movl %r9d,%ecx - cmpq %r9,%rdx - jz .Lhandle_7 - js .Lhandle_7 -.Lalign_1: -.Ls11: movb (%rsi),%bl -.Ld11: movb %bl,(%rdi) + jnz 18b +20: andl %edx,%edx + jz 23f + movl %edx,%ecx +21: movb (%rsi),%al +22: movb %al,(%rdi) incq %rsi incq %rdi decl %ecx - jnz .Lalign_1 - subq %r9,%rdx - jmp .Lafter_bad_alignment -#endif + jnz 21b +23: xor %eax,%eax + ASM_CLAC + ret - /* table sorted by exception address */ - .section __ex_table,"a" - .align 8 - .quad .Ls1,.Ls1e - .quad .Ls2,.Ls2e - .quad .Ls3,.Ls3e - .quad .Ls4,.Ls4e - .quad .Ld1,.Ls1e - .quad .Ld2,.Ls2e - .quad .Ld3,.Ls3e - .quad .Ld4,.Ls4e - .quad .Ls5,.Ls5e - .quad .Ls6,.Ls6e - .quad .Ls7,.Ls7e - .quad .Ls8,.Ls8e - .quad .Ld5,.Ls5e - .quad .Ld6,.Ls6e - .quad .Ld7,.Ls7e - .quad .Ld8,.Ls8e - .quad .Ls9,.Le_quad - .quad .Ld9,.Le_quad - .quad .Ls10,.Le_byte - .quad .Ld10,.Le_byte -#ifdef FIX_ALIGNMENT - .quad .Ls11,.Lzero_rest - .quad .Ld11,.Lzero_rest -#endif - .quad .Le5,.Le_zero + .section .fixup,"ax" +30: shll $6,%ecx + addl %ecx,%edx + jmp 60f +40: leal (%rdx,%rcx,8),%edx + jmp 60f +50: movl %ecx,%edx +60: jmp copy_user_handle_tail /* ecx is zerorest also */ .previous - /* compute 64-offset for main loop. 8 bytes accuracy with error on the - pessimistic side. this is gross. it would be better to fix the - interface. */ - /* eax: zero, ebx: 64 */ -.Ls1e: addl $8,%eax -.Ls2e: addl $8,%eax -.Ls3e: addl $8,%eax -.Ls4e: addl $8,%eax -.Ls5e: addl $8,%eax -.Ls6e: addl $8,%eax -.Ls7e: addl $8,%eax -.Ls8e: addl $8,%eax - addq %rbx,%rdi /* +64 */ - subq %rax,%rdi /* correct destination with computed offset */ - - shlq $6,%rdx /* loop counter * 64 (stride length) */ - addq %rax,%rdx /* add offset to loopcnt */ - andl $63,%ecx /* remaining bytes */ - addq %rcx,%rdx /* add them */ - jmp .Lzero_rest - - /* exception on quad word loop in tail handling */ - /* ecx: loopcnt/8, %edx: length, rdi: correct */ -.Le_quad: - shll $3,%ecx - andl $7,%edx - addl %ecx,%edx - /* edx: bytes to zero, rdi: dest, eax:zero */ -.Lzero_rest: - cmpl $0,(%rsp) - jz .Le_zero - movq %rdx,%rcx -.Le_byte: - xorl %eax,%eax -.Le5: rep - stosb - /* when there is another exception while zeroing the rest just return */ -.Le_zero: - movq %rdx,%rax - jmp .Lende + _ASM_EXTABLE(1b,30b) + _ASM_EXTABLE(2b,30b) + _ASM_EXTABLE(3b,30b) + _ASM_EXTABLE(4b,30b) + _ASM_EXTABLE(5b,30b) + _ASM_EXTABLE(6b,30b) + _ASM_EXTABLE(7b,30b) + _ASM_EXTABLE(8b,30b) + _ASM_EXTABLE(9b,30b) + _ASM_EXTABLE(10b,30b) + _ASM_EXTABLE(11b,30b) + _ASM_EXTABLE(12b,30b) + _ASM_EXTABLE(13b,30b) + _ASM_EXTABLE(14b,30b) + _ASM_EXTABLE(15b,30b) + _ASM_EXTABLE(16b,30b) + _ASM_EXTABLE(18b,40b) + _ASM_EXTABLE(19b,40b) + _ASM_EXTABLE(21b,50b) + _ASM_EXTABLE(22b,50b) CFI_ENDPROC -ENDPROC(copy_user_generic) - +ENDPROC(copy_user_generic_unrolled) - /* Some CPUs run faster using the string copy instructions. - This is also a lot simpler. Use them when possible. - Patch in jmps to this code instead of copying it fully - to avoid unwanted aliasing in the exception tables. */ - - /* rdi destination - * rsi source - * rdx count - * ecx zero flag - * - * Output: - * eax uncopied bytes or 0 if successfull. - * - * Only 4GB of copy is supported. This shouldn't be a problem - * because the kernel normally only writes from/to page sized chunks - * even if user space passed a longer buffer. - * And more would be dangerous because both Intel and AMD have - * errata with rep movsq > 4GB. If someone feels the need to fix - * this please consider this. - */ +/* Some CPUs run faster using the string copy instructions. + * This is also a lot simpler. Use them when possible. + * + * Only 4GB of copy is supported. This shouldn't be a problem + * because the kernel normally only writes from/to page sized chunks + * even if user space passed a longer buffer. + * And more would be dangerous because both Intel and AMD have + * errata with rep movsq > 4GB. If someone feels the need to fix + * this please consider this. + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successful. + */ ENTRY(copy_user_generic_string) CFI_STARTPROC - movl %ecx,%r8d /* save zero flag */ + ASM_STAC + cmpl $8,%edx + jb 2f /* less than 8 bytes, go to byte copy loop */ + ALIGN_DESTINATION movl %edx,%ecx shrl $3,%ecx - andl $7,%edx - jz 10f -1: rep - movsq - movl %edx,%ecx -2: rep + andl $7,%edx +1: rep + movsq +2: movl %edx,%ecx +3: rep movsb -9: movl %ecx,%eax + xorl %eax,%eax + ASM_CLAC ret - /* multiple of 8 byte */ -10: rep - movsq - xor %eax,%eax - ret + .section .fixup,"ax" +11: leal (%rdx,%rcx,8),%ecx +12: movl %ecx,%edx /* ecx is zerorest also */ + jmp copy_user_handle_tail + .previous - /* exception handling */ -3: lea (%rdx,%rcx,8),%rax /* exception on quad loop */ - jmp 6f -5: movl %ecx,%eax /* exception on byte loop */ - /* eax: left over bytes */ -6: testl %r8d,%r8d /* zero flag set? */ - jz 7f - movl %eax,%ecx /* initialize x86 loop counter */ - push %rax - xorl %eax,%eax -8: rep - stosb /* zero the rest */ -11: pop %rax -7: ret + _ASM_EXTABLE(1b,11b) + _ASM_EXTABLE(3b,12b) CFI_ENDPROC -END(copy_user_generic_c) +ENDPROC(copy_user_generic_string) + +/* + * Some CPUs are adding enhanced REP MOVSB/STOSB instructions. + * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled. + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successful. + */ +ENTRY(copy_user_enhanced_fast_string) + CFI_STARTPROC + ASM_STAC + movl %edx,%ecx +1: rep + movsb + xorl %eax,%eax + ASM_CLAC + ret - .section __ex_table,"a" - .quad 1b,3b - .quad 2b,5b - .quad 8b,11b - .quad 10b,3b + .section .fixup,"ax" +12: movl %ecx,%edx /* ecx is zerorest also */ + jmp copy_user_handle_tail .previous + + _ASM_EXTABLE(1b,12b) + CFI_ENDPROC +ENDPROC(copy_user_enhanced_fast_string) diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S index 5196762b3b0..6a4f43c2d9e 100644 --- a/arch/x86/lib/copy_user_nocache_64.S +++ b/arch/x86/lib/copy_user_nocache_64.S @@ -1,4 +1,6 @@ -/* Copyright 2002 Andi Kleen, SuSE Labs. +/* + * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com> + * Copyright 2002 Andi Kleen, SuSE Labs. * Subject to the GNU Public License v2. * * Functions to copy from and to user space. @@ -12,207 +14,123 @@ #include <asm/current.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> -#include <asm/cpufeature.h> - -/* - * copy_user_nocache - Uncached memory copy with exception handling - * This will force destination/source out of cache for more performance. - * - * Input: - * rdi destination - * rsi source - * rdx count - * rcx zero flag when 1 zero on exception - * - * Output: - * eax uncopied bytes or 0 if successful. - */ -ENTRY(__copy_user_nocache) - CFI_STARTPROC - pushq %rbx - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rbx, 0 - pushq %rcx /* save zero flag */ - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rcx, 0 - - xorl %eax,%eax /* zero for the exception handler */ +#include <asm/asm.h> +#include <asm/smap.h> + .macro ALIGN_DESTINATION #ifdef FIX_ALIGNMENT /* check for bad alignment of destination */ movl %edi,%ecx andl $7,%ecx - jnz .Lbad_alignment -.Lafter_bad_alignment: -#endif - - movq %rdx,%rcx - - movl $64,%ebx - shrq $6,%rdx - decq %rdx - js .Lhandle_tail - - .p2align 4 -.Lloop: -.Ls1: movq (%rsi),%r11 -.Ls2: movq 1*8(%rsi),%r8 -.Ls3: movq 2*8(%rsi),%r9 -.Ls4: movq 3*8(%rsi),%r10 -.Ld1: movnti %r11,(%rdi) -.Ld2: movnti %r8,1*8(%rdi) -.Ld3: movnti %r9,2*8(%rdi) -.Ld4: movnti %r10,3*8(%rdi) - -.Ls5: movq 4*8(%rsi),%r11 -.Ls6: movq 5*8(%rsi),%r8 -.Ls7: movq 6*8(%rsi),%r9 -.Ls8: movq 7*8(%rsi),%r10 -.Ld5: movnti %r11,4*8(%rdi) -.Ld6: movnti %r8,5*8(%rdi) -.Ld7: movnti %r9,6*8(%rdi) -.Ld8: movnti %r10,7*8(%rdi) + jz 102f /* already aligned */ + subl $8,%ecx + negl %ecx + subl %ecx,%edx +100: movb (%rsi),%al +101: movb %al,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 100b +102: + .section .fixup,"ax" +103: addl %ecx,%edx /* ecx is zerorest also */ + jmp copy_user_handle_tail + .previous - dec %rdx + _ASM_EXTABLE(100b,103b) + _ASM_EXTABLE(101b,103b) +#endif + .endm +/* + * copy_user_nocache - Uncached memory copy with exception handling + * This will force destination/source out of cache for more performance. + */ +ENTRY(__copy_user_nocache) + CFI_STARTPROC + ASM_STAC + cmpl $8,%edx + jb 20f /* less then 8 bytes, go to byte copy loop */ + ALIGN_DESTINATION + movl %edx,%ecx + andl $63,%edx + shrl $6,%ecx + jz 17f +1: movq (%rsi),%r8 +2: movq 1*8(%rsi),%r9 +3: movq 2*8(%rsi),%r10 +4: movq 3*8(%rsi),%r11 +5: movnti %r8,(%rdi) +6: movnti %r9,1*8(%rdi) +7: movnti %r10,2*8(%rdi) +8: movnti %r11,3*8(%rdi) +9: movq 4*8(%rsi),%r8 +10: movq 5*8(%rsi),%r9 +11: movq 6*8(%rsi),%r10 +12: movq 7*8(%rsi),%r11 +13: movnti %r8,4*8(%rdi) +14: movnti %r9,5*8(%rdi) +15: movnti %r10,6*8(%rdi) +16: movnti %r11,7*8(%rdi) leaq 64(%rsi),%rsi leaq 64(%rdi),%rdi - - jns .Lloop - - .p2align 4 -.Lhandle_tail: - movl %ecx,%edx - andl $63,%ecx - shrl $3,%ecx - jz .Lhandle_7 - movl $8,%ebx - .p2align 4 -.Lloop_8: -.Ls9: movq (%rsi),%r8 -.Ld9: movnti %r8,(%rdi) decl %ecx - leaq 8(%rdi),%rdi + jnz 1b +17: movl %edx,%ecx + andl $7,%edx + shrl $3,%ecx + jz 20f +18: movq (%rsi),%r8 +19: movnti %r8,(%rdi) leaq 8(%rsi),%rsi - jnz .Lloop_8 - -.Lhandle_7: + leaq 8(%rdi),%rdi + decl %ecx + jnz 18b +20: andl %edx,%edx + jz 23f movl %edx,%ecx - andl $7,%ecx - jz .Lende - .p2align 4 -.Lloop_1: -.Ls10: movb (%rsi),%bl -.Ld10: movb %bl,(%rdi) - incq %rdi +21: movb (%rsi),%al +22: movb %al,(%rdi) incq %rsi + incq %rdi decl %ecx - jnz .Lloop_1 - - CFI_REMEMBER_STATE -.Lende: - popq %rcx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE %rcx - popq %rbx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE rbx + jnz 21b +23: xorl %eax,%eax + ASM_CLAC sfence ret - CFI_RESTORE_STATE - -#ifdef FIX_ALIGNMENT - /* align destination */ - .p2align 4 -.Lbad_alignment: - movl $8,%r9d - subl %ecx,%r9d - movl %r9d,%ecx - cmpq %r9,%rdx - jz .Lhandle_7 - js .Lhandle_7 -.Lalign_1: -.Ls11: movb (%rsi),%bl -.Ld11: movb %bl,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz .Lalign_1 - subq %r9,%rdx - jmp .Lafter_bad_alignment -#endif - /* table sorted by exception address */ - .section __ex_table,"a" - .align 8 - .quad .Ls1,.Ls1e - .quad .Ls2,.Ls2e - .quad .Ls3,.Ls3e - .quad .Ls4,.Ls4e - .quad .Ld1,.Ls1e - .quad .Ld2,.Ls2e - .quad .Ld3,.Ls3e - .quad .Ld4,.Ls4e - .quad .Ls5,.Ls5e - .quad .Ls6,.Ls6e - .quad .Ls7,.Ls7e - .quad .Ls8,.Ls8e - .quad .Ld5,.Ls5e - .quad .Ld6,.Ls6e - .quad .Ld7,.Ls7e - .quad .Ld8,.Ls8e - .quad .Ls9,.Le_quad - .quad .Ld9,.Le_quad - .quad .Ls10,.Le_byte - .quad .Ld10,.Le_byte -#ifdef FIX_ALIGNMENT - .quad .Ls11,.Lzero_rest - .quad .Ld11,.Lzero_rest -#endif - .quad .Le5,.Le_zero + .section .fixup,"ax" +30: shll $6,%ecx + addl %ecx,%edx + jmp 60f +40: lea (%rdx,%rcx,8),%rdx + jmp 60f +50: movl %ecx,%edx +60: sfence + jmp copy_user_handle_tail .previous - /* compute 64-offset for main loop. 8 bytes accuracy with error on the - pessimistic side. this is gross. it would be better to fix the - interface. */ - /* eax: zero, ebx: 64 */ -.Ls1e: addl $8,%eax -.Ls2e: addl $8,%eax -.Ls3e: addl $8,%eax -.Ls4e: addl $8,%eax -.Ls5e: addl $8,%eax -.Ls6e: addl $8,%eax -.Ls7e: addl $8,%eax -.Ls8e: addl $8,%eax - addq %rbx,%rdi /* +64 */ - subq %rax,%rdi /* correct destination with computed offset */ - - shlq $6,%rdx /* loop counter * 64 (stride length) */ - addq %rax,%rdx /* add offset to loopcnt */ - andl $63,%ecx /* remaining bytes */ - addq %rcx,%rdx /* add them */ - jmp .Lzero_rest - - /* exception on quad word loop in tail handling */ - /* ecx: loopcnt/8, %edx: length, rdi: correct */ -.Le_quad: - shll $3,%ecx - andl $7,%edx - addl %ecx,%edx - /* edx: bytes to zero, rdi: dest, eax:zero */ -.Lzero_rest: - cmpl $0,(%rsp) /* zero flag set? */ - jz .Le_zero - movq %rdx,%rcx -.Le_byte: - xorl %eax,%eax -.Le5: rep - stosb - /* when there is another exception while zeroing the rest just return */ -.Le_zero: - movq %rdx,%rax - jmp .Lende + _ASM_EXTABLE(1b,30b) + _ASM_EXTABLE(2b,30b) + _ASM_EXTABLE(3b,30b) + _ASM_EXTABLE(4b,30b) + _ASM_EXTABLE(5b,30b) + _ASM_EXTABLE(6b,30b) + _ASM_EXTABLE(7b,30b) + _ASM_EXTABLE(8b,30b) + _ASM_EXTABLE(9b,30b) + _ASM_EXTABLE(10b,30b) + _ASM_EXTABLE(11b,30b) + _ASM_EXTABLE(12b,30b) + _ASM_EXTABLE(13b,30b) + _ASM_EXTABLE(14b,30b) + _ASM_EXTABLE(15b,30b) + _ASM_EXTABLE(16b,30b) + _ASM_EXTABLE(18b,40b) + _ASM_EXTABLE(19b,40b) + _ASM_EXTABLE(21b,50b) + _ASM_EXTABLE(22b,50b) CFI_ENDPROC ENDPROC(__copy_user_nocache) - - diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index f0dba36578e..2419d5fefae 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S @@ -1,6 +1,6 @@ /* - * Copyright 2002,2003 Andi Kleen, SuSE Labs. - * + * Copyright 2002, 2003 Andi Kleen, SuSE Labs. + * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive * for more details. No warranty for anything given at all. @@ -8,85 +8,77 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> #include <asm/errno.h> +#include <asm/asm.h> /* * Checksum copy with exception handling. - * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the + * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the * destination is zeroed. - * + * * Input * rdi source * rsi destination * edx len (32bit) - * ecx sum (32bit) + * ecx sum (32bit) * r8 src_err_ptr (int) * r9 dst_err_ptr (int) * * Output * eax 64bit sum. undefined in case of exception. - * - * Wrappers need to take care of valid exception sum and zeroing. + * + * Wrappers need to take care of valid exception sum and zeroing. * They also should align source or destination to 8 bytes. */ .macro source 10: - .section __ex_table,"a" - .align 8 - .quad 10b,.Lbad_source - .previous + _ASM_EXTABLE(10b, .Lbad_source) .endm - + .macro dest 20: - .section __ex_table,"a" - .align 8 - .quad 20b,.Lbad_dest - .previous + _ASM_EXTABLE(20b, .Lbad_dest) .endm - + .macro ignore L=.Lignore 30: - .section __ex_table,"a" - .align 8 - .quad 30b,\L - .previous + _ASM_EXTABLE(30b, \L) .endm - - + + ENTRY(csum_partial_copy_generic) CFI_STARTPROC - cmpl $3*64,%edx - jle .Lignore + cmpl $3*64, %edx + jle .Lignore -.Lignore: - subq $7*8,%rsp +.Lignore: + subq $7*8, %rsp CFI_ADJUST_CFA_OFFSET 7*8 - movq %rbx,2*8(%rsp) + movq %rbx, 2*8(%rsp) CFI_REL_OFFSET rbx, 2*8 - movq %r12,3*8(%rsp) + movq %r12, 3*8(%rsp) CFI_REL_OFFSET r12, 3*8 - movq %r14,4*8(%rsp) + movq %r14, 4*8(%rsp) CFI_REL_OFFSET r14, 4*8 - movq %r13,5*8(%rsp) + movq %r13, 5*8(%rsp) CFI_REL_OFFSET r13, 5*8 - movq %rbp,6*8(%rsp) + movq %rbp, 6*8(%rsp) CFI_REL_OFFSET rbp, 6*8 - movq %r8,(%rsp) - movq %r9,1*8(%rsp) - - movl %ecx,%eax - movl %edx,%ecx + movq %r8, (%rsp) + movq %r9, 1*8(%rsp) - xorl %r9d,%r9d - movq %rcx,%r12 + movl %ecx, %eax + movl %edx, %ecx - shrq $6,%r12 - jz .Lhandle_tail /* < 64 */ + xorl %r9d, %r9d + movq %rcx, %r12 + + shrq $6, %r12 + jz .Lhandle_tail /* < 64 */ clc - + /* main loop. clear in 64 byte blocks */ /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */ /* r11: temp3, rdx: temp4, r12 loopcnt */ @@ -94,156 +86,156 @@ ENTRY(csum_partial_copy_generic) .p2align 4 .Lloop: source - movq (%rdi),%rbx + movq (%rdi), %rbx source - movq 8(%rdi),%r8 + movq 8(%rdi), %r8 source - movq 16(%rdi),%r11 + movq 16(%rdi), %r11 source - movq 24(%rdi),%rdx + movq 24(%rdi), %rdx source - movq 32(%rdi),%r10 + movq 32(%rdi), %r10 source - movq 40(%rdi),%rbp + movq 40(%rdi), %rbp source - movq 48(%rdi),%r14 + movq 48(%rdi), %r14 source - movq 56(%rdi),%r13 - + movq 56(%rdi), %r13 + ignore 2f prefetcht0 5*64(%rdi) -2: - adcq %rbx,%rax - adcq %r8,%rax - adcq %r11,%rax - adcq %rdx,%rax - adcq %r10,%rax - adcq %rbp,%rax - adcq %r14,%rax - adcq %r13,%rax +2: + adcq %rbx, %rax + adcq %r8, %rax + adcq %r11, %rax + adcq %rdx, %rax + adcq %r10, %rax + adcq %rbp, %rax + adcq %r14, %rax + adcq %r13, %rax decl %r12d - + dest - movq %rbx,(%rsi) + movq %rbx, (%rsi) dest - movq %r8,8(%rsi) + movq %r8, 8(%rsi) dest - movq %r11,16(%rsi) + movq %r11, 16(%rsi) dest - movq %rdx,24(%rsi) + movq %rdx, 24(%rsi) dest - movq %r10,32(%rsi) + movq %r10, 32(%rsi) dest - movq %rbp,40(%rsi) + movq %rbp, 40(%rsi) dest - movq %r14,48(%rsi) + movq %r14, 48(%rsi) dest - movq %r13,56(%rsi) - + movq %r13, 56(%rsi) + 3: - - leaq 64(%rdi),%rdi - leaq 64(%rsi),%rsi - jnz .Lloop + leaq 64(%rdi), %rdi + leaq 64(%rsi), %rsi - adcq %r9,%rax + jnz .Lloop - /* do last upto 56 bytes */ + adcq %r9, %rax + + /* do last up to 56 bytes */ .Lhandle_tail: /* ecx: count */ - movl %ecx,%r10d - andl $63,%ecx - shrl $3,%ecx - jz .Lfold + movl %ecx, %r10d + andl $63, %ecx + shrl $3, %ecx + jz .Lfold clc .p2align 4 -.Lloop_8: +.Lloop_8: source - movq (%rdi),%rbx - adcq %rbx,%rax + movq (%rdi), %rbx + adcq %rbx, %rax decl %ecx dest - movq %rbx,(%rsi) - leaq 8(%rsi),%rsi /* preserve carry */ - leaq 8(%rdi),%rdi + movq %rbx, (%rsi) + leaq 8(%rsi), %rsi /* preserve carry */ + leaq 8(%rdi), %rdi jnz .Lloop_8 - adcq %r9,%rax /* add in carry */ + adcq %r9, %rax /* add in carry */ .Lfold: /* reduce checksum to 32bits */ - movl %eax,%ebx - shrq $32,%rax - addl %ebx,%eax - adcl %r9d,%eax + movl %eax, %ebx + shrq $32, %rax + addl %ebx, %eax + adcl %r9d, %eax - /* do last upto 6 bytes */ + /* do last up to 6 bytes */ .Lhandle_7: - movl %r10d,%ecx - andl $7,%ecx - shrl $1,%ecx + movl %r10d, %ecx + andl $7, %ecx + shrl $1, %ecx jz .Lhandle_1 - movl $2,%edx - xorl %ebx,%ebx - clc + movl $2, %edx + xorl %ebx, %ebx + clc .p2align 4 -.Lloop_1: +.Lloop_1: source - movw (%rdi),%bx - adcl %ebx,%eax + movw (%rdi), %bx + adcl %ebx, %eax decl %ecx dest - movw %bx,(%rsi) - leaq 2(%rdi),%rdi - leaq 2(%rsi),%rsi + movw %bx, (%rsi) + leaq 2(%rdi), %rdi + leaq 2(%rsi), %rsi jnz .Lloop_1 - adcl %r9d,%eax /* add in carry */ - + adcl %r9d, %eax /* add in carry */ + /* handle last odd byte */ .Lhandle_1: - testl $1,%r10d + testl $1, %r10d jz .Lende - xorl %ebx,%ebx + xorl %ebx, %ebx source - movb (%rdi),%bl + movb (%rdi), %bl dest - movb %bl,(%rsi) - addl %ebx,%eax - adcl %r9d,%eax /* carry */ - + movb %bl, (%rsi) + addl %ebx, %eax + adcl %r9d, %eax /* carry */ + CFI_REMEMBER_STATE .Lende: - movq 2*8(%rsp),%rbx + movq 2*8(%rsp), %rbx CFI_RESTORE rbx - movq 3*8(%rsp),%r12 + movq 3*8(%rsp), %r12 CFI_RESTORE r12 - movq 4*8(%rsp),%r14 + movq 4*8(%rsp), %r14 CFI_RESTORE r14 - movq 5*8(%rsp),%r13 + movq 5*8(%rsp), %r13 CFI_RESTORE r13 - movq 6*8(%rsp),%rbp + movq 6*8(%rsp), %rbp CFI_RESTORE rbp - addq $7*8,%rsp + addq $7*8, %rsp CFI_ADJUST_CFA_OFFSET -7*8 ret CFI_RESTORE_STATE /* Exception handlers. Very simple, zeroing is done in the wrappers */ .Lbad_source: - movq (%rsp),%rax - testq %rax,%rax + movq (%rsp), %rax + testq %rax, %rax jz .Lende - movl $-EFAULT,(%rax) + movl $-EFAULT, (%rax) jmp .Lende - + .Lbad_dest: - movq 8(%rsp),%rax - testq %rax,%rax - jz .Lende - movl $-EFAULT,(%rax) + movq 8(%rsp), %rax + testq %rax, %rax + jz .Lende + movl $-EFAULT, (%rax) jmp .Lende CFI_ENDPROC ENDPROC(csum_partial_copy_generic) diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c index bc503f50690..9845371c5c3 100644 --- a/arch/x86/lib/csum-partial_64.c +++ b/arch/x86/lib/csum-partial_64.c @@ -84,7 +84,7 @@ static unsigned do_csum(const unsigned char *buff, unsigned len) count64--; } - /* last upto 7 8byte blocks */ + /* last up to 7 8byte blocks */ count %= 8; while (count) { asm("addq %1,%0\n\t" @@ -136,8 +136,6 @@ __wsum csum_partial(const void *buff, int len, __wsum sum) (__force u32)sum); } -EXPORT_SYMBOL(csum_partial); - /* * this routine is used for miscellaneous IP-like checksums, mainly * in icmp.c diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index 459b58a8a15..7609e0e421e 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -6,6 +6,7 @@ */ #include <asm/checksum.h> #include <linux/module.h> +#include <asm/smap.h> /** * csum_partial_copy_from_user - Copy and checksum from user space. @@ -52,8 +53,10 @@ csum_partial_copy_from_user(const void __user *src, void *dst, len -= 2; } } + stac(); isum = csum_partial_copy_generic((__force const void *)src, dst, len, isum, errp, NULL); + clac(); if (unlikely(*errp)) goto out_err; @@ -82,6 +85,8 @@ __wsum csum_partial_copy_to_user(const void *src, void __user *dst, int len, __wsum isum, int *errp) { + __wsum ret; + might_sleep(); if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) { @@ -105,8 +110,11 @@ csum_partial_copy_to_user(const void *src, void __user *dst, } *errp = 0; - return csum_partial_copy_generic(src, (void __force *)dst, - len, isum, NULL, errp); + stac(); + ret = csum_partial_copy_generic(src, (void __force *)dst, + len, isum, NULL, errp); + clac(); + return ret; } EXPORT_SYMBOL(csum_partial_copy_to_user); @@ -115,7 +123,7 @@ EXPORT_SYMBOL(csum_partial_copy_to_user); * @src: source address * @dst: destination address * @len: number of bytes to be copied. - * @isum: initial sum that is added into the result (32bit unfolded) + * @sum: initial sum that is added into the result (32bit unfolded) * * Returns an 32bit unfolded checksum of the buffer. */ diff --git a/arch/x86/lib/delay_32.c b/arch/x86/lib/delay.c index 4535e6d147a..39d6a3db0b9 100644 --- a/arch/x86/lib/delay_32.c +++ b/arch/x86/lib/delay.c @@ -3,6 +3,7 @@ * * Copyright (C) 1993 Linus Torvalds * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz> + * Copyright (C) 2008 Jiri Hladky <hladky _dot_ jiri _at_ gmail _dot_ com> * * The __delay function must _NOT_ be inlined as its execution time * depends wildly on alignment on many x86 processors. The additional @@ -15,7 +16,6 @@ #include <linux/timex.h> #include <linux/preempt.h> #include <linux/delay.h> -#include <linux/init.h> #include <asm/processor.h> #include <asm/delay.h> @@ -28,29 +28,61 @@ /* simple loop based delay: */ static void delay_loop(unsigned long loops) { - int d0; - - __asm__ __volatile__( - "\tjmp 1f\n" - ".align 16\n" - "1:\tjmp 2f\n" - ".align 16\n" - "2:\tdecl %0\n\tjns 2b" - :"=&a" (d0) - :"0" (loops)); + asm volatile( + " test %0,%0 \n" + " jz 3f \n" + " jmp 1f \n" + + ".align 16 \n" + "1: jmp 2f \n" + + ".align 16 \n" + "2: dec %0 \n" + " jnz 2b \n" + "3: dec %0 \n" + + : /* we don't need output */ + :"a" (loops) + ); } /* TSC based delay: */ -static void delay_tsc(unsigned long loops) +static void delay_tsc(unsigned long __loops) { - unsigned long bclock, now; + u32 bclock, now, loops = __loops; + int cpu; - preempt_disable(); /* TSC's are per-cpu */ + preempt_disable(); + cpu = smp_processor_id(); + rdtsc_barrier(); rdtscl(bclock); - do { - rep_nop(); + for (;;) { + rdtsc_barrier(); rdtscl(now); - } while ((now-bclock) < loops); + if ((now - bclock) >= loops) + break; + + /* Allow RT tasks to run */ + preempt_enable(); + rep_nop(); + preempt_disable(); + + /* + * It is possible that we moved to another CPU, and + * since TSC's are per-cpu we need to calculate + * that. The delay must guarantee that we wait "at + * least" the amount of time. Being moved to another + * CPU could make the wait longer but we just need to + * make sure we waited long enough. Rebalance the + * counter for this CPU. + */ + if (unlikely(cpu != smp_processor_id())) { + loops -= (now - bclock); + cpu = smp_processor_id(); + rdtsc_barrier(); + rdtscl(bclock); + } + } preempt_enable(); } @@ -65,10 +97,10 @@ void use_tsc_delay(void) delay_fn = delay_tsc; } -int __devinit read_current_timer(unsigned long *timer_val) +int read_current_timer(unsigned long *timer_val) { if (delay_fn == delay_tsc) { - rdtscl(*timer_val); + rdtscll(*timer_val); return 0; } return -1; @@ -78,31 +110,30 @@ void __delay(unsigned long loops) { delay_fn(loops); } +EXPORT_SYMBOL(__delay); inline void __const_udelay(unsigned long xloops) { int d0; xloops *= 4; - __asm__("mull %0" + asm("mull %%edx" :"=d" (xloops), "=&a" (d0) :"1" (xloops), "0" - (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4))); + (this_cpu_read(cpu_info.loops_per_jiffy) * (HZ/4))); __delay(++xloops); } +EXPORT_SYMBOL(__const_udelay); void __udelay(unsigned long usecs) { __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ } +EXPORT_SYMBOL(__udelay); void __ndelay(unsigned long nsecs) { __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ } - -EXPORT_SYMBOL(__delay); -EXPORT_SYMBOL(__const_udelay); -EXPORT_SYMBOL(__udelay); EXPORT_SYMBOL(__ndelay); diff --git a/arch/x86/lib/delay_64.c b/arch/x86/lib/delay_64.c deleted file mode 100644 index bbc61051851..00000000000 --- a/arch/x86/lib/delay_64.c +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Precise Delay Loops for x86-64 - * - * Copyright (C) 1993 Linus Torvalds - * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz> - * - * The __delay function must _NOT_ be inlined as its execution time - * depends wildly on alignment on many x86 processors. - */ - -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/timex.h> -#include <linux/preempt.h> -#include <linux/delay.h> -#include <linux/init.h> - -#include <asm/delay.h> -#include <asm/msr.h> - -#ifdef CONFIG_SMP -#include <asm/smp.h> -#endif - -int __devinit read_current_timer(unsigned long *timer_value) -{ - rdtscll(*timer_value); - return 0; -} - -void __delay(unsigned long loops) -{ - unsigned bclock, now; - - preempt_disable(); /* TSC's are pre-cpu */ - rdtscl(bclock); - do { - rep_nop(); - rdtscl(now); - } - while ((now-bclock) < loops); - preempt_enable(); -} -EXPORT_SYMBOL(__delay); - -inline void __const_udelay(unsigned long xloops) -{ - __delay(((xloops * HZ * - cpu_data(raw_smp_processor_id()).loops_per_jiffy) >> 32) + 1); -} -EXPORT_SYMBOL(__const_udelay); - -void __udelay(unsigned long usecs) -{ - __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ -} -EXPORT_SYMBOL(__udelay); - -void __ndelay(unsigned long nsecs) -{ - __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ -} -EXPORT_SYMBOL(__ndelay); diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S new file mode 100644 index 00000000000..a4512359656 --- /dev/null +++ b/arch/x86/lib/getuser.S @@ -0,0 +1,140 @@ +/* + * __get_user functions. + * + * (C) Copyright 1998 Linus Torvalds + * (C) Copyright 2005 Andi Kleen + * (C) Copyright 2008 Glauber Costa + * + * These functions have a non-standard call interface + * to make them more efficient, especially as they + * return an error value in addition to the "real" + * return value. + */ + +/* + * __get_user_X + * + * Inputs: %[r|e]ax contains the address. + * + * Outputs: %[r|e]ax is error code (0 or -EFAULT) + * %[r|e]dx contains zero-extended value + * %ecx contains the high half for 32-bit __get_user_8 + * + * + * These functions should not modify any other registers, + * as they get called from within inline assembly. + */ + +#include <linux/linkage.h> +#include <asm/dwarf2.h> +#include <asm/page_types.h> +#include <asm/errno.h> +#include <asm/asm-offsets.h> +#include <asm/thread_info.h> +#include <asm/asm.h> +#include <asm/smap.h> + + .text +ENTRY(__get_user_1) + CFI_STARTPROC + GET_THREAD_INFO(%_ASM_DX) + cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user + ASM_STAC +1: movzbl (%_ASM_AX),%edx + xor %eax,%eax + ASM_CLAC + ret + CFI_ENDPROC +ENDPROC(__get_user_1) + +ENTRY(__get_user_2) + CFI_STARTPROC + add $1,%_ASM_AX + jc bad_get_user + GET_THREAD_INFO(%_ASM_DX) + cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user + ASM_STAC +2: movzwl -1(%_ASM_AX),%edx + xor %eax,%eax + ASM_CLAC + ret + CFI_ENDPROC +ENDPROC(__get_user_2) + +ENTRY(__get_user_4) + CFI_STARTPROC + add $3,%_ASM_AX + jc bad_get_user + GET_THREAD_INFO(%_ASM_DX) + cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user + ASM_STAC +3: movl -3(%_ASM_AX),%edx + xor %eax,%eax + ASM_CLAC + ret + CFI_ENDPROC +ENDPROC(__get_user_4) + +ENTRY(__get_user_8) + CFI_STARTPROC +#ifdef CONFIG_X86_64 + add $7,%_ASM_AX + jc bad_get_user + GET_THREAD_INFO(%_ASM_DX) + cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user + ASM_STAC +4: movq -7(%_ASM_AX),%rdx + xor %eax,%eax + ASM_CLAC + ret +#else + add $7,%_ASM_AX + jc bad_get_user_8 + GET_THREAD_INFO(%_ASM_DX) + cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user_8 + ASM_STAC +4: movl -7(%_ASM_AX),%edx +5: movl -3(%_ASM_AX),%ecx + xor %eax,%eax + ASM_CLAC + ret +#endif + CFI_ENDPROC +ENDPROC(__get_user_8) + + +bad_get_user: + CFI_STARTPROC + xor %edx,%edx + mov $(-EFAULT),%_ASM_AX + ASM_CLAC + ret + CFI_ENDPROC +END(bad_get_user) + +#ifdef CONFIG_X86_32 +bad_get_user_8: + CFI_STARTPROC + xor %edx,%edx + xor %ecx,%ecx + mov $(-EFAULT),%_ASM_AX + ASM_CLAC + ret + CFI_ENDPROC +END(bad_get_user_8) +#endif + + _ASM_EXTABLE(1b,bad_get_user) + _ASM_EXTABLE(2b,bad_get_user) + _ASM_EXTABLE(3b,bad_get_user) +#ifdef CONFIG_X86_64 + _ASM_EXTABLE(4b,bad_get_user) +#else + _ASM_EXTABLE(4b,bad_get_user_8) + _ASM_EXTABLE(5b,bad_get_user_8) +#endif diff --git a/arch/x86/lib/getuser_32.S b/arch/x86/lib/getuser_32.S deleted file mode 100644 index 6d84b53f12a..00000000000 --- a/arch/x86/lib/getuser_32.S +++ /dev/null @@ -1,78 +0,0 @@ -/* - * __get_user functions. - * - * (C) Copyright 1998 Linus Torvalds - * - * These functions have a non-standard call interface - * to make them more efficient, especially as they - * return an error value in addition to the "real" - * return value. - */ -#include <linux/linkage.h> -#include <asm/dwarf2.h> -#include <asm/thread_info.h> - - -/* - * __get_user_X - * - * Inputs: %eax contains the address - * - * Outputs: %eax is error code (0 or -EFAULT) - * %edx contains zero-extended value - * - * These functions should not modify any other registers, - * as they get called from within inline assembly. - */ - -.text -ENTRY(__get_user_1) - CFI_STARTPROC - GET_THREAD_INFO(%edx) - cmpl TI_addr_limit(%edx),%eax - jae bad_get_user -1: movzbl (%eax),%edx - xorl %eax,%eax - ret - CFI_ENDPROC -ENDPROC(__get_user_1) - -ENTRY(__get_user_2) - CFI_STARTPROC - addl $1,%eax - jc bad_get_user - GET_THREAD_INFO(%edx) - cmpl TI_addr_limit(%edx),%eax - jae bad_get_user -2: movzwl -1(%eax),%edx - xorl %eax,%eax - ret - CFI_ENDPROC -ENDPROC(__get_user_2) - -ENTRY(__get_user_4) - CFI_STARTPROC - addl $3,%eax - jc bad_get_user - GET_THREAD_INFO(%edx) - cmpl TI_addr_limit(%edx),%eax - jae bad_get_user -3: movl -3(%eax),%edx - xorl %eax,%eax - ret - CFI_ENDPROC -ENDPROC(__get_user_4) - -bad_get_user: - CFI_STARTPROC - xorl %edx,%edx - movl $-14,%eax - ret - CFI_ENDPROC -END(bad_get_user) - -.section __ex_table,"a" - .long 1b,bad_get_user - .long 2b,bad_get_user - .long 3b,bad_get_user -.previous diff --git a/arch/x86/lib/getuser_64.S b/arch/x86/lib/getuser_64.S deleted file mode 100644 index 5448876261f..00000000000 --- a/arch/x86/lib/getuser_64.S +++ /dev/null @@ -1,109 +0,0 @@ -/* - * __get_user functions. - * - * (C) Copyright 1998 Linus Torvalds - * (C) Copyright 2005 Andi Kleen - * - * These functions have a non-standard call interface - * to make them more efficient, especially as they - * return an error value in addition to the "real" - * return value. - */ - -/* - * __get_user_X - * - * Inputs: %rcx contains the address. - * The register is modified, but all changes are undone - * before returning because the C code doesn't know about it. - * - * Outputs: %rax is error code (0 or -EFAULT) - * %rdx contains zero-extended value - * - * %r8 is destroyed. - * - * These functions should not modify any other registers, - * as they get called from within inline assembly. - */ - -#include <linux/linkage.h> -#include <asm/dwarf2.h> -#include <asm/page.h> -#include <asm/errno.h> -#include <asm/asm-offsets.h> -#include <asm/thread_info.h> - - .text -ENTRY(__get_user_1) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - cmpq threadinfo_addr_limit(%r8),%rcx - jae bad_get_user -1: movzb (%rcx),%edx - xorl %eax,%eax - ret - CFI_ENDPROC -ENDPROC(__get_user_1) - -ENTRY(__get_user_2) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $1,%rcx - jc 20f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 20f - decq %rcx -2: movzwl (%rcx),%edx - xorl %eax,%eax - ret -20: decq %rcx - jmp bad_get_user - CFI_ENDPROC -ENDPROC(__get_user_2) - -ENTRY(__get_user_4) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $3,%rcx - jc 30f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 30f - subq $3,%rcx -3: movl (%rcx),%edx - xorl %eax,%eax - ret -30: subq $3,%rcx - jmp bad_get_user - CFI_ENDPROC -ENDPROC(__get_user_4) - -ENTRY(__get_user_8) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $7,%rcx - jc 40f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 40f - subq $7,%rcx -4: movq (%rcx),%rdx - xorl %eax,%eax - ret -40: subq $7,%rcx - jmp bad_get_user - CFI_ENDPROC -ENDPROC(__get_user_8) - -bad_get_user: - CFI_STARTPROC - xorl %edx,%edx - movq $(-EFAULT),%rax - ret - CFI_ENDPROC -END(bad_get_user) - -.section __ex_table,"a" - .quad 1b,bad_get_user - .quad 2b,bad_get_user - .quad 3b,bad_get_user - .quad 4b,bad_get_user -.previous diff --git a/arch/x86/lib/hash.c b/arch/x86/lib/hash.c new file mode 100644 index 00000000000..ff4fa51a5b1 --- /dev/null +++ b/arch/x86/lib/hash.c @@ -0,0 +1,92 @@ +/* + * Some portions derived from code covered by the following notice: + * + * Copyright (c) 2010-2013 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/hash.h> +#include <linux/init.h> + +#include <asm/processor.h> +#include <asm/cpufeature.h> +#include <asm/hash.h> + +static inline u32 crc32_u32(u32 crc, u32 val) +{ +#ifdef CONFIG_AS_CRC32 + asm ("crc32l %1,%0\n" : "+r" (crc) : "rm" (val)); +#else + asm (".byte 0xf2, 0x0f, 0x38, 0xf1, 0xc1" : "+a" (crc) : "c" (val)); +#endif + return crc; +} + +static u32 intel_crc4_2_hash(const void *data, u32 len, u32 seed) +{ + const u32 *p32 = (const u32 *) data; + u32 i, tmp = 0; + + for (i = 0; i < len / 4; i++) + seed = crc32_u32(seed, *p32++); + + switch (len & 3) { + case 3: + tmp |= *((const u8 *) p32 + 2) << 16; + /* fallthrough */ + case 2: + tmp |= *((const u8 *) p32 + 1) << 8; + /* fallthrough */ + case 1: + tmp |= *((const u8 *) p32); + seed = crc32_u32(seed, tmp); + break; + } + + return seed; +} + +static u32 intel_crc4_2_hash2(const u32 *data, u32 len, u32 seed) +{ + const u32 *p32 = (const u32 *) data; + u32 i; + + for (i = 0; i < len; i++) + seed = crc32_u32(seed, *p32++); + + return seed; +} + +void __init setup_arch_fast_hash(struct fast_hash_ops *ops) +{ + if (cpu_has_xmm4_2) { + ops->hash = intel_crc4_2_hash; + ops->hash2 = intel_crc4_2_hash2; + } +} diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c new file mode 100644 index 00000000000..c1f01a8e9f6 --- /dev/null +++ b/arch/x86/lib/inat.c @@ -0,0 +1,97 @@ +/* + * x86 instruction attribute tables + * + * Written by Masami Hiramatsu <mhiramat@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ +#include <asm/insn.h> + +/* Attribute tables are generated from opcode map */ +#include "inat-tables.c" + +/* Attribute search APIs */ +insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode) +{ + return inat_primary_table[opcode]; +} + +int inat_get_last_prefix_id(insn_byte_t last_pfx) +{ + insn_attr_t lpfx_attr; + + lpfx_attr = inat_get_opcode_attribute(last_pfx); + return inat_last_prefix_id(lpfx_attr); +} + +insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, int lpfx_id, + insn_attr_t esc_attr) +{ + const insn_attr_t *table; + int n; + + n = inat_escape_id(esc_attr); + + table = inat_escape_tables[n][0]; + if (!table) + return 0; + if (inat_has_variant(table[opcode]) && lpfx_id) { + table = inat_escape_tables[n][lpfx_id]; + if (!table) + return 0; + } + return table[opcode]; +} + +insn_attr_t inat_get_group_attribute(insn_byte_t modrm, int lpfx_id, + insn_attr_t grp_attr) +{ + const insn_attr_t *table; + int n; + + n = inat_group_id(grp_attr); + + table = inat_group_tables[n][0]; + if (!table) + return inat_group_common_attribute(grp_attr); + if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && lpfx_id) { + table = inat_group_tables[n][lpfx_id]; + if (!table) + return inat_group_common_attribute(grp_attr); + } + return table[X86_MODRM_REG(modrm)] | + inat_group_common_attribute(grp_attr); +} + +insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, + insn_byte_t vex_p) +{ + const insn_attr_t *table; + if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX) + return 0; + /* At first, this checks the master table */ + table = inat_avx_tables[vex_m][0]; + if (!table) + return 0; + if (!inat_is_group(table[opcode]) && vex_p) { + /* If this is not a group, get attribute directly */ + table = inat_avx_tables[vex_m][vex_p]; + if (!table) + return 0; + } + return table[opcode]; +} + diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c new file mode 100644 index 00000000000..54fcffed28e --- /dev/null +++ b/arch/x86/lib/insn.c @@ -0,0 +1,580 @@ +/* + * x86 instruction analysis + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004, 2009 + */ + +#ifdef __KERNEL__ +#include <linux/string.h> +#else +#include <string.h> +#endif +#include <asm/inat.h> +#include <asm/insn.h> + +/* Verify next sizeof(t) bytes can be on the same instruction */ +#define validate_next(t, insn, n) \ + ((insn)->next_byte + sizeof(t) + n - (insn)->kaddr <= MAX_INSN_SIZE) + +#define __get_next(t, insn) \ + ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; }) + +#define __peek_nbyte_next(t, insn, n) \ + ({ t r = *(t*)((insn)->next_byte + n); r; }) + +#define get_next(t, insn) \ + ({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); }) + +#define peek_nbyte_next(t, insn, n) \ + ({ if (unlikely(!validate_next(t, insn, n))) goto err_out; __peek_nbyte_next(t, insn, n); }) + +#define peek_next(t, insn) peek_nbyte_next(t, insn, 0) + +/** + * insn_init() - initialize struct insn + * @insn: &struct insn to be initialized + * @kaddr: address (in kernel memory) of instruction (or copy thereof) + * @x86_64: !0 for 64-bit kernel or 64-bit app + */ +void insn_init(struct insn *insn, const void *kaddr, int x86_64) +{ + memset(insn, 0, sizeof(*insn)); + insn->kaddr = kaddr; + insn->next_byte = kaddr; + insn->x86_64 = x86_64 ? 1 : 0; + insn->opnd_bytes = 4; + if (x86_64) + insn->addr_bytes = 8; + else + insn->addr_bytes = 4; +} + +/** + * insn_get_prefixes - scan x86 instruction prefix bytes + * @insn: &struct insn containing instruction + * + * Populates the @insn->prefixes bitmap, and updates @insn->next_byte + * to point to the (first) opcode. No effect if @insn->prefixes.got + * is already set. + */ +void insn_get_prefixes(struct insn *insn) +{ + struct insn_field *prefixes = &insn->prefixes; + insn_attr_t attr; + insn_byte_t b, lb; + int i, nb; + + if (prefixes->got) + return; + + nb = 0; + lb = 0; + b = peek_next(insn_byte_t, insn); + attr = inat_get_opcode_attribute(b); + while (inat_is_legacy_prefix(attr)) { + /* Skip if same prefix */ + for (i = 0; i < nb; i++) + if (prefixes->bytes[i] == b) + goto found; + if (nb == 4) + /* Invalid instruction */ + break; + prefixes->bytes[nb++] = b; + if (inat_is_address_size_prefix(attr)) { + /* address size switches 2/4 or 4/8 */ + if (insn->x86_64) + insn->addr_bytes ^= 12; + else + insn->addr_bytes ^= 6; + } else if (inat_is_operand_size_prefix(attr)) { + /* oprand size switches 2/4 */ + insn->opnd_bytes ^= 6; + } +found: + prefixes->nbytes++; + insn->next_byte++; + lb = b; + b = peek_next(insn_byte_t, insn); + attr = inat_get_opcode_attribute(b); + } + /* Set the last prefix */ + if (lb && lb != insn->prefixes.bytes[3]) { + if (unlikely(insn->prefixes.bytes[3])) { + /* Swap the last prefix */ + b = insn->prefixes.bytes[3]; + for (i = 0; i < nb; i++) + if (prefixes->bytes[i] == lb) + prefixes->bytes[i] = b; + } + insn->prefixes.bytes[3] = lb; + } + + /* Decode REX prefix */ + if (insn->x86_64) { + b = peek_next(insn_byte_t, insn); + attr = inat_get_opcode_attribute(b); + if (inat_is_rex_prefix(attr)) { + insn->rex_prefix.value = b; + insn->rex_prefix.nbytes = 1; + insn->next_byte++; + if (X86_REX_W(b)) + /* REX.W overrides opnd_size */ + insn->opnd_bytes = 8; + } + } + insn->rex_prefix.got = 1; + + /* Decode VEX prefix */ + b = peek_next(insn_byte_t, insn); + attr = inat_get_opcode_attribute(b); + if (inat_is_vex_prefix(attr)) { + insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1); + if (!insn->x86_64) { + /* + * In 32-bits mode, if the [7:6] bits (mod bits of + * ModRM) on the second byte are not 11b, it is + * LDS or LES. + */ + if (X86_MODRM_MOD(b2) != 3) + goto vex_end; + } + insn->vex_prefix.bytes[0] = b; + insn->vex_prefix.bytes[1] = b2; + if (inat_is_vex3_prefix(attr)) { + b2 = peek_nbyte_next(insn_byte_t, insn, 2); + insn->vex_prefix.bytes[2] = b2; + insn->vex_prefix.nbytes = 3; + insn->next_byte += 3; + if (insn->x86_64 && X86_VEX_W(b2)) + /* VEX.W overrides opnd_size */ + insn->opnd_bytes = 8; + } else { + insn->vex_prefix.nbytes = 2; + insn->next_byte += 2; + } + } +vex_end: + insn->vex_prefix.got = 1; + + prefixes->got = 1; + +err_out: + return; +} + +/** + * insn_get_opcode - collect opcode(s) + * @insn: &struct insn containing instruction + * + * Populates @insn->opcode, updates @insn->next_byte to point past the + * opcode byte(s), and set @insn->attr (except for groups). + * If necessary, first collects any preceding (prefix) bytes. + * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got + * is already 1. + */ +void insn_get_opcode(struct insn *insn) +{ + struct insn_field *opcode = &insn->opcode; + insn_byte_t op; + int pfx_id; + if (opcode->got) + return; + if (!insn->prefixes.got) + insn_get_prefixes(insn); + + /* Get first opcode */ + op = get_next(insn_byte_t, insn); + opcode->bytes[0] = op; + opcode->nbytes = 1; + + /* Check if there is VEX prefix or not */ + if (insn_is_avx(insn)) { + insn_byte_t m, p; + m = insn_vex_m_bits(insn); + p = insn_vex_p_bits(insn); + insn->attr = inat_get_avx_attribute(op, m, p); + if (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr)) + insn->attr = 0; /* This instruction is bad */ + goto end; /* VEX has only 1 byte for opcode */ + } + + insn->attr = inat_get_opcode_attribute(op); + while (inat_is_escape(insn->attr)) { + /* Get escaped opcode */ + op = get_next(insn_byte_t, insn); + opcode->bytes[opcode->nbytes++] = op; + pfx_id = insn_last_prefix_id(insn); + insn->attr = inat_get_escape_attribute(op, pfx_id, insn->attr); + } + if (inat_must_vex(insn->attr)) + insn->attr = 0; /* This instruction is bad */ +end: + opcode->got = 1; + +err_out: + return; +} + +/** + * insn_get_modrm - collect ModRM byte, if any + * @insn: &struct insn containing instruction + * + * Populates @insn->modrm and updates @insn->next_byte to point past the + * ModRM byte, if any. If necessary, first collects the preceding bytes + * (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1. + */ +void insn_get_modrm(struct insn *insn) +{ + struct insn_field *modrm = &insn->modrm; + insn_byte_t pfx_id, mod; + if (modrm->got) + return; + if (!insn->opcode.got) + insn_get_opcode(insn); + + if (inat_has_modrm(insn->attr)) { + mod = get_next(insn_byte_t, insn); + modrm->value = mod; + modrm->nbytes = 1; + if (inat_is_group(insn->attr)) { + pfx_id = insn_last_prefix_id(insn); + insn->attr = inat_get_group_attribute(mod, pfx_id, + insn->attr); + if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) + insn->attr = 0; /* This is bad */ + } + } + + if (insn->x86_64 && inat_is_force64(insn->attr)) + insn->opnd_bytes = 8; + modrm->got = 1; + +err_out: + return; +} + + +/** + * insn_rip_relative() - Does instruction use RIP-relative addressing mode? + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * ModRM byte. No effect if @insn->x86_64 is 0. + */ +int insn_rip_relative(struct insn *insn) +{ + struct insn_field *modrm = &insn->modrm; + + if (!insn->x86_64) + return 0; + if (!modrm->got) + insn_get_modrm(insn); + /* + * For rip-relative instructions, the mod field (top 2 bits) + * is zero and the r/m field (bottom 3 bits) is 0x5. + */ + return (modrm->nbytes && (modrm->value & 0xc7) == 0x5); +} + +/** + * insn_get_sib() - Get the SIB byte of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * ModRM byte. + */ +void insn_get_sib(struct insn *insn) +{ + insn_byte_t modrm; + + if (insn->sib.got) + return; + if (!insn->modrm.got) + insn_get_modrm(insn); + if (insn->modrm.nbytes) { + modrm = (insn_byte_t)insn->modrm.value; + if (insn->addr_bytes != 2 && + X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) { + insn->sib.value = get_next(insn_byte_t, insn); + insn->sib.nbytes = 1; + } + } + insn->sib.got = 1; + +err_out: + return; +} + + +/** + * insn_get_displacement() - Get the displacement of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * SIB byte. + * Displacement value is sign-expanded. + */ +void insn_get_displacement(struct insn *insn) +{ + insn_byte_t mod, rm, base; + + if (insn->displacement.got) + return; + if (!insn->sib.got) + insn_get_sib(insn); + if (insn->modrm.nbytes) { + /* + * Interpreting the modrm byte: + * mod = 00 - no displacement fields (exceptions below) + * mod = 01 - 1-byte displacement field + * mod = 10 - displacement field is 4 bytes, or 2 bytes if + * address size = 2 (0x67 prefix in 32-bit mode) + * mod = 11 - no memory operand + * + * If address size = 2... + * mod = 00, r/m = 110 - displacement field is 2 bytes + * + * If address size != 2... + * mod != 11, r/m = 100 - SIB byte exists + * mod = 00, SIB base = 101 - displacement field is 4 bytes + * mod = 00, r/m = 101 - rip-relative addressing, displacement + * field is 4 bytes + */ + mod = X86_MODRM_MOD(insn->modrm.value); + rm = X86_MODRM_RM(insn->modrm.value); + base = X86_SIB_BASE(insn->sib.value); + if (mod == 3) + goto out; + if (mod == 1) { + insn->displacement.value = get_next(char, insn); + insn->displacement.nbytes = 1; + } else if (insn->addr_bytes == 2) { + if ((mod == 0 && rm == 6) || mod == 2) { + insn->displacement.value = + get_next(short, insn); + insn->displacement.nbytes = 2; + } + } else { + if ((mod == 0 && rm == 5) || mod == 2 || + (mod == 0 && base == 5)) { + insn->displacement.value = get_next(int, insn); + insn->displacement.nbytes = 4; + } + } + } +out: + insn->displacement.got = 1; + +err_out: + return; +} + +/* Decode moffset16/32/64. Return 0 if failed */ +static int __get_moffset(struct insn *insn) +{ + switch (insn->addr_bytes) { + case 2: + insn->moffset1.value = get_next(short, insn); + insn->moffset1.nbytes = 2; + break; + case 4: + insn->moffset1.value = get_next(int, insn); + insn->moffset1.nbytes = 4; + break; + case 8: + insn->moffset1.value = get_next(int, insn); + insn->moffset1.nbytes = 4; + insn->moffset2.value = get_next(int, insn); + insn->moffset2.nbytes = 4; + break; + default: /* opnd_bytes must be modified manually */ + goto err_out; + } + insn->moffset1.got = insn->moffset2.got = 1; + + return 1; + +err_out: + return 0; +} + +/* Decode imm v32(Iz). Return 0 if failed */ +static int __get_immv32(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate.value = get_next(short, insn); + insn->immediate.nbytes = 2; + break; + case 4: + case 8: + insn->immediate.value = get_next(int, insn); + insn->immediate.nbytes = 4; + break; + default: /* opnd_bytes must be modified manually */ + goto err_out; + } + + return 1; + +err_out: + return 0; +} + +/* Decode imm v64(Iv/Ov), Return 0 if failed */ +static int __get_immv(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate1.value = get_next(short, insn); + insn->immediate1.nbytes = 2; + break; + case 4: + insn->immediate1.value = get_next(int, insn); + insn->immediate1.nbytes = 4; + break; + case 8: + insn->immediate1.value = get_next(int, insn); + insn->immediate1.nbytes = 4; + insn->immediate2.value = get_next(int, insn); + insn->immediate2.nbytes = 4; + break; + default: /* opnd_bytes must be modified manually */ + goto err_out; + } + insn->immediate1.got = insn->immediate2.got = 1; + + return 1; +err_out: + return 0; +} + +/* Decode ptr16:16/32(Ap) */ +static int __get_immptr(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate1.value = get_next(short, insn); + insn->immediate1.nbytes = 2; + break; + case 4: + insn->immediate1.value = get_next(int, insn); + insn->immediate1.nbytes = 4; + break; + case 8: + /* ptr16:64 is not exist (no segment) */ + return 0; + default: /* opnd_bytes must be modified manually */ + goto err_out; + } + insn->immediate2.value = get_next(unsigned short, insn); + insn->immediate2.nbytes = 2; + insn->immediate1.got = insn->immediate2.got = 1; + + return 1; +err_out: + return 0; +} + +/** + * insn_get_immediate() - Get the immediates of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * displacement bytes. + * Basically, most of immediates are sign-expanded. Unsigned-value can be + * get by bit masking with ((1 << (nbytes * 8)) - 1) + */ +void insn_get_immediate(struct insn *insn) +{ + if (insn->immediate.got) + return; + if (!insn->displacement.got) + insn_get_displacement(insn); + + if (inat_has_moffset(insn->attr)) { + if (!__get_moffset(insn)) + goto err_out; + goto done; + } + + if (!inat_has_immediate(insn->attr)) + /* no immediates */ + goto done; + + switch (inat_immediate_size(insn->attr)) { + case INAT_IMM_BYTE: + insn->immediate.value = get_next(char, insn); + insn->immediate.nbytes = 1; + break; + case INAT_IMM_WORD: + insn->immediate.value = get_next(short, insn); + insn->immediate.nbytes = 2; + break; + case INAT_IMM_DWORD: + insn->immediate.value = get_next(int, insn); + insn->immediate.nbytes = 4; + break; + case INAT_IMM_QWORD: + insn->immediate1.value = get_next(int, insn); + insn->immediate1.nbytes = 4; + insn->immediate2.value = get_next(int, insn); + insn->immediate2.nbytes = 4; + break; + case INAT_IMM_PTR: + if (!__get_immptr(insn)) + goto err_out; + break; + case INAT_IMM_VWORD32: + if (!__get_immv32(insn)) + goto err_out; + break; + case INAT_IMM_VWORD: + if (!__get_immv(insn)) + goto err_out; + break; + default: + /* Here, insn must have an immediate, but failed */ + goto err_out; + } + if (inat_has_second_immediate(insn->attr)) { + insn->immediate2.value = get_next(char, insn); + insn->immediate2.nbytes = 1; + } +done: + insn->immediate.got = 1; + +err_out: + return; +} + +/** + * insn_get_length() - Get the length of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * immediates bytes. + */ +void insn_get_length(struct insn *insn) +{ + if (insn->length) + return; + if (!insn->immediate.got) + insn_get_immediate(insn); + insn->length = (unsigned char)((unsigned long)insn->next_byte + - (unsigned long)insn->kaddr); +} diff --git a/arch/x86/lib/io_64.c b/arch/x86/lib/io_64.c deleted file mode 100644 index 3f1eb59b5f0..00000000000 --- a/arch/x86/lib/io_64.c +++ /dev/null @@ -1,25 +0,0 @@ -#include <linux/string.h> -#include <linux/module.h> -#include <asm/io.h> - -void __memcpy_toio(unsigned long dst, const void *src, unsigned len) -{ - __inline_memcpy((void *)dst, src, len); -} -EXPORT_SYMBOL(__memcpy_toio); - -void __memcpy_fromio(void *dst, unsigned long src, unsigned len) -{ - __inline_memcpy(dst, (const void *)src, len); -} -EXPORT_SYMBOL(__memcpy_fromio); - -void memset_io(volatile void __iomem *a, int b, size_t c) -{ - /* - * TODO: memset can mangle the IO patterns quite a bit. - * perhaps it would be better to use a dumb one: - */ - memset((void *)a, b, c); -} -EXPORT_SYMBOL(memset_io); diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c index 37756b6fb32..a404b4b7553 100644 --- a/arch/x86/lib/memcpy_32.c +++ b/arch/x86/lib/memcpy_32.c @@ -4,7 +4,7 @@ #undef memcpy #undef memset -void *memcpy(void *to, const void *from, size_t n) +__visible void *memcpy(void *to, const void *from, size_t n) { #ifdef CONFIG_X86_USE_3DNOW return __memcpy3d(to, from, n); @@ -14,30 +14,195 @@ void *memcpy(void *to, const void *from, size_t n) } EXPORT_SYMBOL(memcpy); -void *memset(void *s, int c, size_t count) +__visible void *memset(void *s, int c, size_t count) { return __memset(s, c, count); } EXPORT_SYMBOL(memset); -void *memmove(void *dest, const void *src, size_t n) +__visible void *memmove(void *dest, const void *src, size_t n) { - int d0, d1, d2; - - if (dest < src) { - memcpy(dest,src,n); - } else { - __asm__ __volatile__( - "std\n\t" - "rep\n\t" - "movsb\n\t" - "cld" - : "=&c" (d0), "=&S" (d1), "=&D" (d2) - :"0" (n), - "1" (n-1+src), - "2" (n-1+dest) - :"memory"); - } - return dest; + int d0,d1,d2,d3,d4,d5; + char *ret = dest; + + __asm__ __volatile__( + /* Handle more 16 bytes in loop */ + "cmp $0x10, %0\n\t" + "jb 1f\n\t" + + /* Decide forward/backward copy mode */ + "cmp %2, %1\n\t" + "jb 2f\n\t" + + /* + * movs instruction have many startup latency + * so we handle small size by general register. + */ + "cmp $680, %0\n\t" + "jb 3f\n\t" + /* + * movs instruction is only good for aligned case. + */ + "mov %1, %3\n\t" + "xor %2, %3\n\t" + "and $0xff, %3\n\t" + "jz 4f\n\t" + "3:\n\t" + "sub $0x10, %0\n\t" + + /* + * We gobble 16 bytes forward in each loop. + */ + "3:\n\t" + "sub $0x10, %0\n\t" + "mov 0*4(%1), %3\n\t" + "mov 1*4(%1), %4\n\t" + "mov %3, 0*4(%2)\n\t" + "mov %4, 1*4(%2)\n\t" + "mov 2*4(%1), %3\n\t" + "mov 3*4(%1), %4\n\t" + "mov %3, 2*4(%2)\n\t" + "mov %4, 3*4(%2)\n\t" + "lea 0x10(%1), %1\n\t" + "lea 0x10(%2), %2\n\t" + "jae 3b\n\t" + "add $0x10, %0\n\t" + "jmp 1f\n\t" + + /* + * Handle data forward by movs. + */ + ".p2align 4\n\t" + "4:\n\t" + "mov -4(%1, %0), %3\n\t" + "lea -4(%2, %0), %4\n\t" + "shr $2, %0\n\t" + "rep movsl\n\t" + "mov %3, (%4)\n\t" + "jmp 11f\n\t" + /* + * Handle data backward by movs. + */ + ".p2align 4\n\t" + "6:\n\t" + "mov (%1), %3\n\t" + "mov %2, %4\n\t" + "lea -4(%1, %0), %1\n\t" + "lea -4(%2, %0), %2\n\t" + "shr $2, %0\n\t" + "std\n\t" + "rep movsl\n\t" + "mov %3,(%4)\n\t" + "cld\n\t" + "jmp 11f\n\t" + + /* + * Start to prepare for backward copy. + */ + ".p2align 4\n\t" + "2:\n\t" + "cmp $680, %0\n\t" + "jb 5f\n\t" + "mov %1, %3\n\t" + "xor %2, %3\n\t" + "and $0xff, %3\n\t" + "jz 6b\n\t" + + /* + * Calculate copy position to tail. + */ + "5:\n\t" + "add %0, %1\n\t" + "add %0, %2\n\t" + "sub $0x10, %0\n\t" + + /* + * We gobble 16 bytes backward in each loop. + */ + "7:\n\t" + "sub $0x10, %0\n\t" + + "mov -1*4(%1), %3\n\t" + "mov -2*4(%1), %4\n\t" + "mov %3, -1*4(%2)\n\t" + "mov %4, -2*4(%2)\n\t" + "mov -3*4(%1), %3\n\t" + "mov -4*4(%1), %4\n\t" + "mov %3, -3*4(%2)\n\t" + "mov %4, -4*4(%2)\n\t" + "lea -0x10(%1), %1\n\t" + "lea -0x10(%2), %2\n\t" + "jae 7b\n\t" + /* + * Calculate copy position to head. + */ + "add $0x10, %0\n\t" + "sub %0, %1\n\t" + "sub %0, %2\n\t" + + /* + * Move data from 8 bytes to 15 bytes. + */ + ".p2align 4\n\t" + "1:\n\t" + "cmp $8, %0\n\t" + "jb 8f\n\t" + "mov 0*4(%1), %3\n\t" + "mov 1*4(%1), %4\n\t" + "mov -2*4(%1, %0), %5\n\t" + "mov -1*4(%1, %0), %1\n\t" + + "mov %3, 0*4(%2)\n\t" + "mov %4, 1*4(%2)\n\t" + "mov %5, -2*4(%2, %0)\n\t" + "mov %1, -1*4(%2, %0)\n\t" + "jmp 11f\n\t" + + /* + * Move data from 4 bytes to 7 bytes. + */ + ".p2align 4\n\t" + "8:\n\t" + "cmp $4, %0\n\t" + "jb 9f\n\t" + "mov 0*4(%1), %3\n\t" + "mov -1*4(%1, %0), %4\n\t" + "mov %3, 0*4(%2)\n\t" + "mov %4, -1*4(%2, %0)\n\t" + "jmp 11f\n\t" + + /* + * Move data from 2 bytes to 3 bytes. + */ + ".p2align 4\n\t" + "9:\n\t" + "cmp $2, %0\n\t" + "jb 10f\n\t" + "movw 0*2(%1), %%dx\n\t" + "movw -1*2(%1, %0), %%bx\n\t" + "movw %%dx, 0*2(%2)\n\t" + "movw %%bx, -1*2(%2, %0)\n\t" + "jmp 11f\n\t" + + /* + * Move data for 1 byte. + */ + ".p2align 4\n\t" + "10:\n\t" + "cmp $1, %0\n\t" + "jb 11f\n\t" + "movb (%1), %%cl\n\t" + "movb %%cl, (%2)\n\t" + ".p2align 4\n\t" + "11:" + : "=&c" (d0), "=&S" (d1), "=&D" (d2), + "=r" (d3),"=r" (d4), "=r"(d5) + :"0" (n), + "1" (src), + "2" (dest) + :"memory"); + + return ret; + } EXPORT_SYMBOL(memmove); diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index c22981fa2f3..56313a32618 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -1,131 +1,206 @@ /* Copyright 2002 Andi Kleen */ #include <linux/linkage.h> -#include <asm/dwarf2.h> + #include <asm/cpufeature.h> +#include <asm/dwarf2.h> +#include <asm/alternative-asm.h> /* * memcpy - Copy a memory block. * - * Input: - * rdi destination - * rsi source - * rdx count - * + * Input: + * rdi destination + * rsi source + * rdx count + * * Output: * rax original destination - */ + */ - ALIGN -memcpy_c: - CFI_STARTPROC - movq %rdi,%rax - movl %edx,%ecx - shrl $3,%ecx - andl $7,%edx +/* + * memcpy_c() - fast string ops (REP MOVSQ) based variant. + * + * This gets patched over the unrolled variant (below) via the + * alternative instructions framework: + */ + .section .altinstr_replacement, "ax", @progbits +.Lmemcpy_c: + movq %rdi, %rax + movq %rdx, %rcx + shrq $3, %rcx + andl $7, %edx rep movsq - movl %edx,%ecx + movl %edx, %ecx rep movsb ret - CFI_ENDPROC -ENDPROC(memcpy_c) +.Lmemcpy_e: + .previous + +/* + * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than + * memcpy_c. Use memcpy_c_e when possible. + * + * This gets patched over the unrolled variant (below) via the + * alternative instructions framework: + */ + .section .altinstr_replacement, "ax", @progbits +.Lmemcpy_c_e: + movq %rdi, %rax + movq %rdx, %rcx + rep movsb + ret +.Lmemcpy_e_e: + .previous ENTRY(__memcpy) ENTRY(memcpy) CFI_STARTPROC - pushq %rbx - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rbx, 0 - movq %rdi,%rax - - movl %edx,%ecx - shrl $6,%ecx - jz .Lhandle_tail - + movq %rdi, %rax + + cmpq $0x20, %rdx + jb .Lhandle_tail + + /* + * We check whether memory false dependence could occur, + * then jump to corresponding copy mode. + */ + cmp %dil, %sil + jl .Lcopy_backward + subq $0x20, %rdx +.Lcopy_forward_loop: + subq $0x20, %rdx + + /* + * Move in blocks of 4x8 bytes: + */ + movq 0*8(%rsi), %r8 + movq 1*8(%rsi), %r9 + movq 2*8(%rsi), %r10 + movq 3*8(%rsi), %r11 + leaq 4*8(%rsi), %rsi + + movq %r8, 0*8(%rdi) + movq %r9, 1*8(%rdi) + movq %r10, 2*8(%rdi) + movq %r11, 3*8(%rdi) + leaq 4*8(%rdi), %rdi + jae .Lcopy_forward_loop + addl $0x20, %edx + jmp .Lhandle_tail + +.Lcopy_backward: + /* + * Calculate copy position to tail. + */ + addq %rdx, %rsi + addq %rdx, %rdi + subq $0x20, %rdx + /* + * At most 3 ALU operations in one cycle, + * so append NOPS in the same 16 bytes trunk. + */ .p2align 4 -.Lloop_64: - decl %ecx - - movq (%rsi),%r11 - movq 8(%rsi),%r8 - - movq %r11,(%rdi) - movq %r8,1*8(%rdi) - - movq 2*8(%rsi),%r9 - movq 3*8(%rsi),%r10 - - movq %r9,2*8(%rdi) - movq %r10,3*8(%rdi) - - movq 4*8(%rsi),%r11 - movq 5*8(%rsi),%r8 - - movq %r11,4*8(%rdi) - movq %r8,5*8(%rdi) - - movq 6*8(%rsi),%r9 - movq 7*8(%rsi),%r10 - - movq %r9,6*8(%rdi) - movq %r10,7*8(%rdi) - - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - jnz .Lloop_64 - +.Lcopy_backward_loop: + subq $0x20, %rdx + movq -1*8(%rsi), %r8 + movq -2*8(%rsi), %r9 + movq -3*8(%rsi), %r10 + movq -4*8(%rsi), %r11 + leaq -4*8(%rsi), %rsi + movq %r8, -1*8(%rdi) + movq %r9, -2*8(%rdi) + movq %r10, -3*8(%rdi) + movq %r11, -4*8(%rdi) + leaq -4*8(%rdi), %rdi + jae .Lcopy_backward_loop + + /* + * Calculate copy position to head. + */ + addl $0x20, %edx + subq %rdx, %rsi + subq %rdx, %rdi .Lhandle_tail: - movl %edx,%ecx - andl $63,%ecx - shrl $3,%ecx - jz .Lhandle_7 + cmpl $16, %edx + jb .Lless_16bytes + + /* + * Move data from 16 bytes to 31 bytes. + */ + movq 0*8(%rsi), %r8 + movq 1*8(%rsi), %r9 + movq -2*8(%rsi, %rdx), %r10 + movq -1*8(%rsi, %rdx), %r11 + movq %r8, 0*8(%rdi) + movq %r9, 1*8(%rdi) + movq %r10, -2*8(%rdi, %rdx) + movq %r11, -1*8(%rdi, %rdx) + retq .p2align 4 -.Lloop_8: - decl %ecx - movq (%rsi),%r8 - movq %r8,(%rdi) - leaq 8(%rdi),%rdi - leaq 8(%rsi),%rsi - jnz .Lloop_8 - -.Lhandle_7: - movl %edx,%ecx - andl $7,%ecx - jz .Lende +.Lless_16bytes: + cmpl $8, %edx + jb .Lless_8bytes + /* + * Move data from 8 bytes to 15 bytes. + */ + movq 0*8(%rsi), %r8 + movq -1*8(%rsi, %rdx), %r9 + movq %r8, 0*8(%rdi) + movq %r9, -1*8(%rdi, %rdx) + retq .p2align 4 -.Lloop_1: - movb (%rsi),%r8b - movb %r8b,(%rdi) - incq %rdi - incq %rsi - decl %ecx - jnz .Lloop_1 - -.Lende: - popq %rbx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE rbx - ret -.Lfinal: +.Lless_8bytes: + cmpl $4, %edx + jb .Lless_3bytes + + /* + * Move data from 4 bytes to 7 bytes. + */ + movl (%rsi), %ecx + movl -4(%rsi, %rdx), %r8d + movl %ecx, (%rdi) + movl %r8d, -4(%rdi, %rdx) + retq + .p2align 4 +.Lless_3bytes: + subl $1, %edx + jb .Lend + /* + * Move data from 1 bytes to 3 bytes. + */ + movzbl (%rsi), %ecx + jz .Lstore_1byte + movzbq 1(%rsi), %r8 + movzbq (%rsi, %rdx), %r9 + movb %r8b, 1(%rdi) + movb %r9b, (%rdi, %rdx) +.Lstore_1byte: + movb %cl, (%rdi) + +.Lend: + retq CFI_ENDPROC ENDPROC(memcpy) ENDPROC(__memcpy) - /* Some CPUs run faster using the string copy instructions. - It is also a lot simpler. Use this when possible */ - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp <disp8> */ - .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ -2: - .previous - .section .altinstructions,"a" - .align 8 - .quad memcpy - .quad 1b - .byte X86_FEATURE_REP_GOOD - /* Replace only beginning, memcpy is used to apply alternatives, so it - * is silly to overwrite itself with nops - reboot is only outcome... */ - .byte 2b - 1b - .byte 2b - 1b + /* + * Some CPUs are adding enhanced REP MOVSB/STOSB feature + * If the feature is supported, memcpy_c_e() is the first choice. + * If enhanced rep movsb copy is not available, use fast string copy + * memcpy_c() when possible. This is faster and code is simpler than + * original memcpy(). + * Otherwise, original memcpy() is used. + * In .altinstructions section, ERMS feature is placed after REG_GOOD + * feature to implement the right patch order. + * + * Replace only beginning, memcpy is used to apply alternatives, + * so it is silly to overwrite itself with nops - reboot is the + * only outcome... + */ + .section .altinstructions, "a" + altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ + .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c + altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ + .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e .previous diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S new file mode 100644 index 00000000000..65268a6104f --- /dev/null +++ b/arch/x86/lib/memmove_64.S @@ -0,0 +1,223 @@ +/* + * Normally compiler builtins are used, but sometimes the compiler calls out + * of line code. Based on asm-i386/string.h. + * + * This assembly file is re-written from memmove_64.c file. + * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> + */ +#define _STRING_C +#include <linux/linkage.h> +#include <asm/dwarf2.h> +#include <asm/cpufeature.h> +#include <asm/alternative-asm.h> + +#undef memmove + +/* + * Implement memmove(). This can handle overlap between src and dst. + * + * Input: + * rdi: dest + * rsi: src + * rdx: count + * + * Output: + * rax: dest + */ +ENTRY(memmove) + CFI_STARTPROC + + /* Handle more 32 bytes in loop */ + mov %rdi, %rax + cmp $0x20, %rdx + jb 1f + + /* Decide forward/backward copy mode */ + cmp %rdi, %rsi + jge .Lmemmove_begin_forward + mov %rsi, %r8 + add %rdx, %r8 + cmp %rdi, %r8 + jg 2f + +.Lmemmove_begin_forward: + /* + * movsq instruction have many startup latency + * so we handle small size by general register. + */ + cmp $680, %rdx + jb 3f + /* + * movsq instruction is only good for aligned case. + */ + + cmpb %dil, %sil + je 4f +3: + sub $0x20, %rdx + /* + * We gobble 32 bytes forward in each loop. + */ +5: + sub $0x20, %rdx + movq 0*8(%rsi), %r11 + movq 1*8(%rsi), %r10 + movq 2*8(%rsi), %r9 + movq 3*8(%rsi), %r8 + leaq 4*8(%rsi), %rsi + + movq %r11, 0*8(%rdi) + movq %r10, 1*8(%rdi) + movq %r9, 2*8(%rdi) + movq %r8, 3*8(%rdi) + leaq 4*8(%rdi), %rdi + jae 5b + addq $0x20, %rdx + jmp 1f + /* + * Handle data forward by movsq. + */ + .p2align 4 +4: + movq %rdx, %rcx + movq -8(%rsi, %rdx), %r11 + lea -8(%rdi, %rdx), %r10 + shrq $3, %rcx + rep movsq + movq %r11, (%r10) + jmp 13f +.Lmemmove_end_forward: + + /* + * Handle data backward by movsq. + */ + .p2align 4 +7: + movq %rdx, %rcx + movq (%rsi), %r11 + movq %rdi, %r10 + leaq -8(%rsi, %rdx), %rsi + leaq -8(%rdi, %rdx), %rdi + shrq $3, %rcx + std + rep movsq + cld + movq %r11, (%r10) + jmp 13f + + /* + * Start to prepare for backward copy. + */ + .p2align 4 +2: + cmp $680, %rdx + jb 6f + cmp %dil, %sil + je 7b +6: + /* + * Calculate copy position to tail. + */ + addq %rdx, %rsi + addq %rdx, %rdi + subq $0x20, %rdx + /* + * We gobble 32 bytes backward in each loop. + */ +8: + subq $0x20, %rdx + movq -1*8(%rsi), %r11 + movq -2*8(%rsi), %r10 + movq -3*8(%rsi), %r9 + movq -4*8(%rsi), %r8 + leaq -4*8(%rsi), %rsi + + movq %r11, -1*8(%rdi) + movq %r10, -2*8(%rdi) + movq %r9, -3*8(%rdi) + movq %r8, -4*8(%rdi) + leaq -4*8(%rdi), %rdi + jae 8b + /* + * Calculate copy position to head. + */ + addq $0x20, %rdx + subq %rdx, %rsi + subq %rdx, %rdi +1: + cmpq $16, %rdx + jb 9f + /* + * Move data from 16 bytes to 31 bytes. + */ + movq 0*8(%rsi), %r11 + movq 1*8(%rsi), %r10 + movq -2*8(%rsi, %rdx), %r9 + movq -1*8(%rsi, %rdx), %r8 + movq %r11, 0*8(%rdi) + movq %r10, 1*8(%rdi) + movq %r9, -2*8(%rdi, %rdx) + movq %r8, -1*8(%rdi, %rdx) + jmp 13f + .p2align 4 +9: + cmpq $8, %rdx + jb 10f + /* + * Move data from 8 bytes to 15 bytes. + */ + movq 0*8(%rsi), %r11 + movq -1*8(%rsi, %rdx), %r10 + movq %r11, 0*8(%rdi) + movq %r10, -1*8(%rdi, %rdx) + jmp 13f +10: + cmpq $4, %rdx + jb 11f + /* + * Move data from 4 bytes to 7 bytes. + */ + movl (%rsi), %r11d + movl -4(%rsi, %rdx), %r10d + movl %r11d, (%rdi) + movl %r10d, -4(%rdi, %rdx) + jmp 13f +11: + cmp $2, %rdx + jb 12f + /* + * Move data from 2 bytes to 3 bytes. + */ + movw (%rsi), %r11w + movw -2(%rsi, %rdx), %r10w + movw %r11w, (%rdi) + movw %r10w, -2(%rdi, %rdx) + jmp 13f +12: + cmp $1, %rdx + jb 13f + /* + * Move data for 1 byte. + */ + movb (%rsi), %r11b + movb %r11b, (%rdi) +13: + retq + CFI_ENDPROC + + .section .altinstr_replacement,"ax" +.Lmemmove_begin_forward_efs: + /* Forward moving data. */ + movq %rdx, %rcx + rep movsb + retq +.Lmemmove_end_forward_efs: + .previous + + .section .altinstructions,"a" + altinstruction_entry .Lmemmove_begin_forward, \ + .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \ + .Lmemmove_end_forward-.Lmemmove_begin_forward, \ + .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs + .previous +ENDPROC(memmove) diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c deleted file mode 100644 index 80175e47b19..00000000000 --- a/arch/x86/lib/memmove_64.c +++ /dev/null @@ -1,21 +0,0 @@ -/* Normally compiler builtins are used, but sometimes the compiler calls out - of line code. Based on asm-i386/string.h. - */ -#define _STRING_C -#include <linux/string.h> -#include <linux/module.h> - -#undef memmove -void *memmove(void * dest,const void *src,size_t count) -{ - if (dest < src) { - return memcpy(dest,src,count); - } else { - char *p = dest + count; - const char *s = src + count; - while (count--) - *--p = *--s; - } - return dest; -} -EXPORT_SYMBOL(memmove); diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 2c5948116bd..2dcb3808cbd 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -2,9 +2,13 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> +#include <asm/cpufeature.h> +#include <asm/alternative-asm.h> /* - * ISO C memset - set a memory block to a byte value. + * ISO C memset - set a memory block to a byte value. This function uses fast + * string to get better performance than the original function. The code is + * simpler and shorter than the orignal function as well. * * rdi destination * rsi value (char) @@ -12,36 +16,55 @@ * * rax original destination */ - ALIGN -memset_c: - CFI_STARTPROC + .section .altinstr_replacement, "ax", @progbits +.Lmemset_c: movq %rdi,%r9 - movl %edx,%r8d - andl $7,%r8d - movl %edx,%ecx - shrl $3,%ecx + movq %rdx,%rcx + andl $7,%edx + shrq $3,%rcx /* expand byte value */ movzbl %sil,%esi movabs $0x0101010101010101,%rax - mulq %rsi /* with rax, clobbers rdx */ + imulq %rsi,%rax rep stosq - movl %r8d,%ecx + movl %edx,%ecx rep stosb movq %r9,%rax ret - CFI_ENDPROC -ENDPROC(memset_c) +.Lmemset_e: + .previous + +/* + * ISO C memset - set a memory block to a byte value. This function uses + * enhanced rep stosb to override the fast string function. + * The code is simpler and shorter than the fast string function as well. + * + * rdi destination + * rsi value (char) + * rdx count (bytes) + * + * rax original destination + */ + .section .altinstr_replacement, "ax", @progbits +.Lmemset_c_e: + movq %rdi,%r9 + movb %sil,%al + movq %rdx,%rcx + rep stosb + movq %r9,%rax + ret +.Lmemset_e_e: + .previous ENTRY(memset) ENTRY(__memset) CFI_STARTPROC movq %rdi,%r10 - movq %rdx,%r11 /* expand byte value */ movzbl %sil,%ecx movabs $0x0101010101010101,%rax - mul %rcx /* with rax, clobbers rdx */ + imulq %rcx,%rax /* align dst */ movl %edi,%r9d @@ -50,13 +73,13 @@ ENTRY(__memset) CFI_REMEMBER_STATE .Lafter_bad_alignment: - movl %r11d,%ecx - shrl $6,%ecx + movq %rdx,%rcx + shrq $6,%rcx jz .Lhandle_tail .p2align 4 .Lloop_64: - decl %ecx + decq %rcx movq %rax,(%rdi) movq %rax,8(%rdi) movq %rax,16(%rdi) @@ -72,7 +95,7 @@ ENTRY(__memset) to predict jump tables. */ .p2align 4 .Lhandle_tail: - movl %r11d,%ecx + movl %edx,%ecx andl $63&(~7),%ecx jz .Lhandle_7 shrl $3,%ecx @@ -84,12 +107,11 @@ ENTRY(__memset) jnz .Lloop_8 .Lhandle_7: - movl %r11d,%ecx - andl $7,%ecx + andl $7,%edx jz .Lende .p2align 4 .Lloop_1: - decl %ecx + decl %edx movb %al,(%rdi) leaq 1(%rdi),%rdi jnz .Lloop_1 @@ -100,34 +122,33 @@ ENTRY(__memset) CFI_RESTORE_STATE .Lbad_alignment: - cmpq $7,%r11 + cmpq $7,%rdx jbe .Lhandle_7 movq %rax,(%rdi) /* unaligned store */ movq $8,%r8 subq %r9,%r8 addq %r8,%rdi - subq %r8,%r11 + subq %r8,%rdx jmp .Lafter_bad_alignment .Lfinal: CFI_ENDPROC ENDPROC(memset) ENDPROC(__memset) - /* Some CPUs run faster using the string instructions. - It is also a lot simpler. Use this when possible */ - -#include <asm/cpufeature.h> - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp <disp8> */ - .byte (memset_c - memset) - (2f - 1b) /* offset */ -2: - .previous + /* Some CPUs support enhanced REP MOVSB/STOSB feature. + * It is recommended to use this when possible. + * + * If enhanced REP MOVSB/STOSB feature is not available, use fast string + * instructions. + * + * Otherwise, use original memset function. + * + * In .altinstructions section, ERMS feature is placed after REG_GOOD + * feature to implement the right patch order. + */ .section .altinstructions,"a" - .align 8 - .quad memset - .quad 1b - .byte X86_FEATURE_REP_GOOD - .byte .Lfinal - memset - .byte 2b - 1b + altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ + .Lfinal-memset,.Lmemset_e-.Lmemset_c + altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ + .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e .previous diff --git a/arch/x86/lib/misc.c b/arch/x86/lib/misc.c new file mode 100644 index 00000000000..76b373af03f --- /dev/null +++ b/arch/x86/lib/misc.c @@ -0,0 +1,21 @@ +/* + * Count the digits of @val including a possible sign. + * + * (Typed on and submitted from hpa's mobile phone.) + */ +int num_digits(int val) +{ + int m = 10; + int d = 1; + + if (val < 0) { + d++; + val = -val; + } + + while (val >= m) { + m *= 10; + d++; + } + return d; +} diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c index cc9b4a4450f..c9f2d9ba8dd 100644 --- a/arch/x86/lib/mmx_32.c +++ b/arch/x86/lib/mmx_32.c @@ -1,32 +1,30 @@ -#include <linux/types.h> -#include <linux/string.h> -#include <linux/sched.h> -#include <linux/hardirq.h> -#include <linux/module.h> - -#include <asm/asm.h> -#include <asm/i387.h> - - /* * MMX 3DNow! library helper functions * * To do: - * We can use MMX just for prefetch in IRQ's. This may be a win. + * We can use MMX just for prefetch in IRQ's. This may be a win. * (reported so on K6-III) * We should use a better code neutral filler for the short jump * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ?? * We also want to clobber the filler register so we don't get any - * register forwarding stalls on the filler. + * register forwarding stalls on the filler. * * Add *user handling. Checksums are not a win with MMX on any CPU * tested so far for any MMX solution figured. * - * 22/09/2000 - Arjan van de Ven - * Improved for non-egineering-sample Athlons + * 22/09/2000 - Arjan van de Ven + * Improved for non-egineering-sample Athlons * */ - +#include <linux/hardirq.h> +#include <linux/string.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/types.h> + +#include <asm/i387.h> +#include <asm/asm.h> + void *_mmx_memcpy(void *to, const void *from, size_t len) { void *p; @@ -51,12 +49,10 @@ void *_mmx_memcpy(void *to, const void *from, size_t len) "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ " jmp 2b\n" ".previous\n" - _ASM_EXTABLE(1b,3b) - : : "r" (from) ); - - - for(; i>5; i--) - { + _ASM_EXTABLE(1b, 3b) + : : "r" (from)); + + for ( ; i > 5; i--) { __asm__ __volatile__ ( "1: prefetch 320(%0)\n" "2: movq (%0), %%mm0\n" @@ -79,14 +75,14 @@ void *_mmx_memcpy(void *to, const void *from, size_t len) "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ " jmp 2b\n" ".previous\n" - _ASM_EXTABLE(1b,3b) - : : "r" (from), "r" (to) : "memory"); - from+=64; - to+=64; + _ASM_EXTABLE(1b, 3b) + : : "r" (from), "r" (to) : "memory"); + + from += 64; + to += 64; } - for(; i>0; i--) - { + for ( ; i > 0; i--) { __asm__ __volatile__ ( " movq (%0), %%mm0\n" " movq 8(%0), %%mm1\n" @@ -104,17 +100,20 @@ void *_mmx_memcpy(void *to, const void *from, size_t len) " movq %%mm1, 40(%1)\n" " movq %%mm2, 48(%1)\n" " movq %%mm3, 56(%1)\n" - : : "r" (from), "r" (to) : "memory"); - from+=64; - to+=64; + : : "r" (from), "r" (to) : "memory"); + + from += 64; + to += 64; } /* - * Now do the tail of the block + * Now do the tail of the block: */ - __memcpy(to, from, len&63); + __memcpy(to, from, len & 63); kernel_fpu_end(); + return p; } +EXPORT_SYMBOL(_mmx_memcpy); #ifdef CONFIG_MK7 @@ -128,13 +127,12 @@ static void fast_clear_page(void *page) int i; kernel_fpu_begin(); - + __asm__ __volatile__ ( " pxor %%mm0, %%mm0\n" : : ); - for(i=0;i<4096/64;i++) - { + for (i = 0; i < 4096/64; i++) { __asm__ __volatile__ ( " movntq %%mm0, (%0)\n" " movntq %%mm0, 8(%0)\n" @@ -145,14 +143,15 @@ static void fast_clear_page(void *page) " movntq %%mm0, 48(%0)\n" " movntq %%mm0, 56(%0)\n" : : "r" (page) : "memory"); - page+=64; + page += 64; } - /* since movntq is weakly-ordered, a "sfence" is needed to become - * ordered again. + + /* + * Since movntq is weakly-ordered, a "sfence" is needed to become + * ordered again: */ - __asm__ __volatile__ ( - " sfence \n" : : - ); + __asm__ __volatile__("sfence\n"::); + kernel_fpu_end(); } @@ -162,10 +161,11 @@ static void fast_copy_page(void *to, void *from) kernel_fpu_begin(); - /* maybe the prefetch stuff can go before the expensive fnsave... + /* + * maybe the prefetch stuff can go before the expensive fnsave... * but that is for later. -AV */ - __asm__ __volatile__ ( + __asm__ __volatile__( "1: prefetch (%0)\n" " prefetch 64(%0)\n" " prefetch 128(%0)\n" @@ -176,11 +176,9 @@ static void fast_copy_page(void *to, void *from) "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ " jmp 2b\n" ".previous\n" - _ASM_EXTABLE(1b,3b) - : : "r" (from) ); + _ASM_EXTABLE(1b, 3b) : : "r" (from)); - for(i=0; i<(4096-320)/64; i++) - { + for (i = 0; i < (4096-320)/64; i++) { __asm__ __volatile__ ( "1: prefetch 320(%0)\n" "2: movq (%0), %%mm0\n" @@ -203,13 +201,13 @@ static void fast_copy_page(void *to, void *from) "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ " jmp 2b\n" ".previous\n" - _ASM_EXTABLE(1b,3b) - : : "r" (from), "r" (to) : "memory"); - from+=64; - to+=64; + _ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory"); + + from += 64; + to += 64; } - for(i=(4096-320)/64; i<4096/64; i++) - { + + for (i = (4096-320)/64; i < 4096/64; i++) { __asm__ __volatile__ ( "2: movq (%0), %%mm0\n" " movntq %%mm0, (%1)\n" @@ -227,37 +225,34 @@ static void fast_copy_page(void *to, void *from) " movntq %%mm6, 48(%1)\n" " movq 56(%0), %%mm7\n" " movntq %%mm7, 56(%1)\n" - : : "r" (from), "r" (to) : "memory"); - from+=64; - to+=64; + : : "r" (from), "r" (to) : "memory"); + from += 64; + to += 64; } - /* since movntq is weakly-ordered, a "sfence" is needed to become - * ordered again. + /* + * Since movntq is weakly-ordered, a "sfence" is needed to become + * ordered again: */ - __asm__ __volatile__ ( - " sfence \n" : : - ); + __asm__ __volatile__("sfence \n"::); kernel_fpu_end(); } -#else +#else /* CONFIG_MK7 */ /* * Generic MMX implementation without K7 specific streaming */ - static void fast_clear_page(void *page) { int i; - + kernel_fpu_begin(); - + __asm__ __volatile__ ( " pxor %%mm0, %%mm0\n" : : ); - for(i=0;i<4096/128;i++) - { + for (i = 0; i < 4096/128; i++) { __asm__ __volatile__ ( " movq %%mm0, (%0)\n" " movq %%mm0, 8(%0)\n" @@ -275,8 +270,8 @@ static void fast_clear_page(void *page) " movq %%mm0, 104(%0)\n" " movq %%mm0, 112(%0)\n" " movq %%mm0, 120(%0)\n" - : : "r" (page) : "memory"); - page+=128; + : : "r" (page) : "memory"); + page += 128; } kernel_fpu_end(); @@ -285,8 +280,7 @@ static void fast_clear_page(void *page) static void fast_copy_page(void *to, void *from) { int i; - - + kernel_fpu_begin(); __asm__ __volatile__ ( @@ -300,11 +294,9 @@ static void fast_copy_page(void *to, void *from) "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ " jmp 2b\n" ".previous\n" - _ASM_EXTABLE(1b,3b) - : : "r" (from) ); + _ASM_EXTABLE(1b, 3b) : : "r" (from)); - for(i=0; i<4096/64; i++) - { + for (i = 0; i < 4096/64; i++) { __asm__ __volatile__ ( "1: prefetch 320(%0)\n" "2: movq (%0), %%mm0\n" @@ -327,60 +319,59 @@ static void fast_copy_page(void *to, void *from) "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ " jmp 2b\n" ".previous\n" - _ASM_EXTABLE(1b,3b) - : : "r" (from), "r" (to) : "memory"); - from+=64; - to+=64; + _ASM_EXTABLE(1b, 3b) + : : "r" (from), "r" (to) : "memory"); + + from += 64; + to += 64; } kernel_fpu_end(); } - -#endif +#endif /* !CONFIG_MK7 */ /* - * Favour MMX for page clear and copy. + * Favour MMX for page clear and copy: */ - -static void slow_zero_page(void * page) +static void slow_zero_page(void *page) { int d0, d1; - __asm__ __volatile__( \ - "cld\n\t" \ - "rep ; stosl" \ - : "=&c" (d0), "=&D" (d1) - :"a" (0),"1" (page),"0" (1024) - :"memory"); + + __asm__ __volatile__( + "cld\n\t" + "rep ; stosl" + + : "=&c" (d0), "=&D" (d1) + :"a" (0), "1" (page), "0" (1024) + :"memory"); } - -void mmx_clear_page(void * page) + +void mmx_clear_page(void *page) { - if(unlikely(in_interrupt())) + if (unlikely(in_interrupt())) slow_zero_page(page); else fast_clear_page(page); } +EXPORT_SYMBOL(mmx_clear_page); static void slow_copy_page(void *to, void *from) { int d0, d1, d2; - __asm__ __volatile__( \ - "cld\n\t" \ - "rep ; movsl" \ - : "=&c" (d0), "=&D" (d1), "=&S" (d2) \ - : "0" (1024),"1" ((long) to),"2" ((long) from) \ + + __asm__ __volatile__( + "cld\n\t" + "rep ; movsl" + : "=&c" (d0), "=&D" (d1), "=&S" (d2) + : "0" (1024), "1" ((long) to), "2" ((long) from) : "memory"); } - void mmx_copy_page(void *to, void *from) { - if(unlikely(in_interrupt())) + if (unlikely(in_interrupt())) slow_copy_page(to, from); else fast_copy_page(to, from); } - -EXPORT_SYMBOL(_mmx_memcpy); -EXPORT_SYMBOL(mmx_clear_page); EXPORT_SYMBOL(mmx_copy_page); diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c deleted file mode 100644 index 57d043fa893..00000000000 --- a/arch/x86/lib/msr-on-cpu.c +++ /dev/null @@ -1,101 +0,0 @@ -#include <linux/module.h> -#include <linux/preempt.h> -#include <linux/smp.h> -#include <asm/msr.h> - -struct msr_info { - u32 msr_no; - u32 l, h; - int err; -}; - -static void __rdmsr_on_cpu(void *info) -{ - struct msr_info *rv = info; - - rdmsr(rv->msr_no, rv->l, rv->h); -} - -static void __rdmsr_safe_on_cpu(void *info) -{ - struct msr_info *rv = info; - - rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h); -} - -static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe) -{ - int err = 0; - struct msr_info rv; - - rv.msr_no = msr_no; - if (safe) { - smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 0, 1); - err = rv.err; - } else { - smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 0, 1); - } - *l = rv.l; - *h = rv.h; - - return err; -} - -static void __wrmsr_on_cpu(void *info) -{ - struct msr_info *rv = info; - - wrmsr(rv->msr_no, rv->l, rv->h); -} - -static void __wrmsr_safe_on_cpu(void *info) -{ - struct msr_info *rv = info; - - rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h); -} - -static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe) -{ - int err = 0; - struct msr_info rv; - - rv.msr_no = msr_no; - rv.l = l; - rv.h = h; - if (safe) { - smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 0, 1); - err = rv.err; - } else { - smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 0, 1); - } - - return err; -} - -void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) -{ - _wrmsr_on_cpu(cpu, msr_no, l, h, 0); -} - -void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) -{ - _rdmsr_on_cpu(cpu, msr_no, l, h, 0); -} - -/* These "safe" variants are slower and should be used when the target MSR - may not actually exist. */ -int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) -{ - return _wrmsr_on_cpu(cpu, msr_no, l, h, 1); -} - -int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) -{ - return _rdmsr_on_cpu(cpu, msr_no, l, h, 1); -} - -EXPORT_SYMBOL(rdmsr_on_cpu); -EXPORT_SYMBOL(wrmsr_on_cpu); -EXPORT_SYMBOL(rdmsr_safe_on_cpu); -EXPORT_SYMBOL(wrmsr_safe_on_cpu); diff --git a/arch/x86/lib/msr-reg-export.c b/arch/x86/lib/msr-reg-export.c new file mode 100644 index 00000000000..8d6ef78b5d0 --- /dev/null +++ b/arch/x86/lib/msr-reg-export.c @@ -0,0 +1,5 @@ +#include <linux/module.h> +#include <asm/msr.h> + +EXPORT_SYMBOL(rdmsr_safe_regs); +EXPORT_SYMBOL(wrmsr_safe_regs); diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S new file mode 100644 index 00000000000..f6d13eefad1 --- /dev/null +++ b/arch/x86/lib/msr-reg.S @@ -0,0 +1,102 @@ +#include <linux/linkage.h> +#include <linux/errno.h> +#include <asm/dwarf2.h> +#include <asm/asm.h> +#include <asm/msr.h> + +#ifdef CONFIG_X86_64 +/* + * int {rdmsr,wrmsr}_safe_regs(u32 gprs[8]); + * + * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi] + * + */ +.macro op_safe_regs op +ENTRY(\op\()_safe_regs) + CFI_STARTPROC + pushq_cfi %rbx + pushq_cfi %rbp + movq %rdi, %r10 /* Save pointer */ + xorl %r11d, %r11d /* Return value */ + movl (%rdi), %eax + movl 4(%rdi), %ecx + movl 8(%rdi), %edx + movl 12(%rdi), %ebx + movl 20(%rdi), %ebp + movl 24(%rdi), %esi + movl 28(%rdi), %edi + CFI_REMEMBER_STATE +1: \op +2: movl %eax, (%r10) + movl %r11d, %eax /* Return value */ + movl %ecx, 4(%r10) + movl %edx, 8(%r10) + movl %ebx, 12(%r10) + movl %ebp, 20(%r10) + movl %esi, 24(%r10) + movl %edi, 28(%r10) + popq_cfi %rbp + popq_cfi %rbx + ret +3: + CFI_RESTORE_STATE + movl $-EIO, %r11d + jmp 2b + + _ASM_EXTABLE(1b, 3b) + CFI_ENDPROC +ENDPROC(\op\()_safe_regs) +.endm + +#else /* X86_32 */ + +.macro op_safe_regs op +ENTRY(\op\()_safe_regs) + CFI_STARTPROC + pushl_cfi %ebx + pushl_cfi %ebp + pushl_cfi %esi + pushl_cfi %edi + pushl_cfi $0 /* Return value */ + pushl_cfi %eax + movl 4(%eax), %ecx + movl 8(%eax), %edx + movl 12(%eax), %ebx + movl 20(%eax), %ebp + movl 24(%eax), %esi + movl 28(%eax), %edi + movl (%eax), %eax + CFI_REMEMBER_STATE +1: \op +2: pushl_cfi %eax + movl 4(%esp), %eax + popl_cfi (%eax) + addl $4, %esp + CFI_ADJUST_CFA_OFFSET -4 + movl %ecx, 4(%eax) + movl %edx, 8(%eax) + movl %ebx, 12(%eax) + movl %ebp, 20(%eax) + movl %esi, 24(%eax) + movl %edi, 28(%eax) + popl_cfi %eax + popl_cfi %edi + popl_cfi %esi + popl_cfi %ebp + popl_cfi %ebx + ret +3: + CFI_RESTORE_STATE + movl $-EIO, 4(%esp) + jmp 2b + + _ASM_EXTABLE(1b, 3b) + CFI_ENDPROC +ENDPROC(\op\()_safe_regs) +.endm + +#endif + +op_safe_regs rdmsr +op_safe_regs wrmsr + diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c new file mode 100644 index 00000000000..518532e6a3f --- /dev/null +++ b/arch/x86/lib/msr-smp.c @@ -0,0 +1,266 @@ +#include <linux/module.h> +#include <linux/preempt.h> +#include <linux/smp.h> +#include <asm/msr.h> + +static void __rdmsr_on_cpu(void *info) +{ + struct msr_info *rv = info; + struct msr *reg; + int this_cpu = raw_smp_processor_id(); + + if (rv->msrs) + reg = per_cpu_ptr(rv->msrs, this_cpu); + else + reg = &rv->reg; + + rdmsr(rv->msr_no, reg->l, reg->h); +} + +static void __wrmsr_on_cpu(void *info) +{ + struct msr_info *rv = info; + struct msr *reg; + int this_cpu = raw_smp_processor_id(); + + if (rv->msrs) + reg = per_cpu_ptr(rv->msrs, this_cpu); + else + reg = &rv->reg; + + wrmsr(rv->msr_no, reg->l, reg->h); +} + +int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); + *l = rv.reg.l; + *h = rv.reg.h; + + return err; +} +EXPORT_SYMBOL(rdmsr_on_cpu); + +int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); + *q = rv.reg.q; + + return err; +} +EXPORT_SYMBOL(rdmsrl_on_cpu); + +int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + rv.reg.l = l; + rv.reg.h = h; + err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); + + return err; +} +EXPORT_SYMBOL(wrmsr_on_cpu); + +int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + rv.reg.q = q; + + err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); + + return err; +} +EXPORT_SYMBOL(wrmsrl_on_cpu); + +static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no, + struct msr *msrs, + void (*msr_func) (void *info)) +{ + struct msr_info rv; + int this_cpu; + + memset(&rv, 0, sizeof(rv)); + + rv.msrs = msrs; + rv.msr_no = msr_no; + + this_cpu = get_cpu(); + + if (cpumask_test_cpu(this_cpu, mask)) + msr_func(&rv); + + smp_call_function_many(mask, msr_func, &rv, 1); + put_cpu(); +} + +/* rdmsr on a bunch of CPUs + * + * @mask: which CPUs + * @msr_no: which MSR + * @msrs: array of MSR values + * + */ +void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs) +{ + __rwmsr_on_cpus(mask, msr_no, msrs, __rdmsr_on_cpu); +} +EXPORT_SYMBOL(rdmsr_on_cpus); + +/* + * wrmsr on a bunch of CPUs + * + * @mask: which CPUs + * @msr_no: which MSR + * @msrs: array of MSR values + * + */ +void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs) +{ + __rwmsr_on_cpus(mask, msr_no, msrs, __wrmsr_on_cpu); +} +EXPORT_SYMBOL(wrmsr_on_cpus); + +/* These "safe" variants are slower and should be used when the target MSR + may not actually exist. */ +static void __rdmsr_safe_on_cpu(void *info) +{ + struct msr_info *rv = info; + + rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h); +} + +static void __wrmsr_safe_on_cpu(void *info) +{ + struct msr_info *rv = info; + + rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h); +} + +int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1); + *l = rv.reg.l; + *h = rv.reg.h; + + return err ? err : rv.err; +} +EXPORT_SYMBOL(rdmsr_safe_on_cpu); + +int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + rv.reg.l = l; + rv.reg.h = h; + err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); + + return err ? err : rv.err; +} +EXPORT_SYMBOL(wrmsr_safe_on_cpu); + +int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + rv.reg.q = q; + + err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); + + return err ? err : rv.err; +} +EXPORT_SYMBOL(wrmsrl_safe_on_cpu); + +int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1); + *q = rv.reg.q; + + return err ? err : rv.err; +} +EXPORT_SYMBOL(rdmsrl_safe_on_cpu); + +/* + * These variants are significantly slower, but allows control over + * the entire 32-bit GPR set. + */ +static void __rdmsr_safe_regs_on_cpu(void *info) +{ + struct msr_regs_info *rv = info; + + rv->err = rdmsr_safe_regs(rv->regs); +} + +static void __wrmsr_safe_regs_on_cpu(void *info) +{ + struct msr_regs_info *rv = info; + + rv->err = wrmsr_safe_regs(rv->regs); +} + +int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) +{ + int err; + struct msr_regs_info rv; + + rv.regs = regs; + rv.err = -EIO; + err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1); + + return err ? err : rv.err; +} +EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu); + +int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) +{ + int err; + struct msr_regs_info rv; + + rv.regs = regs; + rv.err = -EIO; + err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1); + + return err ? err : rv.err; +} +EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu); diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c new file mode 100644 index 00000000000..43623739c7c --- /dev/null +++ b/arch/x86/lib/msr.c @@ -0,0 +1,110 @@ +#include <linux/module.h> +#include <linux/preempt.h> +#include <asm/msr.h> + +struct msr *msrs_alloc(void) +{ + struct msr *msrs = NULL; + + msrs = alloc_percpu(struct msr); + if (!msrs) { + pr_warn("%s: error allocating msrs\n", __func__); + return NULL; + } + + return msrs; +} +EXPORT_SYMBOL(msrs_alloc); + +void msrs_free(struct msr *msrs) +{ + free_percpu(msrs); +} +EXPORT_SYMBOL(msrs_free); + +/** + * Read an MSR with error handling + * + * @msr: MSR to read + * @m: value to read into + * + * It returns read data only on success, otherwise it doesn't change the output + * argument @m. + * + */ +int msr_read(u32 msr, struct msr *m) +{ + int err; + u64 val; + + err = rdmsrl_safe(msr, &val); + if (!err) + m->q = val; + + return err; +} + +/** + * Write an MSR with error handling + * + * @msr: MSR to write + * @m: value to write + */ +int msr_write(u32 msr, struct msr *m) +{ + return wrmsrl_safe(msr, m->q); +} + +static inline int __flip_bit(u32 msr, u8 bit, bool set) +{ + struct msr m, m1; + int err = -EINVAL; + + if (bit > 63) + return err; + + err = msr_read(msr, &m); + if (err) + return err; + + m1 = m; + if (set) + m1.q |= BIT_64(bit); + else + m1.q &= ~BIT_64(bit); + + if (m1.q == m.q) + return 0; + + err = msr_write(msr, &m1); + if (err) + return err; + + return 1; +} + +/** + * Set @bit in a MSR @msr. + * + * Retval: + * < 0: An error was encountered. + * = 0: Bit was already set. + * > 0: Hardware accepted the MSR write. + */ +int msr_set_bit(u32 msr, u8 bit) +{ + return __flip_bit(msr, bit, true); +} + +/** + * Clear @bit in a MSR @msr. + * + * Retval: + * < 0: An error was encountered. + * = 0: Bit was already cleared. + * > 0: Hardware accepted the MSR write. + */ +int msr_clear_bit(u32 msr, u8 bit) +{ + return __flip_bit(msr, bit, false); +} diff --git a/arch/x86/lib/putuser_32.S b/arch/x86/lib/putuser.S index f58fba109d1..fc6ba17a7ee 100644 --- a/arch/x86/lib/putuser_32.S +++ b/arch/x86/lib/putuser.S @@ -2,6 +2,8 @@ * __put_user functions. * * (C) Copyright 2005 Linus Torvalds + * (C) Copyright 2005 Andi Kleen + * (C) Copyright 2008 Glauber Costa * * These functions have a non-standard call interface * to make them more efficient, especially as they @@ -11,6 +13,9 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> #include <asm/thread_info.h> +#include <asm/errno.h> +#include <asm/asm.h> +#include <asm/smap.h> /* @@ -26,73 +31,71 @@ */ #define ENTER CFI_STARTPROC ; \ - pushl %ebx ; \ - CFI_ADJUST_CFA_OFFSET 4 ; \ - CFI_REL_OFFSET ebx, 0 ; \ - GET_THREAD_INFO(%ebx) -#define EXIT popl %ebx ; \ - CFI_ADJUST_CFA_OFFSET -4 ; \ - CFI_RESTORE ebx ; \ - ret ; \ + GET_THREAD_INFO(%_ASM_BX) +#define EXIT ASM_CLAC ; \ + ret ; \ CFI_ENDPROC .text ENTRY(__put_user_1) ENTER - cmpl TI_addr_limit(%ebx),%ecx + cmp TI_addr_limit(%_ASM_BX),%_ASM_CX jae bad_put_user -1: movb %al,(%ecx) - xorl %eax,%eax + ASM_STAC +1: movb %al,(%_ASM_CX) + xor %eax,%eax EXIT ENDPROC(__put_user_1) ENTRY(__put_user_2) ENTER - movl TI_addr_limit(%ebx),%ebx - subl $1,%ebx - cmpl %ebx,%ecx + mov TI_addr_limit(%_ASM_BX),%_ASM_BX + sub $1,%_ASM_BX + cmp %_ASM_BX,%_ASM_CX jae bad_put_user -2: movw %ax,(%ecx) - xorl %eax,%eax + ASM_STAC +2: movw %ax,(%_ASM_CX) + xor %eax,%eax EXIT ENDPROC(__put_user_2) ENTRY(__put_user_4) ENTER - movl TI_addr_limit(%ebx),%ebx - subl $3,%ebx - cmpl %ebx,%ecx + mov TI_addr_limit(%_ASM_BX),%_ASM_BX + sub $3,%_ASM_BX + cmp %_ASM_BX,%_ASM_CX jae bad_put_user -3: movl %eax,(%ecx) - xorl %eax,%eax + ASM_STAC +3: movl %eax,(%_ASM_CX) + xor %eax,%eax EXIT ENDPROC(__put_user_4) ENTRY(__put_user_8) ENTER - movl TI_addr_limit(%ebx),%ebx - subl $7,%ebx - cmpl %ebx,%ecx + mov TI_addr_limit(%_ASM_BX),%_ASM_BX + sub $7,%_ASM_BX + cmp %_ASM_BX,%_ASM_CX jae bad_put_user -4: movl %eax,(%ecx) -5: movl %edx,4(%ecx) - xorl %eax,%eax + ASM_STAC +4: mov %_ASM_AX,(%_ASM_CX) +#ifdef CONFIG_X86_32 +5: movl %edx,4(%_ASM_CX) +#endif + xor %eax,%eax EXIT ENDPROC(__put_user_8) bad_put_user: - CFI_STARTPROC simple - CFI_DEF_CFA esp, 2*4 - CFI_OFFSET eip, -1*4 - CFI_OFFSET ebx, -2*4 - movl $-14,%eax + CFI_STARTPROC + movl $-EFAULT,%eax EXIT END(bad_put_user) -.section __ex_table,"a" - .long 1b,bad_put_user - .long 2b,bad_put_user - .long 3b,bad_put_user - .long 4b,bad_put_user - .long 5b,bad_put_user -.previous + _ASM_EXTABLE(1b,bad_put_user) + _ASM_EXTABLE(2b,bad_put_user) + _ASM_EXTABLE(3b,bad_put_user) + _ASM_EXTABLE(4b,bad_put_user) +#ifdef CONFIG_X86_32 + _ASM_EXTABLE(5b,bad_put_user) +#endif diff --git a/arch/x86/lib/putuser_64.S b/arch/x86/lib/putuser_64.S deleted file mode 100644 index 4989f5a8fa9..00000000000 --- a/arch/x86/lib/putuser_64.S +++ /dev/null @@ -1,106 +0,0 @@ -/* - * __put_user functions. - * - * (C) Copyright 1998 Linus Torvalds - * (C) Copyright 2005 Andi Kleen - * - * These functions have a non-standard call interface - * to make them more efficient, especially as they - * return an error value in addition to the "real" - * return value. - */ - -/* - * __put_user_X - * - * Inputs: %rcx contains the address - * %rdx contains new value - * - * Outputs: %rax is error code (0 or -EFAULT) - * - * %r8 is destroyed. - * - * These functions should not modify any other registers, - * as they get called from within inline assembly. - */ - -#include <linux/linkage.h> -#include <asm/dwarf2.h> -#include <asm/page.h> -#include <asm/errno.h> -#include <asm/asm-offsets.h> -#include <asm/thread_info.h> - - .text -ENTRY(__put_user_1) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - cmpq threadinfo_addr_limit(%r8),%rcx - jae bad_put_user -1: movb %dl,(%rcx) - xorl %eax,%eax - ret - CFI_ENDPROC -ENDPROC(__put_user_1) - -ENTRY(__put_user_2) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $1,%rcx - jc 20f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 20f - decq %rcx -2: movw %dx,(%rcx) - xorl %eax,%eax - ret -20: decq %rcx - jmp bad_put_user - CFI_ENDPROC -ENDPROC(__put_user_2) - -ENTRY(__put_user_4) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $3,%rcx - jc 30f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 30f - subq $3,%rcx -3: movl %edx,(%rcx) - xorl %eax,%eax - ret -30: subq $3,%rcx - jmp bad_put_user - CFI_ENDPROC -ENDPROC(__put_user_4) - -ENTRY(__put_user_8) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $7,%rcx - jc 40f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 40f - subq $7,%rcx -4: movq %rdx,(%rcx) - xorl %eax,%eax - ret -40: subq $7,%rcx - jmp bad_put_user - CFI_ENDPROC -ENDPROC(__put_user_8) - -bad_put_user: - CFI_STARTPROC - movq $(-EFAULT),%rax - ret - CFI_ENDPROC -END(bad_put_user) - -.section __ex_table,"a" - .quad 1b,bad_put_user - .quad 2b,bad_put_user - .quad 3b,bad_put_user - .quad 4b,bad_put_user -.previous diff --git a/arch/x86/lib/rwlock.S b/arch/x86/lib/rwlock.S new file mode 100644 index 00000000000..1cad22139c8 --- /dev/null +++ b/arch/x86/lib/rwlock.S @@ -0,0 +1,44 @@ +/* Slow paths of read/write spinlocks. */ + +#include <linux/linkage.h> +#include <asm/alternative-asm.h> +#include <asm/frame.h> +#include <asm/rwlock.h> + +#ifdef CONFIG_X86_32 +# define __lock_ptr eax +#else +# define __lock_ptr rdi +#endif + +ENTRY(__write_lock_failed) + CFI_STARTPROC + FRAME +0: LOCK_PREFIX + WRITE_LOCK_ADD($RW_LOCK_BIAS) (%__lock_ptr) +1: rep; nop + cmpl $WRITE_LOCK_CMP, (%__lock_ptr) + jne 1b + LOCK_PREFIX + WRITE_LOCK_SUB($RW_LOCK_BIAS) (%__lock_ptr) + jnz 0b + ENDFRAME + ret + CFI_ENDPROC +END(__write_lock_failed) + +ENTRY(__read_lock_failed) + CFI_STARTPROC + FRAME +0: LOCK_PREFIX + READ_LOCK_SIZE(inc) (%__lock_ptr) +1: rep; nop + READ_LOCK_SIZE(cmp) $1, (%__lock_ptr) + js 1b + LOCK_PREFIX + READ_LOCK_SIZE(dec) (%__lock_ptr) + js 0b + ENDFRAME + ret + CFI_ENDPROC +END(__read_lock_failed) diff --git a/arch/x86/lib/rwlock_64.S b/arch/x86/lib/rwlock_64.S deleted file mode 100644 index 05ea55f7140..00000000000 --- a/arch/x86/lib/rwlock_64.S +++ /dev/null @@ -1,38 +0,0 @@ -/* Slow paths of read/write spinlocks. */ - -#include <linux/linkage.h> -#include <asm/rwlock.h> -#include <asm/alternative-asm.h> -#include <asm/dwarf2.h> - -/* rdi: pointer to rwlock_t */ -ENTRY(__write_lock_failed) - CFI_STARTPROC - LOCK_PREFIX - addl $RW_LOCK_BIAS,(%rdi) -1: rep - nop - cmpl $RW_LOCK_BIAS,(%rdi) - jne 1b - LOCK_PREFIX - subl $RW_LOCK_BIAS,(%rdi) - jnz __write_lock_failed - ret - CFI_ENDPROC -END(__write_lock_failed) - -/* rdi: pointer to rwlock_t */ -ENTRY(__read_lock_failed) - CFI_STARTPROC - LOCK_PREFIX - incl (%rdi) -1: rep - nop - cmpl $1,(%rdi) - js 1b - LOCK_PREFIX - decl (%rdi) - js __read_lock_failed - ret - CFI_ENDPROC -END(__read_lock_failed) diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S new file mode 100644 index 00000000000..5dff5f04246 --- /dev/null +++ b/arch/x86/lib/rwsem.S @@ -0,0 +1,136 @@ +/* + * x86 semaphore implementation. + * + * (C) Copyright 1999 Linus Torvalds + * + * Portions Copyright 1999 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org> + */ + +#include <linux/linkage.h> +#include <asm/alternative-asm.h> +#include <asm/dwarf2.h> + +#define __ASM_HALF_REG(reg) __ASM_SEL(reg, e##reg) +#define __ASM_HALF_SIZE(inst) __ASM_SEL(inst##w, inst##l) + +#ifdef CONFIG_X86_32 + +/* + * The semaphore operations have a special calling sequence that + * allow us to do a simpler in-line version of them. These routines + * need to convert that sequence back into the C sequence when + * there is contention on the semaphore. + * + * %eax contains the semaphore pointer on entry. Save the C-clobbered + * registers (%eax, %edx and %ecx) except %eax whish is either a return + * value or just clobbered.. + */ + +#define save_common_regs \ + pushl_cfi %ecx; CFI_REL_OFFSET ecx, 0 + +#define restore_common_regs \ + popl_cfi %ecx; CFI_RESTORE ecx + + /* Avoid uglifying the argument copying x86-64 needs to do. */ + .macro movq src, dst + .endm + +#else + +/* + * x86-64 rwsem wrappers + * + * This interfaces the inline asm code to the slow-path + * C routines. We need to save the call-clobbered regs + * that the asm does not mark as clobbered, and move the + * argument from %rax to %rdi. + * + * NOTE! We don't need to save %rax, because the functions + * will always return the semaphore pointer in %rax (which + * is also the input argument to these helpers) + * + * The following can clobber %rdx because the asm clobbers it: + * call_rwsem_down_write_failed + * call_rwsem_wake + * but %rdi, %rsi, %rcx, %r8-r11 always need saving. + */ + +#define save_common_regs \ + pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \ + pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \ + pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \ + pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \ + pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \ + pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \ + pushq_cfi %r11; CFI_REL_OFFSET r11, 0 + +#define restore_common_regs \ + popq_cfi %r11; CFI_RESTORE r11; \ + popq_cfi %r10; CFI_RESTORE r10; \ + popq_cfi %r9; CFI_RESTORE r9; \ + popq_cfi %r8; CFI_RESTORE r8; \ + popq_cfi %rcx; CFI_RESTORE rcx; \ + popq_cfi %rsi; CFI_RESTORE rsi; \ + popq_cfi %rdi; CFI_RESTORE rdi + +#endif + +/* Fix up special calling conventions */ +ENTRY(call_rwsem_down_read_failed) + CFI_STARTPROC + save_common_regs + __ASM_SIZE(push,_cfi) %__ASM_REG(dx) + CFI_REL_OFFSET __ASM_REG(dx), 0 + movq %rax,%rdi + call rwsem_down_read_failed + __ASM_SIZE(pop,_cfi) %__ASM_REG(dx) + CFI_RESTORE __ASM_REG(dx) + restore_common_regs + ret + CFI_ENDPROC +ENDPROC(call_rwsem_down_read_failed) + +ENTRY(call_rwsem_down_write_failed) + CFI_STARTPROC + save_common_regs + movq %rax,%rdi + call rwsem_down_write_failed + restore_common_regs + ret + CFI_ENDPROC +ENDPROC(call_rwsem_down_write_failed) + +ENTRY(call_rwsem_wake) + CFI_STARTPROC + /* do nothing if still outstanding active readers */ + __ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx) + jnz 1f + save_common_regs + movq %rax,%rdi + call rwsem_wake + restore_common_regs +1: ret + CFI_ENDPROC +ENDPROC(call_rwsem_wake) + +ENTRY(call_rwsem_downgrade_wake) + CFI_STARTPROC + save_common_regs + __ASM_SIZE(push,_cfi) %__ASM_REG(dx) + CFI_REL_OFFSET __ASM_REG(dx), 0 + movq %rax,%rdi + call rwsem_downgrade_wake + __ASM_SIZE(pop,_cfi) %__ASM_REG(dx) + CFI_RESTORE __ASM_REG(dx) + restore_common_regs + ret + CFI_ENDPROC +ENDPROC(call_rwsem_downgrade_wake) diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S deleted file mode 100644 index 3899bd37fdf..00000000000 --- a/arch/x86/lib/semaphore_32.S +++ /dev/null @@ -1,219 +0,0 @@ -/* - * i386 semaphore implementation. - * - * (C) Copyright 1999 Linus Torvalds - * - * Portions Copyright 1999 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org> - */ - -#include <linux/linkage.h> -#include <asm/rwlock.h> -#include <asm/alternative-asm.h> -#include <asm/frame.h> -#include <asm/dwarf2.h> - -/* - * The semaphore operations have a special calling sequence that - * allow us to do a simpler in-line version of them. These routines - * need to convert that sequence back into the C sequence when - * there is contention on the semaphore. - * - * %eax contains the semaphore pointer on entry. Save the C-clobbered - * registers (%eax, %edx and %ecx) except %eax whish is either a return - * value or just clobbered.. - */ - .section .sched.text, "ax" -ENTRY(__down_failed) - CFI_STARTPROC - FRAME - pushl %edx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET edx,0 - pushl %ecx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ecx,0 - call __down - popl %ecx - CFI_ADJUST_CFA_OFFSET -4 - CFI_RESTORE ecx - popl %edx - CFI_ADJUST_CFA_OFFSET -4 - CFI_RESTORE edx - ENDFRAME - ret - CFI_ENDPROC - ENDPROC(__down_failed) - -ENTRY(__down_failed_interruptible) - CFI_STARTPROC - FRAME - pushl %edx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET edx,0 - pushl %ecx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ecx,0 - call __down_interruptible - popl %ecx - CFI_ADJUST_CFA_OFFSET -4 - CFI_RESTORE ecx - popl %edx - CFI_ADJUST_CFA_OFFSET -4 - CFI_RESTORE edx - ENDFRAME - ret - CFI_ENDPROC - ENDPROC(__down_failed_interruptible) - -ENTRY(__down_failed_trylock) - CFI_STARTPROC - FRAME - pushl %edx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET edx,0 - pushl %ecx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ecx,0 - call __down_trylock - popl %ecx - CFI_ADJUST_CFA_OFFSET -4 - CFI_RESTORE ecx - popl %edx - CFI_ADJUST_CFA_OFFSET -4 - CFI_RESTORE edx - ENDFRAME - ret - CFI_ENDPROC - ENDPROC(__down_failed_trylock) - -ENTRY(__up_wakeup) - CFI_STARTPROC - FRAME - pushl %edx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET edx,0 - pushl %ecx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ecx,0 - call __up - popl %ecx - CFI_ADJUST_CFA_OFFSET -4 - CFI_RESTORE ecx - popl %edx - CFI_ADJUST_CFA_OFFSET -4 - CFI_RESTORE edx - ENDFRAME - ret - CFI_ENDPROC - ENDPROC(__up_wakeup) - -/* - * rw spinlock fallbacks - */ -#ifdef CONFIG_SMP -ENTRY(__write_lock_failed) - CFI_STARTPROC simple - FRAME -2: LOCK_PREFIX - addl $ RW_LOCK_BIAS,(%eax) -1: rep; nop - cmpl $ RW_LOCK_BIAS,(%eax) - jne 1b - LOCK_PREFIX - subl $ RW_LOCK_BIAS,(%eax) - jnz 2b - ENDFRAME - ret - CFI_ENDPROC - ENDPROC(__write_lock_failed) - -ENTRY(__read_lock_failed) - CFI_STARTPROC - FRAME -2: LOCK_PREFIX - incl (%eax) -1: rep; nop - cmpl $1,(%eax) - js 1b - LOCK_PREFIX - decl (%eax) - js 2b - ENDFRAME - ret - CFI_ENDPROC - ENDPROC(__read_lock_failed) - -#endif - -#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM - -/* Fix up special calling conventions */ -ENTRY(call_rwsem_down_read_failed) - CFI_STARTPROC - push %ecx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ecx,0 - push %edx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET edx,0 - call rwsem_down_read_failed - pop %edx - CFI_ADJUST_CFA_OFFSET -4 - pop %ecx - CFI_ADJUST_CFA_OFFSET -4 - ret - CFI_ENDPROC - ENDPROC(call_rwsem_down_read_failed) - -ENTRY(call_rwsem_down_write_failed) - CFI_STARTPROC - push %ecx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ecx,0 - calll rwsem_down_write_failed - pop %ecx - CFI_ADJUST_CFA_OFFSET -4 - ret - CFI_ENDPROC - ENDPROC(call_rwsem_down_write_failed) - -ENTRY(call_rwsem_wake) - CFI_STARTPROC - decw %dx /* do nothing if still outstanding active readers */ - jnz 1f - push %ecx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ecx,0 - call rwsem_wake - pop %ecx - CFI_ADJUST_CFA_OFFSET -4 -1: ret - CFI_ENDPROC - ENDPROC(call_rwsem_wake) - -/* Fix up special calling conventions */ -ENTRY(call_rwsem_downgrade_wake) - CFI_STARTPROC - push %ecx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ecx,0 - push %edx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET edx,0 - call rwsem_downgrade_wake - pop %edx - CFI_ADJUST_CFA_OFFSET -4 - pop %ecx - CFI_ADJUST_CFA_OFFSET -4 - ret - CFI_ENDPROC - ENDPROC(call_rwsem_downgrade_wake) - -#endif diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c index c2c0504a307..bd59090825d 100644 --- a/arch/x86/lib/string_32.c +++ b/arch/x86/lib/string_32.c @@ -14,25 +14,25 @@ #include <linux/module.h> #ifdef __HAVE_ARCH_STRCPY -char *strcpy(char * dest,const char *src) +char *strcpy(char *dest, const char *src) { int d0, d1, d2; - asm volatile( "1:\tlodsb\n\t" + asm volatile("1:\tlodsb\n\t" "stosb\n\t" "testb %%al,%%al\n\t" "jne 1b" : "=&S" (d0), "=&D" (d1), "=&a" (d2) - :"0" (src),"1" (dest) : "memory"); + : "0" (src), "1" (dest) : "memory"); return dest; } EXPORT_SYMBOL(strcpy); #endif #ifdef __HAVE_ARCH_STRNCPY -char *strncpy(char * dest,const char *src,size_t count) +char *strncpy(char *dest, const char *src, size_t count) { int d0, d1, d2, d3; - asm volatile( "1:\tdecl %2\n\t" + asm volatile("1:\tdecl %2\n\t" "js 2f\n\t" "lodsb\n\t" "stosb\n\t" @@ -42,17 +42,17 @@ char *strncpy(char * dest,const char *src,size_t count) "stosb\n" "2:" : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3) - :"0" (src),"1" (dest),"2" (count) : "memory"); + : "0" (src), "1" (dest), "2" (count) : "memory"); return dest; } EXPORT_SYMBOL(strncpy); #endif #ifdef __HAVE_ARCH_STRCAT -char *strcat(char * dest,const char * src) +char *strcat(char *dest, const char *src) { int d0, d1, d2, d3; - asm volatile( "repne\n\t" + asm volatile("repne\n\t" "scasb\n\t" "decl %1\n" "1:\tlodsb\n\t" @@ -60,17 +60,17 @@ char *strcat(char * dest,const char * src) "testb %%al,%%al\n\t" "jne 1b" : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3) - : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu): "memory"); + : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu) : "memory"); return dest; } EXPORT_SYMBOL(strcat); #endif #ifdef __HAVE_ARCH_STRNCAT -char *strncat(char * dest,const char * src,size_t count) +char *strncat(char *dest, const char *src, size_t count) { int d0, d1, d2, d3; - asm volatile( "repne\n\t" + asm volatile("repne\n\t" "scasb\n\t" "decl %1\n\t" "movl %8,%3\n" @@ -83,7 +83,7 @@ char *strncat(char * dest,const char * src,size_t count) "2:\txorl %2,%2\n\t" "stosb" : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3) - : "0" (src),"1" (dest),"2" (0),"3" (0xffffffffu), "g" (count) + : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu), "g" (count) : "memory"); return dest; } @@ -91,11 +91,11 @@ EXPORT_SYMBOL(strncat); #endif #ifdef __HAVE_ARCH_STRCMP -int strcmp(const char * cs,const char * ct) +int strcmp(const char *cs, const char *ct) { int d0, d1; int res; - asm volatile( "1:\tlodsb\n\t" + asm volatile("1:\tlodsb\n\t" "scasb\n\t" "jne 2f\n\t" "testb %%al,%%al\n\t" @@ -105,20 +105,20 @@ int strcmp(const char * cs,const char * ct) "2:\tsbbl %%eax,%%eax\n\t" "orb $1,%%al\n" "3:" - :"=a" (res), "=&S" (d0), "=&D" (d1) - :"1" (cs),"2" (ct) - :"memory"); + : "=a" (res), "=&S" (d0), "=&D" (d1) + : "1" (cs), "2" (ct) + : "memory"); return res; } EXPORT_SYMBOL(strcmp); #endif #ifdef __HAVE_ARCH_STRNCMP -int strncmp(const char * cs,const char * ct,size_t count) +int strncmp(const char *cs, const char *ct, size_t count) { int res; int d0, d1, d2; - asm volatile( "1:\tdecl %3\n\t" + asm volatile("1:\tdecl %3\n\t" "js 2f\n\t" "lodsb\n\t" "scasb\n\t" @@ -130,20 +130,20 @@ int strncmp(const char * cs,const char * ct,size_t count) "3:\tsbbl %%eax,%%eax\n\t" "orb $1,%%al\n" "4:" - :"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2) - :"1" (cs),"2" (ct),"3" (count) - :"memory"); + : "=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2) + : "1" (cs), "2" (ct), "3" (count) + : "memory"); return res; } EXPORT_SYMBOL(strncmp); #endif #ifdef __HAVE_ARCH_STRCHR -char *strchr(const char * s, int c) +char *strchr(const char *s, int c) { int d0; - char * res; - asm volatile( "movb %%al,%%ah\n" + char *res; + asm volatile("movb %%al,%%ah\n" "1:\tlodsb\n\t" "cmpb %%ah,%%al\n\t" "je 2f\n\t" @@ -152,53 +152,51 @@ char *strchr(const char * s, int c) "movl $1,%1\n" "2:\tmovl %1,%0\n\t" "decl %0" - :"=a" (res), "=&S" (d0) - :"1" (s),"0" (c) - :"memory"); + : "=a" (res), "=&S" (d0) + : "1" (s), "0" (c) + : "memory"); return res; } EXPORT_SYMBOL(strchr); #endif #ifdef __HAVE_ARCH_STRLEN -size_t strlen(const char * s) +size_t strlen(const char *s) { int d0; - int res; - asm volatile( "repne\n\t" - "scasb\n\t" - "notl %0\n\t" - "decl %0" - :"=c" (res), "=&D" (d0) - :"1" (s),"a" (0), "0" (0xffffffffu) - :"memory"); - return res; + size_t res; + asm volatile("repne\n\t" + "scasb" + : "=c" (res), "=&D" (d0) + : "1" (s), "a" (0), "0" (0xffffffffu) + : "memory"); + return ~res - 1; } EXPORT_SYMBOL(strlen); #endif #ifdef __HAVE_ARCH_MEMCHR -void *memchr(const void *cs,int c,size_t count) +void *memchr(const void *cs, int c, size_t count) { int d0; void *res; if (!count) return NULL; - asm volatile( "repne\n\t" + asm volatile("repne\n\t" "scasb\n\t" "je 1f\n\t" "movl $1,%0\n" "1:\tdecl %0" - :"=D" (res), "=&c" (d0) - :"a" (c),"0" (cs),"1" (count) - :"memory"); + : "=D" (res), "=&c" (d0) + : "a" (c), "0" (cs), "1" (count) + : "memory"); return res; } EXPORT_SYMBOL(memchr); #endif #ifdef __HAVE_ARCH_MEMSCAN -void *memscan(void * addr, int c, size_t size) +void *memscan(void *addr, int c, size_t size) { if (!size) return addr; @@ -219,7 +217,7 @@ size_t strnlen(const char *s, size_t count) { int d0; int res; - asm volatile( "movl %2,%0\n\t" + asm volatile("movl %2,%0\n\t" "jmp 2f\n" "1:\tcmpb $0,(%0)\n\t" "je 3f\n\t" @@ -228,9 +226,9 @@ size_t strnlen(const char *s, size_t count) "cmpl $-1,%1\n\t" "jne 1b\n" "3:\tsubl %2,%0" - :"=a" (res), "=&d" (d0) - :"c" (s),"1" (count) - :"memory"); + : "=a" (res), "=&d" (d0) + : "c" (s), "1" (count) + : "memory"); return res; } EXPORT_SYMBOL(strnlen); diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c index a3dafbf59da..8e2d55f754b 100644 --- a/arch/x86/lib/strstr_32.c +++ b/arch/x86/lib/strstr_32.c @@ -1,9 +1,9 @@ #include <linux/string.h> -char * strstr(const char * cs,const char * ct) +char *strstr(const char *cs, const char *ct) { int d0, d1; -register char * __res; +register char *__res; __asm__ __volatile__( "movl %6,%%edi\n\t" "repne\n\t" @@ -23,9 +23,9 @@ __asm__ __volatile__( "jne 1b\n\t" "xorl %%eax,%%eax\n\t" "2:" - :"=a" (__res), "=&c" (d0), "=&S" (d1) - :"0" (0), "1" (0xffffffff), "2" (cs), "g" (ct) - :"dx", "di"); + : "=a" (__res), "=&c" (d0), "=&S" (d1) + : "0" (0), "1" (0xffffffff), "2" (cs), "g" (ct) + : "dx", "di"); return __res; } diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S new file mode 100644 index 00000000000..28f85c91671 --- /dev/null +++ b/arch/x86/lib/thunk_32.S @@ -0,0 +1,30 @@ +/* + * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash) + * Copyright 2008 by Steven Rostedt, Red Hat, Inc + * (inspired by Andi Kleen's thunk_64.S) + * Subject to the GNU public license, v.2. No warranty of any kind. + */ + #include <linux/linkage.h> + #include <asm/asm.h> + +#ifdef CONFIG_TRACE_IRQFLAGS + /* put return address in eax (arg1) */ + .macro thunk_ra name,func + .globl \name +\name: + pushl %eax + pushl %ecx + pushl %edx + /* Place EIP in the arg1 */ + movl 3*4(%esp), %eax + call \func + popl %edx + popl %ecx + popl %eax + ret + _ASM_NOKPROBE(\name) + .endm + + thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller + thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller +#endif diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index 8b92d428ab0..92d9feaff42 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S @@ -2,70 +2,47 @@ * Save registers before calling assembly functions. This avoids * disturbance of register allocation in some inline assembly constructs. * Copyright 2001,2002 by Andi Kleen, SuSE Labs. + * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc. * Subject to the GNU public license, v.2. No warranty of any kind. */ +#include <linux/linkage.h> +#include <asm/dwarf2.h> +#include <asm/calling.h> +#include <asm/asm.h> - #include <linux/linkage.h> - #include <asm/dwarf2.h> - #include <asm/calling.h> - #include <asm/rwlock.h> - - /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ - .macro thunk name,func + /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ + .macro THUNK name, func, put_ret_addr_in_rdi=0 .globl \name -\name: +\name: CFI_STARTPROC - SAVE_ARGS - call \func - jmp restore - CFI_ENDPROC - .endm - /* rdi: arg1 ... normal C conventions. rax is passed from C. */ - .macro thunk_retrax name,func - .globl \name -\name: - CFI_STARTPROC + /* this one pushes 9 elems, the next one would be %rIP */ SAVE_ARGS + + .if \put_ret_addr_in_rdi + movq_cfi_restore 9*8, rdi + .endif + call \func - jmp restore_norax + jmp restore CFI_ENDPROC + _ASM_NOKPROBE(\name) .endm - - - .section .sched.text, "ax" -#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM - thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed - thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed - thunk rwsem_wake_thunk,rwsem_wake - thunk rwsem_downgrade_thunk,rwsem_downgrade_wake -#endif - - thunk __down_failed,__down - thunk_retrax __down_failed_interruptible,__down_interruptible - thunk_retrax __down_failed_trylock,__down_trylock - thunk __up_wakeup,__up #ifdef CONFIG_TRACE_IRQFLAGS - thunk trace_hardirqs_on_thunk,trace_hardirqs_on - thunk trace_hardirqs_off_thunk,trace_hardirqs_off + THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1 + THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC - thunk lockdep_sys_exit_thunk,lockdep_sys_exit + THUNK lockdep_sys_exit_thunk,lockdep_sys_exit #endif - + /* SAVE_ARGS below is used only for the .cfi directives it contains. */ CFI_STARTPROC SAVE_ARGS restore: RESTORE_ARGS - ret - CFI_ENDPROC - - CFI_STARTPROC - SAVE_ARGS -restore_norax: - RESTORE_ARGS 1 ret CFI_ENDPROC + _ASM_NOKPROBE(restore) diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c new file mode 100644 index 00000000000..ddf9ecb53cc --- /dev/null +++ b/arch/x86/lib/usercopy.c @@ -0,0 +1,36 @@ +/* + * User address space access functions. + * + * For licencing details see kernel-base/COPYING + */ + +#include <linux/highmem.h> +#include <linux/module.h> + +#include <asm/word-at-a-time.h> +#include <linux/sched.h> + +/* + * We rely on the nested NMI work to allow atomic faults from the NMI path; the + * nested NMI paths are careful to preserve CR2. + */ +unsigned long +copy_from_user_nmi(void *to, const void __user *from, unsigned long n) +{ + unsigned long ret; + + if (__range_not_ok(from, n, TASK_SIZE)) + return 0; + + /* + * Even though this function is typically called from NMI/IRQ context + * disable pagefaults so that its behaviour is consistent even when + * called form other contexts. + */ + pagefault_disable(); + ret = __copy_from_user_inatomic(to, from, n); + pagefault_enable(); + + return ret; +} +EXPORT_SYMBOL_GPL(copy_from_user_nmi); diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index e849b9998b0..e2f5e21c03b 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -1,4 +1,4 @@ -/* +/* * User address space access functions. * The non inlined parts of asm-i386/uaccess.h are here. * @@ -13,6 +13,14 @@ #include <linux/interrupt.h> #include <asm/uaccess.h> #include <asm/mmx.h> +#include <asm/asm.h> + +#ifdef CONFIG_X86_INTEL_USERCOPY +/* + * Alignment at which movsl is preferred for bulk memory copies. + */ +struct movsl_mask movsl_mask __read_mostly; +#endif static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n) { @@ -22,95 +30,8 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon #endif return 1; } -#define movsl_is_ok(a1,a2,n) \ - __movsl_is_ok((unsigned long)(a1),(unsigned long)(a2),(n)) - -/* - * Copy a null terminated string from userspace. - */ - -#define __do_strncpy_from_user(dst,src,count,res) \ -do { \ - int __d0, __d1, __d2; \ - might_sleep(); \ - __asm__ __volatile__( \ - " testl %1,%1\n" \ - " jz 2f\n" \ - "0: lodsb\n" \ - " stosb\n" \ - " testb %%al,%%al\n" \ - " jz 1f\n" \ - " decl %1\n" \ - " jnz 0b\n" \ - "1: subl %1,%0\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: movl %5,%0\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(0b,3b) \ - : "=d"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \ - "=&D" (__d2) \ - : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ - : "memory"); \ -} while (0) - -/** - * __strncpy_from_user: - Copy a NUL terminated string from userspace, with less checking. - * @dst: Destination address, in kernel space. This buffer must be at - * least @count bytes long. - * @src: Source address, in user space. - * @count: Maximum number of bytes to copy, including the trailing NUL. - * - * Copies a NUL-terminated string from userspace to kernel space. - * Caller must check the specified block with access_ok() before calling - * this function. - * - * On success, returns the length of the string (not including the trailing - * NUL). - * - * If access to userspace fails, returns -EFAULT (some data may have been - * copied). - * - * If @count is smaller than the length of the string, copies @count bytes - * and returns @count. - */ -long -__strncpy_from_user(char *dst, const char __user *src, long count) -{ - long res; - __do_strncpy_from_user(dst, src, count, res); - return res; -} -EXPORT_SYMBOL(__strncpy_from_user); - -/** - * strncpy_from_user: - Copy a NUL terminated string from userspace. - * @dst: Destination address, in kernel space. This buffer must be at - * least @count bytes long. - * @src: Source address, in user space. - * @count: Maximum number of bytes to copy, including the trailing NUL. - * - * Copies a NUL-terminated string from userspace to kernel space. - * - * On success, returns the length of the string (not including the trailing - * NUL). - * - * If access to userspace fails, returns -EFAULT (some data may have been - * copied). - * - * If @count is smaller than the length of the string, copies @count bytes - * and returns @count. - */ -long -strncpy_from_user(char *dst, const char __user *src, long count) -{ - long res = -EFAULT; - if (access_ok(VERIFY_READ, src, 1)) - __do_strncpy_from_user(dst, src, count, res); - return res; -} -EXPORT_SYMBOL(strncpy_from_user); +#define movsl_is_ok(a1, a2, n) \ + __movsl_is_ok((unsigned long)(a1), (unsigned long)(a2), (n)) /* * Zero Userspace @@ -119,12 +40,13 @@ EXPORT_SYMBOL(strncpy_from_user); #define __do_clear_user(addr,size) \ do { \ int __d0; \ - might_sleep(); \ - __asm__ __volatile__( \ + might_fault(); \ + __asm__ __volatile__( \ + ASM_STAC "\n" \ "0: rep; stosl\n" \ " movl %2,%0\n" \ "1: rep; stosb\n" \ - "2:\n" \ + "2: " ASM_CLAC "\n" \ ".section .fixup,\"ax\"\n" \ "3: lea 0(%2,%0,4),%0\n" \ " jmp 2b\n" \ @@ -148,7 +70,7 @@ do { \ unsigned long clear_user(void __user *to, unsigned long n) { - might_sleep(); + might_fault(); if (access_ok(VERIFY_WRITE, to, n)) __do_clear_user(to, n); return n; @@ -174,50 +96,6 @@ __clear_user(void __user *to, unsigned long n) } EXPORT_SYMBOL(__clear_user); -/** - * strnlen_user: - Get the size of a string in user space. - * @s: The string to measure. - * @n: The maximum valid length - * - * Get the size of a NUL-terminated string in user space. - * - * Returns the size of the string INCLUDING the terminating NUL. - * On exception, returns 0. - * If the string is too long, returns a value greater than @n. - */ -long strnlen_user(const char __user *s, long n) -{ - unsigned long mask = -__addr_ok(s); - unsigned long res, tmp; - - might_sleep(); - - __asm__ __volatile__( - " testl %0, %0\n" - " jz 3f\n" - " andl %0,%%ecx\n" - "0: repne; scasb\n" - " setne %%al\n" - " subl %%ecx,%0\n" - " addl %0,%%eax\n" - "1:\n" - ".section .fixup,\"ax\"\n" - "2: xorl %%eax,%%eax\n" - " jmp 1b\n" - "3: movb $1,%%al\n" - " jmp 1b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 0b,2b\n" - ".previous" - :"=r" (n), "=D" (s), "=a" (res), "=c" (tmp) - :"0" (n), "1" (s), "2" (0), "3" (mask) - :"cc"); - return res & mask; -} -EXPORT_SYMBOL(strnlen_user); - #ifdef CONFIG_X86_INTEL_USERCOPY static unsigned long __copy_user_intel(void __user *to, const void *from, unsigned long size) @@ -279,47 +157,44 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size) "101: lea 0(%%eax,%0,4),%0\n" " jmp 100b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b,100b\n" - " .long 2b,100b\n" - " .long 3b,100b\n" - " .long 4b,100b\n" - " .long 5b,100b\n" - " .long 6b,100b\n" - " .long 7b,100b\n" - " .long 8b,100b\n" - " .long 9b,100b\n" - " .long 10b,100b\n" - " .long 11b,100b\n" - " .long 12b,100b\n" - " .long 13b,100b\n" - " .long 14b,100b\n" - " .long 15b,100b\n" - " .long 16b,100b\n" - " .long 17b,100b\n" - " .long 18b,100b\n" - " .long 19b,100b\n" - " .long 20b,100b\n" - " .long 21b,100b\n" - " .long 22b,100b\n" - " .long 23b,100b\n" - " .long 24b,100b\n" - " .long 25b,100b\n" - " .long 26b,100b\n" - " .long 27b,100b\n" - " .long 28b,100b\n" - " .long 29b,100b\n" - " .long 30b,100b\n" - " .long 31b,100b\n" - " .long 32b,100b\n" - " .long 33b,100b\n" - " .long 34b,100b\n" - " .long 35b,100b\n" - " .long 36b,100b\n" - " .long 37b,100b\n" - " .long 99b,101b\n" - ".previous" + _ASM_EXTABLE(1b,100b) + _ASM_EXTABLE(2b,100b) + _ASM_EXTABLE(3b,100b) + _ASM_EXTABLE(4b,100b) + _ASM_EXTABLE(5b,100b) + _ASM_EXTABLE(6b,100b) + _ASM_EXTABLE(7b,100b) + _ASM_EXTABLE(8b,100b) + _ASM_EXTABLE(9b,100b) + _ASM_EXTABLE(10b,100b) + _ASM_EXTABLE(11b,100b) + _ASM_EXTABLE(12b,100b) + _ASM_EXTABLE(13b,100b) + _ASM_EXTABLE(14b,100b) + _ASM_EXTABLE(15b,100b) + _ASM_EXTABLE(16b,100b) + _ASM_EXTABLE(17b,100b) + _ASM_EXTABLE(18b,100b) + _ASM_EXTABLE(19b,100b) + _ASM_EXTABLE(20b,100b) + _ASM_EXTABLE(21b,100b) + _ASM_EXTABLE(22b,100b) + _ASM_EXTABLE(23b,100b) + _ASM_EXTABLE(24b,100b) + _ASM_EXTABLE(25b,100b) + _ASM_EXTABLE(26b,100b) + _ASM_EXTABLE(27b,100b) + _ASM_EXTABLE(28b,100b) + _ASM_EXTABLE(29b,100b) + _ASM_EXTABLE(30b,100b) + _ASM_EXTABLE(31b,100b) + _ASM_EXTABLE(32b,100b) + _ASM_EXTABLE(33b,100b) + _ASM_EXTABLE(34b,100b) + _ASM_EXTABLE(35b,100b) + _ASM_EXTABLE(36b,100b) + _ASM_EXTABLE(37b,100b) + _ASM_EXTABLE(99b,101b) : "=&c"(size), "=&D" (d0), "=&S" (d1) : "1"(to), "2"(from), "0"(size) : "eax", "edx", "memory"); @@ -333,17 +208,17 @@ __copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size) __asm__ __volatile__( " .align 2,0x90\n" "0: movl 32(%4), %%eax\n" - " cmpl $67, %0\n" - " jbe 2f\n" + " cmpl $67, %0\n" + " jbe 2f\n" "1: movl 64(%4), %%eax\n" - " .align 2,0x90\n" - "2: movl 0(%4), %%eax\n" - "21: movl 4(%4), %%edx\n" - " movl %%eax, 0(%3)\n" - " movl %%edx, 4(%3)\n" - "3: movl 8(%4), %%eax\n" - "31: movl 12(%4),%%edx\n" - " movl %%eax, 8(%3)\n" + " .align 2,0x90\n" + "2: movl 0(%4), %%eax\n" + "21: movl 4(%4), %%edx\n" + " movl %%eax, 0(%3)\n" + " movl %%edx, 4(%3)\n" + "3: movl 8(%4), %%eax\n" + "31: movl 12(%4),%%edx\n" + " movl %%eax, 8(%3)\n" " movl %%edx, 12(%3)\n" "4: movl 16(%4), %%eax\n" "41: movl 20(%4), %%edx\n" @@ -369,52 +244,49 @@ __copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size) "91: movl 60(%4), %%edx\n" " movl %%eax, 56(%3)\n" " movl %%edx, 60(%3)\n" - " addl $-64, %0\n" - " addl $64, %4\n" - " addl $64, %3\n" - " cmpl $63, %0\n" - " ja 0b\n" - "5: movl %0, %%eax\n" - " shrl $2, %0\n" - " andl $3, %%eax\n" - " cld\n" - "6: rep; movsl\n" + " addl $-64, %0\n" + " addl $64, %4\n" + " addl $64, %3\n" + " cmpl $63, %0\n" + " ja 0b\n" + "5: movl %0, %%eax\n" + " shrl $2, %0\n" + " andl $3, %%eax\n" + " cld\n" + "6: rep; movsl\n" " movl %%eax,%0\n" - "7: rep; movsb\n" - "8:\n" + "7: rep; movsb\n" + "8:\n" ".section .fixup,\"ax\"\n" - "9: lea 0(%%eax,%0,4),%0\n" - "16: pushl %0\n" - " pushl %%eax\n" + "9: lea 0(%%eax,%0,4),%0\n" + "16: pushl %0\n" + " pushl %%eax\n" " xorl %%eax,%%eax\n" - " rep; stosb\n" - " popl %%eax\n" - " popl %0\n" - " jmp 8b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 0b,16b\n" - " .long 1b,16b\n" - " .long 2b,16b\n" - " .long 21b,16b\n" - " .long 3b,16b\n" - " .long 31b,16b\n" - " .long 4b,16b\n" - " .long 41b,16b\n" - " .long 10b,16b\n" - " .long 51b,16b\n" - " .long 11b,16b\n" - " .long 61b,16b\n" - " .long 12b,16b\n" - " .long 71b,16b\n" - " .long 13b,16b\n" - " .long 81b,16b\n" - " .long 14b,16b\n" - " .long 91b,16b\n" - " .long 6b,9b\n" - " .long 7b,16b\n" - ".previous" + " rep; stosb\n" + " popl %%eax\n" + " popl %0\n" + " jmp 8b\n" + ".previous\n" + _ASM_EXTABLE(0b,16b) + _ASM_EXTABLE(1b,16b) + _ASM_EXTABLE(2b,16b) + _ASM_EXTABLE(21b,16b) + _ASM_EXTABLE(3b,16b) + _ASM_EXTABLE(31b,16b) + _ASM_EXTABLE(4b,16b) + _ASM_EXTABLE(41b,16b) + _ASM_EXTABLE(10b,16b) + _ASM_EXTABLE(51b,16b) + _ASM_EXTABLE(11b,16b) + _ASM_EXTABLE(61b,16b) + _ASM_EXTABLE(12b,16b) + _ASM_EXTABLE(71b,16b) + _ASM_EXTABLE(13b,16b) + _ASM_EXTABLE(81b,16b) + _ASM_EXTABLE(14b,16b) + _ASM_EXTABLE(91b,16b) + _ASM_EXTABLE(6b,9b) + _ASM_EXTABLE(7b,16b) : "=&c"(size), "=&D" (d0), "=&S" (d1) : "1"(to), "2"(from), "0"(size) : "eax", "edx", "memory"); @@ -429,7 +301,7 @@ __copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size) static unsigned long __copy_user_zeroing_intel_nocache(void *to, const void __user *from, unsigned long size) { - int d0, d1; + int d0, d1; __asm__ __volatile__( " .align 2,0x90\n" @@ -494,29 +366,26 @@ static unsigned long __copy_user_zeroing_intel_nocache(void *to, " popl %0\n" " jmp 8b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 0b,16b\n" - " .long 1b,16b\n" - " .long 2b,16b\n" - " .long 21b,16b\n" - " .long 3b,16b\n" - " .long 31b,16b\n" - " .long 4b,16b\n" - " .long 41b,16b\n" - " .long 10b,16b\n" - " .long 51b,16b\n" - " .long 11b,16b\n" - " .long 61b,16b\n" - " .long 12b,16b\n" - " .long 71b,16b\n" - " .long 13b,16b\n" - " .long 81b,16b\n" - " .long 14b,16b\n" - " .long 91b,16b\n" - " .long 6b,9b\n" - " .long 7b,16b\n" - ".previous" + _ASM_EXTABLE(0b,16b) + _ASM_EXTABLE(1b,16b) + _ASM_EXTABLE(2b,16b) + _ASM_EXTABLE(21b,16b) + _ASM_EXTABLE(3b,16b) + _ASM_EXTABLE(31b,16b) + _ASM_EXTABLE(4b,16b) + _ASM_EXTABLE(41b,16b) + _ASM_EXTABLE(10b,16b) + _ASM_EXTABLE(51b,16b) + _ASM_EXTABLE(11b,16b) + _ASM_EXTABLE(61b,16b) + _ASM_EXTABLE(12b,16b) + _ASM_EXTABLE(71b,16b) + _ASM_EXTABLE(13b,16b) + _ASM_EXTABLE(81b,16b) + _ASM_EXTABLE(14b,16b) + _ASM_EXTABLE(91b,16b) + _ASM_EXTABLE(6b,9b) + _ASM_EXTABLE(7b,16b) : "=&c"(size), "=&D" (d0), "=&S" (d1) : "1"(to), "2"(from), "0"(size) : "eax", "edx", "memory"); @@ -526,7 +395,7 @@ static unsigned long __copy_user_zeroing_intel_nocache(void *to, static unsigned long __copy_user_intel_nocache(void *to, const void __user *from, unsigned long size) { - int d0, d1; + int d0, d1; __asm__ __volatile__( " .align 2,0x90\n" @@ -585,29 +454,26 @@ static unsigned long __copy_user_intel_nocache(void *to, "9: lea 0(%%eax,%0,4),%0\n" "16: jmp 8b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 0b,16b\n" - " .long 1b,16b\n" - " .long 2b,16b\n" - " .long 21b,16b\n" - " .long 3b,16b\n" - " .long 31b,16b\n" - " .long 4b,16b\n" - " .long 41b,16b\n" - " .long 10b,16b\n" - " .long 51b,16b\n" - " .long 11b,16b\n" - " .long 61b,16b\n" - " .long 12b,16b\n" - " .long 71b,16b\n" - " .long 13b,16b\n" - " .long 81b,16b\n" - " .long 14b,16b\n" - " .long 91b,16b\n" - " .long 6b,9b\n" - " .long 7b,16b\n" - ".previous" + _ASM_EXTABLE(0b,16b) + _ASM_EXTABLE(1b,16b) + _ASM_EXTABLE(2b,16b) + _ASM_EXTABLE(21b,16b) + _ASM_EXTABLE(3b,16b) + _ASM_EXTABLE(31b,16b) + _ASM_EXTABLE(4b,16b) + _ASM_EXTABLE(41b,16b) + _ASM_EXTABLE(10b,16b) + _ASM_EXTABLE(51b,16b) + _ASM_EXTABLE(11b,16b) + _ASM_EXTABLE(61b,16b) + _ASM_EXTABLE(12b,16b) + _ASM_EXTABLE(71b,16b) + _ASM_EXTABLE(13b,16b) + _ASM_EXTABLE(81b,16b) + _ASM_EXTABLE(14b,16b) + _ASM_EXTABLE(91b,16b) + _ASM_EXTABLE(6b,9b) + _ASM_EXTABLE(7b,16b) : "=&c"(size), "=&D" (d0), "=&S" (d1) : "1"(to), "2"(from), "0"(size) : "eax", "edx", "memory"); @@ -629,7 +495,7 @@ unsigned long __copy_user_zeroing_intel_nocache(void *to, #endif /* CONFIG_X86_INTEL_USERCOPY */ /* Generic arbitrary sized copy. */ -#define __copy_user(to,from,size) \ +#define __copy_user(to, from, size) \ do { \ int __d0, __d1, __d2; \ __asm__ __volatile__( \ @@ -654,18 +520,15 @@ do { \ "3: lea 0(%3,%0,4),%0\n" \ " jmp 2b\n" \ ".previous\n" \ - ".section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 4b,5b\n" \ - " .long 0b,3b\n" \ - " .long 1b,2b\n" \ - ".previous" \ + _ASM_EXTABLE(4b,5b) \ + _ASM_EXTABLE(0b,3b) \ + _ASM_EXTABLE(1b,2b) \ : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \ : "3"(size), "0"(size), "1"(to), "2"(from) \ : "memory"); \ } while (0) -#define __copy_user_zeroing(to,from,size) \ +#define __copy_user_zeroing(to, from, size) \ do { \ int __d0, __d1, __d2; \ __asm__ __volatile__( \ @@ -696,12 +559,9 @@ do { \ " popl %0\n" \ " jmp 2b\n" \ ".previous\n" \ - ".section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 4b,5b\n" \ - " .long 0b,3b\n" \ - " .long 1b,6b\n" \ - ".previous" \ + _ASM_EXTABLE(4b,5b) \ + _ASM_EXTABLE(0b,3b) \ + _ASM_EXTABLE(1b,6b) \ : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \ : "3"(size), "0"(size), "1"(to), "2"(from) \ : "memory"); \ @@ -710,67 +570,12 @@ do { \ unsigned long __copy_to_user_ll(void __user *to, const void *from, unsigned long n) { -#ifndef CONFIG_X86_WP_WORKS_OK - if (unlikely(boot_cpu_data.wp_works_ok == 0) && - ((unsigned long )to) < TASK_SIZE) { - /* - * When we are in an atomic section (see - * mm/filemap.c:file_read_actor), return the full - * length to take the slow path. - */ - if (in_atomic()) - return n; - - /* - * CPU does not honor the WP bit when writing - * from supervisory mode, and due to preemption or SMP, - * the page tables can change at any time. - * Do it manually. Manfred <manfred@colorfullife.com> - */ - while (n) { - unsigned long offset = ((unsigned long)to)%PAGE_SIZE; - unsigned long len = PAGE_SIZE - offset; - int retval; - struct page *pg; - void *maddr; - - if (len > n) - len = n; - -survive: - down_read(¤t->mm->mmap_sem); - retval = get_user_pages(current, current->mm, - (unsigned long )to, 1, 1, 0, &pg, NULL); - - if (retval == -ENOMEM && is_global_init(current)) { - up_read(¤t->mm->mmap_sem); - congestion_wait(WRITE, HZ/50); - goto survive; - } - - if (retval != 1) { - up_read(¤t->mm->mmap_sem); - break; - } - - maddr = kmap_atomic(pg, KM_USER0); - memcpy(maddr + offset, from, len); - kunmap_atomic(maddr, KM_USER0); - set_page_dirty_lock(pg); - put_page(pg); - up_read(¤t->mm->mmap_sem); - - from += len; - to += len; - n -= len; - } - return n; - } -#endif + stac(); if (movsl_is_ok(to, from, n)) __copy_user(to, from, n); else n = __copy_user_intel(to, from, n); + clac(); return n; } EXPORT_SYMBOL(__copy_to_user_ll); @@ -778,10 +583,12 @@ EXPORT_SYMBOL(__copy_to_user_ll); unsigned long __copy_from_user_ll(void *to, const void __user *from, unsigned long n) { + stac(); if (movsl_is_ok(to, from, n)) __copy_user_zeroing(to, from, n); else n = __copy_user_zeroing_intel(to, from, n); + clac(); return n; } EXPORT_SYMBOL(__copy_from_user_ll); @@ -789,11 +596,13 @@ EXPORT_SYMBOL(__copy_from_user_ll); unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from, unsigned long n) { + stac(); if (movsl_is_ok(to, from, n)) __copy_user(to, from, n); else n = __copy_user_intel((void __user *)to, (const void *)from, n); + clac(); return n; } EXPORT_SYMBOL(__copy_from_user_ll_nozero); @@ -801,14 +610,16 @@ EXPORT_SYMBOL(__copy_from_user_ll_nozero); unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from, unsigned long n) { + stac(); #ifdef CONFIG_X86_INTEL_USERCOPY - if ( n > 64 && cpu_has_xmm2) - n = __copy_user_zeroing_intel_nocache(to, from, n); + if (n > 64 && cpu_has_xmm2) + n = __copy_user_zeroing_intel_nocache(to, from, n); else __copy_user_zeroing(to, from, n); #else - __copy_user_zeroing(to, from, n); + __copy_user_zeroing(to, from, n); #endif + clac(); return n; } EXPORT_SYMBOL(__copy_from_user_ll_nocache); @@ -816,14 +627,16 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocache); unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from, unsigned long n) { + stac(); #ifdef CONFIG_X86_INTEL_USERCOPY - if ( n > 64 && cpu_has_xmm2) - n = __copy_user_intel_nocache(to, from, n); + if (n > 64 && cpu_has_xmm2) + n = __copy_user_intel_nocache(to, from, n); else __copy_user(to, from, n); #else - __copy_user(to, from, n); + __copy_user(to, from, n); #endif + clac(); return n; } EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero); @@ -841,14 +654,13 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero); * Returns number of bytes that could not be copied. * On success, this will be zero. */ -unsigned long -copy_to_user(void __user *to, const void *from, unsigned long n) +unsigned long _copy_to_user(void __user *to, const void *from, unsigned n) { if (access_ok(VERIFY_WRITE, to, n)) n = __copy_to_user(to, from, n); return n; } -EXPORT_SYMBOL(copy_to_user); +EXPORT_SYMBOL(_copy_to_user); /** * copy_from_user: - Copy a block of data from user space. @@ -866,8 +678,7 @@ EXPORT_SYMBOL(copy_to_user); * If some data could not be copied, this function will pad the copied * data to the requested size using zero bytes. */ -unsigned long -copy_from_user(void *to, const void __user *from, unsigned long n) +unsigned long _copy_from_user(void *to, const void __user *from, unsigned n) { if (access_ok(VERIFY_READ, from, n)) n = __copy_from_user(to, from, n); @@ -875,4 +686,4 @@ copy_from_user(void *to, const void __user *from, unsigned long n) memset(to, 0, n); return n; } -EXPORT_SYMBOL(copy_from_user); +EXPORT_SYMBOL(_copy_from_user); diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 0c89d1bb028..c905e89e19f 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -9,64 +9,16 @@ #include <asm/uaccess.h> /* - * Copy a null terminated string from userspace. - */ - -#define __do_strncpy_from_user(dst,src,count,res) \ -do { \ - long __d0, __d1, __d2; \ - might_sleep(); \ - __asm__ __volatile__( \ - " testq %1,%1\n" \ - " jz 2f\n" \ - "0: lodsb\n" \ - " stosb\n" \ - " testb %%al,%%al\n" \ - " jz 1f\n" \ - " decq %1\n" \ - " jnz 0b\n" \ - "1: subq %1,%0\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: movq %5,%0\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(0b,3b) \ - : "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \ - "=&D" (__d2) \ - : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ - : "memory"); \ -} while (0) - -long -__strncpy_from_user(char *dst, const char __user *src, long count) -{ - long res; - __do_strncpy_from_user(dst, src, count, res); - return res; -} -EXPORT_SYMBOL(__strncpy_from_user); - -long -strncpy_from_user(char *dst, const char __user *src, long count) -{ - long res = -EFAULT; - if (access_ok(VERIFY_READ, src, 1)) - return __strncpy_from_user(dst, src, count); - return res; -} -EXPORT_SYMBOL(strncpy_from_user); - -/* * Zero Userspace */ unsigned long __clear_user(void __user *addr, unsigned long size) { long __d0; - might_sleep(); + might_fault(); /* no memory constraint because it doesn't change any memory gcc knows about */ + stac(); asm volatile( " testq %[size8],%[size8]\n" " jz 4f\n" @@ -86,9 +38,10 @@ unsigned long __clear_user(void __user *addr, unsigned long size) ".previous\n" _ASM_EXTABLE(0b,3b) _ASM_EXTABLE(1b,2b) - : [size8] "=c"(size), [dst] "=&D" (__d0) + : [size8] "=&c"(size), [dst] "=&D" (__d0) : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr), [zero] "r" (0UL), [eight] "r" (8UL)); + clac(); return size; } EXPORT_SYMBOL(__clear_user); @@ -101,54 +54,6 @@ unsigned long clear_user(void __user *to, unsigned long n) } EXPORT_SYMBOL(clear_user); -/* - * Return the size of a string (including the ending 0) - * - * Return 0 on exception, a value greater than N if too long - */ - -long __strnlen_user(const char __user *s, long n) -{ - long res = 0; - char c; - - while (1) { - if (res>n) - return n+1; - if (__get_user(c, s)) - return 0; - if (!c) - return res+1; - res++; - s++; - } -} -EXPORT_SYMBOL(__strnlen_user); - -long strnlen_user(const char __user *s, long n) -{ - if (!access_ok(VERIFY_READ, s, n)) - return 0; - return __strnlen_user(s, n); -} -EXPORT_SYMBOL(strnlen_user); - -long strlen_user(const char __user *s) -{ - long res = 0; - char c; - - for (;;) { - if (get_user(c, s)) - return 0; - if (!c) - return res+1; - res++; - s++; - } -} -EXPORT_SYMBOL(strlen_user); - unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len) { if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) { @@ -158,3 +63,27 @@ unsigned long copy_in_user(void __user *to, const void __user *from, unsigned le } EXPORT_SYMBOL(copy_in_user); +/* + * Try to copy last bytes and clear the rest if needed. + * Since protection fault in copy_from/to_user is not a normal situation, + * it is not necessary to optimize tail handling. + */ +__visible unsigned long +copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest) +{ + char c; + unsigned zero_len; + + for (; len; --len, to++) { + if (__get_user_nocheck(c, from++, sizeof(char))) + break; + if (__put_user_nocheck(c, to, sizeof(char))) + break; + } + + for (c = 0, zero_len = len; zerorest && zero_len; --zero_len) + if (__put_user_nocheck(c, to++, sizeof(char))) + break; + clac(); + return len; +} diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt new file mode 100644 index 00000000000..1a2be7c6895 --- /dev/null +++ b/arch/x86/lib/x86-opcode-map.txt @@ -0,0 +1,961 @@ +# x86 Opcode Maps +# +# This is (mostly) based on following documentations. +# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2C +# (#326018-047US, June 2013) +# +#<Opcode maps> +# Table: table-name +# Referrer: escaped-name +# AVXcode: avx-code +# opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] +# (or) +# opcode: escape # escaped-name +# EndTable +# +#<group maps> +# GrpTable: GrpXXX +# reg: mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] +# EndTable +# +# AVX Superscripts +# (v): this opcode requires VEX prefix. +# (v1): this opcode only supports 128bit VEX. +# +# Last Prefix Superscripts +# - (66): the last prefix is 0x66 +# - (F3): the last prefix is 0xF3 +# - (F2): the last prefix is 0xF2 +# - (!F3) : the last prefix is not 0xF3 (including non-last prefix case) +# - (66&F2): Both 0x66 and 0xF2 prefixes are specified. + +Table: one byte opcode +Referrer: +AVXcode: +# 0x00 - 0x0f +00: ADD Eb,Gb +01: ADD Ev,Gv +02: ADD Gb,Eb +03: ADD Gv,Ev +04: ADD AL,Ib +05: ADD rAX,Iz +06: PUSH ES (i64) +07: POP ES (i64) +08: OR Eb,Gb +09: OR Ev,Gv +0a: OR Gb,Eb +0b: OR Gv,Ev +0c: OR AL,Ib +0d: OR rAX,Iz +0e: PUSH CS (i64) +0f: escape # 2-byte escape +# 0x10 - 0x1f +10: ADC Eb,Gb +11: ADC Ev,Gv +12: ADC Gb,Eb +13: ADC Gv,Ev +14: ADC AL,Ib +15: ADC rAX,Iz +16: PUSH SS (i64) +17: POP SS (i64) +18: SBB Eb,Gb +19: SBB Ev,Gv +1a: SBB Gb,Eb +1b: SBB Gv,Ev +1c: SBB AL,Ib +1d: SBB rAX,Iz +1e: PUSH DS (i64) +1f: POP DS (i64) +# 0x20 - 0x2f +20: AND Eb,Gb +21: AND Ev,Gv +22: AND Gb,Eb +23: AND Gv,Ev +24: AND AL,Ib +25: AND rAx,Iz +26: SEG=ES (Prefix) +27: DAA (i64) +28: SUB Eb,Gb +29: SUB Ev,Gv +2a: SUB Gb,Eb +2b: SUB Gv,Ev +2c: SUB AL,Ib +2d: SUB rAX,Iz +2e: SEG=CS (Prefix) +2f: DAS (i64) +# 0x30 - 0x3f +30: XOR Eb,Gb +31: XOR Ev,Gv +32: XOR Gb,Eb +33: XOR Gv,Ev +34: XOR AL,Ib +35: XOR rAX,Iz +36: SEG=SS (Prefix) +37: AAA (i64) +38: CMP Eb,Gb +39: CMP Ev,Gv +3a: CMP Gb,Eb +3b: CMP Gv,Ev +3c: CMP AL,Ib +3d: CMP rAX,Iz +3e: SEG=DS (Prefix) +3f: AAS (i64) +# 0x40 - 0x4f +40: INC eAX (i64) | REX (o64) +41: INC eCX (i64) | REX.B (o64) +42: INC eDX (i64) | REX.X (o64) +43: INC eBX (i64) | REX.XB (o64) +44: INC eSP (i64) | REX.R (o64) +45: INC eBP (i64) | REX.RB (o64) +46: INC eSI (i64) | REX.RX (o64) +47: INC eDI (i64) | REX.RXB (o64) +48: DEC eAX (i64) | REX.W (o64) +49: DEC eCX (i64) | REX.WB (o64) +4a: DEC eDX (i64) | REX.WX (o64) +4b: DEC eBX (i64) | REX.WXB (o64) +4c: DEC eSP (i64) | REX.WR (o64) +4d: DEC eBP (i64) | REX.WRB (o64) +4e: DEC eSI (i64) | REX.WRX (o64) +4f: DEC eDI (i64) | REX.WRXB (o64) +# 0x50 - 0x5f +50: PUSH rAX/r8 (d64) +51: PUSH rCX/r9 (d64) +52: PUSH rDX/r10 (d64) +53: PUSH rBX/r11 (d64) +54: PUSH rSP/r12 (d64) +55: PUSH rBP/r13 (d64) +56: PUSH rSI/r14 (d64) +57: PUSH rDI/r15 (d64) +58: POP rAX/r8 (d64) +59: POP rCX/r9 (d64) +5a: POP rDX/r10 (d64) +5b: POP rBX/r11 (d64) +5c: POP rSP/r12 (d64) +5d: POP rBP/r13 (d64) +5e: POP rSI/r14 (d64) +5f: POP rDI/r15 (d64) +# 0x60 - 0x6f +60: PUSHA/PUSHAD (i64) +61: POPA/POPAD (i64) +62: BOUND Gv,Ma (i64) +63: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64) +64: SEG=FS (Prefix) +65: SEG=GS (Prefix) +66: Operand-Size (Prefix) +67: Address-Size (Prefix) +68: PUSH Iz (d64) +69: IMUL Gv,Ev,Iz +6a: PUSH Ib (d64) +6b: IMUL Gv,Ev,Ib +6c: INS/INSB Yb,DX +6d: INS/INSW/INSD Yz,DX +6e: OUTS/OUTSB DX,Xb +6f: OUTS/OUTSW/OUTSD DX,Xz +# 0x70 - 0x7f +70: JO Jb +71: JNO Jb +72: JB/JNAE/JC Jb +73: JNB/JAE/JNC Jb +74: JZ/JE Jb +75: JNZ/JNE Jb +76: JBE/JNA Jb +77: JNBE/JA Jb +78: JS Jb +79: JNS Jb +7a: JP/JPE Jb +7b: JNP/JPO Jb +7c: JL/JNGE Jb +7d: JNL/JGE Jb +7e: JLE/JNG Jb +7f: JNLE/JG Jb +# 0x80 - 0x8f +80: Grp1 Eb,Ib (1A) +81: Grp1 Ev,Iz (1A) +82: Grp1 Eb,Ib (1A),(i64) +83: Grp1 Ev,Ib (1A) +84: TEST Eb,Gb +85: TEST Ev,Gv +86: XCHG Eb,Gb +87: XCHG Ev,Gv +88: MOV Eb,Gb +89: MOV Ev,Gv +8a: MOV Gb,Eb +8b: MOV Gv,Ev +8c: MOV Ev,Sw +8d: LEA Gv,M +8e: MOV Sw,Ew +8f: Grp1A (1A) | POP Ev (d64) +# 0x90 - 0x9f +90: NOP | PAUSE (F3) | XCHG r8,rAX +91: XCHG rCX/r9,rAX +92: XCHG rDX/r10,rAX +93: XCHG rBX/r11,rAX +94: XCHG rSP/r12,rAX +95: XCHG rBP/r13,rAX +96: XCHG rSI/r14,rAX +97: XCHG rDI/r15,rAX +98: CBW/CWDE/CDQE +99: CWD/CDQ/CQO +9a: CALLF Ap (i64) +9b: FWAIT/WAIT +9c: PUSHF/D/Q Fv (d64) +9d: POPF/D/Q Fv (d64) +9e: SAHF +9f: LAHF +# 0xa0 - 0xaf +a0: MOV AL,Ob +a1: MOV rAX,Ov +a2: MOV Ob,AL +a3: MOV Ov,rAX +a4: MOVS/B Yb,Xb +a5: MOVS/W/D/Q Yv,Xv +a6: CMPS/B Xb,Yb +a7: CMPS/W/D Xv,Yv +a8: TEST AL,Ib +a9: TEST rAX,Iz +aa: STOS/B Yb,AL +ab: STOS/W/D/Q Yv,rAX +ac: LODS/B AL,Xb +ad: LODS/W/D/Q rAX,Xv +ae: SCAS/B AL,Yb +# Note: The May 2011 Intel manual shows Xv for the second parameter of the +# next instruction but Yv is correct +af: SCAS/W/D/Q rAX,Yv +# 0xb0 - 0xbf +b0: MOV AL/R8L,Ib +b1: MOV CL/R9L,Ib +b2: MOV DL/R10L,Ib +b3: MOV BL/R11L,Ib +b4: MOV AH/R12L,Ib +b5: MOV CH/R13L,Ib +b6: MOV DH/R14L,Ib +b7: MOV BH/R15L,Ib +b8: MOV rAX/r8,Iv +b9: MOV rCX/r9,Iv +ba: MOV rDX/r10,Iv +bb: MOV rBX/r11,Iv +bc: MOV rSP/r12,Iv +bd: MOV rBP/r13,Iv +be: MOV rSI/r14,Iv +bf: MOV rDI/r15,Iv +# 0xc0 - 0xcf +c0: Grp2 Eb,Ib (1A) +c1: Grp2 Ev,Ib (1A) +c2: RETN Iw (f64) +c3: RETN +c4: LES Gz,Mp (i64) | VEX+2byte (Prefix) +c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix) +c6: Grp11A Eb,Ib (1A) +c7: Grp11B Ev,Iz (1A) +c8: ENTER Iw,Ib +c9: LEAVE (d64) +ca: RETF Iw +cb: RETF +cc: INT3 +cd: INT Ib +ce: INTO (i64) +cf: IRET/D/Q +# 0xd0 - 0xdf +d0: Grp2 Eb,1 (1A) +d1: Grp2 Ev,1 (1A) +d2: Grp2 Eb,CL (1A) +d3: Grp2 Ev,CL (1A) +d4: AAM Ib (i64) +d5: AAD Ib (i64) +d6: +d7: XLAT/XLATB +d8: ESC +d9: ESC +da: ESC +db: ESC +dc: ESC +dd: ESC +de: ESC +df: ESC +# 0xe0 - 0xef +e0: LOOPNE/LOOPNZ Jb (f64) +e1: LOOPE/LOOPZ Jb (f64) +e2: LOOP Jb (f64) +e3: JrCXZ Jb (f64) +e4: IN AL,Ib +e5: IN eAX,Ib +e6: OUT Ib,AL +e7: OUT Ib,eAX +e8: CALL Jz (f64) +e9: JMP-near Jz (f64) +ea: JMP-far Ap (i64) +eb: JMP-short Jb (f64) +ec: IN AL,DX +ed: IN eAX,DX +ee: OUT DX,AL +ef: OUT DX,eAX +# 0xf0 - 0xff +f0: LOCK (Prefix) +f1: +f2: REPNE (Prefix) | XACQUIRE (Prefix) +f3: REP/REPE (Prefix) | XRELEASE (Prefix) +f4: HLT +f5: CMC +f6: Grp3_1 Eb (1A) +f7: Grp3_2 Ev (1A) +f8: CLC +f9: STC +fa: CLI +fb: STI +fc: CLD +fd: STD +fe: Grp4 (1A) +ff: Grp5 (1A) +EndTable + +Table: 2-byte opcode (0x0f) +Referrer: 2-byte escape +AVXcode: 1 +# 0x0f 0x00-0x0f +00: Grp6 (1A) +01: Grp7 (1A) +02: LAR Gv,Ew +03: LSL Gv,Ew +04: +05: SYSCALL (o64) +06: CLTS +07: SYSRET (o64) +08: INVD +09: WBINVD +0a: +0b: UD2 (1B) +0c: +# AMD's prefetch group. Intel supports prefetchw(/1) only. +0d: GrpP +0e: FEMMS +# 3DNow! uses the last imm byte as opcode extension. +0f: 3DNow! Pq,Qq,Ib +# 0x0f 0x10-0x1f +# NOTE: According to Intel SDM opcode map, vmovups and vmovupd has no operands +# but it actually has operands. And also, vmovss and vmovsd only accept 128bit. +# MOVSS/MOVSD has too many forms(3) on SDM. This map just shows a typical form. +# Many AVX instructions lack v1 superscript, according to Intel AVX-Prgramming +# Reference A.1 +10: vmovups Vps,Wps | vmovupd Vpd,Wpd (66) | vmovss Vx,Hx,Wss (F3),(v1) | vmovsd Vx,Hx,Wsd (F2),(v1) +11: vmovups Wps,Vps | vmovupd Wpd,Vpd (66) | vmovss Wss,Hx,Vss (F3),(v1) | vmovsd Wsd,Hx,Vsd (F2),(v1) +12: vmovlps Vq,Hq,Mq (v1) | vmovhlps Vq,Hq,Uq (v1) | vmovlpd Vq,Hq,Mq (66),(v1) | vmovsldup Vx,Wx (F3) | vmovddup Vx,Wx (F2) +13: vmovlps Mq,Vq (v1) | vmovlpd Mq,Vq (66),(v1) +14: vunpcklps Vx,Hx,Wx | vunpcklpd Vx,Hx,Wx (66) +15: vunpckhps Vx,Hx,Wx | vunpckhpd Vx,Hx,Wx (66) +16: vmovhps Vdq,Hq,Mq (v1) | vmovlhps Vdq,Hq,Uq (v1) | vmovhpd Vdq,Hq,Mq (66),(v1) | vmovshdup Vx,Wx (F3) +17: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1) +18: Grp16 (1A) +19: +1a: BNDCL Ev,Gv | BNDCU Ev,Gv | BNDMOV Gv,Ev | BNDLDX Gv,Ev,Gv +1b: BNDCN Ev,Gv | BNDMOV Ev,Gv | BNDMK Gv,Ev | BNDSTX Ev,GV,Gv +1c: +1d: +1e: +1f: NOP Ev +# 0x0f 0x20-0x2f +20: MOV Rd,Cd +21: MOV Rd,Dd +22: MOV Cd,Rd +23: MOV Dd,Rd +24: +25: +26: +27: +28: vmovaps Vps,Wps | vmovapd Vpd,Wpd (66) +29: vmovaps Wps,Vps | vmovapd Wpd,Vpd (66) +2a: cvtpi2ps Vps,Qpi | cvtpi2pd Vpd,Qpi (66) | vcvtsi2ss Vss,Hss,Ey (F3),(v1) | vcvtsi2sd Vsd,Hsd,Ey (F2),(v1) +2b: vmovntps Mps,Vps | vmovntpd Mpd,Vpd (66) +2c: cvttps2pi Ppi,Wps | cvttpd2pi Ppi,Wpd (66) | vcvttss2si Gy,Wss (F3),(v1) | vcvttsd2si Gy,Wsd (F2),(v1) +2d: cvtps2pi Ppi,Wps | cvtpd2pi Qpi,Wpd (66) | vcvtss2si Gy,Wss (F3),(v1) | vcvtsd2si Gy,Wsd (F2),(v1) +2e: vucomiss Vss,Wss (v1) | vucomisd Vsd,Wsd (66),(v1) +2f: vcomiss Vss,Wss (v1) | vcomisd Vsd,Wsd (66),(v1) +# 0x0f 0x30-0x3f +30: WRMSR +31: RDTSC +32: RDMSR +33: RDPMC +34: SYSENTER +35: SYSEXIT +36: +37: GETSEC +38: escape # 3-byte escape 1 +39: +3a: escape # 3-byte escape 2 +3b: +3c: +3d: +3e: +3f: +# 0x0f 0x40-0x4f +40: CMOVO Gv,Ev +41: CMOVNO Gv,Ev +42: CMOVB/C/NAE Gv,Ev +43: CMOVAE/NB/NC Gv,Ev +44: CMOVE/Z Gv,Ev +45: CMOVNE/NZ Gv,Ev +46: CMOVBE/NA Gv,Ev +47: CMOVA/NBE Gv,Ev +48: CMOVS Gv,Ev +49: CMOVNS Gv,Ev +4a: CMOVP/PE Gv,Ev +4b: CMOVNP/PO Gv,Ev +4c: CMOVL/NGE Gv,Ev +4d: CMOVNL/GE Gv,Ev +4e: CMOVLE/NG Gv,Ev +4f: CMOVNLE/G Gv,Ev +# 0x0f 0x50-0x5f +50: vmovmskps Gy,Ups | vmovmskpd Gy,Upd (66) +51: vsqrtps Vps,Wps | vsqrtpd Vpd,Wpd (66) | vsqrtss Vss,Hss,Wss (F3),(v1) | vsqrtsd Vsd,Hsd,Wsd (F2),(v1) +52: vrsqrtps Vps,Wps | vrsqrtss Vss,Hss,Wss (F3),(v1) +53: vrcpps Vps,Wps | vrcpss Vss,Hss,Wss (F3),(v1) +54: vandps Vps,Hps,Wps | vandpd Vpd,Hpd,Wpd (66) +55: vandnps Vps,Hps,Wps | vandnpd Vpd,Hpd,Wpd (66) +56: vorps Vps,Hps,Wps | vorpd Vpd,Hpd,Wpd (66) +57: vxorps Vps,Hps,Wps | vxorpd Vpd,Hpd,Wpd (66) +58: vaddps Vps,Hps,Wps | vaddpd Vpd,Hpd,Wpd (66) | vaddss Vss,Hss,Wss (F3),(v1) | vaddsd Vsd,Hsd,Wsd (F2),(v1) +59: vmulps Vps,Hps,Wps | vmulpd Vpd,Hpd,Wpd (66) | vmulss Vss,Hss,Wss (F3),(v1) | vmulsd Vsd,Hsd,Wsd (F2),(v1) +5a: vcvtps2pd Vpd,Wps | vcvtpd2ps Vps,Wpd (66) | vcvtss2sd Vsd,Hx,Wss (F3),(v1) | vcvtsd2ss Vss,Hx,Wsd (F2),(v1) +5b: vcvtdq2ps Vps,Wdq | vcvtps2dq Vdq,Wps (66) | vcvttps2dq Vdq,Wps (F3) +5c: vsubps Vps,Hps,Wps | vsubpd Vpd,Hpd,Wpd (66) | vsubss Vss,Hss,Wss (F3),(v1) | vsubsd Vsd,Hsd,Wsd (F2),(v1) +5d: vminps Vps,Hps,Wps | vminpd Vpd,Hpd,Wpd (66) | vminss Vss,Hss,Wss (F3),(v1) | vminsd Vsd,Hsd,Wsd (F2),(v1) +5e: vdivps Vps,Hps,Wps | vdivpd Vpd,Hpd,Wpd (66) | vdivss Vss,Hss,Wss (F3),(v1) | vdivsd Vsd,Hsd,Wsd (F2),(v1) +5f: vmaxps Vps,Hps,Wps | vmaxpd Vpd,Hpd,Wpd (66) | vmaxss Vss,Hss,Wss (F3),(v1) | vmaxsd Vsd,Hsd,Wsd (F2),(v1) +# 0x0f 0x60-0x6f +60: punpcklbw Pq,Qd | vpunpcklbw Vx,Hx,Wx (66),(v1) +61: punpcklwd Pq,Qd | vpunpcklwd Vx,Hx,Wx (66),(v1) +62: punpckldq Pq,Qd | vpunpckldq Vx,Hx,Wx (66),(v1) +63: packsswb Pq,Qq | vpacksswb Vx,Hx,Wx (66),(v1) +64: pcmpgtb Pq,Qq | vpcmpgtb Vx,Hx,Wx (66),(v1) +65: pcmpgtw Pq,Qq | vpcmpgtw Vx,Hx,Wx (66),(v1) +66: pcmpgtd Pq,Qq | vpcmpgtd Vx,Hx,Wx (66),(v1) +67: packuswb Pq,Qq | vpackuswb Vx,Hx,Wx (66),(v1) +68: punpckhbw Pq,Qd | vpunpckhbw Vx,Hx,Wx (66),(v1) +69: punpckhwd Pq,Qd | vpunpckhwd Vx,Hx,Wx (66),(v1) +6a: punpckhdq Pq,Qd | vpunpckhdq Vx,Hx,Wx (66),(v1) +6b: packssdw Pq,Qd | vpackssdw Vx,Hx,Wx (66),(v1) +6c: vpunpcklqdq Vx,Hx,Wx (66),(v1) +6d: vpunpckhqdq Vx,Hx,Wx (66),(v1) +6e: movd/q Pd,Ey | vmovd/q Vy,Ey (66),(v1) +6f: movq Pq,Qq | vmovdqa Vx,Wx (66) | vmovdqu Vx,Wx (F3) +# 0x0f 0x70-0x7f +70: pshufw Pq,Qq,Ib | vpshufd Vx,Wx,Ib (66),(v1) | vpshufhw Vx,Wx,Ib (F3),(v1) | vpshuflw Vx,Wx,Ib (F2),(v1) +71: Grp12 (1A) +72: Grp13 (1A) +73: Grp14 (1A) +74: pcmpeqb Pq,Qq | vpcmpeqb Vx,Hx,Wx (66),(v1) +75: pcmpeqw Pq,Qq | vpcmpeqw Vx,Hx,Wx (66),(v1) +76: pcmpeqd Pq,Qq | vpcmpeqd Vx,Hx,Wx (66),(v1) +# Note: Remove (v), because vzeroall and vzeroupper becomes emms without VEX. +77: emms | vzeroupper | vzeroall +78: VMREAD Ey,Gy +79: VMWRITE Gy,Ey +7a: +7b: +7c: vhaddpd Vpd,Hpd,Wpd (66) | vhaddps Vps,Hps,Wps (F2) +7d: vhsubpd Vpd,Hpd,Wpd (66) | vhsubps Vps,Hps,Wps (F2) +7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1) +7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3) +# 0x0f 0x80-0x8f +80: JO Jz (f64) +81: JNO Jz (f64) +82: JB/JC/JNAE Jz (f64) +83: JAE/JNB/JNC Jz (f64) +84: JE/JZ Jz (f64) +85: JNE/JNZ Jz (f64) +86: JBE/JNA Jz (f64) +87: JA/JNBE Jz (f64) +88: JS Jz (f64) +89: JNS Jz (f64) +8a: JP/JPE Jz (f64) +8b: JNP/JPO Jz (f64) +8c: JL/JNGE Jz (f64) +8d: JNL/JGE Jz (f64) +8e: JLE/JNG Jz (f64) +8f: JNLE/JG Jz (f64) +# 0x0f 0x90-0x9f +90: SETO Eb +91: SETNO Eb +92: SETB/C/NAE Eb +93: SETAE/NB/NC Eb +94: SETE/Z Eb +95: SETNE/NZ Eb +96: SETBE/NA Eb +97: SETA/NBE Eb +98: SETS Eb +99: SETNS Eb +9a: SETP/PE Eb +9b: SETNP/PO Eb +9c: SETL/NGE Eb +9d: SETNL/GE Eb +9e: SETLE/NG Eb +9f: SETNLE/G Eb +# 0x0f 0xa0-0xaf +a0: PUSH FS (d64) +a1: POP FS (d64) +a2: CPUID +a3: BT Ev,Gv +a4: SHLD Ev,Gv,Ib +a5: SHLD Ev,Gv,CL +a6: GrpPDLK +a7: GrpRNG +a8: PUSH GS (d64) +a9: POP GS (d64) +aa: RSM +ab: BTS Ev,Gv +ac: SHRD Ev,Gv,Ib +ad: SHRD Ev,Gv,CL +ae: Grp15 (1A),(1C) +af: IMUL Gv,Ev +# 0x0f 0xb0-0xbf +b0: CMPXCHG Eb,Gb +b1: CMPXCHG Ev,Gv +b2: LSS Gv,Mp +b3: BTR Ev,Gv +b4: LFS Gv,Mp +b5: LGS Gv,Mp +b6: MOVZX Gv,Eb +b7: MOVZX Gv,Ew +b8: JMPE (!F3) | POPCNT Gv,Ev (F3) +b9: Grp10 (1A) +ba: Grp8 Ev,Ib (1A) +bb: BTC Ev,Gv +bc: BSF Gv,Ev (!F3) | TZCNT Gv,Ev (F3) +bd: BSR Gv,Ev (!F3) | LZCNT Gv,Ev (F3) +be: MOVSX Gv,Eb +bf: MOVSX Gv,Ew +# 0x0f 0xc0-0xcf +c0: XADD Eb,Gb +c1: XADD Ev,Gv +c2: vcmpps Vps,Hps,Wps,Ib | vcmppd Vpd,Hpd,Wpd,Ib (66) | vcmpss Vss,Hss,Wss,Ib (F3),(v1) | vcmpsd Vsd,Hsd,Wsd,Ib (F2),(v1) +c3: movnti My,Gy +c4: pinsrw Pq,Ry/Mw,Ib | vpinsrw Vdq,Hdq,Ry/Mw,Ib (66),(v1) +c5: pextrw Gd,Nq,Ib | vpextrw Gd,Udq,Ib (66),(v1) +c6: vshufps Vps,Hps,Wps,Ib | vshufpd Vpd,Hpd,Wpd,Ib (66) +c7: Grp9 (1A) +c8: BSWAP RAX/EAX/R8/R8D +c9: BSWAP RCX/ECX/R9/R9D +ca: BSWAP RDX/EDX/R10/R10D +cb: BSWAP RBX/EBX/R11/R11D +cc: BSWAP RSP/ESP/R12/R12D +cd: BSWAP RBP/EBP/R13/R13D +ce: BSWAP RSI/ESI/R14/R14D +cf: BSWAP RDI/EDI/R15/R15D +# 0x0f 0xd0-0xdf +d0: vaddsubpd Vpd,Hpd,Wpd (66) | vaddsubps Vps,Hps,Wps (F2) +d1: psrlw Pq,Qq | vpsrlw Vx,Hx,Wx (66),(v1) +d2: psrld Pq,Qq | vpsrld Vx,Hx,Wx (66),(v1) +d3: psrlq Pq,Qq | vpsrlq Vx,Hx,Wx (66),(v1) +d4: paddq Pq,Qq | vpaddq Vx,Hx,Wx (66),(v1) +d5: pmullw Pq,Qq | vpmullw Vx,Hx,Wx (66),(v1) +d6: vmovq Wq,Vq (66),(v1) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2) +d7: pmovmskb Gd,Nq | vpmovmskb Gd,Ux (66),(v1) +d8: psubusb Pq,Qq | vpsubusb Vx,Hx,Wx (66),(v1) +d9: psubusw Pq,Qq | vpsubusw Vx,Hx,Wx (66),(v1) +da: pminub Pq,Qq | vpminub Vx,Hx,Wx (66),(v1) +db: pand Pq,Qq | vpand Vx,Hx,Wx (66),(v1) +dc: paddusb Pq,Qq | vpaddusb Vx,Hx,Wx (66),(v1) +dd: paddusw Pq,Qq | vpaddusw Vx,Hx,Wx (66),(v1) +de: pmaxub Pq,Qq | vpmaxub Vx,Hx,Wx (66),(v1) +df: pandn Pq,Qq | vpandn Vx,Hx,Wx (66),(v1) +# 0x0f 0xe0-0xef +e0: pavgb Pq,Qq | vpavgb Vx,Hx,Wx (66),(v1) +e1: psraw Pq,Qq | vpsraw Vx,Hx,Wx (66),(v1) +e2: psrad Pq,Qq | vpsrad Vx,Hx,Wx (66),(v1) +e3: pavgw Pq,Qq | vpavgw Vx,Hx,Wx (66),(v1) +e4: pmulhuw Pq,Qq | vpmulhuw Vx,Hx,Wx (66),(v1) +e5: pmulhw Pq,Qq | vpmulhw Vx,Hx,Wx (66),(v1) +e6: vcvttpd2dq Vx,Wpd (66) | vcvtdq2pd Vx,Wdq (F3) | vcvtpd2dq Vx,Wpd (F2) +e7: movntq Mq,Pq | vmovntdq Mx,Vx (66) +e8: psubsb Pq,Qq | vpsubsb Vx,Hx,Wx (66),(v1) +e9: psubsw Pq,Qq | vpsubsw Vx,Hx,Wx (66),(v1) +ea: pminsw Pq,Qq | vpminsw Vx,Hx,Wx (66),(v1) +eb: por Pq,Qq | vpor Vx,Hx,Wx (66),(v1) +ec: paddsb Pq,Qq | vpaddsb Vx,Hx,Wx (66),(v1) +ed: paddsw Pq,Qq | vpaddsw Vx,Hx,Wx (66),(v1) +ee: pmaxsw Pq,Qq | vpmaxsw Vx,Hx,Wx (66),(v1) +ef: pxor Pq,Qq | vpxor Vx,Hx,Wx (66),(v1) +# 0x0f 0xf0-0xff +f0: vlddqu Vx,Mx (F2) +f1: psllw Pq,Qq | vpsllw Vx,Hx,Wx (66),(v1) +f2: pslld Pq,Qq | vpslld Vx,Hx,Wx (66),(v1) +f3: psllq Pq,Qq | vpsllq Vx,Hx,Wx (66),(v1) +f4: pmuludq Pq,Qq | vpmuludq Vx,Hx,Wx (66),(v1) +f5: pmaddwd Pq,Qq | vpmaddwd Vx,Hx,Wx (66),(v1) +f6: psadbw Pq,Qq | vpsadbw Vx,Hx,Wx (66),(v1) +f7: maskmovq Pq,Nq | vmaskmovdqu Vx,Ux (66),(v1) +f8: psubb Pq,Qq | vpsubb Vx,Hx,Wx (66),(v1) +f9: psubw Pq,Qq | vpsubw Vx,Hx,Wx (66),(v1) +fa: psubd Pq,Qq | vpsubd Vx,Hx,Wx (66),(v1) +fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) +fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) +fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) +fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) +ff: +EndTable + +Table: 3-byte opcode 1 (0x0f 0x38) +Referrer: 3-byte escape 1 +AVXcode: 2 +# 0x0f 0x38 0x00-0x0f +00: pshufb Pq,Qq | vpshufb Vx,Hx,Wx (66),(v1) +01: phaddw Pq,Qq | vphaddw Vx,Hx,Wx (66),(v1) +02: phaddd Pq,Qq | vphaddd Vx,Hx,Wx (66),(v1) +03: phaddsw Pq,Qq | vphaddsw Vx,Hx,Wx (66),(v1) +04: pmaddubsw Pq,Qq | vpmaddubsw Vx,Hx,Wx (66),(v1) +05: phsubw Pq,Qq | vphsubw Vx,Hx,Wx (66),(v1) +06: phsubd Pq,Qq | vphsubd Vx,Hx,Wx (66),(v1) +07: phsubsw Pq,Qq | vphsubsw Vx,Hx,Wx (66),(v1) +08: psignb Pq,Qq | vpsignb Vx,Hx,Wx (66),(v1) +09: psignw Pq,Qq | vpsignw Vx,Hx,Wx (66),(v1) +0a: psignd Pq,Qq | vpsignd Vx,Hx,Wx (66),(v1) +0b: pmulhrsw Pq,Qq | vpmulhrsw Vx,Hx,Wx (66),(v1) +0c: vpermilps Vx,Hx,Wx (66),(v) +0d: vpermilpd Vx,Hx,Wx (66),(v) +0e: vtestps Vx,Wx (66),(v) +0f: vtestpd Vx,Wx (66),(v) +# 0x0f 0x38 0x10-0x1f +10: pblendvb Vdq,Wdq (66) +11: +12: +13: vcvtph2ps Vx,Wx,Ib (66),(v) +14: blendvps Vdq,Wdq (66) +15: blendvpd Vdq,Wdq (66) +16: vpermps Vqq,Hqq,Wqq (66),(v) +17: vptest Vx,Wx (66) +18: vbroadcastss Vx,Wd (66),(v) +19: vbroadcastsd Vqq,Wq (66),(v) +1a: vbroadcastf128 Vqq,Mdq (66),(v) +1b: +1c: pabsb Pq,Qq | vpabsb Vx,Wx (66),(v1) +1d: pabsw Pq,Qq | vpabsw Vx,Wx (66),(v1) +1e: pabsd Pq,Qq | vpabsd Vx,Wx (66),(v1) +1f: +# 0x0f 0x38 0x20-0x2f +20: vpmovsxbw Vx,Ux/Mq (66),(v1) +21: vpmovsxbd Vx,Ux/Md (66),(v1) +22: vpmovsxbq Vx,Ux/Mw (66),(v1) +23: vpmovsxwd Vx,Ux/Mq (66),(v1) +24: vpmovsxwq Vx,Ux/Md (66),(v1) +25: vpmovsxdq Vx,Ux/Mq (66),(v1) +26: +27: +28: vpmuldq Vx,Hx,Wx (66),(v1) +29: vpcmpeqq Vx,Hx,Wx (66),(v1) +2a: vmovntdqa Vx,Mx (66),(v1) +2b: vpackusdw Vx,Hx,Wx (66),(v1) +2c: vmaskmovps Vx,Hx,Mx (66),(v) +2d: vmaskmovpd Vx,Hx,Mx (66),(v) +2e: vmaskmovps Mx,Hx,Vx (66),(v) +2f: vmaskmovpd Mx,Hx,Vx (66),(v) +# 0x0f 0x38 0x30-0x3f +30: vpmovzxbw Vx,Ux/Mq (66),(v1) +31: vpmovzxbd Vx,Ux/Md (66),(v1) +32: vpmovzxbq Vx,Ux/Mw (66),(v1) +33: vpmovzxwd Vx,Ux/Mq (66),(v1) +34: vpmovzxwq Vx,Ux/Md (66),(v1) +35: vpmovzxdq Vx,Ux/Mq (66),(v1) +36: vpermd Vqq,Hqq,Wqq (66),(v) +37: vpcmpgtq Vx,Hx,Wx (66),(v1) +38: vpminsb Vx,Hx,Wx (66),(v1) +39: vpminsd Vx,Hx,Wx (66),(v1) +3a: vpminuw Vx,Hx,Wx (66),(v1) +3b: vpminud Vx,Hx,Wx (66),(v1) +3c: vpmaxsb Vx,Hx,Wx (66),(v1) +3d: vpmaxsd Vx,Hx,Wx (66),(v1) +3e: vpmaxuw Vx,Hx,Wx (66),(v1) +3f: vpmaxud Vx,Hx,Wx (66),(v1) +# 0x0f 0x38 0x40-0x8f +40: vpmulld Vx,Hx,Wx (66),(v1) +41: vphminposuw Vdq,Wdq (66),(v1) +42: +43: +44: +45: vpsrlvd/q Vx,Hx,Wx (66),(v) +46: vpsravd Vx,Hx,Wx (66),(v) +47: vpsllvd/q Vx,Hx,Wx (66),(v) +# Skip 0x48-0x57 +58: vpbroadcastd Vx,Wx (66),(v) +59: vpbroadcastq Vx,Wx (66),(v) +5a: vbroadcasti128 Vqq,Mdq (66),(v) +# Skip 0x5b-0x77 +78: vpbroadcastb Vx,Wx (66),(v) +79: vpbroadcastw Vx,Wx (66),(v) +# Skip 0x7a-0x7f +80: INVEPT Gy,Mdq (66) +81: INVPID Gy,Mdq (66) +82: INVPCID Gy,Mdq (66) +8c: vpmaskmovd/q Vx,Hx,Mx (66),(v) +8e: vpmaskmovd/q Mx,Vx,Hx (66),(v) +# 0x0f 0x38 0x90-0xbf (FMA) +90: vgatherdd/q Vx,Hx,Wx (66),(v) +91: vgatherqd/q Vx,Hx,Wx (66),(v) +92: vgatherdps/d Vx,Hx,Wx (66),(v) +93: vgatherqps/d Vx,Hx,Wx (66),(v) +94: +95: +96: vfmaddsub132ps/d Vx,Hx,Wx (66),(v) +97: vfmsubadd132ps/d Vx,Hx,Wx (66),(v) +98: vfmadd132ps/d Vx,Hx,Wx (66),(v) +99: vfmadd132ss/d Vx,Hx,Wx (66),(v),(v1) +9a: vfmsub132ps/d Vx,Hx,Wx (66),(v) +9b: vfmsub132ss/d Vx,Hx,Wx (66),(v),(v1) +9c: vfnmadd132ps/d Vx,Hx,Wx (66),(v) +9d: vfnmadd132ss/d Vx,Hx,Wx (66),(v),(v1) +9e: vfnmsub132ps/d Vx,Hx,Wx (66),(v) +9f: vfnmsub132ss/d Vx,Hx,Wx (66),(v),(v1) +a6: vfmaddsub213ps/d Vx,Hx,Wx (66),(v) +a7: vfmsubadd213ps/d Vx,Hx,Wx (66),(v) +a8: vfmadd213ps/d Vx,Hx,Wx (66),(v) +a9: vfmadd213ss/d Vx,Hx,Wx (66),(v),(v1) +aa: vfmsub213ps/d Vx,Hx,Wx (66),(v) +ab: vfmsub213ss/d Vx,Hx,Wx (66),(v),(v1) +ac: vfnmadd213ps/d Vx,Hx,Wx (66),(v) +ad: vfnmadd213ss/d Vx,Hx,Wx (66),(v),(v1) +ae: vfnmsub213ps/d Vx,Hx,Wx (66),(v) +af: vfnmsub213ss/d Vx,Hx,Wx (66),(v),(v1) +b6: vfmaddsub231ps/d Vx,Hx,Wx (66),(v) +b7: vfmsubadd231ps/d Vx,Hx,Wx (66),(v) +b8: vfmadd231ps/d Vx,Hx,Wx (66),(v) +b9: vfmadd231ss/d Vx,Hx,Wx (66),(v),(v1) +ba: vfmsub231ps/d Vx,Hx,Wx (66),(v) +bb: vfmsub231ss/d Vx,Hx,Wx (66),(v),(v1) +bc: vfnmadd231ps/d Vx,Hx,Wx (66),(v) +bd: vfnmadd231ss/d Vx,Hx,Wx (66),(v),(v1) +be: vfnmsub231ps/d Vx,Hx,Wx (66),(v) +bf: vfnmsub231ss/d Vx,Hx,Wx (66),(v),(v1) +# 0x0f 0x38 0xc0-0xff +db: VAESIMC Vdq,Wdq (66),(v1) +dc: VAESENC Vdq,Hdq,Wdq (66),(v1) +dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1) +de: VAESDEC Vdq,Hdq,Wdq (66),(v1) +df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1) +f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2) +f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2) +f2: ANDN Gy,By,Ey (v) +f3: Grp17 (1A) +f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) +f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v) +f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v) +EndTable + +Table: 3-byte opcode 2 (0x0f 0x3a) +Referrer: 3-byte escape 2 +AVXcode: 3 +# 0x0f 0x3a 0x00-0xff +00: vpermq Vqq,Wqq,Ib (66),(v) +01: vpermpd Vqq,Wqq,Ib (66),(v) +02: vpblendd Vx,Hx,Wx,Ib (66),(v) +03: +04: vpermilps Vx,Wx,Ib (66),(v) +05: vpermilpd Vx,Wx,Ib (66),(v) +06: vperm2f128 Vqq,Hqq,Wqq,Ib (66),(v) +07: +08: vroundps Vx,Wx,Ib (66) +09: vroundpd Vx,Wx,Ib (66) +0a: vroundss Vss,Wss,Ib (66),(v1) +0b: vroundsd Vsd,Wsd,Ib (66),(v1) +0c: vblendps Vx,Hx,Wx,Ib (66) +0d: vblendpd Vx,Hx,Wx,Ib (66) +0e: vpblendw Vx,Hx,Wx,Ib (66),(v1) +0f: palignr Pq,Qq,Ib | vpalignr Vx,Hx,Wx,Ib (66),(v1) +14: vpextrb Rd/Mb,Vdq,Ib (66),(v1) +15: vpextrw Rd/Mw,Vdq,Ib (66),(v1) +16: vpextrd/q Ey,Vdq,Ib (66),(v1) +17: vextractps Ed,Vdq,Ib (66),(v1) +18: vinsertf128 Vqq,Hqq,Wqq,Ib (66),(v) +19: vextractf128 Wdq,Vqq,Ib (66),(v) +1d: vcvtps2ph Wx,Vx,Ib (66),(v) +20: vpinsrb Vdq,Hdq,Ry/Mb,Ib (66),(v1) +21: vinsertps Vdq,Hdq,Udq/Md,Ib (66),(v1) +22: vpinsrd/q Vdq,Hdq,Ey,Ib (66),(v1) +38: vinserti128 Vqq,Hqq,Wqq,Ib (66),(v) +39: vextracti128 Wdq,Vqq,Ib (66),(v) +40: vdpps Vx,Hx,Wx,Ib (66) +41: vdppd Vdq,Hdq,Wdq,Ib (66),(v1) +42: vmpsadbw Vx,Hx,Wx,Ib (66),(v1) +44: vpclmulqdq Vdq,Hdq,Wdq,Ib (66),(v1) +46: vperm2i128 Vqq,Hqq,Wqq,Ib (66),(v) +4a: vblendvps Vx,Hx,Wx,Lx (66),(v) +4b: vblendvpd Vx,Hx,Wx,Lx (66),(v) +4c: vpblendvb Vx,Hx,Wx,Lx (66),(v1) +60: vpcmpestrm Vdq,Wdq,Ib (66),(v1) +61: vpcmpestri Vdq,Wdq,Ib (66),(v1) +62: vpcmpistrm Vdq,Wdq,Ib (66),(v1) +63: vpcmpistri Vdq,Wdq,Ib (66),(v1) +df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1) +f0: RORX Gy,Ey,Ib (F2),(v) +EndTable + +GrpTable: Grp1 +0: ADD +1: OR +2: ADC +3: SBB +4: AND +5: SUB +6: XOR +7: CMP +EndTable + +GrpTable: Grp1A +0: POP +EndTable + +GrpTable: Grp2 +0: ROL +1: ROR +2: RCL +3: RCR +4: SHL/SAL +5: SHR +6: +7: SAR +EndTable + +GrpTable: Grp3_1 +0: TEST Eb,Ib +1: +2: NOT Eb +3: NEG Eb +4: MUL AL,Eb +5: IMUL AL,Eb +6: DIV AL,Eb +7: IDIV AL,Eb +EndTable + +GrpTable: Grp3_2 +0: TEST Ev,Iz +1: +2: NOT Ev +3: NEG Ev +4: MUL rAX,Ev +5: IMUL rAX,Ev +6: DIV rAX,Ev +7: IDIV rAX,Ev +EndTable + +GrpTable: Grp4 +0: INC Eb +1: DEC Eb +EndTable + +GrpTable: Grp5 +0: INC Ev +1: DEC Ev +2: CALLN Ev (f64) +3: CALLF Ep +4: JMPN Ev (f64) +5: JMPF Mp +6: PUSH Ev (d64) +7: +EndTable + +GrpTable: Grp6 +0: SLDT Rv/Mw +1: STR Rv/Mw +2: LLDT Ew +3: LTR Ew +4: VERR Ew +5: VERW Ew +EndTable + +GrpTable: Grp7 +0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) +1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) +2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) +3: LIDT Ms +4: SMSW Mw/Rv +5: +6: LMSW Ew +7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B) +EndTable + +GrpTable: Grp8 +4: BT +5: BTS +6: BTR +7: BTC +EndTable + +GrpTable: Grp9 +1: CMPXCHG8B/16B Mq/Mdq +6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B) +7: VMPTRST Mq | VMPTRST Mq (F3) | RDSEED Rv (11B) +EndTable + +GrpTable: Grp10 +EndTable + +# Grp11A and Grp11B are expressed as Grp11 in Intel SDM +GrpTable: Grp11A +0: MOV Eb,Ib +7: XABORT Ib (000),(11B) +EndTable + +GrpTable: Grp11B +0: MOV Eb,Iz +7: XBEGIN Jz (000),(11B) +EndTable + +GrpTable: Grp12 +2: psrlw Nq,Ib (11B) | vpsrlw Hx,Ux,Ib (66),(11B),(v1) +4: psraw Nq,Ib (11B) | vpsraw Hx,Ux,Ib (66),(11B),(v1) +6: psllw Nq,Ib (11B) | vpsllw Hx,Ux,Ib (66),(11B),(v1) +EndTable + +GrpTable: Grp13 +2: psrld Nq,Ib (11B) | vpsrld Hx,Ux,Ib (66),(11B),(v1) +4: psrad Nq,Ib (11B) | vpsrad Hx,Ux,Ib (66),(11B),(v1) +6: pslld Nq,Ib (11B) | vpslld Hx,Ux,Ib (66),(11B),(v1) +EndTable + +GrpTable: Grp14 +2: psrlq Nq,Ib (11B) | vpsrlq Hx,Ux,Ib (66),(11B),(v1) +3: vpsrldq Hx,Ux,Ib (66),(11B),(v1) +6: psllq Nq,Ib (11B) | vpsllq Hx,Ux,Ib (66),(11B),(v1) +7: vpslldq Hx,Ux,Ib (66),(11B),(v1) +EndTable + +GrpTable: Grp15 +0: fxsave | RDFSBASE Ry (F3),(11B) +1: fxstor | RDGSBASE Ry (F3),(11B) +2: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B) +3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B) +4: XSAVE +5: XRSTOR | lfence (11B) +6: XSAVEOPT | mfence (11B) +7: clflush | sfence (11B) +EndTable + +GrpTable: Grp16 +0: prefetch NTA +1: prefetch T0 +2: prefetch T1 +3: prefetch T2 +EndTable + +GrpTable: Grp17 +1: BLSR By,Ey (v) +2: BLSMSK By,Ey (v) +3: BLSI By,Ey (v) +EndTable + +# AMD's Prefetch Group +GrpTable: GrpP +0: PREFETCH +1: PREFETCHW +EndTable + +GrpTable: GrpPDLK +0: MONTMUL +1: XSHA1 +2: XSHA2 +EndTable + +GrpTable: GrpRNG +0: xstore-rng +1: xcrypt-ecb +2: xcrypt-cbc +4: xcrypt-cfb +5: xcrypt-ofb +EndTable |
