diff options
Diffstat (limited to 'arch/x86/lib/copy_user_64.S')
| -rw-r--r-- | arch/x86/lib/copy_user_64.S | 490 |
1 files changed, 215 insertions, 275 deletions
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 70bebd31040..dee945d5559 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -1,8 +1,10 @@ -/* Copyright 2002 Andi Kleen, SuSE Labs. +/* + * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com> + * Copyright 2002 Andi Kleen, SuSE Labs. * Subject to the GNU Public License v2. - * - * Functions to copy from and to user space. - */ + * + * Functions to copy from and to user space. + */ #include <linux/linkage.h> #include <asm/dwarf2.h> @@ -13,67 +15,94 @@ #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/cpufeature.h> +#include <asm/alternative-asm.h> +#include <asm/asm.h> +#include <asm/smap.h> - .macro ALTERNATIVE_JUMP feature,orig,alt +/* + * By placing feature2 after feature1 in altinstructions section, we logically + * implement: + * If CPU has feature2, jmp to alt2 is used + * else if CPU has feature1, jmp to alt1 is used + * else jmp to orig is used. + */ + .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2 0: .byte 0xe9 /* 32bit jump */ .long \orig-1f /* by default jump to orig */ 1: .section .altinstr_replacement,"ax" -2: .byte 0xe9 /* near jump with 32bit immediate */ - .long \alt-1b /* offset */ /* or alternatively to alt */ +2: .byte 0xe9 /* near jump with 32bit immediate */ + .long \alt1-1b /* offset */ /* or alternatively to alt1 */ +3: .byte 0xe9 /* near jump with 32bit immediate */ + .long \alt2-1b /* offset */ /* or alternatively to alt2 */ .previous + .section .altinstructions,"a" - .align 8 - .quad 0b - .quad 2b - .byte \feature /* when feature is set */ - .byte 5 - .byte 5 + altinstruction_entry 0b,2b,\feature1,5,5 + altinstruction_entry 0b,3b,\feature2,5,5 .previous .endm -/* Standard copy_to_user with segment limit checking */ -ENTRY(copy_to_user) + .macro ALIGN_DESTINATION +#ifdef FIX_ALIGNMENT + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $7,%ecx + jz 102f /* already aligned */ + subl $8,%ecx + negl %ecx + subl %ecx,%edx +100: movb (%rsi),%al +101: movb %al,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 100b +102: + .section .fixup,"ax" +103: addl %ecx,%edx /* ecx is zerorest also */ + jmp copy_user_handle_tail + .previous + + _ASM_EXTABLE(100b,103b) + _ASM_EXTABLE(101b,103b) +#endif + .endm + +/* Standard copy_to_user with segment limit checking */ +ENTRY(_copy_to_user) CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rdi,%rcx addq %rdx,%rcx - jc bad_to_user - cmpq threadinfo_addr_limit(%rax),%rcx - jae bad_to_user - xorl %eax,%eax /* clear zero flag */ - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + jc bad_to_user + cmpq TI_addr_limit(%rax),%rcx + ja bad_to_user + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ + copy_user_generic_unrolled,copy_user_generic_string, \ + copy_user_enhanced_fast_string CFI_ENDPROC +ENDPROC(_copy_to_user) -ENTRY(copy_user_generic) - CFI_STARTPROC - movl $1,%ecx /* set zero flag */ - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string - CFI_ENDPROC - -ENTRY(__copy_from_user_inatomic) - CFI_STARTPROC - xorl %ecx,%ecx /* clear zero flag */ - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string - CFI_ENDPROC - -/* Standard copy_from_user with segment limit checking */ -ENTRY(copy_from_user) +/* Standard copy_from_user with segment limit checking */ +ENTRY(_copy_from_user) CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rsi,%rcx addq %rdx,%rcx - jc bad_from_user - cmpq threadinfo_addr_limit(%rax),%rcx - jae bad_from_user - movl $1,%ecx /* set zero flag */ - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + jc bad_from_user + cmpq TI_addr_limit(%rax),%rcx + ja bad_from_user + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ + copy_user_generic_unrolled,copy_user_generic_string, \ + copy_user_enhanced_fast_string CFI_ENDPROC -ENDPROC(copy_from_user) - +ENDPROC(_copy_from_user) + .section .fixup,"ax" /* must zero dest */ +ENTRY(bad_from_user) bad_from_user: CFI_STARTPROC movl %edx,%ecx @@ -81,274 +110,185 @@ bad_from_user: rep stosb bad_to_user: - movl %edx,%eax + movl %edx,%eax ret CFI_ENDPROC -END(bad_from_user) +ENDPROC(bad_from_user) .previous - - + /* * copy_user_generic_unrolled - memory copy with exception handling. - * This version is for CPUs like P4 that don't have efficient micro code for rep movsq - * - * Input: + * This version is for CPUs like P4 that don't have efficient micro + * code for rep movsq + * + * Input: * rdi destination * rsi source * rdx count - * ecx zero flag -- if true zero destination on error * - * Output: + * Output: * eax uncopied bytes or 0 if successful. */ ENTRY(copy_user_generic_unrolled) CFI_STARTPROC - pushq %rbx - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rbx, 0 - pushq %rcx - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rcx, 0 - xorl %eax,%eax /*zero for the exception handler */ - -#ifdef FIX_ALIGNMENT - /* check for bad alignment of destination */ - movl %edi,%ecx - andl $7,%ecx - jnz .Lbad_alignment -.Lafter_bad_alignment: -#endif - - movq %rdx,%rcx - - movl $64,%ebx - shrq $6,%rdx - decq %rdx - js .Lhandle_tail - - .p2align 4 -.Lloop: -.Ls1: movq (%rsi),%r11 -.Ls2: movq 1*8(%rsi),%r8 -.Ls3: movq 2*8(%rsi),%r9 -.Ls4: movq 3*8(%rsi),%r10 -.Ld1: movq %r11,(%rdi) -.Ld2: movq %r8,1*8(%rdi) -.Ld3: movq %r9,2*8(%rdi) -.Ld4: movq %r10,3*8(%rdi) - -.Ls5: movq 4*8(%rsi),%r11 -.Ls6: movq 5*8(%rsi),%r8 -.Ls7: movq 6*8(%rsi),%r9 -.Ls8: movq 7*8(%rsi),%r10 -.Ld5: movq %r11,4*8(%rdi) -.Ld6: movq %r8,5*8(%rdi) -.Ld7: movq %r9,6*8(%rdi) -.Ld8: movq %r10,7*8(%rdi) - - decq %rdx - + ASM_STAC + cmpl $8,%edx + jb 20f /* less then 8 bytes, go to byte copy loop */ + ALIGN_DESTINATION + movl %edx,%ecx + andl $63,%edx + shrl $6,%ecx + jz 17f +1: movq (%rsi),%r8 +2: movq 1*8(%rsi),%r9 +3: movq 2*8(%rsi),%r10 +4: movq 3*8(%rsi),%r11 +5: movq %r8,(%rdi) +6: movq %r9,1*8(%rdi) +7: movq %r10,2*8(%rdi) +8: movq %r11,3*8(%rdi) +9: movq 4*8(%rsi),%r8 +10: movq 5*8(%rsi),%r9 +11: movq 6*8(%rsi),%r10 +12: movq 7*8(%rsi),%r11 +13: movq %r8,4*8(%rdi) +14: movq %r9,5*8(%rdi) +15: movq %r10,6*8(%rdi) +16: movq %r11,7*8(%rdi) leaq 64(%rsi),%rsi leaq 64(%rdi),%rdi - - jns .Lloop - - .p2align 4 -.Lhandle_tail: - movl %ecx,%edx - andl $63,%ecx - shrl $3,%ecx - jz .Lhandle_7 - movl $8,%ebx - .p2align 4 -.Lloop_8: -.Ls9: movq (%rsi),%r8 -.Ld9: movq %r8,(%rdi) decl %ecx - leaq 8(%rdi),%rdi + jnz 1b +17: movl %edx,%ecx + andl $7,%edx + shrl $3,%ecx + jz 20f +18: movq (%rsi),%r8 +19: movq %r8,(%rdi) leaq 8(%rsi),%rsi - jnz .Lloop_8 - -.Lhandle_7: - movl %edx,%ecx - andl $7,%ecx - jz .Lende - .p2align 4 -.Lloop_1: -.Ls10: movb (%rsi),%bl -.Ld10: movb %bl,(%rdi) - incq %rdi - incq %rsi + leaq 8(%rdi),%rdi decl %ecx - jnz .Lloop_1 - - CFI_REMEMBER_STATE -.Lende: - popq %rcx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE rcx - popq %rbx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE rbx - ret - CFI_RESTORE_STATE - -#ifdef FIX_ALIGNMENT - /* align destination */ - .p2align 4 -.Lbad_alignment: - movl $8,%r9d - subl %ecx,%r9d - movl %r9d,%ecx - cmpq %r9,%rdx - jz .Lhandle_7 - js .Lhandle_7 -.Lalign_1: -.Ls11: movb (%rsi),%bl -.Ld11: movb %bl,(%rdi) + jnz 18b +20: andl %edx,%edx + jz 23f + movl %edx,%ecx +21: movb (%rsi),%al +22: movb %al,(%rdi) incq %rsi incq %rdi decl %ecx - jnz .Lalign_1 - subq %r9,%rdx - jmp .Lafter_bad_alignment -#endif + jnz 21b +23: xor %eax,%eax + ASM_CLAC + ret - /* table sorted by exception address */ - .section __ex_table,"a" - .align 8 - .quad .Ls1,.Ls1e - .quad .Ls2,.Ls2e - .quad .Ls3,.Ls3e - .quad .Ls4,.Ls4e - .quad .Ld1,.Ls1e - .quad .Ld2,.Ls2e - .quad .Ld3,.Ls3e - .quad .Ld4,.Ls4e - .quad .Ls5,.Ls5e - .quad .Ls6,.Ls6e - .quad .Ls7,.Ls7e - .quad .Ls8,.Ls8e - .quad .Ld5,.Ls5e - .quad .Ld6,.Ls6e - .quad .Ld7,.Ls7e - .quad .Ld8,.Ls8e - .quad .Ls9,.Le_quad - .quad .Ld9,.Le_quad - .quad .Ls10,.Le_byte - .quad .Ld10,.Le_byte -#ifdef FIX_ALIGNMENT - .quad .Ls11,.Lzero_rest - .quad .Ld11,.Lzero_rest -#endif - .quad .Le5,.Le_zero + .section .fixup,"ax" +30: shll $6,%ecx + addl %ecx,%edx + jmp 60f +40: leal (%rdx,%rcx,8),%edx + jmp 60f +50: movl %ecx,%edx +60: jmp copy_user_handle_tail /* ecx is zerorest also */ .previous - /* compute 64-offset for main loop. 8 bytes accuracy with error on the - pessimistic side. this is gross. it would be better to fix the - interface. */ - /* eax: zero, ebx: 64 */ -.Ls1e: addl $8,%eax -.Ls2e: addl $8,%eax -.Ls3e: addl $8,%eax -.Ls4e: addl $8,%eax -.Ls5e: addl $8,%eax -.Ls6e: addl $8,%eax -.Ls7e: addl $8,%eax -.Ls8e: addl $8,%eax - addq %rbx,%rdi /* +64 */ - subq %rax,%rdi /* correct destination with computed offset */ - - shlq $6,%rdx /* loop counter * 64 (stride length) */ - addq %rax,%rdx /* add offset to loopcnt */ - andl $63,%ecx /* remaining bytes */ - addq %rcx,%rdx /* add them */ - jmp .Lzero_rest - - /* exception on quad word loop in tail handling */ - /* ecx: loopcnt/8, %edx: length, rdi: correct */ -.Le_quad: - shll $3,%ecx - andl $7,%edx - addl %ecx,%edx - /* edx: bytes to zero, rdi: dest, eax:zero */ -.Lzero_rest: - cmpl $0,(%rsp) - jz .Le_zero - movq %rdx,%rcx -.Le_byte: - xorl %eax,%eax -.Le5: rep - stosb - /* when there is another exception while zeroing the rest just return */ -.Le_zero: - movq %rdx,%rax - jmp .Lende + _ASM_EXTABLE(1b,30b) + _ASM_EXTABLE(2b,30b) + _ASM_EXTABLE(3b,30b) + _ASM_EXTABLE(4b,30b) + _ASM_EXTABLE(5b,30b) + _ASM_EXTABLE(6b,30b) + _ASM_EXTABLE(7b,30b) + _ASM_EXTABLE(8b,30b) + _ASM_EXTABLE(9b,30b) + _ASM_EXTABLE(10b,30b) + _ASM_EXTABLE(11b,30b) + _ASM_EXTABLE(12b,30b) + _ASM_EXTABLE(13b,30b) + _ASM_EXTABLE(14b,30b) + _ASM_EXTABLE(15b,30b) + _ASM_EXTABLE(16b,30b) + _ASM_EXTABLE(18b,40b) + _ASM_EXTABLE(19b,40b) + _ASM_EXTABLE(21b,50b) + _ASM_EXTABLE(22b,50b) CFI_ENDPROC -ENDPROC(copy_user_generic) - +ENDPROC(copy_user_generic_unrolled) - /* Some CPUs run faster using the string copy instructions. - This is also a lot simpler. Use them when possible. - Patch in jmps to this code instead of copying it fully - to avoid unwanted aliasing in the exception tables. */ - - /* rdi destination - * rsi source - * rdx count - * ecx zero flag - * - * Output: - * eax uncopied bytes or 0 if successfull. - * - * Only 4GB of copy is supported. This shouldn't be a problem - * because the kernel normally only writes from/to page sized chunks - * even if user space passed a longer buffer. - * And more would be dangerous because both Intel and AMD have - * errata with rep movsq > 4GB. If someone feels the need to fix - * this please consider this. - */ +/* Some CPUs run faster using the string copy instructions. + * This is also a lot simpler. Use them when possible. + * + * Only 4GB of copy is supported. This shouldn't be a problem + * because the kernel normally only writes from/to page sized chunks + * even if user space passed a longer buffer. + * And more would be dangerous because both Intel and AMD have + * errata with rep movsq > 4GB. If someone feels the need to fix + * this please consider this. + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successful. + */ ENTRY(copy_user_generic_string) CFI_STARTPROC - movl %ecx,%r8d /* save zero flag */ + ASM_STAC + cmpl $8,%edx + jb 2f /* less than 8 bytes, go to byte copy loop */ + ALIGN_DESTINATION movl %edx,%ecx shrl $3,%ecx - andl $7,%edx - jz 10f -1: rep - movsq - movl %edx,%ecx -2: rep + andl $7,%edx +1: rep + movsq +2: movl %edx,%ecx +3: rep movsb -9: movl %ecx,%eax + xorl %eax,%eax + ASM_CLAC ret - /* multiple of 8 byte */ -10: rep - movsq - xor %eax,%eax - ret + .section .fixup,"ax" +11: leal (%rdx,%rcx,8),%ecx +12: movl %ecx,%edx /* ecx is zerorest also */ + jmp copy_user_handle_tail + .previous - /* exception handling */ -3: lea (%rdx,%rcx,8),%rax /* exception on quad loop */ - jmp 6f -5: movl %ecx,%eax /* exception on byte loop */ - /* eax: left over bytes */ -6: testl %r8d,%r8d /* zero flag set? */ - jz 7f - movl %eax,%ecx /* initialize x86 loop counter */ - push %rax - xorl %eax,%eax -8: rep - stosb /* zero the rest */ -11: pop %rax -7: ret + _ASM_EXTABLE(1b,11b) + _ASM_EXTABLE(3b,12b) CFI_ENDPROC -END(copy_user_generic_c) +ENDPROC(copy_user_generic_string) + +/* + * Some CPUs are adding enhanced REP MOVSB/STOSB instructions. + * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled. + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successful. + */ +ENTRY(copy_user_enhanced_fast_string) + CFI_STARTPROC + ASM_STAC + movl %edx,%ecx +1: rep + movsb + xorl %eax,%eax + ASM_CLAC + ret - .section __ex_table,"a" - .quad 1b,3b - .quad 2b,5b - .quad 8b,11b - .quad 10b,3b + .section .fixup,"ax" +12: movl %ecx,%edx /* ecx is zerorest also */ + jmp copy_user_handle_tail .previous + + _ASM_EXTABLE(1b,12b) + CFI_ENDPROC +ENDPROC(copy_user_enhanced_fast_string) |
