diff options
Diffstat (limited to 'arch/x86/lib/memcpy_64.S')
| -rw-r--r-- | arch/x86/lib/memcpy_64.S | 283 |
1 files changed, 179 insertions, 104 deletions
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index c22981fa2f3..56313a32618 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -1,131 +1,206 @@ /* Copyright 2002 Andi Kleen */ #include <linux/linkage.h> -#include <asm/dwarf2.h> + #include <asm/cpufeature.h> +#include <asm/dwarf2.h> +#include <asm/alternative-asm.h> /* * memcpy - Copy a memory block. * - * Input: - * rdi destination - * rsi source - * rdx count - * + * Input: + * rdi destination + * rsi source + * rdx count + * * Output: * rax original destination - */ + */ - ALIGN -memcpy_c: - CFI_STARTPROC - movq %rdi,%rax - movl %edx,%ecx - shrl $3,%ecx - andl $7,%edx +/* + * memcpy_c() - fast string ops (REP MOVSQ) based variant. + * + * This gets patched over the unrolled variant (below) via the + * alternative instructions framework: + */ + .section .altinstr_replacement, "ax", @progbits +.Lmemcpy_c: + movq %rdi, %rax + movq %rdx, %rcx + shrq $3, %rcx + andl $7, %edx rep movsq - movl %edx,%ecx + movl %edx, %ecx rep movsb ret - CFI_ENDPROC -ENDPROC(memcpy_c) +.Lmemcpy_e: + .previous + +/* + * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than + * memcpy_c. Use memcpy_c_e when possible. + * + * This gets patched over the unrolled variant (below) via the + * alternative instructions framework: + */ + .section .altinstr_replacement, "ax", @progbits +.Lmemcpy_c_e: + movq %rdi, %rax + movq %rdx, %rcx + rep movsb + ret +.Lmemcpy_e_e: + .previous ENTRY(__memcpy) ENTRY(memcpy) CFI_STARTPROC - pushq %rbx - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rbx, 0 - movq %rdi,%rax - - movl %edx,%ecx - shrl $6,%ecx - jz .Lhandle_tail - + movq %rdi, %rax + + cmpq $0x20, %rdx + jb .Lhandle_tail + + /* + * We check whether memory false dependence could occur, + * then jump to corresponding copy mode. + */ + cmp %dil, %sil + jl .Lcopy_backward + subq $0x20, %rdx +.Lcopy_forward_loop: + subq $0x20, %rdx + + /* + * Move in blocks of 4x8 bytes: + */ + movq 0*8(%rsi), %r8 + movq 1*8(%rsi), %r9 + movq 2*8(%rsi), %r10 + movq 3*8(%rsi), %r11 + leaq 4*8(%rsi), %rsi + + movq %r8, 0*8(%rdi) + movq %r9, 1*8(%rdi) + movq %r10, 2*8(%rdi) + movq %r11, 3*8(%rdi) + leaq 4*8(%rdi), %rdi + jae .Lcopy_forward_loop + addl $0x20, %edx + jmp .Lhandle_tail + +.Lcopy_backward: + /* + * Calculate copy position to tail. + */ + addq %rdx, %rsi + addq %rdx, %rdi + subq $0x20, %rdx + /* + * At most 3 ALU operations in one cycle, + * so append NOPS in the same 16 bytes trunk. + */ .p2align 4 -.Lloop_64: - decl %ecx - - movq (%rsi),%r11 - movq 8(%rsi),%r8 - - movq %r11,(%rdi) - movq %r8,1*8(%rdi) - - movq 2*8(%rsi),%r9 - movq 3*8(%rsi),%r10 - - movq %r9,2*8(%rdi) - movq %r10,3*8(%rdi) - - movq 4*8(%rsi),%r11 - movq 5*8(%rsi),%r8 - - movq %r11,4*8(%rdi) - movq %r8,5*8(%rdi) - - movq 6*8(%rsi),%r9 - movq 7*8(%rsi),%r10 - - movq %r9,6*8(%rdi) - movq %r10,7*8(%rdi) - - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - jnz .Lloop_64 - +.Lcopy_backward_loop: + subq $0x20, %rdx + movq -1*8(%rsi), %r8 + movq -2*8(%rsi), %r9 + movq -3*8(%rsi), %r10 + movq -4*8(%rsi), %r11 + leaq -4*8(%rsi), %rsi + movq %r8, -1*8(%rdi) + movq %r9, -2*8(%rdi) + movq %r10, -3*8(%rdi) + movq %r11, -4*8(%rdi) + leaq -4*8(%rdi), %rdi + jae .Lcopy_backward_loop + + /* + * Calculate copy position to head. + */ + addl $0x20, %edx + subq %rdx, %rsi + subq %rdx, %rdi .Lhandle_tail: - movl %edx,%ecx - andl $63,%ecx - shrl $3,%ecx - jz .Lhandle_7 + cmpl $16, %edx + jb .Lless_16bytes + + /* + * Move data from 16 bytes to 31 bytes. + */ + movq 0*8(%rsi), %r8 + movq 1*8(%rsi), %r9 + movq -2*8(%rsi, %rdx), %r10 + movq -1*8(%rsi, %rdx), %r11 + movq %r8, 0*8(%rdi) + movq %r9, 1*8(%rdi) + movq %r10, -2*8(%rdi, %rdx) + movq %r11, -1*8(%rdi, %rdx) + retq .p2align 4 -.Lloop_8: - decl %ecx - movq (%rsi),%r8 - movq %r8,(%rdi) - leaq 8(%rdi),%rdi - leaq 8(%rsi),%rsi - jnz .Lloop_8 - -.Lhandle_7: - movl %edx,%ecx - andl $7,%ecx - jz .Lende +.Lless_16bytes: + cmpl $8, %edx + jb .Lless_8bytes + /* + * Move data from 8 bytes to 15 bytes. + */ + movq 0*8(%rsi), %r8 + movq -1*8(%rsi, %rdx), %r9 + movq %r8, 0*8(%rdi) + movq %r9, -1*8(%rdi, %rdx) + retq .p2align 4 -.Lloop_1: - movb (%rsi),%r8b - movb %r8b,(%rdi) - incq %rdi - incq %rsi - decl %ecx - jnz .Lloop_1 - -.Lende: - popq %rbx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE rbx - ret -.Lfinal: +.Lless_8bytes: + cmpl $4, %edx + jb .Lless_3bytes + + /* + * Move data from 4 bytes to 7 bytes. + */ + movl (%rsi), %ecx + movl -4(%rsi, %rdx), %r8d + movl %ecx, (%rdi) + movl %r8d, -4(%rdi, %rdx) + retq + .p2align 4 +.Lless_3bytes: + subl $1, %edx + jb .Lend + /* + * Move data from 1 bytes to 3 bytes. + */ + movzbl (%rsi), %ecx + jz .Lstore_1byte + movzbq 1(%rsi), %r8 + movzbq (%rsi, %rdx), %r9 + movb %r8b, 1(%rdi) + movb %r9b, (%rdi, %rdx) +.Lstore_1byte: + movb %cl, (%rdi) + +.Lend: + retq CFI_ENDPROC ENDPROC(memcpy) ENDPROC(__memcpy) - /* Some CPUs run faster using the string copy instructions. - It is also a lot simpler. Use this when possible */ - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp <disp8> */ - .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ -2: - .previous - .section .altinstructions,"a" - .align 8 - .quad memcpy - .quad 1b - .byte X86_FEATURE_REP_GOOD - /* Replace only beginning, memcpy is used to apply alternatives, so it - * is silly to overwrite itself with nops - reboot is only outcome... */ - .byte 2b - 1b - .byte 2b - 1b + /* + * Some CPUs are adding enhanced REP MOVSB/STOSB feature + * If the feature is supported, memcpy_c_e() is the first choice. + * If enhanced rep movsb copy is not available, use fast string copy + * memcpy_c() when possible. This is faster and code is simpler than + * original memcpy(). + * Otherwise, original memcpy() is used. + * In .altinstructions section, ERMS feature is placed after REG_GOOD + * feature to implement the right patch order. + * + * Replace only beginning, memcpy is used to apply alternatives, + * so it is silly to overwrite itself with nops - reboot is the + * only outcome... + */ + .section .altinstructions, "a" + altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ + .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c + altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ + .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e .previous |
