diff options
Diffstat (limited to 'arch/x86/lib/memcpy_64.S')
| -rw-r--r-- | arch/x86/lib/memcpy_64.S | 219 |
1 files changed, 141 insertions, 78 deletions
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index f82e884928a..56313a32618 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -4,6 +4,7 @@ #include <asm/cpufeature.h> #include <asm/dwarf2.h> +#include <asm/alternative-asm.h> /* * memcpy - Copy a memory block. @@ -26,9 +27,8 @@ .section .altinstr_replacement, "ax", @progbits .Lmemcpy_c: movq %rdi, %rax - - movl %edx, %ecx - shrl $3, %ecx + movq %rdx, %rcx + shrq $3, %rcx andl $7, %edx rep movsq movl %edx, %ecx @@ -37,107 +37,170 @@ .Lmemcpy_e: .previous +/* + * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than + * memcpy_c. Use memcpy_c_e when possible. + * + * This gets patched over the unrolled variant (below) via the + * alternative instructions framework: + */ + .section .altinstr_replacement, "ax", @progbits +.Lmemcpy_c_e: + movq %rdi, %rax + movq %rdx, %rcx + rep movsb + ret +.Lmemcpy_e_e: + .previous + ENTRY(__memcpy) ENTRY(memcpy) CFI_STARTPROC + movq %rdi, %rax + + cmpq $0x20, %rdx + jb .Lhandle_tail /* - * Put the number of full 64-byte blocks into %ecx. - * Tail portion is handled at the end: + * We check whether memory false dependence could occur, + * then jump to corresponding copy mode. */ - movq %rdi, %rax - movl %edx, %ecx - shrl $6, %ecx - jz .Lhandle_tail + cmp %dil, %sil + jl .Lcopy_backward + subq $0x20, %rdx +.Lcopy_forward_loop: + subq $0x20, %rdx - .p2align 4 -.Lloop_64: /* - * We decrement the loop index here - and the zero-flag is - * checked at the end of the loop (instructions inbetween do - * not change the zero flag): + * Move in blocks of 4x8 bytes: */ - decl %ecx - + movq 0*8(%rsi), %r8 + movq 1*8(%rsi), %r9 + movq 2*8(%rsi), %r10 + movq 3*8(%rsi), %r11 + leaq 4*8(%rsi), %rsi + + movq %r8, 0*8(%rdi) + movq %r9, 1*8(%rdi) + movq %r10, 2*8(%rdi) + movq %r11, 3*8(%rdi) + leaq 4*8(%rdi), %rdi + jae .Lcopy_forward_loop + addl $0x20, %edx + jmp .Lhandle_tail + +.Lcopy_backward: /* - * Move in blocks of 4x16 bytes: + * Calculate copy position to tail. */ - movq 0*8(%rsi), %r11 - movq 1*8(%rsi), %r8 - movq %r11, 0*8(%rdi) - movq %r8, 1*8(%rdi) - - movq 2*8(%rsi), %r9 - movq 3*8(%rsi), %r10 - movq %r9, 2*8(%rdi) - movq %r10, 3*8(%rdi) - - movq 4*8(%rsi), %r11 - movq 5*8(%rsi), %r8 - movq %r11, 4*8(%rdi) - movq %r8, 5*8(%rdi) - - movq 6*8(%rsi), %r9 - movq 7*8(%rsi), %r10 - movq %r9, 6*8(%rdi) - movq %r10, 7*8(%rdi) - - leaq 64(%rsi), %rsi - leaq 64(%rdi), %rdi - - jnz .Lloop_64 + addq %rdx, %rsi + addq %rdx, %rdi + subq $0x20, %rdx + /* + * At most 3 ALU operations in one cycle, + * so append NOPS in the same 16 bytes trunk. + */ + .p2align 4 +.Lcopy_backward_loop: + subq $0x20, %rdx + movq -1*8(%rsi), %r8 + movq -2*8(%rsi), %r9 + movq -3*8(%rsi), %r10 + movq -4*8(%rsi), %r11 + leaq -4*8(%rsi), %rsi + movq %r8, -1*8(%rdi) + movq %r9, -2*8(%rdi) + movq %r10, -3*8(%rdi) + movq %r11, -4*8(%rdi) + leaq -4*8(%rdi), %rdi + jae .Lcopy_backward_loop + /* + * Calculate copy position to head. + */ + addl $0x20, %edx + subq %rdx, %rsi + subq %rdx, %rdi .Lhandle_tail: - movl %edx, %ecx - andl $63, %ecx - shrl $3, %ecx - jz .Lhandle_7 + cmpl $16, %edx + jb .Lless_16bytes + /* + * Move data from 16 bytes to 31 bytes. + */ + movq 0*8(%rsi), %r8 + movq 1*8(%rsi), %r9 + movq -2*8(%rsi, %rdx), %r10 + movq -1*8(%rsi, %rdx), %r11 + movq %r8, 0*8(%rdi) + movq %r9, 1*8(%rdi) + movq %r10, -2*8(%rdi, %rdx) + movq %r11, -1*8(%rdi, %rdx) + retq .p2align 4 -.Lloop_8: - decl %ecx - movq (%rsi), %r8 - movq %r8, (%rdi) - leaq 8(%rdi), %rdi - leaq 8(%rsi), %rsi - jnz .Lloop_8 - -.Lhandle_7: - movl %edx, %ecx - andl $7, %ecx - jz .Lend +.Lless_16bytes: + cmpl $8, %edx + jb .Lless_8bytes + /* + * Move data from 8 bytes to 15 bytes. + */ + movq 0*8(%rsi), %r8 + movq -1*8(%rsi, %rdx), %r9 + movq %r8, 0*8(%rdi) + movq %r9, -1*8(%rdi, %rdx) + retq + .p2align 4 +.Lless_8bytes: + cmpl $4, %edx + jb .Lless_3bytes + /* + * Move data from 4 bytes to 7 bytes. + */ + movl (%rsi), %ecx + movl -4(%rsi, %rdx), %r8d + movl %ecx, (%rdi) + movl %r8d, -4(%rdi, %rdx) + retq .p2align 4 -.Lloop_1: - movb (%rsi), %r8b - movb %r8b, (%rdi) - incq %rdi - incq %rsi - decl %ecx - jnz .Lloop_1 +.Lless_3bytes: + subl $1, %edx + jb .Lend + /* + * Move data from 1 bytes to 3 bytes. + */ + movzbl (%rsi), %ecx + jz .Lstore_1byte + movzbq 1(%rsi), %r8 + movzbq (%rsi, %rdx), %r9 + movb %r8b, 1(%rdi) + movb %r9b, (%rdi, %rdx) +.Lstore_1byte: + movb %cl, (%rdi) .Lend: - ret + retq CFI_ENDPROC ENDPROC(memcpy) ENDPROC(__memcpy) /* - * Some CPUs run faster using the string copy instructions. - * It is also a lot simpler. Use this when possible: - */ - - .section .altinstructions, "a" - .align 8 - .quad memcpy - .quad .Lmemcpy_c - .byte X86_FEATURE_REP_GOOD - - /* + * Some CPUs are adding enhanced REP MOVSB/STOSB feature + * If the feature is supported, memcpy_c_e() is the first choice. + * If enhanced rep movsb copy is not available, use fast string copy + * memcpy_c() when possible. This is faster and code is simpler than + * original memcpy(). + * Otherwise, original memcpy() is used. + * In .altinstructions section, ERMS feature is placed after REG_GOOD + * feature to implement the right patch order. + * * Replace only beginning, memcpy is used to apply alternatives, * so it is silly to overwrite itself with nops - reboot is the * only outcome... */ - .byte .Lmemcpy_e - .Lmemcpy_c - .byte .Lmemcpy_e - .Lmemcpy_c + .section .altinstructions, "a" + altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ + .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c + altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ + .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e .previous |
