aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/lib/memcpy_64.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/lib/memcpy_64.S')
-rw-r--r--arch/x86/lib/memcpy_64.S219
1 files changed, 141 insertions, 78 deletions
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index f82e884928a..56313a32618 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -4,6 +4,7 @@
#include <asm/cpufeature.h>
#include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
/*
* memcpy - Copy a memory block.
@@ -26,9 +27,8 @@
.section .altinstr_replacement, "ax", @progbits
.Lmemcpy_c:
movq %rdi, %rax
-
- movl %edx, %ecx
- shrl $3, %ecx
+ movq %rdx, %rcx
+ shrq $3, %rcx
andl $7, %edx
rep movsq
movl %edx, %ecx
@@ -37,107 +37,170 @@
.Lmemcpy_e:
.previous
+/*
+ * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
+ * memcpy_c. Use memcpy_c_e when possible.
+ *
+ * This gets patched over the unrolled variant (below) via the
+ * alternative instructions framework:
+ */
+ .section .altinstr_replacement, "ax", @progbits
+.Lmemcpy_c_e:
+ movq %rdi, %rax
+ movq %rdx, %rcx
+ rep movsb
+ ret
+.Lmemcpy_e_e:
+ .previous
+
ENTRY(__memcpy)
ENTRY(memcpy)
CFI_STARTPROC
+ movq %rdi, %rax
+
+ cmpq $0x20, %rdx
+ jb .Lhandle_tail
/*
- * Put the number of full 64-byte blocks into %ecx.
- * Tail portion is handled at the end:
+ * We check whether memory false dependence could occur,
+ * then jump to corresponding copy mode.
*/
- movq %rdi, %rax
- movl %edx, %ecx
- shrl $6, %ecx
- jz .Lhandle_tail
+ cmp %dil, %sil
+ jl .Lcopy_backward
+ subq $0x20, %rdx
+.Lcopy_forward_loop:
+ subq $0x20, %rdx
- .p2align 4
-.Lloop_64:
/*
- * We decrement the loop index here - and the zero-flag is
- * checked at the end of the loop (instructions inbetween do
- * not change the zero flag):
+ * Move in blocks of 4x8 bytes:
*/
- decl %ecx
-
+ movq 0*8(%rsi), %r8
+ movq 1*8(%rsi), %r9
+ movq 2*8(%rsi), %r10
+ movq 3*8(%rsi), %r11
+ leaq 4*8(%rsi), %rsi
+
+ movq %r8, 0*8(%rdi)
+ movq %r9, 1*8(%rdi)
+ movq %r10, 2*8(%rdi)
+ movq %r11, 3*8(%rdi)
+ leaq 4*8(%rdi), %rdi
+ jae .Lcopy_forward_loop
+ addl $0x20, %edx
+ jmp .Lhandle_tail
+
+.Lcopy_backward:
/*
- * Move in blocks of 4x16 bytes:
+ * Calculate copy position to tail.
*/
- movq 0*8(%rsi), %r11
- movq 1*8(%rsi), %r8
- movq %r11, 0*8(%rdi)
- movq %r8, 1*8(%rdi)
-
- movq 2*8(%rsi), %r9
- movq 3*8(%rsi), %r10
- movq %r9, 2*8(%rdi)
- movq %r10, 3*8(%rdi)
-
- movq 4*8(%rsi), %r11
- movq 5*8(%rsi), %r8
- movq %r11, 4*8(%rdi)
- movq %r8, 5*8(%rdi)
-
- movq 6*8(%rsi), %r9
- movq 7*8(%rsi), %r10
- movq %r9, 6*8(%rdi)
- movq %r10, 7*8(%rdi)
-
- leaq 64(%rsi), %rsi
- leaq 64(%rdi), %rdi
-
- jnz .Lloop_64
+ addq %rdx, %rsi
+ addq %rdx, %rdi
+ subq $0x20, %rdx
+ /*
+ * At most 3 ALU operations in one cycle,
+ * so append NOPS in the same 16 bytes trunk.
+ */
+ .p2align 4
+.Lcopy_backward_loop:
+ subq $0x20, %rdx
+ movq -1*8(%rsi), %r8
+ movq -2*8(%rsi), %r9
+ movq -3*8(%rsi), %r10
+ movq -4*8(%rsi), %r11
+ leaq -4*8(%rsi), %rsi
+ movq %r8, -1*8(%rdi)
+ movq %r9, -2*8(%rdi)
+ movq %r10, -3*8(%rdi)
+ movq %r11, -4*8(%rdi)
+ leaq -4*8(%rdi), %rdi
+ jae .Lcopy_backward_loop
+ /*
+ * Calculate copy position to head.
+ */
+ addl $0x20, %edx
+ subq %rdx, %rsi
+ subq %rdx, %rdi
.Lhandle_tail:
- movl %edx, %ecx
- andl $63, %ecx
- shrl $3, %ecx
- jz .Lhandle_7
+ cmpl $16, %edx
+ jb .Lless_16bytes
+ /*
+ * Move data from 16 bytes to 31 bytes.
+ */
+ movq 0*8(%rsi), %r8
+ movq 1*8(%rsi), %r9
+ movq -2*8(%rsi, %rdx), %r10
+ movq -1*8(%rsi, %rdx), %r11
+ movq %r8, 0*8(%rdi)
+ movq %r9, 1*8(%rdi)
+ movq %r10, -2*8(%rdi, %rdx)
+ movq %r11, -1*8(%rdi, %rdx)
+ retq
.p2align 4
-.Lloop_8:
- decl %ecx
- movq (%rsi), %r8
- movq %r8, (%rdi)
- leaq 8(%rdi), %rdi
- leaq 8(%rsi), %rsi
- jnz .Lloop_8
-
-.Lhandle_7:
- movl %edx, %ecx
- andl $7, %ecx
- jz .Lend
+.Lless_16bytes:
+ cmpl $8, %edx
+ jb .Lless_8bytes
+ /*
+ * Move data from 8 bytes to 15 bytes.
+ */
+ movq 0*8(%rsi), %r8
+ movq -1*8(%rsi, %rdx), %r9
+ movq %r8, 0*8(%rdi)
+ movq %r9, -1*8(%rdi, %rdx)
+ retq
+ .p2align 4
+.Lless_8bytes:
+ cmpl $4, %edx
+ jb .Lless_3bytes
+ /*
+ * Move data from 4 bytes to 7 bytes.
+ */
+ movl (%rsi), %ecx
+ movl -4(%rsi, %rdx), %r8d
+ movl %ecx, (%rdi)
+ movl %r8d, -4(%rdi, %rdx)
+ retq
.p2align 4
-.Lloop_1:
- movb (%rsi), %r8b
- movb %r8b, (%rdi)
- incq %rdi
- incq %rsi
- decl %ecx
- jnz .Lloop_1
+.Lless_3bytes:
+ subl $1, %edx
+ jb .Lend
+ /*
+ * Move data from 1 bytes to 3 bytes.
+ */
+ movzbl (%rsi), %ecx
+ jz .Lstore_1byte
+ movzbq 1(%rsi), %r8
+ movzbq (%rsi, %rdx), %r9
+ movb %r8b, 1(%rdi)
+ movb %r9b, (%rdi, %rdx)
+.Lstore_1byte:
+ movb %cl, (%rdi)
.Lend:
- ret
+ retq
CFI_ENDPROC
ENDPROC(memcpy)
ENDPROC(__memcpy)
/*
- * Some CPUs run faster using the string copy instructions.
- * It is also a lot simpler. Use this when possible:
- */
-
- .section .altinstructions, "a"
- .align 8
- .quad memcpy
- .quad .Lmemcpy_c
- .byte X86_FEATURE_REP_GOOD
-
- /*
+ * Some CPUs are adding enhanced REP MOVSB/STOSB feature
+ * If the feature is supported, memcpy_c_e() is the first choice.
+ * If enhanced rep movsb copy is not available, use fast string copy
+ * memcpy_c() when possible. This is faster and code is simpler than
+ * original memcpy().
+ * Otherwise, original memcpy() is used.
+ * In .altinstructions section, ERMS feature is placed after REG_GOOD
+ * feature to implement the right patch order.
+ *
* Replace only beginning, memcpy is used to apply alternatives,
* so it is silly to overwrite itself with nops - reboot is the
* only outcome...
*/
- .byte .Lmemcpy_e - .Lmemcpy_c
- .byte .Lmemcpy_e - .Lmemcpy_c
+ .section .altinstructions, "a"
+ altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
+ .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
+ altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
+ .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
.previous