1 files changed, 141 insertions, 78 deletions
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index f82e884928a..56313a32618 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -4,6 +4,7 @@
 
 #include <asm/cpufeature.h>
 #include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
 
 /*
  * memcpy - Copy a memory block.
@@ -26,9 +27,8 @@
 	.section .altinstr_replacement, "ax", @progbits
 .Lmemcpy_c:
 	movq %rdi, %rax
-
-	movl %edx, %ecx
-	shrl $3, %ecx
+	movq %rdx, %rcx
+	shrq $3, %rcx
 	andl $7, %edx
 	rep movsq
 	movl %edx, %ecx
@@ -37,107 +37,170 @@
 .Lmemcpy_e:
 	.previous
 
+/*
+ * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
+ * memcpy_c. Use memcpy_c_e when possible.
+ *
+ * This gets patched over the unrolled variant (below) via the
+ * alternative instructions framework:
+ */
+	.section .altinstr_replacement, "ax", @progbits
+.Lmemcpy_c_e:
+	movq %rdi, %rax
+	movq %rdx, %rcx
+	rep movsb
+	ret
+.Lmemcpy_e_e:
+	.previous
+
 ENTRY(__memcpy)
 ENTRY(memcpy)
 	CFI_STARTPROC
+	movq %rdi, %rax
+
+	cmpq $0x20, %rdx
+	jb .Lhandle_tail
 
 	/*
-	 * Put the number of full 64-byte blocks into %ecx.
-	 * Tail portion is handled at the end:
+	 * We check whether memory false dependence could occur,
+	 * then jump to corresponding copy mode.
 	 */
-	movq %rdi, %rax
-	movl %edx, %ecx
-	shrl   $6, %ecx
-	jz .Lhandle_tail
+	cmp  %dil, %sil
+	jl .Lcopy_backward
+	subq $0x20, %rdx
+.Lcopy_forward_loop:
+	subq $0x20,	%rdx
 
-	.p2align 4
-.Lloop_64:
 	/*
-	 * We decrement the loop index here - and the zero-flag is
-	 * checked at the end of the loop (instructions inbetween do
-	 * not change the zero flag):
+	 * Move in blocks of 4x8 bytes:
 	 */
-	decl %ecx
-
+	movq 0*8(%rsi),	%r8
+	movq 1*8(%rsi),	%r9
+	movq 2*8(%rsi),	%r10
+	movq 3*8(%rsi),	%r11
+	leaq 4*8(%rsi),	%rsi
+
+	movq %r8,	0*8(%rdi)
+	movq %r9,	1*8(%rdi)
+	movq %r10,	2*8(%rdi)
+	movq %r11,	3*8(%rdi)
+	leaq 4*8(%rdi),	%rdi
+	jae  .Lcopy_forward_loop
+	addl $0x20,	%edx
+	jmp  .Lhandle_tail
+
+.Lcopy_backward:
 	/*
-	 * Move in blocks of 4x16 bytes:
+	 * Calculate copy position to tail.
 	 */
-	movq 0*8(%rsi),		%r11
-	movq 1*8(%rsi),		%r8
-	movq %r11,		0*8(%rdi)
-	movq %r8,		1*8(%rdi)
-
-	movq 2*8(%rsi),		%r9
-	movq 3*8(%rsi),		%r10
-	movq %r9,		2*8(%rdi)
-	movq %r10,		3*8(%rdi)
-
-	movq 4*8(%rsi),		%r11
-	movq 5*8(%rsi),		%r8
-	movq %r11,		4*8(%rdi)
-	movq %r8,		5*8(%rdi)
-
-	movq 6*8(%rsi),		%r9
-	movq 7*8(%rsi),		%r10
-	movq %r9,		6*8(%rdi)
-	movq %r10,		7*8(%rdi)
-
-	leaq 64(%rsi), %rsi
-	leaq 64(%rdi), %rdi
-
-	jnz  .Lloop_64
+	addq %rdx,	%rsi
+	addq %rdx,	%rdi
+	subq $0x20,	%rdx
+	/*
+	 * At most 3 ALU operations in one cycle,
+	 * so append NOPS in the same 16 bytes trunk.
+	 */
+	.p2align 4
+.Lcopy_backward_loop:
+	subq $0x20,	%rdx
+	movq -1*8(%rsi),	%r8
+	movq -2*8(%rsi),	%r9
+	movq -3*8(%rsi),	%r10
+	movq -4*8(%rsi),	%r11
+	leaq -4*8(%rsi),	%rsi
+	movq %r8,		-1*8(%rdi)
+	movq %r9,		-2*8(%rdi)
+	movq %r10,		-3*8(%rdi)
+	movq %r11,		-4*8(%rdi)
+	leaq -4*8(%rdi),	%rdi
+	jae  .Lcopy_backward_loop
 
+	/*
+	 * Calculate copy position to head.
+	 */
+	addl $0x20,	%edx
+	subq %rdx,	%rsi
+	subq %rdx,	%rdi
 .Lhandle_tail:
-	movl %edx, %ecx
-	andl  $63, %ecx
-	shrl   $3, %ecx
-	jz   .Lhandle_7
+	cmpl $16,	%edx
+	jb   .Lless_16bytes
 
+	/*
+	 * Move data from 16 bytes to 31 bytes.
+	 */
+	movq 0*8(%rsi), %r8
+	movq 1*8(%rsi),	%r9
+	movq -2*8(%rsi, %rdx),	%r10
+	movq -1*8(%rsi, %rdx),	%r11
+	movq %r8,	0*8(%rdi)
+	movq %r9,	1*8(%rdi)
+	movq %r10,	-2*8(%rdi, %rdx)
+	movq %r11,	-1*8(%rdi, %rdx)
+	retq
 	.p2align 4
-.Lloop_8:
-	decl %ecx
-	movq (%rsi),		%r8
-	movq %r8,		(%rdi)
-	leaq 8(%rdi),		%rdi
-	leaq 8(%rsi),		%rsi
-	jnz  .Lloop_8
-
-.Lhandle_7:
-	movl %edx, %ecx
-	andl $7, %ecx
-	jz .Lend
+.Lless_16bytes:
+	cmpl $8,	%edx
+	jb   .Lless_8bytes
+	/*
+	 * Move data from 8 bytes to 15 bytes.
+	 */
+	movq 0*8(%rsi),	%r8
+	movq -1*8(%rsi, %rdx),	%r9
+	movq %r8,	0*8(%rdi)
+	movq %r9,	-1*8(%rdi, %rdx)
+	retq
+	.p2align 4
+.Lless_8bytes:
+	cmpl $4,	%edx
+	jb   .Lless_3bytes
 
+	/*
+	 * Move data from 4 bytes to 7 bytes.
+	 */
+	movl (%rsi), %ecx
+	movl -4(%rsi, %rdx), %r8d
+	movl %ecx, (%rdi)
+	movl %r8d, -4(%rdi, %rdx)
+	retq
 	.p2align 4
-.Lloop_1:
-	movb (%rsi), %r8b
-	movb %r8b, (%rdi)
-	incq %rdi
-	incq %rsi
-	decl %ecx
-	jnz .Lloop_1
+.Lless_3bytes:
+	subl $1, %edx
+	jb .Lend
+	/*
+	 * Move data from 1 bytes to 3 bytes.
+	 */
+	movzbl (%rsi), %ecx
+	jz .Lstore_1byte
+	movzbq 1(%rsi), %r8
+	movzbq (%rsi, %rdx), %r9
+	movb %r8b, 1(%rdi)
+	movb %r9b, (%rdi, %rdx)
+.Lstore_1byte:
+	movb %cl, (%rdi)
 
 .Lend:
-	ret
+	retq
 	CFI_ENDPROC
 ENDPROC(memcpy)
 ENDPROC(__memcpy)
 
 	/*
-	 * Some CPUs run faster using the string copy instructions.
-	 * It is also a lot simpler. Use this when possible:
-	 */
-
-	.section .altinstructions, "a"
-	.align 8
-	.quad memcpy
-	.quad .Lmemcpy_c
-	.byte X86_FEATURE_REP_GOOD
-
-	/*
+	 * Some CPUs are adding enhanced REP MOVSB/STOSB feature
+	 * If the feature is supported, memcpy_c_e() is the first choice.
+	 * If enhanced rep movsb copy is not available, use fast string copy
+	 * memcpy_c() when possible. This is faster and code is simpler than
+	 * original memcpy().
+	 * Otherwise, original memcpy() is used.
+	 * In .altinstructions section, ERMS feature is placed after REG_GOOD
+         * feature to implement the right patch order.
+	 *
 	 * Replace only beginning, memcpy is used to apply alternatives,
 	 * so it is silly to overwrite itself with nops - reboot is the
 	 * only outcome...
 	 */
-	.byte .Lmemcpy_e - .Lmemcpy_c
-	.byte .Lmemcpy_e - .Lmemcpy_c
+	.section .altinstructions, "a"
+	altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
+			     .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
+	altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
+			     .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
 	.previous