aboutsummaryrefslogtreecommitdiff
path: root/arch/powerpc/lib/copyuser_power7.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/lib/copyuser_power7.S')
-rw-r--r--arch/powerpc/lib/copyuser_power7.S133
1 files changed, 56 insertions, 77 deletions
diff --git a/arch/powerpc/lib/copyuser_power7.S b/arch/powerpc/lib/copyuser_power7.S
index f9ede7c6606..c46c876ac96 100644
--- a/arch/powerpc/lib/copyuser_power7.S
+++ b/arch/powerpc/lib/copyuser_power7.S
@@ -19,6 +19,14 @@
*/
#include <asm/ppc_asm.h>
+#ifdef __BIG_ENDIAN__
+#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
+#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
+#else
+#define LVS(VRT,RA,RB) lvsr VRT,RA,RB
+#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
+#endif
+
.macro err1
100:
.section __ex_table,"a"
@@ -58,7 +66,7 @@
ld r15,STK_REG(R15)(r1)
ld r14,STK_REG(R14)(r1)
.Ldo_err3:
- bl .exit_vmx_usercopy
+ bl exit_vmx_usercopy
ld r0,STACKFRAMESIZE+16(r1)
mtlr r0
b .Lexit
@@ -77,9 +85,9 @@
.Lexit:
addi r1,r1,STACKFRAMESIZE
.Ldo_err1:
- ld r3,48(r1)
- ld r4,56(r1)
- ld r5,64(r1)
+ ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
+ ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
+ ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
b __copy_tofrom_user_base
@@ -88,18 +96,18 @@ _GLOBAL(__copy_tofrom_user_power7)
cmpldi r5,16
cmpldi cr1,r5,4096
- std r3,48(r1)
- std r4,56(r1)
- std r5,64(r1)
+ std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
+ std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
+ std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
blt .Lshort_copy
bgt cr1,.Lvmx_copy
#else
cmpldi r5,16
- std r3,48(r1)
- std r4,56(r1)
- std r5,64(r1)
+ std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
+ std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
+ std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
blt .Lshort_copy
#endif
@@ -287,12 +295,12 @@ err1; stb r0,0(r3)
mflr r0
std r0,16(r1)
stdu r1,-STACKFRAMESIZE(r1)
- bl .enter_vmx_usercopy
- cmpwi r3,0
+ bl enter_vmx_usercopy
+ cmpwi cr1,r3,0
ld r0,STACKFRAMESIZE+16(r1)
- ld r3,STACKFRAMESIZE+48(r1)
- ld r4,STACKFRAMESIZE+56(r1)
- ld r5,STACKFRAMESIZE+64(r1)
+ ld r3,STK_REG(R31)(r1)
+ ld r4,STK_REG(R30)(r1)
+ ld r5,STK_REG(R29)(r1)
mtlr r0
/*
@@ -318,46 +326,17 @@ err1; stb r0,0(r3)
.machine push
.machine "power4"
- dcbt r0,r6,0b01000
- dcbt r0,r7,0b01010
- dcbtst r0,r9,0b01000
- dcbtst r0,r10,0b01010
- eieio
- dcbt r0,r8,0b01010 /* GO */
-.machine pop
-
- /*
- * We prefetch both the source and destination using enhanced touch
- * instructions. We use a stream ID of 0 for the load side and
- * 1 for the store side.
- */
- clrrdi r6,r4,7
- clrrdi r9,r3,7
- ori r9,r9,1 /* stream=1 */
-
- srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
- cmpldi cr1,r7,0x3FF
- ble cr1,1f
- li r7,0x3FF
-1: lis r0,0x0E00 /* depth=7 */
- sldi r7,r7,7
- or r7,r7,r0
- ori r10,r7,1 /* stream=1 */
-
- lis r8,0x8000 /* GO=1 */
- clrldi r8,r8,32
-
-.machine push
-.machine "power4"
- dcbt r0,r6,0b01000
- dcbt r0,r7,0b01010
- dcbtst r0,r9,0b01000
- dcbtst r0,r10,0b01010
+ /* setup read stream 0 */
+ dcbt r0,r6,0b01000 /* addr from */
+ dcbt r0,r7,0b01010 /* length and depth from */
+ /* setup write stream 1 */
+ dcbtst r0,r9,0b01000 /* addr to */
+ dcbtst r0,r10,0b01010 /* length and depth to */
eieio
- dcbt r0,r8,0b01010 /* GO */
+ dcbt r0,r8,0b01010 /* all streams GO */
.machine pop
- beq .Lunwind_stack_nonvmx_copy
+ beq cr1,.Lunwind_stack_nonvmx_copy
/*
* If source and destination are not relatively aligned we use a
@@ -535,7 +514,7 @@ err3; lbz r0,0(r4)
err3; stb r0,0(r3)
15: addi r1,r1,STACKFRAMESIZE
- b .exit_vmx_usercopy /* tail call optimise */
+ b exit_vmx_usercopy /* tail call optimise */
.Lvmx_unaligned_copy:
/* Get the destination 16B aligned */
@@ -581,13 +560,13 @@ err3; stw r7,4(r3)
li r10,32
li r11,48
- lvsl vr16,0,r4 /* Setup permute control vector */
+ LVS(vr16,0,r4) /* Setup permute control vector */
err3; lvx vr0,0,r4
addi r4,r4,16
bf cr7*4+3,5f
err3; lvx vr1,r0,r4
- vperm vr8,vr0,vr1,vr16
+ VPERM(vr8,vr0,vr1,vr16)
addi r4,r4,16
err3; stvx vr8,r0,r3
addi r3,r3,16
@@ -595,9 +574,9 @@ err3; stvx vr8,r0,r3
5: bf cr7*4+2,6f
err3; lvx vr1,r0,r4
- vperm vr8,vr0,vr1,vr16
+ VPERM(vr8,vr0,vr1,vr16)
err3; lvx vr0,r4,r9
- vperm vr9,vr1,vr0,vr16
+ VPERM(vr9,vr1,vr0,vr16)
addi r4,r4,32
err3; stvx vr8,r0,r3
err3; stvx vr9,r3,r9
@@ -605,13 +584,13 @@ err3; stvx vr9,r3,r9
6: bf cr7*4+1,7f
err3; lvx vr3,r0,r4
- vperm vr8,vr0,vr3,vr16
+ VPERM(vr8,vr0,vr3,vr16)
err3; lvx vr2,r4,r9
- vperm vr9,vr3,vr2,vr16
+ VPERM(vr9,vr3,vr2,vr16)
err3; lvx vr1,r4,r10
- vperm vr10,vr2,vr1,vr16
+ VPERM(vr10,vr2,vr1,vr16)
err3; lvx vr0,r4,r11
- vperm vr11,vr1,vr0,vr16
+ VPERM(vr11,vr1,vr0,vr16)
addi r4,r4,64
err3; stvx vr8,r0,r3
err3; stvx vr9,r3,r9
@@ -640,21 +619,21 @@ err3; stvx vr11,r3,r11
.align 5
8:
err4; lvx vr7,r0,r4
- vperm vr8,vr0,vr7,vr16
+ VPERM(vr8,vr0,vr7,vr16)
err4; lvx vr6,r4,r9
- vperm vr9,vr7,vr6,vr16
+ VPERM(vr9,vr7,vr6,vr16)
err4; lvx vr5,r4,r10
- vperm vr10,vr6,vr5,vr16
+ VPERM(vr10,vr6,vr5,vr16)
err4; lvx vr4,r4,r11
- vperm vr11,vr5,vr4,vr16
+ VPERM(vr11,vr5,vr4,vr16)
err4; lvx vr3,r4,r12
- vperm vr12,vr4,vr3,vr16
+ VPERM(vr12,vr4,vr3,vr16)
err4; lvx vr2,r4,r14
- vperm vr13,vr3,vr2,vr16
+ VPERM(vr13,vr3,vr2,vr16)
err4; lvx vr1,r4,r15
- vperm vr14,vr2,vr1,vr16
+ VPERM(vr14,vr2,vr1,vr16)
err4; lvx vr0,r4,r16
- vperm vr15,vr1,vr0,vr16
+ VPERM(vr15,vr1,vr0,vr16)
addi r4,r4,128
err4; stvx vr8,r0,r3
err4; stvx vr9,r3,r9
@@ -678,13 +657,13 @@ err4; stvx vr15,r3,r16
bf cr7*4+1,9f
err3; lvx vr3,r0,r4
- vperm vr8,vr0,vr3,vr16
+ VPERM(vr8,vr0,vr3,vr16)
err3; lvx vr2,r4,r9
- vperm vr9,vr3,vr2,vr16
+ VPERM(vr9,vr3,vr2,vr16)
err3; lvx vr1,r4,r10
- vperm vr10,vr2,vr1,vr16
+ VPERM(vr10,vr2,vr1,vr16)
err3; lvx vr0,r4,r11
- vperm vr11,vr1,vr0,vr16
+ VPERM(vr11,vr1,vr0,vr16)
addi r4,r4,64
err3; stvx vr8,r0,r3
err3; stvx vr9,r3,r9
@@ -694,9 +673,9 @@ err3; stvx vr11,r3,r11
9: bf cr7*4+2,10f
err3; lvx vr1,r0,r4
- vperm vr8,vr0,vr1,vr16
+ VPERM(vr8,vr0,vr1,vr16)
err3; lvx vr0,r4,r9
- vperm vr9,vr1,vr0,vr16
+ VPERM(vr9,vr1,vr0,vr16)
addi r4,r4,32
err3; stvx vr8,r0,r3
err3; stvx vr9,r3,r9
@@ -704,7 +683,7 @@ err3; stvx vr9,r3,r9
10: bf cr7*4+3,11f
err3; lvx vr1,r0,r4
- vperm vr8,vr0,vr1,vr16
+ VPERM(vr8,vr0,vr1,vr16)
addi r4,r4,16
err3; stvx vr8,r0,r3
addi r3,r3,16
@@ -738,5 +717,5 @@ err3; lbz r0,0(r4)
err3; stb r0,0(r3)
15: addi r1,r1,STACKFRAMESIZE
- b .exit_vmx_usercopy /* tail call optimise */
+ b exit_vmx_usercopy /* tail call optimise */
#endif /* CONFiG_ALTIVEC */