diff options
Diffstat (limited to 'arch/powerpc/lib/memcpy_power7.S')
| -rw-r--r-- | arch/powerpc/lib/memcpy_power7.S | 81 | 
1 files changed, 45 insertions, 36 deletions
diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S index 0663630baf3..2ff5c142f87 100644 --- a/arch/powerpc/lib/memcpy_power7.S +++ b/arch/powerpc/lib/memcpy_power7.S @@ -20,18 +20,27 @@  #include <asm/ppc_asm.h>  _GLOBAL(memcpy_power7) + +#ifdef __BIG_ENDIAN__ +#define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB +#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRA,VRB,VRC +#else +#define LVS(VRT,RA,RB)		lvsr	VRT,RA,RB +#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRB,VRA,VRC +#endif +  #ifdef CONFIG_ALTIVEC  	cmpldi	r5,16  	cmpldi	cr1,r5,4096 -	std	r3,48(r1) +	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)  	blt	.Lshort_copy  	bgt	cr1,.Lvmx_copy  #else  	cmpldi	r5,16 -	std	r3,48(r1) +	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)  	blt	.Lshort_copy  #endif @@ -207,7 +216,7 @@ _GLOBAL(memcpy_power7)  	lbz	r0,0(r4)  	stb	r0,0(r3) -15:	ld	r3,48(r1) +15:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)  	blr  .Lunwind_stack_nonvmx_copy: @@ -217,16 +226,16 @@ _GLOBAL(memcpy_power7)  #ifdef CONFIG_ALTIVEC  .Lvmx_copy:  	mflr	r0 -	std	r4,56(r1) -	std	r5,64(r1) +	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1) +	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)  	std	r0,16(r1)  	stdu	r1,-STACKFRAMESIZE(r1) -	bl	.enter_vmx_copy +	bl	enter_vmx_copy  	cmpwi	cr1,r3,0  	ld	r0,STACKFRAMESIZE+16(r1) -	ld	r3,STACKFRAMESIZE+48(r1) -	ld	r4,STACKFRAMESIZE+56(r1) -	ld	r5,STACKFRAMESIZE+64(r1) +	ld	r3,STK_REG(R31)(r1) +	ld	r4,STK_REG(R30)(r1) +	ld	r5,STK_REG(R29)(r1)  	mtlr	r0  	/* @@ -438,8 +447,8 @@ _GLOBAL(memcpy_power7)  	stb	r0,0(r3)  15:	addi	r1,r1,STACKFRAMESIZE -	ld	r3,48(r1) -	b	.exit_vmx_copy		/* tail call optimise */ +	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1) +	b	exit_vmx_copy		/* tail call optimise */  .Lvmx_unaligned_copy:  	/* Get the destination 16B aligned */ @@ -485,13 +494,13 @@ _GLOBAL(memcpy_power7)  	li	r10,32  	li	r11,48 -	lvsl	vr16,0,r4	/* Setup permute control vector */ +	LVS(vr16,0,r4)		/* Setup permute control vector */  	lvx	vr0,0,r4  	addi	r4,r4,16  	bf	cr7*4+3,5f  	lvx	vr1,r0,r4 -	vperm	vr8,vr0,vr1,vr16 +	VPERM(vr8,vr0,vr1,vr16)  	addi	r4,r4,16  	stvx	vr8,r0,r3  	addi	r3,r3,16 @@ -499,9 +508,9 @@ _GLOBAL(memcpy_power7)  5:	bf	cr7*4+2,6f  	lvx	vr1,r0,r4 -	vperm	vr8,vr0,vr1,vr16 +	VPERM(vr8,vr0,vr1,vr16)  	lvx	vr0,r4,r9 -	vperm	vr9,vr1,vr0,vr16 +	VPERM(vr9,vr1,vr0,vr16)  	addi	r4,r4,32  	stvx	vr8,r0,r3  	stvx	vr9,r3,r9 @@ -509,13 +518,13 @@ _GLOBAL(memcpy_power7)  6:	bf	cr7*4+1,7f  	lvx	vr3,r0,r4 -	vperm	vr8,vr0,vr3,vr16 +	VPERM(vr8,vr0,vr3,vr16)  	lvx	vr2,r4,r9 -	vperm	vr9,vr3,vr2,vr16 +	VPERM(vr9,vr3,vr2,vr16)  	lvx	vr1,r4,r10 -	vperm	vr10,vr2,vr1,vr16 +	VPERM(vr10,vr2,vr1,vr16)  	lvx	vr0,r4,r11 -	vperm	vr11,vr1,vr0,vr16 +	VPERM(vr11,vr1,vr0,vr16)  	addi	r4,r4,64  	stvx	vr8,r0,r3  	stvx	vr9,r3,r9 @@ -544,21 +553,21 @@ _GLOBAL(memcpy_power7)  	.align	5  8:  	lvx	vr7,r0,r4 -	vperm	vr8,vr0,vr7,vr16 +	VPERM(vr8,vr0,vr7,vr16)  	lvx	vr6,r4,r9 -	vperm	vr9,vr7,vr6,vr16 +	VPERM(vr9,vr7,vr6,vr16)  	lvx	vr5,r4,r10 -	vperm	vr10,vr6,vr5,vr16 +	VPERM(vr10,vr6,vr5,vr16)  	lvx	vr4,r4,r11 -	vperm	vr11,vr5,vr4,vr16 +	VPERM(vr11,vr5,vr4,vr16)  	lvx	vr3,r4,r12 -	vperm	vr12,vr4,vr3,vr16 +	VPERM(vr12,vr4,vr3,vr16)  	lvx	vr2,r4,r14 -	vperm	vr13,vr3,vr2,vr16 +	VPERM(vr13,vr3,vr2,vr16)  	lvx	vr1,r4,r15 -	vperm	vr14,vr2,vr1,vr16 +	VPERM(vr14,vr2,vr1,vr16)  	lvx	vr0,r4,r16 -	vperm	vr15,vr1,vr0,vr16 +	VPERM(vr15,vr1,vr0,vr16)  	addi	r4,r4,128  	stvx	vr8,r0,r3  	stvx	vr9,r3,r9 @@ -582,13 +591,13 @@ _GLOBAL(memcpy_power7)  	bf	cr7*4+1,9f  	lvx	vr3,r0,r4 -	vperm	vr8,vr0,vr3,vr16 +	VPERM(vr8,vr0,vr3,vr16)  	lvx	vr2,r4,r9 -	vperm	vr9,vr3,vr2,vr16 +	VPERM(vr9,vr3,vr2,vr16)  	lvx	vr1,r4,r10 -	vperm	vr10,vr2,vr1,vr16 +	VPERM(vr10,vr2,vr1,vr16)  	lvx	vr0,r4,r11 -	vperm	vr11,vr1,vr0,vr16 +	VPERM(vr11,vr1,vr0,vr16)  	addi	r4,r4,64  	stvx	vr8,r0,r3  	stvx	vr9,r3,r9 @@ -598,9 +607,9 @@ _GLOBAL(memcpy_power7)  9:	bf	cr7*4+2,10f  	lvx	vr1,r0,r4 -	vperm	vr8,vr0,vr1,vr16 +	VPERM(vr8,vr0,vr1,vr16)  	lvx	vr0,r4,r9 -	vperm	vr9,vr1,vr0,vr16 +	VPERM(vr9,vr1,vr0,vr16)  	addi	r4,r4,32  	stvx	vr8,r0,r3  	stvx	vr9,r3,r9 @@ -608,7 +617,7 @@ _GLOBAL(memcpy_power7)  10:	bf	cr7*4+3,11f  	lvx	vr1,r0,r4 -	vperm	vr8,vr0,vr1,vr16 +	VPERM(vr8,vr0,vr1,vr16)  	addi	r4,r4,16  	stvx	vr8,r0,r3  	addi	r3,r3,16 @@ -642,6 +651,6 @@ _GLOBAL(memcpy_power7)  	stb	r0,0(r3)  15:	addi	r1,r1,STACKFRAMESIZE -	ld	r3,48(r1) -	b	.exit_vmx_copy		/* tail call optimise */ +	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1) +	b	exit_vmx_copy		/* tail call optimise */  #endif /* CONFiG_ALTIVEC */  | 
