diff options
Diffstat (limited to 'arch/powerpc/lib')
25 files changed, 2711 insertions, 225 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 889f2bc106d..59fa2de9546 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -4,25 +4,32 @@  subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror -ccflags-$(CONFIG_PPC64)	:= -mno-minimal-toc +ccflags-$(CONFIG_PPC64)	:= $(NO_MINIMAL_TOC)  CFLAGS_REMOVE_code-patching.o = -pg  CFLAGS_REMOVE_feature-fixups.o = -pg  obj-y			:= string.o alloc.o \ -			   checksum_$(CONFIG_WORD_SIZE).o crtsavres.o +			   crtsavres.o  obj-$(CONFIG_PPC32)	+= div64.o copy_32.o  obj-$(CONFIG_HAS_IOMEM)	+= devres.o  obj-$(CONFIG_PPC64)	+= copypage_64.o copyuser_64.o \ -			   memcpy_64.o usercopy_64.o mem_64.o string.o \ -			   checksum_wrappers_64.o -obj-$(CONFIG_XMON)	+= sstep.o ldstfp.o -obj-$(CONFIG_KPROBES)	+= sstep.o ldstfp.o -obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= sstep.o ldstfp.o +			   usercopy_64.o mem_64.o string.o \ +			   hweight_64.o \ +			   copyuser_power7.o string_64.o copypage_power7.o +ifeq ($(CONFIG_GENERIC_CSUM),) +obj-y			+= checksum_$(CONFIG_WORD_SIZE).o +obj-$(CONFIG_PPC64)	+= checksum_wrappers_64.o +endif + +obj-$(CONFIG_PPC64)		+= memcpy_power7.o memcpy_64.o  + +obj-$(CONFIG_PPC_EMULATE_SSTEP)	+= sstep.o ldstfp.o  ifeq ($(CONFIG_PPC64),y)  obj-$(CONFIG_SMP)	+= locks.o +obj-$(CONFIG_ALTIVEC)	+= vmx-helper.o  endif  obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o @@ -30,3 +37,6 @@ obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o  obj-y			+= code-patching.o  obj-y			+= feature-fixups.o  obj-$(CONFIG_FTR_FIXUP_SELFTEST) += feature-fixups-test.o + +obj-$(CONFIG_ALTIVEC)	+= xor_vmx.o +CFLAGS_xor_vmx.o += -maltivec -mabi=altivec diff --git a/arch/powerpc/lib/alloc.c b/arch/powerpc/lib/alloc.c index f53e09c7dac..da22c84a8fe 100644 --- a/arch/powerpc/lib/alloc.c +++ b/arch/powerpc/lib/alloc.c @@ -3,16 +3,8 @@  #include <linux/slab.h>  #include <linux/bootmem.h>  #include <linux/string.h> +#include <asm/setup.h> -#include <asm/system.h> - -void * __init_refok alloc_maybe_bootmem(size_t size, gfp_t mask) -{ -	if (mem_init_done) -		return kmalloc(size, mask); -	else -		return alloc_bootmem(size); -}  void * __init_refok zalloc_maybe_bootmem(size_t size, gfp_t mask)  { diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S index 18245af38ae..57a07206505 100644 --- a/arch/powerpc/lib/checksum_64.S +++ b/arch/powerpc/lib/checksum_64.S @@ -65,9 +65,6 @@ _GLOBAL(csum_tcpudp_magic)  	srwi	r3,r3,16  	blr -#define STACKFRAMESIZE 256 -#define STK_REG(i)	(112 + ((i)-14)*8) -  /*   * Computes the checksum of a memory block at buff, length len,   * and adds in "sum" (32-bit). @@ -114,9 +111,9 @@ _GLOBAL(csum_partial)  	mtctr	r6  	stdu	r1,-STACKFRAMESIZE(r1) -	std	r14,STK_REG(r14)(r1) -	std	r15,STK_REG(r15)(r1) -	std	r16,STK_REG(r16)(r1) +	std	r14,STK_REG(R14)(r1) +	std	r15,STK_REG(R15)(r1) +	std	r16,STK_REG(R16)(r1)  	ld	r6,0(r3)  	ld	r9,8(r3) @@ -175,9 +172,9 @@ _GLOBAL(csum_partial)  	adde	r0,r0,r15  	adde	r0,r0,r16 -	ld	r14,STK_REG(r14)(r1) -	ld	r15,STK_REG(r15)(r1) -	ld	r16,STK_REG(r16)(r1) +	ld	r14,STK_REG(R14)(r1) +	ld	r15,STK_REG(R15)(r1) +	ld	r16,STK_REG(R16)(r1)  	addi	r1,r1,STACKFRAMESIZE  	andi.	r4,r4,63 @@ -229,19 +226,35 @@ _GLOBAL(csum_partial)  	blr -	.macro source +	.macro srcnr  100:  	.section __ex_table,"a"  	.align 3 -	.llong 100b,.Lsrc_error +	.llong 100b,.Lsrc_error_nr  	.previous  	.endm -	.macro dest +	.macro source +150: +	.section __ex_table,"a" +	.align 3 +	.llong 150b,.Lsrc_error +	.previous +	.endm + +	.macro dstnr  200:  	.section __ex_table,"a"  	.align 3 -	.llong 200b,.Ldest_error +	.llong 200b,.Ldest_error_nr +	.previous +	.endm + +	.macro dest +250: +	.section __ex_table,"a" +	.align 3 +	.llong 250b,.Ldest_error  	.previous  	.endm @@ -272,16 +285,16 @@ _GLOBAL(csum_partial_copy_generic)  	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */  	beq	.Lcopy_aligned -	li	r7,4 -	sub	r6,r7,r6 +	li	r9,4 +	sub	r6,r9,r6  	mtctr	r6  1: -source;	lhz	r6,0(r3)		/* align to doubleword */ +srcnr;	lhz	r6,0(r3)		/* align to doubleword */  	subi	r5,r5,2  	addi	r3,r3,2  	adde	r0,r0,r6 -dest;	sth	r6,0(r4) +dstnr;	sth	r6,0(r4)  	addi	r4,r4,2  	bdnz	1b @@ -299,9 +312,9 @@ dest;	sth	r6,0(r4)  	mtctr	r6  	stdu	r1,-STACKFRAMESIZE(r1) -	std	r14,STK_REG(r14)(r1) -	std	r15,STK_REG(r15)(r1) -	std	r16,STK_REG(r16)(r1) +	std	r14,STK_REG(R14)(r1) +	std	r15,STK_REG(R15)(r1) +	std	r16,STK_REG(R16)(r1)  source;	ld	r6,0(r3)  source;	ld	r9,8(r3) @@ -382,9 +395,9 @@ dest;	std	r16,56(r4)  	adde	r0,r0,r15  	adde	r0,r0,r16 -	ld	r14,STK_REG(r14)(r1) -	ld	r15,STK_REG(r15)(r1) -	ld	r16,STK_REG(r16)(r1) +	ld	r14,STK_REG(R14)(r1) +	ld	r15,STK_REG(R15)(r1) +	ld	r16,STK_REG(R16)(r1)  	addi	r1,r1,STACKFRAMESIZE  	andi.	r5,r5,63 @@ -395,10 +408,10 @@ dest;	std	r16,56(r4)  	mtctr	r6  3: -source;	ld	r6,0(r3) +srcnr;	ld	r6,0(r3)  	addi	r3,r3,8  	adde	r0,r0,r6 -dest;	std	r6,0(r4) +dstnr;	std	r6,0(r4)  	addi	r4,r4,8  	bdnz	3b @@ -408,10 +421,10 @@ dest;	std	r6,0(r4)  	srdi.	r6,r5,2  	beq	.Lcopy_tail_halfword -source;	lwz	r6,0(r3) +srcnr;	lwz	r6,0(r3)  	addi	r3,r3,4  	adde	r0,r0,r6 -dest;	stw	r6,0(r4) +dstnr;	stw	r6,0(r4)  	addi	r4,r4,4  	subi	r5,r5,4 @@ -419,10 +432,10 @@ dest;	stw	r6,0(r4)  	srdi.	r6,r5,1  	beq	.Lcopy_tail_byte -source;	lhz	r6,0(r3) +srcnr;	lhz	r6,0(r3)  	addi	r3,r3,2  	adde	r0,r0,r6 -dest;	sth	r6,0(r4) +dstnr;	sth	r6,0(r4)  	addi	r4,r4,2  	subi	r5,r5,2 @@ -430,10 +443,10 @@ dest;	sth	r6,0(r4)  	andi.	r6,r5,1  	beq	.Lcopy_finish -source;	lbz	r6,0(r3) +srcnr;	lbz	r6,0(r3)  	sldi	r9,r6,8			/* Pad the byte out to 16 bits */  	adde	r0,r0,r9 -dest;	stb	r6,0(r4) +dstnr;	stb	r6,0(r4)  .Lcopy_finish:  	addze	r0,r0			/* add in final carry */ @@ -443,6 +456,11 @@ dest;	stb	r6,0(r4)  	blr  .Lsrc_error: +	ld	r14,STK_REG(R14)(r1) +	ld	r15,STK_REG(R15)(r1) +	ld	r16,STK_REG(R16)(r1) +	addi	r1,r1,STACKFRAMESIZE +.Lsrc_error_nr:  	cmpdi	0,r7,0  	beqlr  	li	r6,-EFAULT @@ -450,6 +468,11 @@ dest;	stb	r6,0(r4)  	blr  .Ldest_error: +	ld	r14,STK_REG(R14)(r1) +	ld	r15,STK_REG(R15)(r1) +	ld	r16,STK_REG(R16)(r1) +	addi	r1,r1,STACKFRAMESIZE +.Ldest_error_nr:  	cmpdi	0,r8,0  	beqlr  	li	r6,-EFAULT diff --git a/arch/powerpc/lib/checksum_wrappers_64.c b/arch/powerpc/lib/checksum_wrappers_64.c index 769b817fbb3..08e3a3356c4 100644 --- a/arch/powerpc/lib/checksum_wrappers_64.c +++ b/arch/powerpc/lib/checksum_wrappers_64.c @@ -17,7 +17,7 @@   *   * Author: Anton Blanchard <anton@au.ibm.com>   */ -#include <linux/module.h> +#include <linux/export.h>  #include <linux/compiler.h>  #include <linux/types.h>  #include <asm/checksum.h> diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 7c975d43e3f..d5edbeb8eb8 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -13,17 +13,23 @@  #include <linux/mm.h>  #include <asm/page.h>  #include <asm/code-patching.h> +#include <asm/uaccess.h> -void patch_instruction(unsigned int *addr, unsigned int instr) +int patch_instruction(unsigned int *addr, unsigned int instr)  { -	*addr = instr; +	int err; + +	__put_user_size(instr, addr, 4, err); +	if (err) +		return err;  	asm ("dcbst 0, %0; sync; icbi 0,%0; sync; isync" : : "r" (addr)); +	return 0;  } -void patch_branch(unsigned int *addr, unsigned long target, int flags) +int patch_branch(unsigned int *addr, unsigned long target, int flags)  { -	patch_instruction(addr, create_branch(addr, target, flags)); +	return patch_instruction(addr, create_branch(addr, target, flags));  }  unsigned int create_branch(const unsigned int *addr, @@ -153,6 +159,21 @@ unsigned int translate_branch(const unsigned int *dest, const unsigned int *src)  	return 0;  } +#ifdef CONFIG_PPC_BOOK3E_64 +void __patch_exception(int exc, unsigned long addr) +{ +	extern unsigned int interrupt_base_book3e; +	unsigned int *ibase = &interrupt_base_book3e; + +	/* Our exceptions vectors start with a NOP and -then- a branch +	 * to deal with single stepping from userspace which stops on +	 * the second instruction. Thus we need to patch the second +	 * instruction of the exception, not the first one +	 */ + +	patch_branch(ibase + (exc / 4) + 1, addr, 0); +} +#endif  #ifdef CONFIG_CODE_PATCHING_SELFTEST diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S index 4d4eeb90048..a3c4dc4defd 100644 --- a/arch/powerpc/lib/copypage_64.S +++ b/arch/powerpc/lib/copypage_64.S @@ -6,6 +6,7 @@   * as published by the Free Software Foundation; either version   * 2 of the License, or (at your option) any later version.   */ +#include <asm/page.h>  #include <asm/processor.h>  #include <asm/ppc_asm.h>  #include <asm/asm-offsets.h> @@ -15,9 +16,13 @@ PPC64_CACHES:          .tc             ppc64_caches[TC],ppc64_caches          .section        ".text" - -_GLOBAL(copy_4K_page) -	li	r5,4096		/* 4K page size */ +_GLOBAL_TOC(copy_page) +BEGIN_FTR_SECTION +	lis	r5,PAGE_SIZE@h +FTR_SECTION_ELSE +	b	copypage_power7 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) +	ori	r5,r5,PAGE_SIZE@l  BEGIN_FTR_SECTION  	ld      r10,PPC64_CACHES@toc(r2)  	lwz	r11,DCACHEL1LOGLINESIZE(r10)	/* log2 of cache line size */ diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S new file mode 100644 index 00000000000..d7dafb3777a --- /dev/null +++ b/arch/powerpc/lib/copypage_power7.S @@ -0,0 +1,168 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <asm/page.h> +#include <asm/ppc_asm.h> + +_GLOBAL(copypage_power7) +	/* +	 * We prefetch both the source and destination using enhanced touch +	 * instructions. We use a stream ID of 0 for the load side and +	 * 1 for the store side. Since source and destination are page +	 * aligned we don't need to clear the bottom 7 bits of either +	 * address. +	 */ +	ori	r9,r3,1		/* stream=1 => to */ + +#ifdef CONFIG_PPC_64K_PAGES +	lis	r7,0x0E01	/* depth=7 +				 * units/cachelines=512 */ +#else +	lis	r7,0x0E00	/* depth=7 */ +	ori	r7,r7,0x1000	/* units/cachelines=32 */ +#endif +	ori	r10,r7,1	/* stream=1 */ + +	lis	r8,0x8000	/* GO=1 */ +	clrldi	r8,r8,32 + +.machine push +.machine "power4" +	/* setup read stream 0  */ +	dcbt	r0,r4,0b01000  	/* addr from */ +	dcbt	r0,r7,0b01010   /* length and depth from */ +	/* setup write stream 1 */ +	dcbtst	r0,r9,0b01000   /* addr to */ +	dcbtst	r0,r10,0b01010  /* length and depth to */ +	eieio +	dcbt	r0,r8,0b01010	/* all streams GO */ +.machine pop + +#ifdef CONFIG_ALTIVEC +	mflr	r0 +	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1) +	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1) +	std	r0,16(r1) +	stdu	r1,-STACKFRAMESIZE(r1) +	bl	enter_vmx_copy +	cmpwi	r3,0 +	ld	r0,STACKFRAMESIZE+16(r1) +	ld	r3,STK_REG(R31)(r1) +	ld	r4,STK_REG(R30)(r1) +	mtlr	r0 + +	li	r0,(PAGE_SIZE/128) +	mtctr	r0 + +	beq	.Lnonvmx_copy + +	addi	r1,r1,STACKFRAMESIZE + +	li	r6,16 +	li	r7,32 +	li	r8,48 +	li	r9,64 +	li	r10,80 +	li	r11,96 +	li	r12,112 + +	.align	5 +1:	lvx	vr7,r0,r4 +	lvx	vr6,r4,r6 +	lvx	vr5,r4,r7 +	lvx	vr4,r4,r8 +	lvx	vr3,r4,r9 +	lvx	vr2,r4,r10 +	lvx	vr1,r4,r11 +	lvx	vr0,r4,r12 +	addi	r4,r4,128 +	stvx	vr7,r0,r3 +	stvx	vr6,r3,r6 +	stvx	vr5,r3,r7 +	stvx	vr4,r3,r8 +	stvx	vr3,r3,r9 +	stvx	vr2,r3,r10 +	stvx	vr1,r3,r11 +	stvx	vr0,r3,r12 +	addi	r3,r3,128 +	bdnz	1b + +	b	exit_vmx_copy		/* tail call optimise */ + +#else +	li	r0,(PAGE_SIZE/128) +	mtctr	r0 + +	stdu	r1,-STACKFRAMESIZE(r1) +#endif + +.Lnonvmx_copy: +	std	r14,STK_REG(R14)(r1) +	std	r15,STK_REG(R15)(r1) +	std	r16,STK_REG(R16)(r1) +	std	r17,STK_REG(R17)(r1) +	std	r18,STK_REG(R18)(r1) +	std	r19,STK_REG(R19)(r1) +	std	r20,STK_REG(R20)(r1) + +1:	ld	r0,0(r4) +	ld	r5,8(r4) +	ld	r6,16(r4) +	ld	r7,24(r4) +	ld	r8,32(r4) +	ld	r9,40(r4) +	ld	r10,48(r4) +	ld	r11,56(r4) +	ld	r12,64(r4) +	ld	r14,72(r4) +	ld	r15,80(r4) +	ld	r16,88(r4) +	ld	r17,96(r4) +	ld	r18,104(r4) +	ld	r19,112(r4) +	ld	r20,120(r4) +	addi	r4,r4,128 +	std	r0,0(r3) +	std	r5,8(r3) +	std	r6,16(r3) +	std	r7,24(r3) +	std	r8,32(r3) +	std	r9,40(r3) +	std	r10,48(r3) +	std	r11,56(r3) +	std	r12,64(r3) +	std	r14,72(r3) +	std	r15,80(r3) +	std	r16,88(r3) +	std	r17,96(r3) +	std	r18,104(r3) +	std	r19,112(r3) +	std	r20,120(r3) +	addi	r3,r3,128 +	bdnz	1b + +	ld	r14,STK_REG(R14)(r1) +	ld	r15,STK_REG(R15)(r1) +	ld	r16,STK_REG(R16)(r1) +	ld	r17,STK_REG(R17)(r1) +	ld	r18,STK_REG(R18)(r1) +	ld	r19,STK_REG(R19)(r1) +	ld	r20,STK_REG(R20)(r1) +	addi	r1,r1,STACKFRAMESIZE +	blr diff --git a/arch/powerpc/lib/copyuser_64.S b/arch/powerpc/lib/copyuser_64.S index 578b625d6a3..0860ee46013 100644 --- a/arch/powerpc/lib/copyuser_64.S +++ b/arch/powerpc/lib/copyuser_64.S @@ -9,8 +9,22 @@  #include <asm/processor.h>  #include <asm/ppc_asm.h> +#ifdef __BIG_ENDIAN__ +#define sLd sld		/* Shift towards low-numbered address. */ +#define sHd srd		/* Shift towards high-numbered address. */ +#else +#define sLd srd		/* Shift towards low-numbered address. */ +#define sHd sld		/* Shift towards high-numbered address. */ +#endif +  	.align	7 -_GLOBAL(__copy_tofrom_user) +_GLOBAL_TOC(__copy_tofrom_user) +BEGIN_FTR_SECTION +	nop +FTR_SECTION_ELSE +	b	__copy_tofrom_user_power7 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) +_GLOBAL(__copy_tofrom_user_base)  	/* first check for a whole page copy on a page boundary */  	cmpldi	cr1,r5,16  	cmpdi	cr6,r5,4096 @@ -24,7 +38,7 @@ _GLOBAL(__copy_tofrom_user)  	dcbt	0,r4  	beq	.Lcopy_page_4K  	andi.	r6,r6,7 -	PPC_MTOCRF	0x01,r5 +	PPC_MTOCRF(0x01,r5)  	blt	cr1,.Lshort_copy  /* Below we want to nop out the bne if we're on a CPU that has the   * CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit @@ -112,10 +126,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)  24:	ld	r9,0(r4)	/* 3+2n loads, 2+2n stores */  25:	ld	r0,8(r4) -	sld	r6,r9,r10 +	sLd	r6,r9,r10  26:	ldu	r9,16(r4) -	srd	r7,r0,r11 -	sld	r8,r0,r10 +	sHd	r7,r0,r11 +	sLd	r8,r0,r10  	or	r7,r7,r6  	blt	cr6,79f  27:	ld	r0,8(r4) @@ -123,35 +137,35 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)  28:	ld	r0,0(r4)	/* 4+2n loads, 3+2n stores */  29:	ldu	r9,8(r4) -	sld	r8,r0,r10 +	sLd	r8,r0,r10  	addi	r3,r3,-8  	blt	cr6,5f  30:	ld	r0,8(r4) -	srd	r12,r9,r11 -	sld	r6,r9,r10 +	sHd	r12,r9,r11 +	sLd	r6,r9,r10  31:	ldu	r9,16(r4)  	or	r12,r8,r12 -	srd	r7,r0,r11 -	sld	r8,r0,r10 +	sHd	r7,r0,r11 +	sLd	r8,r0,r10  	addi	r3,r3,16  	beq	cr6,78f  1:	or	r7,r7,r6  32:	ld	r0,8(r4)  76:	std	r12,8(r3) -2:	srd	r12,r9,r11 -	sld	r6,r9,r10 +2:	sHd	r12,r9,r11 +	sLd	r6,r9,r10  33:	ldu	r9,16(r4)  	or	r12,r8,r12  77:	stdu	r7,16(r3) -	srd	r7,r0,r11 -	sld	r8,r0,r10 +	sHd	r7,r0,r11 +	sLd	r8,r0,r10  	bdnz	1b  78:	std	r12,8(r3)  	or	r7,r7,r6  79:	std	r7,16(r3) -5:	srd	r12,r9,r11 +5:	sHd	r12,r9,r11  	or	r12,r8,r12  80:	std	r12,24(r3)  	bne	6f @@ -159,28 +173,43 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)  	blr  6:	cmpwi	cr1,r5,8  	addi	r3,r3,32 -	sld	r9,r9,r10 +	sLd	r9,r9,r10  	ble	cr1,7f  34:	ld	r0,8(r4) -	srd	r7,r0,r11 +	sHd	r7,r0,r11  	or	r9,r7,r9  7:  	bf	cr7*4+1,1f +#ifdef __BIG_ENDIAN__  	rotldi	r9,r9,32 +#endif  94:	stw	r9,0(r3) +#ifdef __LITTLE_ENDIAN__ +	rotrdi	r9,r9,32 +#endif  	addi	r3,r3,4  1:	bf	cr7*4+2,2f +#ifdef __BIG_ENDIAN__  	rotldi	r9,r9,16 +#endif  95:	sth	r9,0(r3) +#ifdef __LITTLE_ENDIAN__ +	rotrdi	r9,r9,16 +#endif  	addi	r3,r3,2  2:	bf	cr7*4+3,3f +#ifdef __BIG_ENDIAN__  	rotldi	r9,r9,8 +#endif  96:	stb	r9,0(r3) +#ifdef __LITTLE_ENDIAN__ +	rotrdi	r9,r9,8 +#endif  3:	li	r3,0  	blr  .Ldst_unaligned: -	PPC_MTOCRF	0x01,r6		/* put #bytes to 8B bdry into cr7 */ +	PPC_MTOCRF(0x01,r6)		/* put #bytes to 8B bdry into cr7 */  	subf	r5,r6,r5  	li	r7,0  	cmpldi	cr1,r5,16 @@ -195,7 +224,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)  2:	bf	cr7*4+1,3f  37:	lwzx	r0,r7,r4  83:	stwx	r0,r7,r3 -3:	PPC_MTOCRF	0x01,r5 +3:	PPC_MTOCRF(0x01,r5)  	add	r4,r6,r4  	add	r3,r6,r3  	b	.Ldst_aligned diff --git a/arch/powerpc/lib/copyuser_power7.S b/arch/powerpc/lib/copyuser_power7.S new file mode 100644 index 00000000000..c46c876ac96 --- /dev/null +++ b/arch/powerpc/lib/copyuser_power7.S @@ -0,0 +1,721 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2011 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <asm/ppc_asm.h> + +#ifdef __BIG_ENDIAN__ +#define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB +#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRA,VRB,VRC +#else +#define LVS(VRT,RA,RB)		lvsr	VRT,RA,RB +#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRB,VRA,VRC +#endif + +	.macro err1 +100: +	.section __ex_table,"a" +	.align 3 +	.llong 100b,.Ldo_err1 +	.previous +	.endm + +	.macro err2 +200: +	.section __ex_table,"a" +	.align 3 +	.llong 200b,.Ldo_err2 +	.previous +	.endm + +#ifdef CONFIG_ALTIVEC +	.macro err3 +300: +	.section __ex_table,"a" +	.align 3 +	.llong 300b,.Ldo_err3 +	.previous +	.endm + +	.macro err4 +400: +	.section __ex_table,"a" +	.align 3 +	.llong 400b,.Ldo_err4 +	.previous +	.endm + + +.Ldo_err4: +	ld	r16,STK_REG(R16)(r1) +	ld	r15,STK_REG(R15)(r1) +	ld	r14,STK_REG(R14)(r1) +.Ldo_err3: +	bl	exit_vmx_usercopy +	ld	r0,STACKFRAMESIZE+16(r1) +	mtlr	r0 +	b	.Lexit +#endif /* CONFIG_ALTIVEC */ + +.Ldo_err2: +	ld	r22,STK_REG(R22)(r1) +	ld	r21,STK_REG(R21)(r1) +	ld	r20,STK_REG(R20)(r1) +	ld	r19,STK_REG(R19)(r1) +	ld	r18,STK_REG(R18)(r1) +	ld	r17,STK_REG(R17)(r1) +	ld	r16,STK_REG(R16)(r1) +	ld	r15,STK_REG(R15)(r1) +	ld	r14,STK_REG(R14)(r1) +.Lexit: +	addi	r1,r1,STACKFRAMESIZE +.Ldo_err1: +	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1) +	ld	r4,-STACKFRAMESIZE+STK_REG(R30)(r1) +	ld	r5,-STACKFRAMESIZE+STK_REG(R29)(r1) +	b	__copy_tofrom_user_base + + +_GLOBAL(__copy_tofrom_user_power7) +#ifdef CONFIG_ALTIVEC +	cmpldi	r5,16 +	cmpldi	cr1,r5,4096 + +	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1) +	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1) +	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1) + +	blt	.Lshort_copy +	bgt	cr1,.Lvmx_copy +#else +	cmpldi	r5,16 + +	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1) +	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1) +	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1) + +	blt	.Lshort_copy +#endif + +.Lnonvmx_copy: +	/* Get the source 8B aligned */ +	neg	r6,r4 +	mtocrf	0x01,r6 +	clrldi	r6,r6,(64-3) + +	bf	cr7*4+3,1f +err1;	lbz	r0,0(r4) +	addi	r4,r4,1 +err1;	stb	r0,0(r3) +	addi	r3,r3,1 + +1:	bf	cr7*4+2,2f +err1;	lhz	r0,0(r4) +	addi	r4,r4,2 +err1;	sth	r0,0(r3) +	addi	r3,r3,2 + +2:	bf	cr7*4+1,3f +err1;	lwz	r0,0(r4) +	addi	r4,r4,4 +err1;	stw	r0,0(r3) +	addi	r3,r3,4 + +3:	sub	r5,r5,r6 +	cmpldi	r5,128 +	blt	5f + +	mflr	r0 +	stdu	r1,-STACKFRAMESIZE(r1) +	std	r14,STK_REG(R14)(r1) +	std	r15,STK_REG(R15)(r1) +	std	r16,STK_REG(R16)(r1) +	std	r17,STK_REG(R17)(r1) +	std	r18,STK_REG(R18)(r1) +	std	r19,STK_REG(R19)(r1) +	std	r20,STK_REG(R20)(r1) +	std	r21,STK_REG(R21)(r1) +	std	r22,STK_REG(R22)(r1) +	std	r0,STACKFRAMESIZE+16(r1) + +	srdi	r6,r5,7 +	mtctr	r6 + +	/* Now do cacheline (128B) sized loads and stores. */ +	.align	5 +4: +err2;	ld	r0,0(r4) +err2;	ld	r6,8(r4) +err2;	ld	r7,16(r4) +err2;	ld	r8,24(r4) +err2;	ld	r9,32(r4) +err2;	ld	r10,40(r4) +err2;	ld	r11,48(r4) +err2;	ld	r12,56(r4) +err2;	ld	r14,64(r4) +err2;	ld	r15,72(r4) +err2;	ld	r16,80(r4) +err2;	ld	r17,88(r4) +err2;	ld	r18,96(r4) +err2;	ld	r19,104(r4) +err2;	ld	r20,112(r4) +err2;	ld	r21,120(r4) +	addi	r4,r4,128 +err2;	std	r0,0(r3) +err2;	std	r6,8(r3) +err2;	std	r7,16(r3) +err2;	std	r8,24(r3) +err2;	std	r9,32(r3) +err2;	std	r10,40(r3) +err2;	std	r11,48(r3) +err2;	std	r12,56(r3) +err2;	std	r14,64(r3) +err2;	std	r15,72(r3) +err2;	std	r16,80(r3) +err2;	std	r17,88(r3) +err2;	std	r18,96(r3) +err2;	std	r19,104(r3) +err2;	std	r20,112(r3) +err2;	std	r21,120(r3) +	addi	r3,r3,128 +	bdnz	4b + +	clrldi	r5,r5,(64-7) + +	ld	r14,STK_REG(R14)(r1) +	ld	r15,STK_REG(R15)(r1) +	ld	r16,STK_REG(R16)(r1) +	ld	r17,STK_REG(R17)(r1) +	ld	r18,STK_REG(R18)(r1) +	ld	r19,STK_REG(R19)(r1) +	ld	r20,STK_REG(R20)(r1) +	ld	r21,STK_REG(R21)(r1) +	ld	r22,STK_REG(R22)(r1) +	addi	r1,r1,STACKFRAMESIZE + +	/* Up to 127B to go */ +5:	srdi	r6,r5,4 +	mtocrf	0x01,r6 + +6:	bf	cr7*4+1,7f +err1;	ld	r0,0(r4) +err1;	ld	r6,8(r4) +err1;	ld	r7,16(r4) +err1;	ld	r8,24(r4) +err1;	ld	r9,32(r4) +err1;	ld	r10,40(r4) +err1;	ld	r11,48(r4) +err1;	ld	r12,56(r4) +	addi	r4,r4,64 +err1;	std	r0,0(r3) +err1;	std	r6,8(r3) +err1;	std	r7,16(r3) +err1;	std	r8,24(r3) +err1;	std	r9,32(r3) +err1;	std	r10,40(r3) +err1;	std	r11,48(r3) +err1;	std	r12,56(r3) +	addi	r3,r3,64 + +	/* Up to 63B to go */ +7:	bf	cr7*4+2,8f +err1;	ld	r0,0(r4) +err1;	ld	r6,8(r4) +err1;	ld	r7,16(r4) +err1;	ld	r8,24(r4) +	addi	r4,r4,32 +err1;	std	r0,0(r3) +err1;	std	r6,8(r3) +err1;	std	r7,16(r3) +err1;	std	r8,24(r3) +	addi	r3,r3,32 + +	/* Up to 31B to go */ +8:	bf	cr7*4+3,9f +err1;	ld	r0,0(r4) +err1;	ld	r6,8(r4) +	addi	r4,r4,16 +err1;	std	r0,0(r3) +err1;	std	r6,8(r3) +	addi	r3,r3,16 + +9:	clrldi	r5,r5,(64-4) + +	/* Up to 15B to go */ +.Lshort_copy: +	mtocrf	0x01,r5 +	bf	cr7*4+0,12f +err1;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */ +err1;	lwz	r6,4(r4) +	addi	r4,r4,8 +err1;	stw	r0,0(r3) +err1;	stw	r6,4(r3) +	addi	r3,r3,8 + +12:	bf	cr7*4+1,13f +err1;	lwz	r0,0(r4) +	addi	r4,r4,4 +err1;	stw	r0,0(r3) +	addi	r3,r3,4 + +13:	bf	cr7*4+2,14f +err1;	lhz	r0,0(r4) +	addi	r4,r4,2 +err1;	sth	r0,0(r3) +	addi	r3,r3,2 + +14:	bf	cr7*4+3,15f +err1;	lbz	r0,0(r4) +err1;	stb	r0,0(r3) + +15:	li	r3,0 +	blr + +.Lunwind_stack_nonvmx_copy: +	addi	r1,r1,STACKFRAMESIZE +	b	.Lnonvmx_copy + +#ifdef CONFIG_ALTIVEC +.Lvmx_copy: +	mflr	r0 +	std	r0,16(r1) +	stdu	r1,-STACKFRAMESIZE(r1) +	bl	enter_vmx_usercopy +	cmpwi	cr1,r3,0 +	ld	r0,STACKFRAMESIZE+16(r1) +	ld	r3,STK_REG(R31)(r1) +	ld	r4,STK_REG(R30)(r1) +	ld	r5,STK_REG(R29)(r1) +	mtlr	r0 + +	/* +	 * We prefetch both the source and destination using enhanced touch +	 * instructions. We use a stream ID of 0 for the load side and +	 * 1 for the store side. +	 */ +	clrrdi	r6,r4,7 +	clrrdi	r9,r3,7 +	ori	r9,r9,1		/* stream=1 */ + +	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */ +	cmpldi	r7,0x3FF +	ble	1f +	li	r7,0x3FF +1:	lis	r0,0x0E00	/* depth=7 */ +	sldi	r7,r7,7 +	or	r7,r7,r0 +	ori	r10,r7,1	/* stream=1 */ + +	lis	r8,0x8000	/* GO=1 */ +	clrldi	r8,r8,32 + +.machine push +.machine "power4" +	/* setup read stream 0 */ +	dcbt	r0,r6,0b01000   /* addr from */ +	dcbt	r0,r7,0b01010   /* length and depth from */ +	/* setup write stream 1 */ +	dcbtst	r0,r9,0b01000   /* addr to */ +	dcbtst	r0,r10,0b01010  /* length and depth to */ +	eieio +	dcbt	r0,r8,0b01010	/* all streams GO */ +.machine pop + +	beq	cr1,.Lunwind_stack_nonvmx_copy + +	/* +	 * If source and destination are not relatively aligned we use a +	 * slower permute loop. +	 */ +	xor	r6,r4,r3 +	rldicl.	r6,r6,0,(64-4) +	bne	.Lvmx_unaligned_copy + +	/* Get the destination 16B aligned */ +	neg	r6,r3 +	mtocrf	0x01,r6 +	clrldi	r6,r6,(64-4) + +	bf	cr7*4+3,1f +err3;	lbz	r0,0(r4) +	addi	r4,r4,1 +err3;	stb	r0,0(r3) +	addi	r3,r3,1 + +1:	bf	cr7*4+2,2f +err3;	lhz	r0,0(r4) +	addi	r4,r4,2 +err3;	sth	r0,0(r3) +	addi	r3,r3,2 + +2:	bf	cr7*4+1,3f +err3;	lwz	r0,0(r4) +	addi	r4,r4,4 +err3;	stw	r0,0(r3) +	addi	r3,r3,4 + +3:	bf	cr7*4+0,4f +err3;	ld	r0,0(r4) +	addi	r4,r4,8 +err3;	std	r0,0(r3) +	addi	r3,r3,8 + +4:	sub	r5,r5,r6 + +	/* Get the desination 128B aligned */ +	neg	r6,r3 +	srdi	r7,r6,4 +	mtocrf	0x01,r7 +	clrldi	r6,r6,(64-7) + +	li	r9,16 +	li	r10,32 +	li	r11,48 + +	bf	cr7*4+3,5f +err3;	lvx	vr1,r0,r4 +	addi	r4,r4,16 +err3;	stvx	vr1,r0,r3 +	addi	r3,r3,16 + +5:	bf	cr7*4+2,6f +err3;	lvx	vr1,r0,r4 +err3;	lvx	vr0,r4,r9 +	addi	r4,r4,32 +err3;	stvx	vr1,r0,r3 +err3;	stvx	vr0,r3,r9 +	addi	r3,r3,32 + +6:	bf	cr7*4+1,7f +err3;	lvx	vr3,r0,r4 +err3;	lvx	vr2,r4,r9 +err3;	lvx	vr1,r4,r10 +err3;	lvx	vr0,r4,r11 +	addi	r4,r4,64 +err3;	stvx	vr3,r0,r3 +err3;	stvx	vr2,r3,r9 +err3;	stvx	vr1,r3,r10 +err3;	stvx	vr0,r3,r11 +	addi	r3,r3,64 + +7:	sub	r5,r5,r6 +	srdi	r6,r5,7 + +	std	r14,STK_REG(R14)(r1) +	std	r15,STK_REG(R15)(r1) +	std	r16,STK_REG(R16)(r1) + +	li	r12,64 +	li	r14,80 +	li	r15,96 +	li	r16,112 + +	mtctr	r6 + +	/* +	 * Now do cacheline sized loads and stores. By this stage the +	 * cacheline stores are also cacheline aligned. +	 */ +	.align	5 +8: +err4;	lvx	vr7,r0,r4 +err4;	lvx	vr6,r4,r9 +err4;	lvx	vr5,r4,r10 +err4;	lvx	vr4,r4,r11 +err4;	lvx	vr3,r4,r12 +err4;	lvx	vr2,r4,r14 +err4;	lvx	vr1,r4,r15 +err4;	lvx	vr0,r4,r16 +	addi	r4,r4,128 +err4;	stvx	vr7,r0,r3 +err4;	stvx	vr6,r3,r9 +err4;	stvx	vr5,r3,r10 +err4;	stvx	vr4,r3,r11 +err4;	stvx	vr3,r3,r12 +err4;	stvx	vr2,r3,r14 +err4;	stvx	vr1,r3,r15 +err4;	stvx	vr0,r3,r16 +	addi	r3,r3,128 +	bdnz	8b + +	ld	r14,STK_REG(R14)(r1) +	ld	r15,STK_REG(R15)(r1) +	ld	r16,STK_REG(R16)(r1) + +	/* Up to 127B to go */ +	clrldi	r5,r5,(64-7) +	srdi	r6,r5,4 +	mtocrf	0x01,r6 + +	bf	cr7*4+1,9f +err3;	lvx	vr3,r0,r4 +err3;	lvx	vr2,r4,r9 +err3;	lvx	vr1,r4,r10 +err3;	lvx	vr0,r4,r11 +	addi	r4,r4,64 +err3;	stvx	vr3,r0,r3 +err3;	stvx	vr2,r3,r9 +err3;	stvx	vr1,r3,r10 +err3;	stvx	vr0,r3,r11 +	addi	r3,r3,64 + +9:	bf	cr7*4+2,10f +err3;	lvx	vr1,r0,r4 +err3;	lvx	vr0,r4,r9 +	addi	r4,r4,32 +err3;	stvx	vr1,r0,r3 +err3;	stvx	vr0,r3,r9 +	addi	r3,r3,32 + +10:	bf	cr7*4+3,11f +err3;	lvx	vr1,r0,r4 +	addi	r4,r4,16 +err3;	stvx	vr1,r0,r3 +	addi	r3,r3,16 + +	/* Up to 15B to go */ +11:	clrldi	r5,r5,(64-4) +	mtocrf	0x01,r5 +	bf	cr7*4+0,12f +err3;	ld	r0,0(r4) +	addi	r4,r4,8 +err3;	std	r0,0(r3) +	addi	r3,r3,8 + +12:	bf	cr7*4+1,13f +err3;	lwz	r0,0(r4) +	addi	r4,r4,4 +err3;	stw	r0,0(r3) +	addi	r3,r3,4 + +13:	bf	cr7*4+2,14f +err3;	lhz	r0,0(r4) +	addi	r4,r4,2 +err3;	sth	r0,0(r3) +	addi	r3,r3,2 + +14:	bf	cr7*4+3,15f +err3;	lbz	r0,0(r4) +err3;	stb	r0,0(r3) + +15:	addi	r1,r1,STACKFRAMESIZE +	b	exit_vmx_usercopy	/* tail call optimise */ + +.Lvmx_unaligned_copy: +	/* Get the destination 16B aligned */ +	neg	r6,r3 +	mtocrf	0x01,r6 +	clrldi	r6,r6,(64-4) + +	bf	cr7*4+3,1f +err3;	lbz	r0,0(r4) +	addi	r4,r4,1 +err3;	stb	r0,0(r3) +	addi	r3,r3,1 + +1:	bf	cr7*4+2,2f +err3;	lhz	r0,0(r4) +	addi	r4,r4,2 +err3;	sth	r0,0(r3) +	addi	r3,r3,2 + +2:	bf	cr7*4+1,3f +err3;	lwz	r0,0(r4) +	addi	r4,r4,4 +err3;	stw	r0,0(r3) +	addi	r3,r3,4 + +3:	bf	cr7*4+0,4f +err3;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */ +err3;	lwz	r7,4(r4) +	addi	r4,r4,8 +err3;	stw	r0,0(r3) +err3;	stw	r7,4(r3) +	addi	r3,r3,8 + +4:	sub	r5,r5,r6 + +	/* Get the desination 128B aligned */ +	neg	r6,r3 +	srdi	r7,r6,4 +	mtocrf	0x01,r7 +	clrldi	r6,r6,(64-7) + +	li	r9,16 +	li	r10,32 +	li	r11,48 + +	LVS(vr16,0,r4)		/* Setup permute control vector */ +err3;	lvx	vr0,0,r4 +	addi	r4,r4,16 + +	bf	cr7*4+3,5f +err3;	lvx	vr1,r0,r4 +	VPERM(vr8,vr0,vr1,vr16) +	addi	r4,r4,16 +err3;	stvx	vr8,r0,r3 +	addi	r3,r3,16 +	vor	vr0,vr1,vr1 + +5:	bf	cr7*4+2,6f +err3;	lvx	vr1,r0,r4 +	VPERM(vr8,vr0,vr1,vr16) +err3;	lvx	vr0,r4,r9 +	VPERM(vr9,vr1,vr0,vr16) +	addi	r4,r4,32 +err3;	stvx	vr8,r0,r3 +err3;	stvx	vr9,r3,r9 +	addi	r3,r3,32 + +6:	bf	cr7*4+1,7f +err3;	lvx	vr3,r0,r4 +	VPERM(vr8,vr0,vr3,vr16) +err3;	lvx	vr2,r4,r9 +	VPERM(vr9,vr3,vr2,vr16) +err3;	lvx	vr1,r4,r10 +	VPERM(vr10,vr2,vr1,vr16) +err3;	lvx	vr0,r4,r11 +	VPERM(vr11,vr1,vr0,vr16) +	addi	r4,r4,64 +err3;	stvx	vr8,r0,r3 +err3;	stvx	vr9,r3,r9 +err3;	stvx	vr10,r3,r10 +err3;	stvx	vr11,r3,r11 +	addi	r3,r3,64 + +7:	sub	r5,r5,r6 +	srdi	r6,r5,7 + +	std	r14,STK_REG(R14)(r1) +	std	r15,STK_REG(R15)(r1) +	std	r16,STK_REG(R16)(r1) + +	li	r12,64 +	li	r14,80 +	li	r15,96 +	li	r16,112 + +	mtctr	r6 + +	/* +	 * Now do cacheline sized loads and stores. By this stage the +	 * cacheline stores are also cacheline aligned. +	 */ +	.align	5 +8: +err4;	lvx	vr7,r0,r4 +	VPERM(vr8,vr0,vr7,vr16) +err4;	lvx	vr6,r4,r9 +	VPERM(vr9,vr7,vr6,vr16) +err4;	lvx	vr5,r4,r10 +	VPERM(vr10,vr6,vr5,vr16) +err4;	lvx	vr4,r4,r11 +	VPERM(vr11,vr5,vr4,vr16) +err4;	lvx	vr3,r4,r12 +	VPERM(vr12,vr4,vr3,vr16) +err4;	lvx	vr2,r4,r14 +	VPERM(vr13,vr3,vr2,vr16) +err4;	lvx	vr1,r4,r15 +	VPERM(vr14,vr2,vr1,vr16) +err4;	lvx	vr0,r4,r16 +	VPERM(vr15,vr1,vr0,vr16) +	addi	r4,r4,128 +err4;	stvx	vr8,r0,r3 +err4;	stvx	vr9,r3,r9 +err4;	stvx	vr10,r3,r10 +err4;	stvx	vr11,r3,r11 +err4;	stvx	vr12,r3,r12 +err4;	stvx	vr13,r3,r14 +err4;	stvx	vr14,r3,r15 +err4;	stvx	vr15,r3,r16 +	addi	r3,r3,128 +	bdnz	8b + +	ld	r14,STK_REG(R14)(r1) +	ld	r15,STK_REG(R15)(r1) +	ld	r16,STK_REG(R16)(r1) + +	/* Up to 127B to go */ +	clrldi	r5,r5,(64-7) +	srdi	r6,r5,4 +	mtocrf	0x01,r6 + +	bf	cr7*4+1,9f +err3;	lvx	vr3,r0,r4 +	VPERM(vr8,vr0,vr3,vr16) +err3;	lvx	vr2,r4,r9 +	VPERM(vr9,vr3,vr2,vr16) +err3;	lvx	vr1,r4,r10 +	VPERM(vr10,vr2,vr1,vr16) +err3;	lvx	vr0,r4,r11 +	VPERM(vr11,vr1,vr0,vr16) +	addi	r4,r4,64 +err3;	stvx	vr8,r0,r3 +err3;	stvx	vr9,r3,r9 +err3;	stvx	vr10,r3,r10 +err3;	stvx	vr11,r3,r11 +	addi	r3,r3,64 + +9:	bf	cr7*4+2,10f +err3;	lvx	vr1,r0,r4 +	VPERM(vr8,vr0,vr1,vr16) +err3;	lvx	vr0,r4,r9 +	VPERM(vr9,vr1,vr0,vr16) +	addi	r4,r4,32 +err3;	stvx	vr8,r0,r3 +err3;	stvx	vr9,r3,r9 +	addi	r3,r3,32 + +10:	bf	cr7*4+3,11f +err3;	lvx	vr1,r0,r4 +	VPERM(vr8,vr0,vr1,vr16) +	addi	r4,r4,16 +err3;	stvx	vr8,r0,r3 +	addi	r3,r3,16 + +	/* Up to 15B to go */ +11:	clrldi	r5,r5,(64-4) +	addi	r4,r4,-16	/* Unwind the +16 load offset */ +	mtocrf	0x01,r5 +	bf	cr7*4+0,12f +err3;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */ +err3;	lwz	r6,4(r4) +	addi	r4,r4,8 +err3;	stw	r0,0(r3) +err3;	stw	r6,4(r3) +	addi	r3,r3,8 + +12:	bf	cr7*4+1,13f +err3;	lwz	r0,0(r4) +	addi	r4,r4,4 +err3;	stw	r0,0(r3) +	addi	r3,r3,4 + +13:	bf	cr7*4+2,14f +err3;	lhz	r0,0(r4) +	addi	r4,r4,2 +err3;	sth	r0,0(r3) +	addi	r3,r3,2 + +14:	bf	cr7*4+3,15f +err3;	lbz	r0,0(r4) +err3;	stb	r0,0(r3) + +15:	addi	r1,r1,STACKFRAMESIZE +	b	exit_vmx_usercopy	/* tail call optimise */ +#endif /* CONFiG_ALTIVEC */ diff --git a/arch/powerpc/lib/crtsavres.S b/arch/powerpc/lib/crtsavres.S index 1c893f05d22..a5b30c71a8d 100644 --- a/arch/powerpc/lib/crtsavres.S +++ b/arch/powerpc/lib/crtsavres.S @@ -41,12 +41,13 @@  #include <asm/ppc_asm.h>  	.file	"crtsavres.S" -	.section ".text"  #ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE  #ifndef CONFIG_PPC64 +	.section ".text" +  /* Routines for saving integer registers, called by the compiler.  */  /* Called with r11 pointing to the stack header word of the caller of the */  /* function, just beyond the end of the integer save area.  */ @@ -230,8 +231,91 @@ _GLOBAL(_rest32gpr_31_x)  	mr	1,11  	blr +#ifdef CONFIG_ALTIVEC +/* Called with r0 pointing just beyond the end of the vector save area.  */ + +_GLOBAL(_savevr_20) +	li	r11,-192 +	stvx	vr20,r11,r0 +_GLOBAL(_savevr_21) +	li	r11,-176 +	stvx	vr21,r11,r0 +_GLOBAL(_savevr_22) +	li	r11,-160 +	stvx	vr22,r11,r0 +_GLOBAL(_savevr_23) +	li	r11,-144 +	stvx	vr23,r11,r0 +_GLOBAL(_savevr_24) +	li	r11,-128 +	stvx	vr24,r11,r0 +_GLOBAL(_savevr_25) +	li	r11,-112 +	stvx	vr25,r11,r0 +_GLOBAL(_savevr_26) +	li	r11,-96 +	stvx	vr26,r11,r0 +_GLOBAL(_savevr_27) +	li	r11,-80 +	stvx	vr27,r11,r0 +_GLOBAL(_savevr_28) +	li	r11,-64 +	stvx	vr28,r11,r0 +_GLOBAL(_savevr_29) +	li	r11,-48 +	stvx	vr29,r11,r0 +_GLOBAL(_savevr_30) +	li	r11,-32 +	stvx	vr30,r11,r0 +_GLOBAL(_savevr_31) +	li	r11,-16 +	stvx	vr31,r11,r0 +	blr + +_GLOBAL(_restvr_20) +	li	r11,-192 +	lvx	vr20,r11,r0 +_GLOBAL(_restvr_21) +	li	r11,-176 +	lvx	vr21,r11,r0 +_GLOBAL(_restvr_22) +	li	r11,-160 +	lvx	vr22,r11,r0 +_GLOBAL(_restvr_23) +	li	r11,-144 +	lvx	vr23,r11,r0 +_GLOBAL(_restvr_24) +	li	r11,-128 +	lvx	vr24,r11,r0 +_GLOBAL(_restvr_25) +	li	r11,-112 +	lvx	vr25,r11,r0 +_GLOBAL(_restvr_26) +	li	r11,-96 +	lvx	vr26,r11,r0 +_GLOBAL(_restvr_27) +	li	r11,-80 +	lvx	vr27,r11,r0 +_GLOBAL(_restvr_28) +	li	r11,-64 +	lvx	vr28,r11,r0 +_GLOBAL(_restvr_29) +	li	r11,-48 +	lvx	vr29,r11,r0 +_GLOBAL(_restvr_30) +	li	r11,-32 +	lvx	vr30,r11,r0 +_GLOBAL(_restvr_31) +	li	r11,-16 +	lvx	vr31,r11,r0 +	blr + +#endif /* CONFIG_ALTIVEC */ +  #else /* CONFIG_PPC64 */ +	.section ".text.save.restore","ax",@progbits +  .globl	_savegpr0_14  _savegpr0_14:  	std	r14,-144(r1) @@ -353,6 +437,111 @@ _restgpr0_31:  	mtlr	r0  	blr +#ifdef CONFIG_ALTIVEC +/* Called with r0 pointing just beyond the end of the vector save area.  */ + +.globl	_savevr_20 +_savevr_20: +	li	r12,-192 +	stvx	vr20,r12,r0 +.globl	_savevr_21 +_savevr_21: +	li	r12,-176 +	stvx	vr21,r12,r0 +.globl	_savevr_22 +_savevr_22: +	li	r12,-160 +	stvx	vr22,r12,r0 +.globl	_savevr_23 +_savevr_23: +	li	r12,-144 +	stvx	vr23,r12,r0 +.globl	_savevr_24 +_savevr_24: +	li	r12,-128 +	stvx	vr24,r12,r0 +.globl	_savevr_25 +_savevr_25: +	li	r12,-112 +	stvx	vr25,r12,r0 +.globl	_savevr_26 +_savevr_26: +	li	r12,-96 +	stvx	vr26,r12,r0 +.globl	_savevr_27 +_savevr_27: +	li	r12,-80 +	stvx	vr27,r12,r0 +.globl	_savevr_28 +_savevr_28: +	li	r12,-64 +	stvx	vr28,r12,r0 +.globl	_savevr_29 +_savevr_29: +	li	r12,-48 +	stvx	vr29,r12,r0 +.globl	_savevr_30 +_savevr_30: +	li	r12,-32 +	stvx	vr30,r12,r0 +.globl	_savevr_31 +_savevr_31: +	li	r12,-16 +	stvx	vr31,r12,r0 +	blr + +.globl	_restvr_20 +_restvr_20: +	li	r12,-192 +	lvx	vr20,r12,r0 +.globl	_restvr_21 +_restvr_21: +	li	r12,-176 +	lvx	vr21,r12,r0 +.globl	_restvr_22 +_restvr_22: +	li	r12,-160 +	lvx	vr22,r12,r0 +.globl	_restvr_23 +_restvr_23: +	li	r12,-144 +	lvx	vr23,r12,r0 +.globl	_restvr_24 +_restvr_24: +	li	r12,-128 +	lvx	vr24,r12,r0 +.globl	_restvr_25 +_restvr_25: +	li	r12,-112 +	lvx	vr25,r12,r0 +.globl	_restvr_26 +_restvr_26: +	li	r12,-96 +	lvx	vr26,r12,r0 +.globl	_restvr_27 +_restvr_27: +	li	r12,-80 +	lvx	vr27,r12,r0 +.globl	_restvr_28 +_restvr_28: +	li	r12,-64 +	lvx	vr28,r12,r0 +.globl	_restvr_29 +_restvr_29: +	li	r12,-48 +	lvx	vr29,r12,r0 +.globl	_restvr_30 +_restvr_30: +	li	r12,-32 +	lvx	vr30,r12,r0 +.globl	_restvr_31 +_restvr_31: +	li	r12,-16 +	lvx	vr31,r12,r0 +	blr + +#endif /* CONFIG_ALTIVEC */ +  #endif /* CONFIG_PPC64 */  #endif diff --git a/arch/powerpc/lib/devres.c b/arch/powerpc/lib/devres.c index deac4d30daf..8df55fc3aad 100644 --- a/arch/powerpc/lib/devres.c +++ b/arch/powerpc/lib/devres.c @@ -9,11 +9,11 @@  #include <linux/device.h>	/* devres_*(), devm_ioremap_release() */  #include <linux/gfp.h> -#include <linux/io.h>		/* ioremap_flags() */ -#include <linux/module.h>	/* EXPORT_SYMBOL() */ +#include <linux/io.h>		/* ioremap_prot() */ +#include <linux/export.h>	/* EXPORT_SYMBOL() */  /** - * devm_ioremap_prot - Managed ioremap_flags() + * devm_ioremap_prot - Managed ioremap_prot()   * @dev: Generic device to remap IO address for   * @offset: BUS offset to map   * @size: Size of map @@ -31,7 +31,7 @@ void __iomem *devm_ioremap_prot(struct device *dev, resource_size_t offset,  	if (!ptr)  		return NULL; -	addr = ioremap_flags(offset, size, flags); +	addr = ioremap_prot(offset, size, flags);  	if (addr) {  		*ptr = addr;  		devres_add(dev, ptr); diff --git a/arch/powerpc/lib/feature-fixups-test.S b/arch/powerpc/lib/feature-fixups-test.S index cb737484c5a..f4613118132 100644 --- a/arch/powerpc/lib/feature-fixups-test.S +++ b/arch/powerpc/lib/feature-fixups-test.S @@ -172,6 +172,25 @@ globl(ftr_fixup_test6_expected)  3:	or	3,3,3 +#if 0 +/* Test that if we have a larger else case the assembler spots it and + * reports an error. #if 0'ed so as not to break the build normally. + */ +ftr_fixup_test7: +	or	1,1,1 +BEGIN_FTR_SECTION +	or	2,2,2 +	or	2,2,2 +	or	2,2,2 +FTR_SECTION_ELSE +	or	3,3,3 +	or	3,3,3 +	or	3,3,3 +	or	3,3,3 +ALT_FTR_SECTION_END(0, 1) +	or	1,1,1 +#endif +  #define	MAKE_MACRO_TEST(TYPE)						\  globl(ftr_fixup_test_ ##TYPE##_macros)					\  	or	1,1,1;							\ diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index 0d08d017139..7a8a7487cee 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -18,6 +18,8 @@  #include <linux/init.h>  #include <asm/cputable.h>  #include <asm/code-patching.h> +#include <asm/page.h> +#include <asm/sections.h>  struct fixup_entry { @@ -128,6 +130,27 @@ void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end)  	}  } +void do_final_fixups(void) +{ +#if defined(CONFIG_PPC64) && defined(CONFIG_RELOCATABLE) +	int *src, *dest; +	unsigned long length; + +	if (PHYSICAL_START == 0) +		return; + +	src = (int *)(KERNELBASE + PHYSICAL_START); +	dest = (int *)KERNELBASE; +	length = (__end_interrupts - _stext) / sizeof(int); + +	while (length--) { +		patch_instruction(dest, *src); +		src++; +		dest++; +	} +#endif +} +  #ifdef CONFIG_FTR_FIXUP_SELFTEST  #define check(x)	\ diff --git a/arch/powerpc/lib/hweight_64.S b/arch/powerpc/lib/hweight_64.S new file mode 100644 index 00000000000..19e66001a4f --- /dev/null +++ b/arch/powerpc/lib/hweight_64.S @@ -0,0 +1,110 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2010 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <asm/processor.h> +#include <asm/ppc_asm.h> + +/* Note: This code relies on -mminimal-toc */ + +_GLOBAL(__arch_hweight8) +BEGIN_FTR_SECTION +	b __sw_hweight8 +	nop +	nop +FTR_SECTION_ELSE +	PPC_POPCNTB(R3,R3) +	clrldi	r3,r3,64-8 +	blr +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) + +_GLOBAL(__arch_hweight16) +BEGIN_FTR_SECTION +	b __sw_hweight16 +	nop +	nop +	nop +	nop +FTR_SECTION_ELSE +  BEGIN_FTR_SECTION_NESTED(50) +	PPC_POPCNTB(R3,R3) +	srdi	r4,r3,8 +	add	r3,r4,r3 +	clrldi	r3,r3,64-8 +	blr +  FTR_SECTION_ELSE_NESTED(50) +	clrlwi  r3,r3,16 +	PPC_POPCNTW(R3,R3) +	clrldi	r3,r3,64-8 +	blr +  ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 50) +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) + +_GLOBAL(__arch_hweight32) +BEGIN_FTR_SECTION +	b __sw_hweight32 +	nop +	nop +	nop +	nop +	nop +	nop +FTR_SECTION_ELSE +  BEGIN_FTR_SECTION_NESTED(51) +	PPC_POPCNTB(R3,R3) +	srdi	r4,r3,16 +	add	r3,r4,r3 +	srdi	r4,r3,8 +	add	r3,r4,r3 +	clrldi	r3,r3,64-8 +	blr +  FTR_SECTION_ELSE_NESTED(51) +	PPC_POPCNTW(R3,R3) +	clrldi	r3,r3,64-8 +	blr +  ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 51) +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) + +_GLOBAL(__arch_hweight64) +BEGIN_FTR_SECTION +	b __sw_hweight64 +	nop +	nop +	nop +	nop +	nop +	nop +	nop +	nop +FTR_SECTION_ELSE +  BEGIN_FTR_SECTION_NESTED(52) +	PPC_POPCNTB(R3,R3) +	srdi	r4,r3,32 +	add	r3,r4,r3 +	srdi	r4,r3,16 +	add	r3,r4,r3 +	srdi	r4,r3,8 +	add	r3,r4,r3 +	clrldi	r3,r3,64-8 +	blr +  FTR_SECTION_ELSE_NESTED(52) +	PPC_POPCNTD(R3,R3) +	clrldi	r3,r3,64-8 +	blr +  ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 52) +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) diff --git a/arch/powerpc/lib/ldstfp.S b/arch/powerpc/lib/ldstfp.S index 6a85380520b..85aec08ab23 100644 --- a/arch/powerpc/lib/ldstfp.S +++ b/arch/powerpc/lib/ldstfp.S @@ -330,13 +330,13 @@ _GLOBAL(do_lxvd2x)  	MTMSRD(r7)  	isync  	beq	cr7,1f -	STXVD2X(0,r1,r8) +	STXVD2X(0,R1,R8)  1:	li	r9,-EFAULT -2:	LXVD2X(0,0,r4) +2:	LXVD2X(0,R0,R4)  	li	r9,0  3:	beq	cr7,4f  	bl	put_vsr -	LXVD2X(0,r1,r8) +	LXVD2X(0,R1,R8)  4:	PPC_LL	r0,STKFRM+PPC_LR_STKOFF(r1)  	mtlr	r0  	MTMSRD(r6) @@ -358,13 +358,13 @@ _GLOBAL(do_stxvd2x)  	MTMSRD(r7)  	isync  	beq	cr7,1f -	STXVD2X(0,r1,r8) +	STXVD2X(0,R1,R8)  	bl	get_vsr  1:	li	r9,-EFAULT -2:	STXVD2X(0,0,r4) +2:	STXVD2X(0,R0,R4)  	li	r9,0  3:	beq	cr7,4f -	LXVD2X(0,r1,r8) +	LXVD2X(0,R1,R8)  4:	PPC_LL	r0,STKFRM+PPC_LR_STKOFF(r1)  	mtlr	r0  	MTMSRD(r6) diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c index 9b8182e8216..0c9c8d7d073 100644 --- a/arch/powerpc/lib/locks.c +++ b/arch/powerpc/lib/locks.c @@ -14,16 +14,14 @@  #include <linux/kernel.h>  #include <linux/spinlock.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/stringify.h>  #include <linux/smp.h>  /* waiting for a spinlock... */ -#if defined(CONFIG_PPC_SPLPAR) || defined(CONFIG_PPC_ISERIES) +#if defined(CONFIG_PPC_SPLPAR)  #include <asm/hvcall.h> -#include <asm/iseries/hv_call.h>  #include <asm/smp.h> -#include <asm/firmware.h>  void __spin_yield(arch_spinlock_t *lock)  { @@ -34,20 +32,14 @@ void __spin_yield(arch_spinlock_t *lock)  		return;  	holder_cpu = lock_value & 0xffff;  	BUG_ON(holder_cpu >= NR_CPUS); -	yield_count = lppaca_of(holder_cpu).yield_count; +	yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count);  	if ((yield_count & 1) == 0)  		return;		/* virtual cpu is currently running */  	rmb();  	if (lock->slock != lock_value)  		return;		/* something has changed */ -	if (firmware_has_feature(FW_FEATURE_ISERIES)) -		HvCall2(HvCallBaseYieldProcessor, HvCall_YieldToProc, -			((u64)holder_cpu << 32) | yield_count); -#ifdef CONFIG_PPC_SPLPAR -	else -		plpar_hcall_norets(H_CONFER, -			get_hard_smp_processor_id(holder_cpu), yield_count); -#endif +	plpar_hcall_norets(H_CONFER, +		get_hard_smp_processor_id(holder_cpu), yield_count);  }  /* @@ -65,20 +57,14 @@ void __rw_yield(arch_rwlock_t *rw)  		return;		/* no write lock at present */  	holder_cpu = lock_value & 0xffff;  	BUG_ON(holder_cpu >= NR_CPUS); -	yield_count = lppaca_of(holder_cpu).yield_count; +	yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count);  	if ((yield_count & 1) == 0)  		return;		/* virtual cpu is currently running */  	rmb();  	if (rw->lock != lock_value)  		return;		/* something has changed */ -	if (firmware_has_feature(FW_FEATURE_ISERIES)) -		HvCall2(HvCallBaseYieldProcessor, HvCall_YieldToProc, -			((u64)holder_cpu << 32) | yield_count); -#ifdef CONFIG_PPC_SPLPAR -	else -		plpar_hcall_norets(H_CONFER, -			get_hard_smp_processor_id(holder_cpu), yield_count); -#endif +	plpar_hcall_norets(H_CONFER, +		get_hard_smp_processor_id(holder_cpu), yield_count);  }  #endif diff --git a/arch/powerpc/lib/mem_64.S b/arch/powerpc/lib/mem_64.S index 11ce045e21f..43435c6892f 100644 --- a/arch/powerpc/lib/mem_64.S +++ b/arch/powerpc/lib/mem_64.S @@ -19,7 +19,7 @@ _GLOBAL(memset)  	rlwimi	r4,r4,16,0,15  	cmplw	cr1,r5,r0		/* do we get that far? */  	rldimi	r4,r4,32,0 -	PPC_MTOCRF	1,r0 +	PPC_MTOCRF(1,r0)  	mr	r6,r3  	blt	cr1,8f  	beq+	3f			/* if already 8-byte aligned */ @@ -49,7 +49,7 @@ _GLOBAL(memset)  	bdnz	4b  5:	srwi.	r0,r5,3  	clrlwi	r5,r5,29 -	PPC_MTOCRF	1,r0 +	PPC_MTOCRF(1,r0)  	beq	8f  	bf	29,6f  	std	r4,0(r6) @@ -65,7 +65,7 @@ _GLOBAL(memset)  	std	r4,0(r6)  	addi	r6,r6,8  8:	cmpwi	r5,0 -	PPC_MTOCRF	1,r5 +	PPC_MTOCRF(1,r5)  	beqlr+  	bf	29,9f  	stw	r4,0(r6) @@ -77,10 +77,10 @@ _GLOBAL(memset)  	stb	r4,0(r6)  	blr -_GLOBAL(memmove) +_GLOBAL_TOC(memmove)  	cmplw	0,r3,r4 -	bgt	.backwards_memcpy -	b	.memcpy +	bgt	backwards_memcpy +	b	memcpy  _GLOBAL(backwards_memcpy)  	rlwinm.	r7,r5,32-3,3,31		/* r0 = r5 >> 3 */ diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S index e178922b2c2..32a06ec395d 100644 --- a/arch/powerpc/lib/memcpy_64.S +++ b/arch/powerpc/lib/memcpy_64.S @@ -10,9 +10,30 @@  #include <asm/ppc_asm.h>  	.align	7 -_GLOBAL(memcpy) -	std	r3,48(r1)	/* save destination pointer for return value */ -	PPC_MTOCRF	0x01,r5 +_GLOBAL_TOC(memcpy) +BEGIN_FTR_SECTION +#ifdef __LITTLE_ENDIAN__ +	cmpdi	cr7,r5,0 +#else +	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* save destination pointer for return value */ +#endif +FTR_SECTION_ELSE +#ifndef SELFTEST +	b	memcpy_power7 +#endif +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) +#ifdef __LITTLE_ENDIAN__ +	/* dumb little-endian memcpy that will get replaced at runtime */ +	addi r9,r3,-1 +	addi r4,r4,-1 +	beqlr cr7 +	mtctr r5 +1:	lbzu r10,1(r4) +	stbu r10,1(r9) +	bdnz 1b +	blr +#else +	PPC_MTOCRF(0x01,r5)  	cmpldi	cr1,r5,16  	neg	r6,r3		# LS 3 bits = # bytes to 8-byte dest bdry  	andi.	r6,r6,7 @@ -67,7 +88,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)  2:	bf	cr7*4+3,3f  	lbz	r9,8(r4)  	stb	r9,0(r3) -3:	ld	r3,48(r1)	/* return dest pointer */ +3:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* return dest pointer */  	blr  .Lsrc_unaligned: @@ -150,11 +171,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)  2:	bf	cr7*4+3,3f  	rotldi	r9,r9,8  	stb	r9,0(r3) -3:	ld	r3,48(r1)	/* return dest pointer */ +3:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* return dest pointer */  	blr  .Ldst_unaligned: -	PPC_MTOCRF	0x01,r6		# put #bytes to 8B bdry into cr7 +	PPC_MTOCRF(0x01,r6)		# put #bytes to 8B bdry into cr7  	subf	r5,r6,r5  	li	r7,0  	cmpldi	cr1,r5,16 @@ -169,7 +190,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)  2:	bf	cr7*4+1,3f  	lwzx	r0,r7,r4  	stwx	r0,r7,r3 -3:	PPC_MTOCRF	0x01,r5 +3:	PPC_MTOCRF(0x01,r5)  	add	r4,r6,r4  	add	r3,r6,r3  	b	.Ldst_aligned @@ -195,5 +216,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)  3:	bf	cr7*4+3,4f  	lbz	r0,0(r4)  	stb	r0,0(r3) -4:	ld	r3,48(r1)	/* return dest pointer */ +4:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* return dest pointer */  	blr +#endif diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S new file mode 100644 index 00000000000..2ff5c142f87 --- /dev/null +++ b/arch/powerpc/lib/memcpy_power7.S @@ -0,0 +1,656 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <asm/ppc_asm.h> + +_GLOBAL(memcpy_power7) + +#ifdef __BIG_ENDIAN__ +#define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB +#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRA,VRB,VRC +#else +#define LVS(VRT,RA,RB)		lvsr	VRT,RA,RB +#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRB,VRA,VRC +#endif + +#ifdef CONFIG_ALTIVEC +	cmpldi	r5,16 +	cmpldi	cr1,r5,4096 + +	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + +	blt	.Lshort_copy +	bgt	cr1,.Lvmx_copy +#else +	cmpldi	r5,16 + +	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + +	blt	.Lshort_copy +#endif + +.Lnonvmx_copy: +	/* Get the source 8B aligned */ +	neg	r6,r4 +	mtocrf	0x01,r6 +	clrldi	r6,r6,(64-3) + +	bf	cr7*4+3,1f +	lbz	r0,0(r4) +	addi	r4,r4,1 +	stb	r0,0(r3) +	addi	r3,r3,1 + +1:	bf	cr7*4+2,2f +	lhz	r0,0(r4) +	addi	r4,r4,2 +	sth	r0,0(r3) +	addi	r3,r3,2 + +2:	bf	cr7*4+1,3f +	lwz	r0,0(r4) +	addi	r4,r4,4 +	stw	r0,0(r3) +	addi	r3,r3,4 + +3:	sub	r5,r5,r6 +	cmpldi	r5,128 +	blt	5f + +	mflr	r0 +	stdu	r1,-STACKFRAMESIZE(r1) +	std	r14,STK_REG(R14)(r1) +	std	r15,STK_REG(R15)(r1) +	std	r16,STK_REG(R16)(r1) +	std	r17,STK_REG(R17)(r1) +	std	r18,STK_REG(R18)(r1) +	std	r19,STK_REG(R19)(r1) +	std	r20,STK_REG(R20)(r1) +	std	r21,STK_REG(R21)(r1) +	std	r22,STK_REG(R22)(r1) +	std	r0,STACKFRAMESIZE+16(r1) + +	srdi	r6,r5,7 +	mtctr	r6 + +	/* Now do cacheline (128B) sized loads and stores. */ +	.align	5 +4: +	ld	r0,0(r4) +	ld	r6,8(r4) +	ld	r7,16(r4) +	ld	r8,24(r4) +	ld	r9,32(r4) +	ld	r10,40(r4) +	ld	r11,48(r4) +	ld	r12,56(r4) +	ld	r14,64(r4) +	ld	r15,72(r4) +	ld	r16,80(r4) +	ld	r17,88(r4) +	ld	r18,96(r4) +	ld	r19,104(r4) +	ld	r20,112(r4) +	ld	r21,120(r4) +	addi	r4,r4,128 +	std	r0,0(r3) +	std	r6,8(r3) +	std	r7,16(r3) +	std	r8,24(r3) +	std	r9,32(r3) +	std	r10,40(r3) +	std	r11,48(r3) +	std	r12,56(r3) +	std	r14,64(r3) +	std	r15,72(r3) +	std	r16,80(r3) +	std	r17,88(r3) +	std	r18,96(r3) +	std	r19,104(r3) +	std	r20,112(r3) +	std	r21,120(r3) +	addi	r3,r3,128 +	bdnz	4b + +	clrldi	r5,r5,(64-7) + +	ld	r14,STK_REG(R14)(r1) +	ld	r15,STK_REG(R15)(r1) +	ld	r16,STK_REG(R16)(r1) +	ld	r17,STK_REG(R17)(r1) +	ld	r18,STK_REG(R18)(r1) +	ld	r19,STK_REG(R19)(r1) +	ld	r20,STK_REG(R20)(r1) +	ld	r21,STK_REG(R21)(r1) +	ld	r22,STK_REG(R22)(r1) +	addi	r1,r1,STACKFRAMESIZE + +	/* Up to 127B to go */ +5:	srdi	r6,r5,4 +	mtocrf	0x01,r6 + +6:	bf	cr7*4+1,7f +	ld	r0,0(r4) +	ld	r6,8(r4) +	ld	r7,16(r4) +	ld	r8,24(r4) +	ld	r9,32(r4) +	ld	r10,40(r4) +	ld	r11,48(r4) +	ld	r12,56(r4) +	addi	r4,r4,64 +	std	r0,0(r3) +	std	r6,8(r3) +	std	r7,16(r3) +	std	r8,24(r3) +	std	r9,32(r3) +	std	r10,40(r3) +	std	r11,48(r3) +	std	r12,56(r3) +	addi	r3,r3,64 + +	/* Up to 63B to go */ +7:	bf	cr7*4+2,8f +	ld	r0,0(r4) +	ld	r6,8(r4) +	ld	r7,16(r4) +	ld	r8,24(r4) +	addi	r4,r4,32 +	std	r0,0(r3) +	std	r6,8(r3) +	std	r7,16(r3) +	std	r8,24(r3) +	addi	r3,r3,32 + +	/* Up to 31B to go */ +8:	bf	cr7*4+3,9f +	ld	r0,0(r4) +	ld	r6,8(r4) +	addi	r4,r4,16 +	std	r0,0(r3) +	std	r6,8(r3) +	addi	r3,r3,16 + +9:	clrldi	r5,r5,(64-4) + +	/* Up to 15B to go */ +.Lshort_copy: +	mtocrf	0x01,r5 +	bf	cr7*4+0,12f +	lwz	r0,0(r4)	/* Less chance of a reject with word ops */ +	lwz	r6,4(r4) +	addi	r4,r4,8 +	stw	r0,0(r3) +	stw	r6,4(r3) +	addi	r3,r3,8 + +12:	bf	cr7*4+1,13f +	lwz	r0,0(r4) +	addi	r4,r4,4 +	stw	r0,0(r3) +	addi	r3,r3,4 + +13:	bf	cr7*4+2,14f +	lhz	r0,0(r4) +	addi	r4,r4,2 +	sth	r0,0(r3) +	addi	r3,r3,2 + +14:	bf	cr7*4+3,15f +	lbz	r0,0(r4) +	stb	r0,0(r3) + +15:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1) +	blr + +.Lunwind_stack_nonvmx_copy: +	addi	r1,r1,STACKFRAMESIZE +	b	.Lnonvmx_copy + +#ifdef CONFIG_ALTIVEC +.Lvmx_copy: +	mflr	r0 +	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1) +	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1) +	std	r0,16(r1) +	stdu	r1,-STACKFRAMESIZE(r1) +	bl	enter_vmx_copy +	cmpwi	cr1,r3,0 +	ld	r0,STACKFRAMESIZE+16(r1) +	ld	r3,STK_REG(R31)(r1) +	ld	r4,STK_REG(R30)(r1) +	ld	r5,STK_REG(R29)(r1) +	mtlr	r0 + +	/* +	 * We prefetch both the source and destination using enhanced touch +	 * instructions. We use a stream ID of 0 for the load side and +	 * 1 for the store side. +	 */ +	clrrdi	r6,r4,7 +	clrrdi	r9,r3,7 +	ori	r9,r9,1		/* stream=1 */ + +	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */ +	cmpldi	r7,0x3FF +	ble	1f +	li	r7,0x3FF +1:	lis	r0,0x0E00	/* depth=7 */ +	sldi	r7,r7,7 +	or	r7,r7,r0 +	ori	r10,r7,1	/* stream=1 */ + +	lis	r8,0x8000	/* GO=1 */ +	clrldi	r8,r8,32 + +.machine push +.machine "power4" +	dcbt	r0,r6,0b01000 +	dcbt	r0,r7,0b01010 +	dcbtst	r0,r9,0b01000 +	dcbtst	r0,r10,0b01010 +	eieio +	dcbt	r0,r8,0b01010	/* GO */ +.machine pop + +	beq	cr1,.Lunwind_stack_nonvmx_copy + +	/* +	 * If source and destination are not relatively aligned we use a +	 * slower permute loop. +	 */ +	xor	r6,r4,r3 +	rldicl.	r6,r6,0,(64-4) +	bne	.Lvmx_unaligned_copy + +	/* Get the destination 16B aligned */ +	neg	r6,r3 +	mtocrf	0x01,r6 +	clrldi	r6,r6,(64-4) + +	bf	cr7*4+3,1f +	lbz	r0,0(r4) +	addi	r4,r4,1 +	stb	r0,0(r3) +	addi	r3,r3,1 + +1:	bf	cr7*4+2,2f +	lhz	r0,0(r4) +	addi	r4,r4,2 +	sth	r0,0(r3) +	addi	r3,r3,2 + +2:	bf	cr7*4+1,3f +	lwz	r0,0(r4) +	addi	r4,r4,4 +	stw	r0,0(r3) +	addi	r3,r3,4 + +3:	bf	cr7*4+0,4f +	ld	r0,0(r4) +	addi	r4,r4,8 +	std	r0,0(r3) +	addi	r3,r3,8 + +4:	sub	r5,r5,r6 + +	/* Get the desination 128B aligned */ +	neg	r6,r3 +	srdi	r7,r6,4 +	mtocrf	0x01,r7 +	clrldi	r6,r6,(64-7) + +	li	r9,16 +	li	r10,32 +	li	r11,48 + +	bf	cr7*4+3,5f +	lvx	vr1,r0,r4 +	addi	r4,r4,16 +	stvx	vr1,r0,r3 +	addi	r3,r3,16 + +5:	bf	cr7*4+2,6f +	lvx	vr1,r0,r4 +	lvx	vr0,r4,r9 +	addi	r4,r4,32 +	stvx	vr1,r0,r3 +	stvx	vr0,r3,r9 +	addi	r3,r3,32 + +6:	bf	cr7*4+1,7f +	lvx	vr3,r0,r4 +	lvx	vr2,r4,r9 +	lvx	vr1,r4,r10 +	lvx	vr0,r4,r11 +	addi	r4,r4,64 +	stvx	vr3,r0,r3 +	stvx	vr2,r3,r9 +	stvx	vr1,r3,r10 +	stvx	vr0,r3,r11 +	addi	r3,r3,64 + +7:	sub	r5,r5,r6 +	srdi	r6,r5,7 + +	std	r14,STK_REG(R14)(r1) +	std	r15,STK_REG(R15)(r1) +	std	r16,STK_REG(R16)(r1) + +	li	r12,64 +	li	r14,80 +	li	r15,96 +	li	r16,112 + +	mtctr	r6 + +	/* +	 * Now do cacheline sized loads and stores. By this stage the +	 * cacheline stores are also cacheline aligned. +	 */ +	.align	5 +8: +	lvx	vr7,r0,r4 +	lvx	vr6,r4,r9 +	lvx	vr5,r4,r10 +	lvx	vr4,r4,r11 +	lvx	vr3,r4,r12 +	lvx	vr2,r4,r14 +	lvx	vr1,r4,r15 +	lvx	vr0,r4,r16 +	addi	r4,r4,128 +	stvx	vr7,r0,r3 +	stvx	vr6,r3,r9 +	stvx	vr5,r3,r10 +	stvx	vr4,r3,r11 +	stvx	vr3,r3,r12 +	stvx	vr2,r3,r14 +	stvx	vr1,r3,r15 +	stvx	vr0,r3,r16 +	addi	r3,r3,128 +	bdnz	8b + +	ld	r14,STK_REG(R14)(r1) +	ld	r15,STK_REG(R15)(r1) +	ld	r16,STK_REG(R16)(r1) + +	/* Up to 127B to go */ +	clrldi	r5,r5,(64-7) +	srdi	r6,r5,4 +	mtocrf	0x01,r6 + +	bf	cr7*4+1,9f +	lvx	vr3,r0,r4 +	lvx	vr2,r4,r9 +	lvx	vr1,r4,r10 +	lvx	vr0,r4,r11 +	addi	r4,r4,64 +	stvx	vr3,r0,r3 +	stvx	vr2,r3,r9 +	stvx	vr1,r3,r10 +	stvx	vr0,r3,r11 +	addi	r3,r3,64 + +9:	bf	cr7*4+2,10f +	lvx	vr1,r0,r4 +	lvx	vr0,r4,r9 +	addi	r4,r4,32 +	stvx	vr1,r0,r3 +	stvx	vr0,r3,r9 +	addi	r3,r3,32 + +10:	bf	cr7*4+3,11f +	lvx	vr1,r0,r4 +	addi	r4,r4,16 +	stvx	vr1,r0,r3 +	addi	r3,r3,16 + +	/* Up to 15B to go */ +11:	clrldi	r5,r5,(64-4) +	mtocrf	0x01,r5 +	bf	cr7*4+0,12f +	ld	r0,0(r4) +	addi	r4,r4,8 +	std	r0,0(r3) +	addi	r3,r3,8 + +12:	bf	cr7*4+1,13f +	lwz	r0,0(r4) +	addi	r4,r4,4 +	stw	r0,0(r3) +	addi	r3,r3,4 + +13:	bf	cr7*4+2,14f +	lhz	r0,0(r4) +	addi	r4,r4,2 +	sth	r0,0(r3) +	addi	r3,r3,2 + +14:	bf	cr7*4+3,15f +	lbz	r0,0(r4) +	stb	r0,0(r3) + +15:	addi	r1,r1,STACKFRAMESIZE +	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1) +	b	exit_vmx_copy		/* tail call optimise */ + +.Lvmx_unaligned_copy: +	/* Get the destination 16B aligned */ +	neg	r6,r3 +	mtocrf	0x01,r6 +	clrldi	r6,r6,(64-4) + +	bf	cr7*4+3,1f +	lbz	r0,0(r4) +	addi	r4,r4,1 +	stb	r0,0(r3) +	addi	r3,r3,1 + +1:	bf	cr7*4+2,2f +	lhz	r0,0(r4) +	addi	r4,r4,2 +	sth	r0,0(r3) +	addi	r3,r3,2 + +2:	bf	cr7*4+1,3f +	lwz	r0,0(r4) +	addi	r4,r4,4 +	stw	r0,0(r3) +	addi	r3,r3,4 + +3:	bf	cr7*4+0,4f +	lwz	r0,0(r4)	/* Less chance of a reject with word ops */ +	lwz	r7,4(r4) +	addi	r4,r4,8 +	stw	r0,0(r3) +	stw	r7,4(r3) +	addi	r3,r3,8 + +4:	sub	r5,r5,r6 + +	/* Get the desination 128B aligned */ +	neg	r6,r3 +	srdi	r7,r6,4 +	mtocrf	0x01,r7 +	clrldi	r6,r6,(64-7) + +	li	r9,16 +	li	r10,32 +	li	r11,48 + +	LVS(vr16,0,r4)		/* Setup permute control vector */ +	lvx	vr0,0,r4 +	addi	r4,r4,16 + +	bf	cr7*4+3,5f +	lvx	vr1,r0,r4 +	VPERM(vr8,vr0,vr1,vr16) +	addi	r4,r4,16 +	stvx	vr8,r0,r3 +	addi	r3,r3,16 +	vor	vr0,vr1,vr1 + +5:	bf	cr7*4+2,6f +	lvx	vr1,r0,r4 +	VPERM(vr8,vr0,vr1,vr16) +	lvx	vr0,r4,r9 +	VPERM(vr9,vr1,vr0,vr16) +	addi	r4,r4,32 +	stvx	vr8,r0,r3 +	stvx	vr9,r3,r9 +	addi	r3,r3,32 + +6:	bf	cr7*4+1,7f +	lvx	vr3,r0,r4 +	VPERM(vr8,vr0,vr3,vr16) +	lvx	vr2,r4,r9 +	VPERM(vr9,vr3,vr2,vr16) +	lvx	vr1,r4,r10 +	VPERM(vr10,vr2,vr1,vr16) +	lvx	vr0,r4,r11 +	VPERM(vr11,vr1,vr0,vr16) +	addi	r4,r4,64 +	stvx	vr8,r0,r3 +	stvx	vr9,r3,r9 +	stvx	vr10,r3,r10 +	stvx	vr11,r3,r11 +	addi	r3,r3,64 + +7:	sub	r5,r5,r6 +	srdi	r6,r5,7 + +	std	r14,STK_REG(R14)(r1) +	std	r15,STK_REG(R15)(r1) +	std	r16,STK_REG(R16)(r1) + +	li	r12,64 +	li	r14,80 +	li	r15,96 +	li	r16,112 + +	mtctr	r6 + +	/* +	 * Now do cacheline sized loads and stores. By this stage the +	 * cacheline stores are also cacheline aligned. +	 */ +	.align	5 +8: +	lvx	vr7,r0,r4 +	VPERM(vr8,vr0,vr7,vr16) +	lvx	vr6,r4,r9 +	VPERM(vr9,vr7,vr6,vr16) +	lvx	vr5,r4,r10 +	VPERM(vr10,vr6,vr5,vr16) +	lvx	vr4,r4,r11 +	VPERM(vr11,vr5,vr4,vr16) +	lvx	vr3,r4,r12 +	VPERM(vr12,vr4,vr3,vr16) +	lvx	vr2,r4,r14 +	VPERM(vr13,vr3,vr2,vr16) +	lvx	vr1,r4,r15 +	VPERM(vr14,vr2,vr1,vr16) +	lvx	vr0,r4,r16 +	VPERM(vr15,vr1,vr0,vr16) +	addi	r4,r4,128 +	stvx	vr8,r0,r3 +	stvx	vr9,r3,r9 +	stvx	vr10,r3,r10 +	stvx	vr11,r3,r11 +	stvx	vr12,r3,r12 +	stvx	vr13,r3,r14 +	stvx	vr14,r3,r15 +	stvx	vr15,r3,r16 +	addi	r3,r3,128 +	bdnz	8b + +	ld	r14,STK_REG(R14)(r1) +	ld	r15,STK_REG(R15)(r1) +	ld	r16,STK_REG(R16)(r1) + +	/* Up to 127B to go */ +	clrldi	r5,r5,(64-7) +	srdi	r6,r5,4 +	mtocrf	0x01,r6 + +	bf	cr7*4+1,9f +	lvx	vr3,r0,r4 +	VPERM(vr8,vr0,vr3,vr16) +	lvx	vr2,r4,r9 +	VPERM(vr9,vr3,vr2,vr16) +	lvx	vr1,r4,r10 +	VPERM(vr10,vr2,vr1,vr16) +	lvx	vr0,r4,r11 +	VPERM(vr11,vr1,vr0,vr16) +	addi	r4,r4,64 +	stvx	vr8,r0,r3 +	stvx	vr9,r3,r9 +	stvx	vr10,r3,r10 +	stvx	vr11,r3,r11 +	addi	r3,r3,64 + +9:	bf	cr7*4+2,10f +	lvx	vr1,r0,r4 +	VPERM(vr8,vr0,vr1,vr16) +	lvx	vr0,r4,r9 +	VPERM(vr9,vr1,vr0,vr16) +	addi	r4,r4,32 +	stvx	vr8,r0,r3 +	stvx	vr9,r3,r9 +	addi	r3,r3,32 + +10:	bf	cr7*4+3,11f +	lvx	vr1,r0,r4 +	VPERM(vr8,vr0,vr1,vr16) +	addi	r4,r4,16 +	stvx	vr8,r0,r3 +	addi	r3,r3,16 + +	/* Up to 15B to go */ +11:	clrldi	r5,r5,(64-4) +	addi	r4,r4,-16	/* Unwind the +16 load offset */ +	mtocrf	0x01,r5 +	bf	cr7*4+0,12f +	lwz	r0,0(r4)	/* Less chance of a reject with word ops */ +	lwz	r6,4(r4) +	addi	r4,r4,8 +	stw	r0,0(r3) +	stw	r6,4(r3) +	addi	r3,r3,8 + +12:	bf	cr7*4+1,13f +	lwz	r0,0(r4) +	addi	r4,r4,4 +	stw	r0,0(r3) +	addi	r3,r3,4 + +13:	bf	cr7*4+2,14f +	lhz	r0,0(r4) +	addi	r4,r4,2 +	sth	r0,0(r3) +	addi	r3,r3,2 + +14:	bf	cr7*4+3,15f +	lbz	r0,0(r4) +	stb	r0,0(r3) + +15:	addi	r1,r1,STACKFRAMESIZE +	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1) +	b	exit_vmx_copy		/* tail call optimise */ +#endif /* CONFiG_ALTIVEC */ diff --git a/arch/powerpc/lib/rheap.c b/arch/powerpc/lib/rheap.c index 45907c1dae6..a1060a868e6 100644 --- a/arch/powerpc/lib/rheap.c +++ b/arch/powerpc/lib/rheap.c @@ -15,7 +15,7 @@  #include <linux/types.h>  #include <linux/errno.h>  #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/mm.h>  #include <linux/err.h>  #include <linux/slab.h> diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index ae5189ab004..5c09f365c84 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -11,6 +11,7 @@  #include <linux/kernel.h>  #include <linux/kprobes.h>  #include <linux/ptrace.h> +#include <linux/prefetch.h>  #include <asm/sstep.h>  #include <asm/processor.h>  #include <asm/uaccess.h> @@ -45,6 +46,18 @@ extern int do_stxvd2x(int rn, unsigned long ea);  #endif  /* + * Emulate the truncation of 64 bit values in 32-bit mode. + */ +static unsigned long truncate_if_32bit(unsigned long msr, unsigned long val) +{ +#ifdef __powerpc64__ +	if ((msr & MSR_64BIT) == 0) +		val &= 0xffffffffUL; +#endif +	return val; +} + +/*   * Determine whether a conditional branch instruction would branch.   */  static int __kprobes branch_taken(unsigned int instr, struct pt_regs *regs) @@ -87,14 +100,13 @@ static unsigned long __kprobes dform_ea(unsigned int instr, struct pt_regs *regs  	ea = (signed short) instr;		/* sign-extend */  	if (ra) {  		ea += regs->gpr[ra]; -		if (instr & 0x04000000)		/* update forms */ -			regs->gpr[ra] = ea; +		if (instr & 0x04000000) {		/* update forms */ +			if ((instr>>26) != 47) 		/* stmw is not an update form */ +				regs->gpr[ra] = ea; +		}  	} -#ifdef __powerpc64__ -	if (!(regs->msr & MSR_SF)) -		ea &= 0xffffffffUL; -#endif -	return ea; + +	return truncate_if_32bit(regs->msr, ea);  }  #ifdef __powerpc64__ @@ -113,9 +125,8 @@ static unsigned long __kprobes dsform_ea(unsigned int instr, struct pt_regs *reg  		if ((instr & 3) == 1)		/* update forms */  			regs->gpr[ra] = ea;  	} -	if (!(regs->msr & MSR_SF)) -		ea &= 0xffffffffUL; -	return ea; + +	return truncate_if_32bit(regs->msr, ea);  }  #endif /* __powerpc64 */ @@ -136,11 +147,8 @@ static unsigned long __kprobes xform_ea(unsigned int instr, struct pt_regs *regs  		if (do_update)		/* update forms */  			regs->gpr[ra] = ea;  	} -#ifdef __powerpc64__ -	if (!(regs->msr & MSR_SF)) -		ea &= 0xffffffffUL; -#endif -	return ea; + +	return truncate_if_32bit(regs->msr, ea);  }  /* @@ -204,11 +212,19 @@ static int __kprobes read_mem_unaligned(unsigned long *dest, unsigned long ea,  {  	int err;  	unsigned long x, b, c; +#ifdef __LITTLE_ENDIAN__ +	int len = nb; /* save a copy of the length for byte reversal */ +#endif  	/* unaligned, do this in pieces */  	x = 0;  	for (; nb > 0; nb -= c) { +#ifdef __LITTLE_ENDIAN__ +		c = 1; +#endif +#ifdef __BIG_ENDIAN__  		c = max_align(ea); +#endif  		if (c > nb)  			c = max_align(nb);  		err = read_mem_aligned(&b, ea, c); @@ -217,7 +233,24 @@ static int __kprobes read_mem_unaligned(unsigned long *dest, unsigned long ea,  		x = (x << (8 * c)) + b;  		ea += c;  	} +#ifdef __LITTLE_ENDIAN__ +	switch (len) { +	case 2: +		*dest = byterev_2(x); +		break; +	case 4: +		*dest = byterev_4(x); +		break; +#ifdef __powerpc64__ +	case 8: +		*dest = byterev_8(x); +		break; +#endif +	} +#endif +#ifdef __BIG_ENDIAN__  	*dest = x; +#endif  	return 0;  } @@ -265,15 +298,35 @@ static int __kprobes write_mem_unaligned(unsigned long val, unsigned long ea,  	int err;  	unsigned long c; +#ifdef __LITTLE_ENDIAN__ +	switch (nb) { +	case 2: +		val = byterev_2(val); +		break; +	case 4: +		val = byterev_4(val); +		break; +#ifdef __powerpc64__ +	case 8: +		val = byterev_8(val); +		break; +#endif +	} +#endif  	/* unaligned or little-endian, do this in pieces */  	for (; nb > 0; nb -= c) { +#ifdef __LITTLE_ENDIAN__ +		c = 1; +#endif +#ifdef __BIG_ENDIAN__  		c = max_align(ea); +#endif  		if (c > nb)  			c = max_align(nb);  		err = write_mem_aligned(val >> (nb - c) * 8, ea, c);  		if (err)  			return err; -		++ea; +		ea += c;  	}  	return 0;  } @@ -302,22 +355,36 @@ static int __kprobes do_fp_load(int rn, int (*func)(int, unsigned long),  				struct pt_regs *regs)  {  	int err; -	unsigned long val[sizeof(double) / sizeof(long)]; +	union { +		double dbl; +		unsigned long ul[2]; +		struct { +#ifdef __BIG_ENDIAN__ +			unsigned _pad_; +			unsigned word; +#endif +#ifdef __LITTLE_ENDIAN__ +			unsigned word; +			unsigned _pad_; +#endif +		} single; +	} data;  	unsigned long ptr;  	if (!address_ok(regs, ea, nb))  		return -EFAULT;  	if ((ea & 3) == 0)  		return (*func)(rn, ea); -	ptr = (unsigned long) &val[0]; +	ptr = (unsigned long) &data.ul;  	if (sizeof(unsigned long) == 8 || nb == 4) { -		err = read_mem_unaligned(&val[0], ea, nb, regs); -		ptr += sizeof(unsigned long) - nb; +		err = read_mem_unaligned(&data.ul[0], ea, nb, regs); +		if (nb == 4) +			ptr = (unsigned long)&(data.single.word);  	} else {  		/* reading a double on 32-bit */ -		err = read_mem_unaligned(&val[0], ea, 4, regs); +		err = read_mem_unaligned(&data.ul[0], ea, 4, regs);  		if (!err) -			err = read_mem_unaligned(&val[1], ea + 4, 4, regs); +			err = read_mem_unaligned(&data.ul[1], ea + 4, 4, regs);  	}  	if (err)  		return err; @@ -329,28 +396,42 @@ static int __kprobes do_fp_store(int rn, int (*func)(int, unsigned long),  				 struct pt_regs *regs)  {  	int err; -	unsigned long val[sizeof(double) / sizeof(long)]; +	union { +		double dbl; +		unsigned long ul[2]; +		struct { +#ifdef __BIG_ENDIAN__ +			unsigned _pad_; +			unsigned word; +#endif +#ifdef __LITTLE_ENDIAN__ +			unsigned word; +			unsigned _pad_; +#endif +		} single; +	} data;  	unsigned long ptr;  	if (!address_ok(regs, ea, nb))  		return -EFAULT;  	if ((ea & 3) == 0)  		return (*func)(rn, ea); -	ptr = (unsigned long) &val[0]; +	ptr = (unsigned long) &data.ul[0];  	if (sizeof(unsigned long) == 8 || nb == 4) { -		ptr += sizeof(unsigned long) - nb; +		if (nb == 4) +			ptr = (unsigned long)&(data.single.word);  		err = (*func)(rn, ptr);  		if (err)  			return err; -		err = write_mem_unaligned(val[0], ea, nb, regs); +		err = write_mem_unaligned(data.ul[0], ea, nb, regs);  	} else {  		/* writing a double on 32-bit */  		err = (*func)(rn, ptr);  		if (err)  			return err; -		err = write_mem_unaligned(val[0], ea, 4, regs); +		err = write_mem_unaligned(data.ul[0], ea, 4, regs);  		if (!err) -			err = write_mem_unaligned(val[1], ea + 4, 4, regs); +			err = write_mem_unaligned(data.ul[1], ea + 4, 4, regs);  	}  	return err;  } @@ -466,7 +547,7 @@ static void __kprobes set_cr0(struct pt_regs *regs, int rd)  	regs->ccr = (regs->ccr & 0x0fffffff) | ((regs->xer >> 3) & 0x10000000);  #ifdef __powerpc64__ -	if (!(regs->msr & MSR_SF)) +	if (!(regs->msr & MSR_64BIT))  		val = (int) val;  #endif  	if (val < 0) @@ -487,7 +568,7 @@ static void __kprobes add_with_carry(struct pt_regs *regs, int rd,  		++val;  	regs->gpr[rd] = val;  #ifdef __powerpc64__ -	if (!(regs->msr & MSR_SF)) { +	if (!(regs->msr & MSR_64BIT)) {  		val = (unsigned int) val;  		val1 = (unsigned int) val1;  	} @@ -560,7 +641,7 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)  	unsigned long int ea;  	unsigned int cr, mb, me, sh;  	int err; -	unsigned long old_ra; +	unsigned long old_ra, val3;  	long ival;  	opcode = instr >> 26; @@ -570,12 +651,11 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)  		if ((instr & 2) == 0)  			imm += regs->nip;  		regs->nip += 4; -		if ((regs->msr & MSR_SF) == 0) -			regs->nip &= 0xffffffffUL; +		regs->nip = truncate_if_32bit(regs->msr, regs->nip);  		if (instr & 1)  			regs->link = regs->nip;  		if (branch_taken(instr, regs)) -			regs->nip = imm; +			regs->nip = truncate_if_32bit(regs->msr, imm);  		return 1;  #ifdef CONFIG_PPC64  	case 17:	/* sc */ @@ -604,13 +684,9 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)  			imm -= 0x04000000;  		if ((instr & 2) == 0)  			imm += regs->nip; -		if (instr & 1) { -			regs->link = regs->nip + 4; -			if ((regs->msr & MSR_SF) == 0) -				regs->link &= 0xffffffffUL; -		} -		if ((regs->msr & MSR_SF) == 0) -			imm &= 0xffffffffUL; +		if (instr & 1) +			regs->link = truncate_if_32bit(regs->msr, regs->nip + 4); +		imm = truncate_if_32bit(regs->msr, imm);  		regs->nip = imm;  		return 1;  	case 19: @@ -618,11 +694,8 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)  		case 16:	/* bclr */  		case 528:	/* bcctr */  			imm = (instr & 0x400)? regs->ctr: regs->link; -			regs->nip += 4; -			if ((regs->msr & MSR_SF) == 0) { -				regs->nip &= 0xffffffffUL; -				imm &= 0xffffffffUL; -			} +			regs->nip = truncate_if_32bit(regs->msr, regs->nip + 4); +			imm = truncate_if_32bit(regs->msr, imm);  			if (instr & 1)  				regs->link = regs->nip;  			if (branch_taken(instr, regs)) @@ -1125,7 +1198,7 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)  			sh = regs->gpr[rb] & 0x3f;  			ival = (signed int) regs->gpr[rd];  			regs->gpr[ra] = ival >> (sh < 32 ? sh : 31); -			if (ival < 0 && (sh >= 32 || (ival & ((1 << sh) - 1)) != 0)) +			if (ival < 0 && (sh >= 32 || (ival & ((1ul << sh) - 1)) != 0))  				regs->xer |= XER_CA;  			else  				regs->xer &= ~XER_CA; @@ -1135,7 +1208,7 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)  			sh = rb;  			ival = (signed int) regs->gpr[rd];  			regs->gpr[ra] = ival >> sh; -			if (ival < 0 && (ival & ((1 << sh) - 1)) != 0) +			if (ival < 0 && (ival & ((1ul << sh) - 1)) != 0)  				regs->xer |= XER_CA;  			else  				regs->xer &= ~XER_CA; @@ -1143,7 +1216,7 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)  #ifdef __powerpc64__  		case 27:	/* sld */ -			sh = regs->gpr[rd] & 0x7f; +			sh = regs->gpr[rb] & 0x7f;  			if (sh < 64)  				regs->gpr[ra] = regs->gpr[rd] << sh;  			else @@ -1162,7 +1235,7 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)  			sh = regs->gpr[rb] & 0x7f;  			ival = (signed long int) regs->gpr[rd];  			regs->gpr[ra] = ival >> (sh < 64 ? sh : 63); -			if (ival < 0 && (sh >= 64 || (ival & ((1 << sh) - 1)) != 0)) +			if (ival < 0 && (sh >= 64 || (ival & ((1ul << sh) - 1)) != 0))  				regs->xer |= XER_CA;  			else  				regs->xer &= ~XER_CA; @@ -1173,7 +1246,7 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)  			sh = rb | ((instr & 2) << 4);  			ival = (signed long int) regs->gpr[rd];  			regs->gpr[ra] = ival >> sh; -			if (ival < 0 && (ival & ((1 << sh) - 1)) != 0) +			if (ival < 0 && (ival & ((1ul << sh) - 1)) != 0)  				regs->xer |= XER_CA;  			else  				regs->xer &= ~XER_CA; @@ -1397,7 +1470,7 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)  				regs->gpr[rd] = byterev_4(val);  			goto ldst_done; -#ifdef CONFIG_PPC_CPU +#ifdef CONFIG_PPC_FPU  		case 535:	/* lfsx */  		case 567:	/* lfsux */  			if (!(regs->msr & MSR_FP)) @@ -1488,11 +1561,44 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)  		goto ldst_done;  	case 36:	/* stw */ -	case 37:	/* stwu */  		val = regs->gpr[rd];  		err = write_mem(val, dform_ea(instr, regs), 4, regs);  		goto ldst_done; +	case 37:	/* stwu */ +		val = regs->gpr[rd]; +		val3 = dform_ea(instr, regs); +		/* +		 * For PPC32 we always use stwu to change stack point with r1. So +		 * this emulated store may corrupt the exception frame, now we +		 * have to provide the exception frame trampoline, which is pushed +		 * below the kprobed function stack. So we only update gpr[1] but +		 * don't emulate the real store operation. We will do real store +		 * operation safely in exception return code by checking this flag. +		 */ +		if ((ra == 1) && !(regs->msr & MSR_PR) \ +			&& (val3 >= (regs->gpr[1] - STACK_INT_FRAME_SIZE))) { +#ifdef CONFIG_PPC32 +			/* +			 * Check if we will touch kernel sack overflow +			 */ +			if (val3 - STACK_INT_FRAME_SIZE <= current->thread.ksp_limit) { +				printk(KERN_CRIT "Can't kprobe this since Kernel stack overflow.\n"); +				err = -EINVAL; +				break; +			} +#endif /* CONFIG_PPC32 */ +			/* +			 * Check if we already set since that means we'll +			 * lose the previous value. +			 */ +			WARN_ON(test_thread_flag(TIF_EMULATE_STACK_STORE)); +			set_thread_flag(TIF_EMULATE_STACK_STORE); +			err = 0; +		} else +			err = write_mem(val, val3, 4, regs); +		goto ldst_done; +  	case 38:	/* stb */  	case 39:	/* stbu */  		val = regs->gpr[rd]; @@ -1616,11 +1722,7 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)  		return 0;	/* invoke DSI if -EFAULT? */  	}   instr_done: -	regs->nip += 4; -#ifdef __powerpc64__ -	if ((regs->msr & MSR_SF) == 0) -		regs->nip &= 0xffffffffUL; -#endif +	regs->nip = truncate_if_32bit(regs->msr, regs->nip + 4);  	return 1;   logical_done: diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S index 455881a5563..1b5a0a09d60 100644 --- a/arch/powerpc/lib/string.S +++ b/arch/powerpc/lib/string.S @@ -119,6 +119,7 @@ _GLOBAL(memchr)  2:	li	r3,0  	blr +#ifdef CONFIG_PPC32  _GLOBAL(__clear_user)  	addi	r6,r3,-4  	li	r3,0 @@ -160,48 +161,4 @@ _GLOBAL(__clear_user)  	PPC_LONG	1b,91b  	PPC_LONG	8b,92b  	.text - -_GLOBAL(__strncpy_from_user) -	addi	r6,r3,-1 -	addi	r4,r4,-1 -	cmpwi	0,r5,0 -	beq	2f -	mtctr	r5 -1:	lbzu	r0,1(r4) -	cmpwi	0,r0,0 -	stbu	r0,1(r6) -	bdnzf	2,1b		/* dec ctr, branch if ctr != 0 && !cr0.eq */ -	beq	3f -2:	addi	r6,r6,1 -3:	subf	r3,r3,r6 -	blr -99:	li	r3,-EFAULT -	blr - -	.section __ex_table,"a" -	PPC_LONG	1b,99b -	.text - -/* r3 = str, r4 = len (> 0), r5 = top (highest addr) */ -_GLOBAL(__strnlen_user) -	addi	r7,r3,-1 -	subf	r6,r7,r5	/* top+1 - str */ -	cmplw	0,r4,r6 -	bge	0f -	mr	r6,r4 -0:	mtctr	r6		/* ctr = min(len, top - str) */ -1:	lbzu	r0,1(r7)	/* get next byte */ -	cmpwi	0,r0,0 -	bdnzf	2,1b		/* loop if --ctr != 0 && byte != 0 */ -	addi	r7,r7,1 -	subf	r3,r3,r7	/* number of bytes we have looked at */ -	beqlr			/* return if we found a 0 byte */ -	cmpw	0,r3,r4		/* did we look at all len bytes? */ -	blt	99f		/* if not, must have hit top */ -	addi	r3,r4,1		/* return len + 1 to indicate no null found */ -	blr -99:	li	r3,0		/* bad address, return 0 */ -	blr - -	.section __ex_table,"a" -	PPC_LONG	1b,99b +#endif diff --git a/arch/powerpc/lib/string_64.S b/arch/powerpc/lib/string_64.S new file mode 100644 index 00000000000..7bd9549a90a --- /dev/null +++ b/arch/powerpc/lib/string_64.S @@ -0,0 +1,202 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ + +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> + +	.section	".toc","aw" +PPC64_CACHES: +	.tc		ppc64_caches[TC],ppc64_caches +	.section	".text" + +/** + * __clear_user: - Zero a block of memory in user space, with less checking. + * @to:   Destination address, in user space. + * @n:    Number of bytes to zero. + * + * Zero a block of memory in user space.  Caller must check + * the specified block with access_ok() before calling this function. + * + * Returns number of bytes that could not be cleared. + * On success, this will be zero. + */ + +	.macro err1 +100: +	.section __ex_table,"a" +	.align 3 +	.llong 100b,.Ldo_err1 +	.previous +	.endm + +	.macro err2 +200: +	.section __ex_table,"a" +	.align 3 +	.llong 200b,.Ldo_err2 +	.previous +	.endm + +	.macro err3 +300: +	.section __ex_table,"a" +	.align 3 +	.llong 300b,.Ldo_err3 +	.previous +	.endm + +.Ldo_err1: +	mr	r3,r8 + +.Ldo_err2: +	mtctr	r4 +1: +err3;	stb	r0,0(r3) +	addi	r3,r3,1 +	addi	r4,r4,-1 +	bdnz	1b + +.Ldo_err3: +	mr	r3,r4 +	blr + +_GLOBAL_TOC(__clear_user) +	cmpdi	r4,32 +	neg	r6,r3 +	li	r0,0 +	blt	.Lshort_clear +	mr	r8,r3 +	mtocrf	0x01,r6 +	clrldi	r6,r6,(64-3) + +	/* Get the destination 8 byte aligned */ +	bf	cr7*4+3,1f +err1;	stb	r0,0(r3) +	addi	r3,r3,1 + +1:	bf	cr7*4+2,2f +err1;	sth	r0,0(r3) +	addi	r3,r3,2 + +2:	bf	cr7*4+1,3f +err1;	stw	r0,0(r3) +	addi	r3,r3,4 + +3:	sub	r4,r4,r6 + +	cmpdi	r4,32 +	cmpdi	cr1,r4,512 +	blt	.Lshort_clear +	bgt	cr1,.Llong_clear + +.Lmedium_clear: +	srdi	r6,r4,5 +	mtctr	r6 + +	/* Do 32 byte chunks */ +4: +err2;	std	r0,0(r3) +err2;	std	r0,8(r3) +err2;	std	r0,16(r3) +err2;	std	r0,24(r3) +	addi	r3,r3,32 +	addi	r4,r4,-32 +	bdnz	4b + +.Lshort_clear: +	/* up to 31 bytes to go */ +	cmpdi	r4,16 +	blt	6f +err2;	std	r0,0(r3) +err2;	std	r0,8(r3) +	addi	r3,r3,16 +	addi	r4,r4,-16 + +	/* Up to 15 bytes to go */ +6:	mr	r8,r3 +	clrldi	r4,r4,(64-4) +	mtocrf	0x01,r4 +	bf	cr7*4+0,7f +err1;	std	r0,0(r3) +	addi	r3,r3,8 + +7:	bf	cr7*4+1,8f +err1;	stw	r0,0(r3) +	addi	r3,r3,4 + +8:	bf	cr7*4+2,9f +err1;	sth	r0,0(r3) +	addi	r3,r3,2 + +9:	bf	cr7*4+3,10f +err1;	stb	r0,0(r3) + +10:	li	r3,0 +	blr + +.Llong_clear: +	ld	r5,PPC64_CACHES@toc(r2) + +	bf	cr7*4+0,11f +err2;	std	r0,0(r3) +	addi	r3,r3,8 +	addi	r4,r4,-8 + +	/* Destination is 16 byte aligned, need to get it cacheline aligned */ +11:	lwz	r7,DCACHEL1LOGLINESIZE(r5) +	lwz	r9,DCACHEL1LINESIZE(r5) + +	/* +	 * With worst case alignment the long clear loop takes a minimum +	 * of 1 byte less than 2 cachelines. +	 */ +	sldi	r10,r9,2 +	cmpd	r4,r10 +	blt	.Lmedium_clear + +	neg	r6,r3 +	addi	r10,r9,-1 +	and.	r5,r6,r10 +	beq	13f + +	srdi	r6,r5,4 +	mtctr	r6 +	mr	r8,r3 +12: +err1;	std	r0,0(r3) +err1;	std	r0,8(r3) +	addi	r3,r3,16 +	bdnz	12b + +	sub	r4,r4,r5 + +13:	srd	r6,r4,r7 +	mtctr	r6 +	mr	r8,r3 +14: +err1;	dcbz	r0,r3 +	add	r3,r3,r9 +	bdnz	14b + +	and	r4,r4,r10 + +	cmpdi	r4,32 +	blt	.Lshort_clear +	b	.Lmedium_clear diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c new file mode 100644 index 00000000000..3cf529ceec5 --- /dev/null +++ b/arch/powerpc/lib/vmx-helper.c @@ -0,0 +1,74 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2011 + * + * Authors: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com> + *          Anton Blanchard <anton@au.ibm.com> + */ +#include <linux/uaccess.h> +#include <linux/hardirq.h> +#include <asm/switch_to.h> + +int enter_vmx_usercopy(void) +{ +	if (in_interrupt()) +		return 0; + +	/* This acts as preempt_disable() as well and will make +	 * enable_kernel_altivec(). We need to disable page faults +	 * as they can call schedule and thus make us lose the VMX +	 * context. So on page faults, we just fail which will cause +	 * a fallback to the normal non-vmx copy. +	 */ +	pagefault_disable(); + +	enable_kernel_altivec(); + +	return 1; +} + +/* + * This function must return 0 because we tail call optimise when calling + * from __copy_tofrom_user_power7 which returns 0 on success. + */ +int exit_vmx_usercopy(void) +{ +	pagefault_enable(); +	return 0; +} + +int enter_vmx_copy(void) +{ +	if (in_interrupt()) +		return 0; + +	preempt_disable(); + +	enable_kernel_altivec(); + +	return 1; +} + +/* + * All calls to this function will be optimised into tail calls. We are + * passed a pointer to the destination which we return as required by a + * memcpy implementation. + */ +void *exit_vmx_copy(void *dest) +{ +	preempt_enable(); +	return dest; +} diff --git a/arch/powerpc/lib/xor_vmx.c b/arch/powerpc/lib/xor_vmx.c new file mode 100644 index 00000000000..e905f7c2ea7 --- /dev/null +++ b/arch/powerpc/lib/xor_vmx.c @@ -0,0 +1,177 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <altivec.h> + +#include <linux/preempt.h> +#include <linux/export.h> +#include <linux/sched.h> +#include <asm/switch_to.h> + +typedef vector signed char unative_t; + +#define DEFINE(V)				\ +	unative_t *V = (unative_t *)V##_in;	\ +	unative_t V##_0, V##_1, V##_2, V##_3 + +#define LOAD(V)			\ +	do {			\ +		V##_0 = V[0];	\ +		V##_1 = V[1];	\ +		V##_2 = V[2];	\ +		V##_3 = V[3];	\ +	} while (0) + +#define STORE(V)		\ +	do {			\ +		V[0] = V##_0;	\ +		V[1] = V##_1;	\ +		V[2] = V##_2;	\ +		V[3] = V##_3;	\ +	} while (0) + +#define XOR(V1, V2)					\ +	do {						\ +		V1##_0 = vec_xor(V1##_0, V2##_0);	\ +		V1##_1 = vec_xor(V1##_1, V2##_1);	\ +		V1##_2 = vec_xor(V1##_2, V2##_2);	\ +		V1##_3 = vec_xor(V1##_3, V2##_3);	\ +	} while (0) + +void xor_altivec_2(unsigned long bytes, unsigned long *v1_in, +		   unsigned long *v2_in) +{ +	DEFINE(v1); +	DEFINE(v2); +	unsigned long lines = bytes / (sizeof(unative_t)) / 4; + +	preempt_disable(); +	enable_kernel_altivec(); + +	do { +		LOAD(v1); +		LOAD(v2); +		XOR(v1, v2); +		STORE(v1); + +		v1 += 4; +		v2 += 4; +	} while (--lines > 0); + +	preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_2); + +void xor_altivec_3(unsigned long bytes, unsigned long *v1_in, +		   unsigned long *v2_in, unsigned long *v3_in) +{ +	DEFINE(v1); +	DEFINE(v2); +	DEFINE(v3); +	unsigned long lines = bytes / (sizeof(unative_t)) / 4; + +	preempt_disable(); +	enable_kernel_altivec(); + +	do { +		LOAD(v1); +		LOAD(v2); +		LOAD(v3); +		XOR(v1, v2); +		XOR(v1, v3); +		STORE(v1); + +		v1 += 4; +		v2 += 4; +		v3 += 4; +	} while (--lines > 0); + +	preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_3); + +void xor_altivec_4(unsigned long bytes, unsigned long *v1_in, +		   unsigned long *v2_in, unsigned long *v3_in, +		   unsigned long *v4_in) +{ +	DEFINE(v1); +	DEFINE(v2); +	DEFINE(v3); +	DEFINE(v4); +	unsigned long lines = bytes / (sizeof(unative_t)) / 4; + +	preempt_disable(); +	enable_kernel_altivec(); + +	do { +		LOAD(v1); +		LOAD(v2); +		LOAD(v3); +		LOAD(v4); +		XOR(v1, v2); +		XOR(v3, v4); +		XOR(v1, v3); +		STORE(v1); + +		v1 += 4; +		v2 += 4; +		v3 += 4; +		v4 += 4; +	} while (--lines > 0); + +	preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_4); + +void xor_altivec_5(unsigned long bytes, unsigned long *v1_in, +		   unsigned long *v2_in, unsigned long *v3_in, +		   unsigned long *v4_in, unsigned long *v5_in) +{ +	DEFINE(v1); +	DEFINE(v2); +	DEFINE(v3); +	DEFINE(v4); +	DEFINE(v5); +	unsigned long lines = bytes / (sizeof(unative_t)) / 4; + +	preempt_disable(); +	enable_kernel_altivec(); + +	do { +		LOAD(v1); +		LOAD(v2); +		LOAD(v3); +		LOAD(v4); +		LOAD(v5); +		XOR(v1, v2); +		XOR(v3, v4); +		XOR(v1, v5); +		XOR(v1, v3); +		STORE(v1); + +		v1 += 4; +		v2 += 4; +		v3 += 4; +		v4 += 4; +		v5 += 4; +	} while (--lines > 0); + +	preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_5);  | 
