diff options
Diffstat (limited to 'arch/powerpc/lib')
26 files changed, 5167 insertions, 372 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 3040dac18a3..59fa2de9546 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -4,25 +4,32 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror -ifeq ($(CONFIG_PPC64),y) -EXTRA_CFLAGS += -mno-minimal-toc -endif +ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) CFLAGS_REMOVE_code-patching.o = -pg CFLAGS_REMOVE_feature-fixups.o = -pg obj-y := string.o alloc.o \ - checksum_$(CONFIG_WORD_SIZE).o -obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o + crtsavres.o +obj-$(CONFIG_PPC32) += div64.o copy_32.o obj-$(CONFIG_HAS_IOMEM) += devres.o obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \ - memcpy_64.o usercopy_64.o mem_64.o string.o -obj-$(CONFIG_XMON) += sstep.o -obj-$(CONFIG_KPROBES) += sstep.o + usercopy_64.o mem_64.o string.o \ + hweight_64.o \ + copyuser_power7.o string_64.o copypage_power7.o +ifeq ($(CONFIG_GENERIC_CSUM),) +obj-y += checksum_$(CONFIG_WORD_SIZE).o +obj-$(CONFIG_PPC64) += checksum_wrappers_64.o +endif + +obj-$(CONFIG_PPC64) += memcpy_power7.o memcpy_64.o + +obj-$(CONFIG_PPC_EMULATE_SSTEP) += sstep.o ldstfp.o ifeq ($(CONFIG_PPC64),y) obj-$(CONFIG_SMP) += locks.o +obj-$(CONFIG_ALTIVEC) += vmx-helper.o endif obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o @@ -30,3 +37,6 @@ obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o obj-y += code-patching.o obj-y += feature-fixups.o obj-$(CONFIG_FTR_FIXUP_SELFTEST) += feature-fixups-test.o + +obj-$(CONFIG_ALTIVEC) += xor_vmx.o +CFLAGS_xor_vmx.o += -maltivec -mabi=altivec diff --git a/arch/powerpc/lib/alloc.c b/arch/powerpc/lib/alloc.c index f53e09c7dac..da22c84a8fe 100644 --- a/arch/powerpc/lib/alloc.c +++ b/arch/powerpc/lib/alloc.c @@ -3,16 +3,8 @@ #include <linux/slab.h> #include <linux/bootmem.h> #include <linux/string.h> +#include <asm/setup.h> -#include <asm/system.h> - -void * __init_refok alloc_maybe_bootmem(size_t size, gfp_t mask) -{ - if (mem_init_done) - return kmalloc(size, mask); - else - return alloc_bootmem(size); -} void * __init_refok zalloc_maybe_bootmem(size_t size, gfp_t mask) { diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S index ef96c6c58ef..57a07206505 100644 --- a/arch/powerpc/lib/checksum_64.S +++ b/arch/powerpc/lib/checksum_64.S @@ -69,161 +69,412 @@ _GLOBAL(csum_tcpudp_magic) * Computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit). * - * This code assumes at least halfword alignment, though the length - * can be any number of bytes. The sum is accumulated in r5. - * * csum_partial(r3=buff, r4=len, r5=sum) */ _GLOBAL(csum_partial) - subi r3,r3,8 /* we'll offset by 8 for the loads */ - srdi. r6,r4,3 /* divide by 8 for doubleword count */ - addic r5,r5,0 /* clear carry */ - beq 3f /* if we're doing < 8 bytes */ - andi. r0,r3,2 /* aligned on a word boundary already? */ - beq+ 1f - lhz r6,8(r3) /* do 2 bytes to get aligned */ - addi r3,r3,2 - subi r4,r4,2 - addc r5,r5,r6 - srdi. r6,r4,3 /* recompute number of doublewords */ - beq 3f /* any left? */ -1: mtctr r6 -2: ldu r6,8(r3) /* main sum loop */ - adde r5,r5,r6 - bdnz 2b - andi. r4,r4,7 /* compute bytes left to sum after doublewords */ -3: cmpwi 0,r4,4 /* is at least a full word left? */ - blt 4f - lwz r6,8(r3) /* sum this word */ + addic r0,r5,0 /* clear carry */ + + srdi. r6,r4,3 /* less than 8 bytes? */ + beq .Lcsum_tail_word + + /* + * If only halfword aligned, align to a double word. Since odd + * aligned addresses should be rare and they would require more + * work to calculate the correct checksum, we ignore that case + * and take the potential slowdown of unaligned loads. + */ + rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ + beq .Lcsum_aligned + + li r7,4 + sub r6,r7,r6 + mtctr r6 + +1: + lhz r6,0(r3) /* align to doubleword */ + subi r4,r4,2 + addi r3,r3,2 + adde r0,r0,r6 + bdnz 1b + +.Lcsum_aligned: + /* + * We unroll the loop such that each iteration is 64 bytes with an + * entry and exit limb of 64 bytes, meaning a minimum size of + * 128 bytes. + */ + srdi. r6,r4,7 + beq .Lcsum_tail_doublewords /* len < 128 */ + + srdi r6,r4,6 + subi r6,r6,1 + mtctr r6 + + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + + ld r6,0(r3) + ld r9,8(r3) + + ld r10,16(r3) + ld r11,24(r3) + + /* + * On POWER6 and POWER7 back to back addes take 2 cycles because of + * the XER dependency. This means the fastest this loop can go is + * 16 cycles per iteration. The scheduling of the loop below has + * been shown to hit this on both POWER6 and POWER7. + */ + .align 5 +2: + adde r0,r0,r6 + ld r12,32(r3) + ld r14,40(r3) + + adde r0,r0,r9 + ld r15,48(r3) + ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 + + adde r0,r0,r11 + + adde r0,r0,r12 + + adde r0,r0,r14 + + adde r0,r0,r15 + ld r6,0(r3) + ld r9,8(r3) + + adde r0,r0,r16 + ld r10,16(r3) + ld r11,24(r3) + bdnz 2b + + + adde r0,r0,r6 + ld r12,32(r3) + ld r14,40(r3) + + adde r0,r0,r9 + ld r15,48(r3) + ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 + adde r0,r0,r11 + adde r0,r0,r12 + adde r0,r0,r14 + adde r0,r0,r15 + adde r0,r0,r16 + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + addi r1,r1,STACKFRAMESIZE + + andi. r4,r4,63 + +.Lcsum_tail_doublewords: /* Up to 127 bytes to go */ + srdi. r6,r4,3 + beq .Lcsum_tail_word + + mtctr r6 +3: + ld r6,0(r3) + addi r3,r3,8 + adde r0,r0,r6 + bdnz 3b + + andi. r4,r4,7 + +.Lcsum_tail_word: /* Up to 7 bytes to go */ + srdi. r6,r4,2 + beq .Lcsum_tail_halfword + + lwz r6,0(r3) addi r3,r3,4 + adde r0,r0,r6 subi r4,r4,4 - adde r5,r5,r6 -4: cmpwi 0,r4,2 /* is at least a halfword left? */ - blt+ 5f - lhz r6,8(r3) /* sum this halfword */ - addi r3,r3,2 - subi r4,r4,2 - adde r5,r5,r6 -5: cmpwi 0,r4,1 /* is at least a byte left? */ - bne+ 6f - lbz r6,8(r3) /* sum this byte */ - slwi r6,r6,8 /* this byte is assumed to be the upper byte of a halfword */ - adde r5,r5,r6 -6: addze r5,r5 /* add in final carry */ - rldicl r4,r5,32,0 /* fold two 32-bit halves together */ - add r3,r4,r5 - srdi r3,r3,32 - blr + +.Lcsum_tail_halfword: /* Up to 3 bytes to go */ + srdi. r6,r4,1 + beq .Lcsum_tail_byte + + lhz r6,0(r3) + addi r3,r3,2 + adde r0,r0,r6 + subi r4,r4,2 + +.Lcsum_tail_byte: /* Up to 1 byte to go */ + andi. r6,r4,1 + beq .Lcsum_finish + + lbz r6,0(r3) + sldi r9,r6,8 /* Pad the byte out to 16 bits */ + adde r0,r0,r9 + +.Lcsum_finish: + addze r0,r0 /* add in final carry */ + rldicl r4,r0,32,0 /* fold two 32 bit halves together */ + add r3,r4,r0 + srdi r3,r3,32 + blr + + + .macro srcnr +100: + .section __ex_table,"a" + .align 3 + .llong 100b,.Lsrc_error_nr + .previous + .endm + + .macro source +150: + .section __ex_table,"a" + .align 3 + .llong 150b,.Lsrc_error + .previous + .endm + + .macro dstnr +200: + .section __ex_table,"a" + .align 3 + .llong 200b,.Ldest_error_nr + .previous + .endm + + .macro dest +250: + .section __ex_table,"a" + .align 3 + .llong 250b,.Ldest_error + .previous + .endm /* * Computes the checksum of a memory block at src, length len, * and adds in "sum" (32-bit), while copying the block to dst. * If an access exception occurs on src or dst, it stores -EFAULT - * to *src_err or *dst_err respectively, and (for an error on - * src) zeroes the rest of dst. - * - * This code needs to be reworked to take advantage of 64 bit sum+copy. - * However, due to tokenring halfword alignment problems this will be very - * tricky. For now we'll leave it until we instrument it somehow. + * to *src_err or *dst_err respectively. The caller must take any action + * required in this case (zeroing memory, recalculating partial checksum etc). * * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) */ _GLOBAL(csum_partial_copy_generic) - addic r0,r6,0 - subi r3,r3,4 - subi r4,r4,4 - srwi. r6,r5,2 - beq 3f /* if we're doing < 4 bytes */ - andi. r9,r4,2 /* Align dst to longword boundary */ - beq+ 1f -81: lhz r6,4(r3) /* do 2 bytes to get aligned */ - addi r3,r3,2 + addic r0,r6,0 /* clear carry */ + + srdi. r6,r5,3 /* less than 8 bytes? */ + beq .Lcopy_tail_word + + /* + * If only halfword aligned, align to a double word. Since odd + * aligned addresses should be rare and they would require more + * work to calculate the correct checksum, we ignore that case + * and take the potential slowdown of unaligned loads. + * + * If the source and destination are relatively unaligned we only + * align the source. This keeps things simple. + */ + rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ + beq .Lcopy_aligned + + li r9,4 + sub r6,r9,r6 + mtctr r6 + +1: +srcnr; lhz r6,0(r3) /* align to doubleword */ subi r5,r5,2 -91: sth r6,4(r4) - addi r4,r4,2 - addc r0,r0,r6 - srwi. r6,r5,2 /* # words to do */ - beq 3f -1: mtctr r6 -82: lwzu r6,4(r3) /* the bdnz has zero overhead, so it should */ -92: stwu r6,4(r4) /* be unnecessary to unroll this loop */ - adde r0,r0,r6 - bdnz 82b - andi. r5,r5,3 -3: cmpwi 0,r5,2 - blt+ 4f -83: lhz r6,4(r3) addi r3,r3,2 - subi r5,r5,2 -93: sth r6,4(r4) + adde r0,r0,r6 +dstnr; sth r6,0(r4) addi r4,r4,2 + bdnz 1b + +.Lcopy_aligned: + /* + * We unroll the loop such that each iteration is 64 bytes with an + * entry and exit limb of 64 bytes, meaning a minimum size of + * 128 bytes. + */ + srdi. r6,r5,7 + beq .Lcopy_tail_doublewords /* len < 128 */ + + srdi r6,r5,6 + subi r6,r6,1 + mtctr r6 + + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + +source; ld r6,0(r3) +source; ld r9,8(r3) + +source; ld r10,16(r3) +source; ld r11,24(r3) + + /* + * On POWER6 and POWER7 back to back addes take 2 cycles because of + * the XER dependency. This means the fastest this loop can go is + * 16 cycles per iteration. The scheduling of the loop below has + * been shown to hit this on both POWER6 and POWER7. + */ + .align 5 +2: adde r0,r0,r6 -4: cmpwi 0,r5,1 - bne+ 5f -84: lbz r6,4(r3) -94: stb r6,4(r4) - slwi r6,r6,8 /* Upper byte of word */ +source; ld r12,32(r3) +source; ld r14,40(r3) + + adde r0,r0,r9 +source; ld r15,48(r3) +source; ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 +dest; std r6,0(r4) +dest; std r9,8(r4) + + adde r0,r0,r11 +dest; std r10,16(r4) +dest; std r11,24(r4) + + adde r0,r0,r12 +dest; std r12,32(r4) +dest; std r14,40(r4) + + adde r0,r0,r14 +dest; std r15,48(r4) +dest; std r16,56(r4) + addi r4,r4,64 + + adde r0,r0,r15 +source; ld r6,0(r3) +source; ld r9,8(r3) + + adde r0,r0,r16 +source; ld r10,16(r3) +source; ld r11,24(r3) + bdnz 2b + + adde r0,r0,r6 -5: addze r3,r0 /* add in final carry (unlikely with 64-bit regs) */ - rldicl r4,r3,32,0 /* fold 64 bit value */ - add r3,r4,r3 - srdi r3,r3,32 - blr +source; ld r12,32(r3) +source; ld r14,40(r3) -/* These shouldn't go in the fixup section, since that would - cause the ex_table addresses to get out of order. */ + adde r0,r0,r9 +source; ld r15,48(r3) +source; ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 +dest; std r6,0(r4) +dest; std r9,8(r4) + + adde r0,r0,r11 +dest; std r10,16(r4) +dest; std r11,24(r4) + + adde r0,r0,r12 +dest; std r12,32(r4) +dest; std r14,40(r4) + + adde r0,r0,r14 +dest; std r15,48(r4) +dest; std r16,56(r4) + addi r4,r4,64 + + adde r0,r0,r15 + adde r0,r0,r16 + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + addi r1,r1,STACKFRAMESIZE + + andi. r5,r5,63 + +.Lcopy_tail_doublewords: /* Up to 127 bytes to go */ + srdi. r6,r5,3 + beq .Lcopy_tail_word - .globl src_error_1 -src_error_1: - li r6,0 - subi r5,r5,2 -95: sth r6,4(r4) - addi r4,r4,2 - srwi. r6,r5,2 - beq 3f mtctr r6 - .globl src_error_2 -src_error_2: - li r6,0 -96: stwu r6,4(r4) - bdnz 96b -3: andi. r5,r5,3 - beq src_error - .globl src_error_3 -src_error_3: - li r6,0 - mtctr r5 - addi r4,r4,3 -97: stbu r6,1(r4) - bdnz 97b - .globl src_error -src_error: +3: +srcnr; ld r6,0(r3) + addi r3,r3,8 + adde r0,r0,r6 +dstnr; std r6,0(r4) + addi r4,r4,8 + bdnz 3b + + andi. r5,r5,7 + +.Lcopy_tail_word: /* Up to 7 bytes to go */ + srdi. r6,r5,2 + beq .Lcopy_tail_halfword + +srcnr; lwz r6,0(r3) + addi r3,r3,4 + adde r0,r0,r6 +dstnr; stw r6,0(r4) + addi r4,r4,4 + subi r5,r5,4 + +.Lcopy_tail_halfword: /* Up to 3 bytes to go */ + srdi. r6,r5,1 + beq .Lcopy_tail_byte + +srcnr; lhz r6,0(r3) + addi r3,r3,2 + adde r0,r0,r6 +dstnr; sth r6,0(r4) + addi r4,r4,2 + subi r5,r5,2 + +.Lcopy_tail_byte: /* Up to 1 byte to go */ + andi. r6,r5,1 + beq .Lcopy_finish + +srcnr; lbz r6,0(r3) + sldi r9,r6,8 /* Pad the byte out to 16 bits */ + adde r0,r0,r9 +dstnr; stb r6,0(r4) + +.Lcopy_finish: + addze r0,r0 /* add in final carry */ + rldicl r4,r0,32,0 /* fold two 32 bit halves together */ + add r3,r4,r0 + srdi r3,r3,32 + blr + +.Lsrc_error: + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + addi r1,r1,STACKFRAMESIZE +.Lsrc_error_nr: cmpdi 0,r7,0 - beq 1f + beqlr li r6,-EFAULT stw r6,0(r7) -1: addze r3,r0 blr - .globl dst_error -dst_error: +.Ldest_error: + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + addi r1,r1,STACKFRAMESIZE +.Ldest_error_nr: cmpdi 0,r8,0 - beq 1f + beqlr li r6,-EFAULT stw r6,0(r8) -1: addze r3,r0 blr - -.section __ex_table,"a" - .align 3 - .llong 81b,src_error_1 - .llong 91b,dst_error - .llong 82b,src_error_2 - .llong 92b,dst_error - .llong 83b,src_error_3 - .llong 93b,dst_error - .llong 84b,src_error_3 - .llong 94b,dst_error - .llong 95b,dst_error - .llong 96b,dst_error - .llong 97b,dst_error diff --git a/arch/powerpc/lib/checksum_wrappers_64.c b/arch/powerpc/lib/checksum_wrappers_64.c new file mode 100644 index 00000000000..08e3a3356c4 --- /dev/null +++ b/arch/powerpc/lib/checksum_wrappers_64.c @@ -0,0 +1,102 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2010 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <linux/export.h> +#include <linux/compiler.h> +#include <linux/types.h> +#include <asm/checksum.h> +#include <asm/uaccess.h> + +__wsum csum_and_copy_from_user(const void __user *src, void *dst, + int len, __wsum sum, int *err_ptr) +{ + unsigned int csum; + + might_sleep(); + + *err_ptr = 0; + + if (!len) { + csum = 0; + goto out; + } + + if (unlikely((len < 0) || !access_ok(VERIFY_READ, src, len))) { + *err_ptr = -EFAULT; + csum = (__force unsigned int)sum; + goto out; + } + + csum = csum_partial_copy_generic((void __force *)src, dst, + len, sum, err_ptr, NULL); + + if (unlikely(*err_ptr)) { + int missing = __copy_from_user(dst, src, len); + + if (missing) { + memset(dst + len - missing, 0, missing); + *err_ptr = -EFAULT; + } else { + *err_ptr = 0; + } + + csum = csum_partial(dst, len, sum); + } + +out: + return (__force __wsum)csum; +} +EXPORT_SYMBOL(csum_and_copy_from_user); + +__wsum csum_and_copy_to_user(const void *src, void __user *dst, int len, + __wsum sum, int *err_ptr) +{ + unsigned int csum; + + might_sleep(); + + *err_ptr = 0; + + if (!len) { + csum = 0; + goto out; + } + + if (unlikely((len < 0) || !access_ok(VERIFY_WRITE, dst, len))) { + *err_ptr = -EFAULT; + csum = -1; /* invalid checksum */ + goto out; + } + + csum = csum_partial_copy_generic(src, (void __force *)dst, + len, sum, NULL, err_ptr); + + if (unlikely(*err_ptr)) { + csum = csum_partial(src, len, sum); + + if (copy_to_user(dst, src, len)) { + *err_ptr = -EFAULT; + csum = -1; /* invalid checksum */ + } + } + +out: + return (__force __wsum)csum; +} +EXPORT_SYMBOL(csum_and_copy_to_user); diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 7c975d43e3f..d5edbeb8eb8 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -13,17 +13,23 @@ #include <linux/mm.h> #include <asm/page.h> #include <asm/code-patching.h> +#include <asm/uaccess.h> -void patch_instruction(unsigned int *addr, unsigned int instr) +int patch_instruction(unsigned int *addr, unsigned int instr) { - *addr = instr; + int err; + + __put_user_size(instr, addr, 4, err); + if (err) + return err; asm ("dcbst 0, %0; sync; icbi 0,%0; sync; isync" : : "r" (addr)); + return 0; } -void patch_branch(unsigned int *addr, unsigned long target, int flags) +int patch_branch(unsigned int *addr, unsigned long target, int flags) { - patch_instruction(addr, create_branch(addr, target, flags)); + return patch_instruction(addr, create_branch(addr, target, flags)); } unsigned int create_branch(const unsigned int *addr, @@ -153,6 +159,21 @@ unsigned int translate_branch(const unsigned int *dest, const unsigned int *src) return 0; } +#ifdef CONFIG_PPC_BOOK3E_64 +void __patch_exception(int exc, unsigned long addr) +{ + extern unsigned int interrupt_base_book3e; + unsigned int *ibase = &interrupt_base_book3e; + + /* Our exceptions vectors start with a NOP and -then- a branch + * to deal with single stepping from userspace which stops on + * the second instruction. Thus we need to patch the second + * instruction of the exception, not the first one + */ + + patch_branch(ibase + (exc / 4) + 1, addr, 0); +} +#endif #ifdef CONFIG_CODE_PATCHING_SELFTEST diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S index 74a7f4130b4..55f19f9fd70 100644 --- a/arch/powerpc/lib/copy_32.S +++ b/arch/powerpc/lib/copy_32.S @@ -62,7 +62,7 @@ .text .stabs "arch/powerpc/lib/",N_SO,0,0,0f - .stabs "copy32.S",N_SO,0,0,0f + .stabs "copy_32.S",N_SO,0,0,0f 0: CACHELINE_BYTES = L1_CACHE_BYTES diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S index e68beac0a17..a3c4dc4defd 100644 --- a/arch/powerpc/lib/copypage_64.S +++ b/arch/powerpc/lib/copypage_64.S @@ -6,6 +6,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ +#include <asm/page.h> #include <asm/processor.h> #include <asm/ppc_asm.h> #include <asm/asm-offsets.h> @@ -15,9 +16,13 @@ PPC64_CACHES: .tc ppc64_caches[TC],ppc64_caches .section ".text" - -_GLOBAL(copy_4K_page) - li r5,4096 /* 4K page size */ +_GLOBAL_TOC(copy_page) +BEGIN_FTR_SECTION + lis r5,PAGE_SIZE@h +FTR_SECTION_ELSE + b copypage_power7 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) + ori r5,r5,PAGE_SIZE@l BEGIN_FTR_SECTION ld r10,PPC64_CACHES@toc(r2) lwz r11,DCACHEL1LOGLINESIZE(r10) /* log2 of cache line size */ @@ -43,62 +48,62 @@ END_FTR_SECTION_IFSET(CPU_FTR_CP_USE_DCBTZ) ld r7,16(r4) ldu r8,24(r4) 1: std r5,8(r3) - ld r9,8(r4) std r6,16(r3) + ld r9,8(r4) ld r10,16(r4) std r7,24(r3) - ld r11,24(r4) std r8,32(r3) + ld r11,24(r4) ld r12,32(r4) std r9,40(r3) - ld r5,40(r4) std r10,48(r3) + ld r5,40(r4) ld r6,48(r4) std r11,56(r3) - ld r7,56(r4) std r12,64(r3) + ld r7,56(r4) ld r8,64(r4) std r5,72(r3) - ld r9,72(r4) std r6,80(r3) + ld r9,72(r4) ld r10,80(r4) std r7,88(r3) - ld r11,88(r4) std r8,96(r3) + ld r11,88(r4) ld r12,96(r4) std r9,104(r3) - ld r5,104(r4) std r10,112(r3) + ld r5,104(r4) ld r6,112(r4) std r11,120(r3) - ld r7,120(r4) stdu r12,128(r3) + ld r7,120(r4) ldu r8,128(r4) bdnz 1b std r5,8(r3) - ld r9,8(r4) std r6,16(r3) + ld r9,8(r4) ld r10,16(r4) std r7,24(r3) - ld r11,24(r4) std r8,32(r3) + ld r11,24(r4) ld r12,32(r4) std r9,40(r3) - ld r5,40(r4) std r10,48(r3) + ld r5,40(r4) ld r6,48(r4) std r11,56(r3) - ld r7,56(r4) std r12,64(r3) + ld r7,56(r4) ld r8,64(r4) std r5,72(r3) - ld r9,72(r4) std r6,80(r3) + ld r9,72(r4) ld r10,80(r4) std r7,88(r3) - ld r11,88(r4) std r8,96(r3) + ld r11,88(r4) ld r12,96(r4) std r9,104(r3) std r10,112(r3) diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S new file mode 100644 index 00000000000..d7dafb3777a --- /dev/null +++ b/arch/powerpc/lib/copypage_power7.S @@ -0,0 +1,168 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <asm/page.h> +#include <asm/ppc_asm.h> + +_GLOBAL(copypage_power7) + /* + * We prefetch both the source and destination using enhanced touch + * instructions. We use a stream ID of 0 for the load side and + * 1 for the store side. Since source and destination are page + * aligned we don't need to clear the bottom 7 bits of either + * address. + */ + ori r9,r3,1 /* stream=1 => to */ + +#ifdef CONFIG_PPC_64K_PAGES + lis r7,0x0E01 /* depth=7 + * units/cachelines=512 */ +#else + lis r7,0x0E00 /* depth=7 */ + ori r7,r7,0x1000 /* units/cachelines=32 */ +#endif + ori r10,r7,1 /* stream=1 */ + + lis r8,0x8000 /* GO=1 */ + clrldi r8,r8,32 + +.machine push +.machine "power4" + /* setup read stream 0 */ + dcbt r0,r4,0b01000 /* addr from */ + dcbt r0,r7,0b01010 /* length and depth from */ + /* setup write stream 1 */ + dcbtst r0,r9,0b01000 /* addr to */ + dcbtst r0,r10,0b01010 /* length and depth to */ + eieio + dcbt r0,r8,0b01010 /* all streams GO */ +.machine pop + +#ifdef CONFIG_ALTIVEC + mflr r0 + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + bl enter_vmx_copy + cmpwi r3,0 + ld r0,STACKFRAMESIZE+16(r1) + ld r3,STK_REG(R31)(r1) + ld r4,STK_REG(R30)(r1) + mtlr r0 + + li r0,(PAGE_SIZE/128) + mtctr r0 + + beq .Lnonvmx_copy + + addi r1,r1,STACKFRAMESIZE + + li r6,16 + li r7,32 + li r8,48 + li r9,64 + li r10,80 + li r11,96 + li r12,112 + + .align 5 +1: lvx vr7,r0,r4 + lvx vr6,r4,r6 + lvx vr5,r4,r7 + lvx vr4,r4,r8 + lvx vr3,r4,r9 + lvx vr2,r4,r10 + lvx vr1,r4,r11 + lvx vr0,r4,r12 + addi r4,r4,128 + stvx vr7,r0,r3 + stvx vr6,r3,r6 + stvx vr5,r3,r7 + stvx vr4,r3,r8 + stvx vr3,r3,r9 + stvx vr2,r3,r10 + stvx vr1,r3,r11 + stvx vr0,r3,r12 + addi r3,r3,128 + bdnz 1b + + b exit_vmx_copy /* tail call optimise */ + +#else + li r0,(PAGE_SIZE/128) + mtctr r0 + + stdu r1,-STACKFRAMESIZE(r1) +#endif + +.Lnonvmx_copy: + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + std r17,STK_REG(R17)(r1) + std r18,STK_REG(R18)(r1) + std r19,STK_REG(R19)(r1) + std r20,STK_REG(R20)(r1) + +1: ld r0,0(r4) + ld r5,8(r4) + ld r6,16(r4) + ld r7,24(r4) + ld r8,32(r4) + ld r9,40(r4) + ld r10,48(r4) + ld r11,56(r4) + ld r12,64(r4) + ld r14,72(r4) + ld r15,80(r4) + ld r16,88(r4) + ld r17,96(r4) + ld r18,104(r4) + ld r19,112(r4) + ld r20,120(r4) + addi r4,r4,128 + std r0,0(r3) + std r5,8(r3) + std r6,16(r3) + std r7,24(r3) + std r8,32(r3) + std r9,40(r3) + std r10,48(r3) + std r11,56(r3) + std r12,64(r3) + std r14,72(r3) + std r15,80(r3) + std r16,88(r3) + std r17,96(r3) + std r18,104(r3) + std r19,112(r3) + std r20,120(r3) + addi r3,r3,128 + bdnz 1b + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + ld r17,STK_REG(R17)(r1) + ld r18,STK_REG(R18)(r1) + ld r19,STK_REG(R19)(r1) + ld r20,STK_REG(R20)(r1) + addi r1,r1,STACKFRAMESIZE + blr diff --git a/arch/powerpc/lib/copyuser_64.S b/arch/powerpc/lib/copyuser_64.S index 693b14a778f..0860ee46013 100644 --- a/arch/powerpc/lib/copyuser_64.S +++ b/arch/powerpc/lib/copyuser_64.S @@ -9,8 +9,22 @@ #include <asm/processor.h> #include <asm/ppc_asm.h> +#ifdef __BIG_ENDIAN__ +#define sLd sld /* Shift towards low-numbered address. */ +#define sHd srd /* Shift towards high-numbered address. */ +#else +#define sLd srd /* Shift towards low-numbered address. */ +#define sHd sld /* Shift towards high-numbered address. */ +#endif + .align 7 -_GLOBAL(__copy_tofrom_user) +_GLOBAL_TOC(__copy_tofrom_user) +BEGIN_FTR_SECTION + nop +FTR_SECTION_ELSE + b __copy_tofrom_user_power7 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) +_GLOBAL(__copy_tofrom_user_base) /* first check for a whole page copy on a page boundary */ cmpldi cr1,r5,16 cmpdi cr6,r5,4096 @@ -24,7 +38,7 @@ _GLOBAL(__copy_tofrom_user) dcbt 0,r4 beq .Lcopy_page_4K andi. r6,r6,7 - PPC_MTOCRF 0x01,r5 + PPC_MTOCRF(0x01,r5) blt cr1,.Lshort_copy /* Below we want to nop out the bne if we're on a CPU that has the * CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit @@ -44,37 +58,55 @@ BEGIN_FTR_SECTION andi. r0,r4,7 bne .Lsrc_unaligned END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) - srdi r7,r5,4 -20: ld r9,0(r4) - addi r4,r4,-8 - mtctr r7 - andi. r5,r5,7 - bf cr7*4+0,22f - addi r3,r3,8 - addi r4,r4,8 - mr r8,r9 - blt cr1,72f -21: ld r9,8(r4) -70: std r8,8(r3) -22: ldu r8,16(r4) -71: stdu r9,16(r3) + blt cr1,.Ldo_tail /* if < 16 bytes to copy */ + srdi r0,r5,5 + cmpdi cr1,r0,0 +20: ld r7,0(r4) +220: ld r6,8(r4) + addi r4,r4,16 + mtctr r0 + andi. r0,r5,0x10 + beq 22f + addi r3,r3,16 + addi r4,r4,-16 + mr r9,r7 + mr r8,r6 + beq cr1,72f +21: ld r7,16(r4) +221: ld r6,24(r4) + addi r4,r4,32 +70: std r9,0(r3) +270: std r8,8(r3) +22: ld r9,0(r4) +222: ld r8,8(r4) +71: std r7,16(r3) +271: std r6,24(r3) + addi r3,r3,32 bdnz 21b -72: std r8,8(r3) +72: std r9,0(r3) +272: std r8,8(r3) + andi. r5,r5,0xf beq+ 3f - addi r3,r3,16 + addi r4,r4,16 .Ldo_tail: - bf cr7*4+1,1f -23: lwz r9,8(r4) + addi r3,r3,16 + bf cr7*4+0,246f +244: ld r9,0(r4) + addi r4,r4,8 +245: std r9,0(r3) + addi r3,r3,8 +246: bf cr7*4+1,1f +23: lwz r9,0(r4) addi r4,r4,4 73: stw r9,0(r3) addi r3,r3,4 1: bf cr7*4+2,2f -44: lhz r9,8(r4) +44: lhz r9,0(r4) addi r4,r4,2 74: sth r9,0(r3) addi r3,r3,2 2: bf cr7*4+3,3f -45: lbz r9,8(r4) +45: lbz r9,0(r4) 75: stb r9,0(r3) 3: li r3,0 blr @@ -94,10 +126,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) 24: ld r9,0(r4) /* 3+2n loads, 2+2n stores */ 25: ld r0,8(r4) - sld r6,r9,r10 + sLd r6,r9,r10 26: ldu r9,16(r4) - srd r7,r0,r11 - sld r8,r0,r10 + sHd r7,r0,r11 + sLd r8,r0,r10 or r7,r7,r6 blt cr6,79f 27: ld r0,8(r4) @@ -105,35 +137,35 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) 28: ld r0,0(r4) /* 4+2n loads, 3+2n stores */ 29: ldu r9,8(r4) - sld r8,r0,r10 + sLd r8,r0,r10 addi r3,r3,-8 blt cr6,5f 30: ld r0,8(r4) - srd r12,r9,r11 - sld r6,r9,r10 + sHd r12,r9,r11 + sLd r6,r9,r10 31: ldu r9,16(r4) or r12,r8,r12 - srd r7,r0,r11 - sld r8,r0,r10 + sHd r7,r0,r11 + sLd r8,r0,r10 addi r3,r3,16 beq cr6,78f 1: or r7,r7,r6 32: ld r0,8(r4) 76: std r12,8(r3) -2: srd r12,r9,r11 - sld r6,r9,r10 +2: sHd r12,r9,r11 + sLd r6,r9,r10 33: ldu r9,16(r4) or r12,r8,r12 77: stdu r7,16(r3) - srd r7,r0,r11 - sld r8,r0,r10 + sHd r7,r0,r11 + sLd r8,r0,r10 bdnz 1b 78: std r12,8(r3) or r7,r7,r6 79: std r7,16(r3) -5: srd r12,r9,r11 +5: sHd r12,r9,r11 or r12,r8,r12 80: std r12,24(r3) bne 6f @@ -141,28 +173,43 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) blr 6: cmpwi cr1,r5,8 addi r3,r3,32 - sld r9,r9,r10 + sLd r9,r9,r10 ble cr1,7f 34: ld r0,8(r4) - srd r7,r0,r11 + sHd r7,r0,r11 or r9,r7,r9 7: bf cr7*4+1,1f +#ifdef __BIG_ENDIAN__ rotldi r9,r9,32 +#endif 94: stw r9,0(r3) +#ifdef __LITTLE_ENDIAN__ + rotrdi r9,r9,32 +#endif addi r3,r3,4 1: bf cr7*4+2,2f +#ifdef __BIG_ENDIAN__ rotldi r9,r9,16 +#endif 95: sth r9,0(r3) +#ifdef __LITTLE_ENDIAN__ + rotrdi r9,r9,16 +#endif addi r3,r3,2 2: bf cr7*4+3,3f +#ifdef __BIG_ENDIAN__ rotldi r9,r9,8 +#endif 96: stb r9,0(r3) +#ifdef __LITTLE_ENDIAN__ + rotrdi r9,r9,8 +#endif 3: li r3,0 blr .Ldst_unaligned: - PPC_MTOCRF 0x01,r6 /* put #bytes to 8B bdry into cr7 */ + PPC_MTOCRF(0x01,r6) /* put #bytes to 8B bdry into cr7 */ subf r5,r6,r5 li r7,0 cmpldi cr1,r5,16 @@ -177,7 +224,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) 2: bf cr7*4+1,3f 37: lwzx r0,r7,r4 83: stwx r0,r7,r3 -3: PPC_MTOCRF 0x01,r5 +3: PPC_MTOCRF(0x01,r5) add r4,r6,r4 add r3,r6,r3 b .Ldst_aligned @@ -220,7 +267,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) 131: addi r3,r3,8 120: +320: 122: +322: 124: 125: 126: @@ -229,9 +278,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) 129: 133: addi r3,r3,8 -121: 132: addi r3,r3,8 +121: +321: +344: 134: 135: 138: @@ -303,18 +354,22 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) 183: add r3,r3,r7 b 1f +371: 180: addi r3,r3,8 171: 177: addi r3,r3,8 -170: -172: +370: +372: 176: 178: addi r3,r3,4 185: addi r3,r3,4 +170: +172: +345: 173: 174: 175: @@ -341,11 +396,19 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) .section __ex_table,"a" .align 3 .llong 20b,120b + .llong 220b,320b .llong 21b,121b + .llong 221b,321b .llong 70b,170b + .llong 270b,370b .llong 22b,122b + .llong 222b,322b .llong 71b,171b + .llong 271b,371b .llong 72b,172b + .llong 272b,372b + .llong 244b,344b + .llong 245b,345b .llong 23b,123b .llong 73b,173b .llong 44b,144b diff --git a/arch/powerpc/lib/copyuser_power7.S b/arch/powerpc/lib/copyuser_power7.S new file mode 100644 index 00000000000..c46c876ac96 --- /dev/null +++ b/arch/powerpc/lib/copyuser_power7.S @@ -0,0 +1,721 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2011 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <asm/ppc_asm.h> + +#ifdef __BIG_ENDIAN__ +#define LVS(VRT,RA,RB) lvsl VRT,RA,RB +#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC +#else +#define LVS(VRT,RA,RB) lvsr VRT,RA,RB +#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC +#endif + + .macro err1 +100: + .section __ex_table,"a" + .align 3 + .llong 100b,.Ldo_err1 + .previous + .endm + + .macro err2 +200: + .section __ex_table,"a" + .align 3 + .llong 200b,.Ldo_err2 + .previous + .endm + +#ifdef CONFIG_ALTIVEC + .macro err3 +300: + .section __ex_table,"a" + .align 3 + .llong 300b,.Ldo_err3 + .previous + .endm + + .macro err4 +400: + .section __ex_table,"a" + .align 3 + .llong 400b,.Ldo_err4 + .previous + .endm + + +.Ldo_err4: + ld r16,STK_REG(R16)(r1) + ld r15,STK_REG(R15)(r1) + ld r14,STK_REG(R14)(r1) +.Ldo_err3: + bl exit_vmx_usercopy + ld r0,STACKFRAMESIZE+16(r1) + mtlr r0 + b .Lexit +#endif /* CONFIG_ALTIVEC */ + +.Ldo_err2: + ld r22,STK_REG(R22)(r1) + ld r21,STK_REG(R21)(r1) + ld r20,STK_REG(R20)(r1) + ld r19,STK_REG(R19)(r1) + ld r18,STK_REG(R18)(r1) + ld r17,STK_REG(R17)(r1) + ld r16,STK_REG(R16)(r1) + ld r15,STK_REG(R15)(r1) + ld r14,STK_REG(R14)(r1) +.Lexit: + addi r1,r1,STACKFRAMESIZE +.Ldo_err1: + ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1) + ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1) + b __copy_tofrom_user_base + + +_GLOBAL(__copy_tofrom_user_power7) +#ifdef CONFIG_ALTIVEC + cmpldi r5,16 + cmpldi cr1,r5,4096 + + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) + + blt .Lshort_copy + bgt cr1,.Lvmx_copy +#else + cmpldi r5,16 + + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) + + blt .Lshort_copy +#endif + +.Lnonvmx_copy: + /* Get the source 8B aligned */ + neg r6,r4 + mtocrf 0x01,r6 + clrldi r6,r6,(64-3) + + bf cr7*4+3,1f +err1; lbz r0,0(r4) + addi r4,r4,1 +err1; stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f +err1; lhz r0,0(r4) + addi r4,r4,2 +err1; sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f +err1; lwz r0,0(r4) + addi r4,r4,4 +err1; stw r0,0(r3) + addi r3,r3,4 + +3: sub r5,r5,r6 + cmpldi r5,128 + blt 5f + + mflr r0 + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + std r17,STK_REG(R17)(r1) + std r18,STK_REG(R18)(r1) + std r19,STK_REG(R19)(r1) + std r20,STK_REG(R20)(r1) + std r21,STK_REG(R21)(r1) + std r22,STK_REG(R22)(r1) + std r0,STACKFRAMESIZE+16(r1) + + srdi r6,r5,7 + mtctr r6 + + /* Now do cacheline (128B) sized loads and stores. */ + .align 5 +4: +err2; ld r0,0(r4) +err2; ld r6,8(r4) +err2; ld r7,16(r4) +err2; ld r8,24(r4) +err2; ld r9,32(r4) +err2; ld r10,40(r4) +err2; ld r11,48(r4) +err2; ld r12,56(r4) +err2; ld r14,64(r4) +err2; ld r15,72(r4) +err2; ld r16,80(r4) +err2; ld r17,88(r4) +err2; ld r18,96(r4) +err2; ld r19,104(r4) +err2; ld r20,112(r4) +err2; ld r21,120(r4) + addi r4,r4,128 +err2; std r0,0(r3) +err2; std r6,8(r3) +err2; std r7,16(r3) +err2; std r8,24(r3) +err2; std r9,32(r3) +err2; std r10,40(r3) +err2; std r11,48(r3) +err2; std r12,56(r3) +err2; std r14,64(r3) +err2; std r15,72(r3) +err2; std r16,80(r3) +err2; std r17,88(r3) +err2; std r18,96(r3) +err2; std r19,104(r3) +err2; std r20,112(r3) +err2; std r21,120(r3) + addi r3,r3,128 + bdnz 4b + + clrldi r5,r5,(64-7) + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + ld r17,STK_REG(R17)(r1) + ld r18,STK_REG(R18)(r1) + ld r19,STK_REG(R19)(r1) + ld r20,STK_REG(R20)(r1) + ld r21,STK_REG(R21)(r1) + ld r22,STK_REG(R22)(r1) + addi r1,r1,STACKFRAMESIZE + + /* Up to 127B to go */ +5: srdi r6,r5,4 + mtocrf 0x01,r6 + +6: bf cr7*4+1,7f +err1; ld r0,0(r4) +err1; ld r6,8(r4) +err1; ld r7,16(r4) +err1; ld r8,24(r4) +err1; ld r9,32(r4) +err1; ld r10,40(r4) +err1; ld r11,48(r4) +err1; ld r12,56(r4) + addi r4,r4,64 +err1; std r0,0(r3) +err1; std r6,8(r3) +err1; std r7,16(r3) +err1; std r8,24(r3) +err1; std r9,32(r3) +err1; std r10,40(r3) +err1; std r11,48(r3) +err1; std r12,56(r3) + addi r3,r3,64 + + /* Up to 63B to go */ +7: bf cr7*4+2,8f +err1; ld r0,0(r4) +err1; ld r6,8(r4) +err1; ld r7,16(r4) +err1; ld r8,24(r4) + addi r4,r4,32 +err1; std r0,0(r3) +err1; std r6,8(r3) +err1; std r7,16(r3) +err1; std r8,24(r3) + addi r3,r3,32 + + /* Up to 31B to go */ +8: bf cr7*4+3,9f +err1; ld r0,0(r4) +err1; ld r6,8(r4) + addi r4,r4,16 +err1; std r0,0(r3) +err1; std r6,8(r3) + addi r3,r3,16 + +9: clrldi r5,r5,(64-4) + + /* Up to 15B to go */ +.Lshort_copy: + mtocrf 0x01,r5 + bf cr7*4+0,12f +err1; lwz r0,0(r4) /* Less chance of a reject with word ops */ +err1; lwz r6,4(r4) + addi r4,r4,8 +err1; stw r0,0(r3) +err1; stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f +err1; lwz r0,0(r4) + addi r4,r4,4 +err1; stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f +err1; lhz r0,0(r4) + addi r4,r4,2 +err1; sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f +err1; lbz r0,0(r4) +err1; stb r0,0(r3) + +15: li r3,0 + blr + +.Lunwind_stack_nonvmx_copy: + addi r1,r1,STACKFRAMESIZE + b .Lnonvmx_copy + +#ifdef CONFIG_ALTIVEC +.Lvmx_copy: + mflr r0 + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + bl enter_vmx_usercopy + cmpwi cr1,r3,0 + ld r0,STACKFRAMESIZE+16(r1) + ld r3,STK_REG(R31)(r1) + ld r4,STK_REG(R30)(r1) + ld r5,STK_REG(R29)(r1) + mtlr r0 + + /* + * We prefetch both the source and destination using enhanced touch + * instructions. We use a stream ID of 0 for the load side and + * 1 for the store side. + */ + clrrdi r6,r4,7 + clrrdi r9,r3,7 + ori r9,r9,1 /* stream=1 */ + + srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ + cmpldi r7,0x3FF + ble 1f + li r7,0x3FF +1: lis r0,0x0E00 /* depth=7 */ + sldi r7,r7,7 + or r7,r7,r0 + ori r10,r7,1 /* stream=1 */ + + lis r8,0x8000 /* GO=1 */ + clrldi r8,r8,32 + +.machine push +.machine "power4" + /* setup read stream 0 */ + dcbt r0,r6,0b01000 /* addr from */ + dcbt r0,r7,0b01010 /* length and depth from */ + /* setup write stream 1 */ + dcbtst r0,r9,0b01000 /* addr to */ + dcbtst r0,r10,0b01010 /* length and depth to */ + eieio + dcbt r0,r8,0b01010 /* all streams GO */ +.machine pop + + beq cr1,.Lunwind_stack_nonvmx_copy + + /* + * If source and destination are not relatively aligned we use a + * slower permute loop. + */ + xor r6,r4,r3 + rldicl. r6,r6,0,(64-4) + bne .Lvmx_unaligned_copy + + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f +err3; lbz r0,0(r4) + addi r4,r4,1 +err3; stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f +err3; ld r0,0(r4) + addi r4,r4,8 +err3; std r0,0(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + bf cr7*4+3,5f +err3; lvx vr1,r0,r4 + addi r4,r4,16 +err3; stvx vr1,r0,r3 + addi r3,r3,16 + +5: bf cr7*4+2,6f +err3; lvx vr1,r0,r4 +err3; lvx vr0,r4,r9 + addi r4,r4,32 +err3; stvx vr1,r0,r3 +err3; stvx vr0,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f +err3; lvx vr3,r0,r4 +err3; lvx vr2,r4,r9 +err3; lvx vr1,r4,r10 +err3; lvx vr0,r4,r11 + addi r4,r4,64 +err3; stvx vr3,r0,r3 +err3; stvx vr2,r3,r9 +err3; stvx vr1,r3,r10 +err3; stvx vr0,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: +err4; lvx vr7,r0,r4 +err4; lvx vr6,r4,r9 +err4; lvx vr5,r4,r10 +err4; lvx vr4,r4,r11 +err4; lvx vr3,r4,r12 +err4; lvx vr2,r4,r14 +err4; lvx vr1,r4,r15 +err4; lvx vr0,r4,r16 + addi r4,r4,128 +err4; stvx vr7,r0,r3 +err4; stvx vr6,r3,r9 +err4; stvx vr5,r3,r10 +err4; stvx vr4,r3,r11 +err4; stvx vr3,r3,r12 +err4; stvx vr2,r3,r14 +err4; stvx vr1,r3,r15 +err4; stvx vr0,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f +err3; lvx vr3,r0,r4 +err3; lvx vr2,r4,r9 +err3; lvx vr1,r4,r10 +err3; lvx vr0,r4,r11 + addi r4,r4,64 +err3; stvx vr3,r0,r3 +err3; stvx vr2,r3,r9 +err3; stvx vr1,r3,r10 +err3; stvx vr0,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f +err3; lvx vr1,r0,r4 +err3; lvx vr0,r4,r9 + addi r4,r4,32 +err3; stvx vr1,r0,r3 +err3; stvx vr0,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f +err3; lvx vr1,r0,r4 + addi r4,r4,16 +err3; stvx vr1,r0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + mtocrf 0x01,r5 + bf cr7*4+0,12f +err3; ld r0,0(r4) + addi r4,r4,8 +err3; std r0,0(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f +err3; lbz r0,0(r4) +err3; stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + b exit_vmx_usercopy /* tail call optimise */ + +.Lvmx_unaligned_copy: + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f +err3; lbz r0,0(r4) + addi r4,r4,1 +err3; stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f +err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ +err3; lwz r7,4(r4) + addi r4,r4,8 +err3; stw r0,0(r3) +err3; stw r7,4(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + LVS(vr16,0,r4) /* Setup permute control vector */ +err3; lvx vr0,0,r4 + addi r4,r4,16 + + bf cr7*4+3,5f +err3; lvx vr1,r0,r4 + VPERM(vr8,vr0,vr1,vr16) + addi r4,r4,16 +err3; stvx vr8,r0,r3 + addi r3,r3,16 + vor vr0,vr1,vr1 + +5: bf cr7*4+2,6f +err3; lvx vr1,r0,r4 + VPERM(vr8,vr0,vr1,vr16) +err3; lvx vr0,r4,r9 + VPERM(vr9,vr1,vr0,vr16) + addi r4,r4,32 +err3; stvx vr8,r0,r3 +err3; stvx vr9,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f +err3; lvx vr3,r0,r4 + VPERM(vr8,vr0,vr3,vr16) +err3; lvx vr2,r4,r9 + VPERM(vr9,vr3,vr2,vr16) +err3; lvx vr1,r4,r10 + VPERM(vr10,vr2,vr1,vr16) +err3; lvx vr0,r4,r11 + VPERM(vr11,vr1,vr0,vr16) + addi r4,r4,64 +err3; stvx vr8,r0,r3 +err3; stvx vr9,r3,r9 +err3; stvx vr10,r3,r10 +err3; stvx vr11,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: +err4; lvx vr7,r0,r4 + VPERM(vr8,vr0,vr7,vr16) +err4; lvx vr6,r4,r9 + VPERM(vr9,vr7,vr6,vr16) +err4; lvx vr5,r4,r10 + VPERM(vr10,vr6,vr5,vr16) +err4; lvx vr4,r4,r11 + VPERM(vr11,vr5,vr4,vr16) +err4; lvx vr3,r4,r12 + VPERM(vr12,vr4,vr3,vr16) +err4; lvx vr2,r4,r14 + VPERM(vr13,vr3,vr2,vr16) +err4; lvx vr1,r4,r15 + VPERM(vr14,vr2,vr1,vr16) +err4; lvx vr0,r4,r16 + VPERM(vr15,vr1,vr0,vr16) + addi r4,r4,128 +err4; stvx vr8,r0,r3 +err4; stvx vr9,r3,r9 +err4; stvx vr10,r3,r10 +err4; stvx vr11,r3,r11 +err4; stvx vr12,r3,r12 +err4; stvx vr13,r3,r14 +err4; stvx vr14,r3,r15 +err4; stvx vr15,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f +err3; lvx vr3,r0,r4 + VPERM(vr8,vr0,vr3,vr16) +err3; lvx vr2,r4,r9 + VPERM(vr9,vr3,vr2,vr16) +err3; lvx vr1,r4,r10 + VPERM(vr10,vr2,vr1,vr16) +err3; lvx vr0,r4,r11 + VPERM(vr11,vr1,vr0,vr16) + addi r4,r4,64 +err3; stvx vr8,r0,r3 +err3; stvx vr9,r3,r9 +err3; stvx vr10,r3,r10 +err3; stvx vr11,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f +err3; lvx vr1,r0,r4 + VPERM(vr8,vr0,vr1,vr16) +err3; lvx vr0,r4,r9 + VPERM(vr9,vr1,vr0,vr16) + addi r4,r4,32 +err3; stvx vr8,r0,r3 +err3; stvx vr9,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f +err3; lvx vr1,r0,r4 + VPERM(vr8,vr0,vr1,vr16) + addi r4,r4,16 +err3; stvx vr8,r0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + addi r4,r4,-16 /* Unwind the +16 load offset */ + mtocrf 0x01,r5 + bf cr7*4+0,12f +err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ +err3; lwz r6,4(r4) + addi r4,r4,8 +err3; stw r0,0(r3) +err3; stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f +err3; lbz r0,0(r4) +err3; stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + b exit_vmx_usercopy /* tail call optimise */ +#endif /* CONFiG_ALTIVEC */ diff --git a/arch/powerpc/lib/crtsavres.S b/arch/powerpc/lib/crtsavres.S index 70a9cd8a300..a5b30c71a8d 100644 --- a/arch/powerpc/lib/crtsavres.S +++ b/arch/powerpc/lib/crtsavres.S @@ -6,6 +6,7 @@ * Written By Michael Meissner * * Based on gcc/config/rs6000/crtsavres.asm from gcc + * 64 bit additions from reading the PPC elf64abi document. * * This file is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -40,10 +41,13 @@ #include <asm/ppc_asm.h> .file "crtsavres.S" - .section ".text" #ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE +#ifndef CONFIG_PPC64 + + .section ".text" + /* Routines for saving integer registers, called by the compiler. */ /* Called with r11 pointing to the stack header word of the caller of the */ /* function, just beyond the end of the integer save area. */ @@ -226,4 +230,318 @@ _GLOBAL(_rest32gpr_31_x) mtlr 0 mr 1,11 blr + +#ifdef CONFIG_ALTIVEC +/* Called with r0 pointing just beyond the end of the vector save area. */ + +_GLOBAL(_savevr_20) + li r11,-192 + stvx vr20,r11,r0 +_GLOBAL(_savevr_21) + li r11,-176 + stvx vr21,r11,r0 +_GLOBAL(_savevr_22) + li r11,-160 + stvx vr22,r11,r0 +_GLOBAL(_savevr_23) + li r11,-144 + stvx vr23,r11,r0 +_GLOBAL(_savevr_24) + li r11,-128 + stvx vr24,r11,r0 +_GLOBAL(_savevr_25) + li r11,-112 + stvx vr25,r11,r0 +_GLOBAL(_savevr_26) + li r11,-96 + stvx vr26,r11,r0 +_GLOBAL(_savevr_27) + li r11,-80 + stvx vr27,r11,r0 +_GLOBAL(_savevr_28) + li r11,-64 + stvx vr28,r11,r0 +_GLOBAL(_savevr_29) + li r11,-48 + stvx vr29,r11,r0 +_GLOBAL(_savevr_30) + li r11,-32 + stvx vr30,r11,r0 +_GLOBAL(_savevr_31) + li r11,-16 + stvx vr31,r11,r0 + blr + +_GLOBAL(_restvr_20) + li r11,-192 + lvx vr20,r11,r0 +_GLOBAL(_restvr_21) + li r11,-176 + lvx vr21,r11,r0 +_GLOBAL(_restvr_22) + li r11,-160 + lvx vr22,r11,r0 +_GLOBAL(_restvr_23) + li r11,-144 + lvx vr23,r11,r0 +_GLOBAL(_restvr_24) + li r11,-128 + lvx vr24,r11,r0 +_GLOBAL(_restvr_25) + li r11,-112 + lvx vr25,r11,r0 +_GLOBAL(_restvr_26) + li r11,-96 + lvx vr26,r11,r0 +_GLOBAL(_restvr_27) + li r11,-80 + lvx vr27,r11,r0 +_GLOBAL(_restvr_28) + li r11,-64 + lvx vr28,r11,r0 +_GLOBAL(_restvr_29) + li r11,-48 + lvx vr29,r11,r0 +_GLOBAL(_restvr_30) + li r11,-32 + lvx vr30,r11,r0 +_GLOBAL(_restvr_31) + li r11,-16 + lvx vr31,r11,r0 + blr + +#endif /* CONFIG_ALTIVEC */ + +#else /* CONFIG_PPC64 */ + + .section ".text.save.restore","ax",@progbits + +.globl _savegpr0_14 +_savegpr0_14: + std r14,-144(r1) +.globl _savegpr0_15 +_savegpr0_15: + std r15,-136(r1) +.globl _savegpr0_16 +_savegpr0_16: + std r16,-128(r1) +.globl _savegpr0_17 +_savegpr0_17: + std r17,-120(r1) +.globl _savegpr0_18 +_savegpr0_18: + std r18,-112(r1) +.globl _savegpr0_19 +_savegpr0_19: + std r19,-104(r1) +.globl _savegpr0_20 +_savegpr0_20: + std r20,-96(r1) +.globl _savegpr0_21 +_savegpr0_21: + std r21,-88(r1) +.globl _savegpr0_22 +_savegpr0_22: + std r22,-80(r1) +.globl _savegpr0_23 +_savegpr0_23: + std r23,-72(r1) +.globl _savegpr0_24 +_savegpr0_24: + std r24,-64(r1) +.globl _savegpr0_25 +_savegpr0_25: + std r25,-56(r1) +.globl _savegpr0_26 +_savegpr0_26: + std r26,-48(r1) +.globl _savegpr0_27 +_savegpr0_27: + std r27,-40(r1) +.globl _savegpr0_28 +_savegpr0_28: + std r28,-32(r1) +.globl _savegpr0_29 +_savegpr0_29: + std r29,-24(r1) +.globl _savegpr0_30 +_savegpr0_30: + std r30,-16(r1) +.globl _savegpr0_31 +_savegpr0_31: + std r31,-8(r1) + std r0,16(r1) + blr + +.globl _restgpr0_14 +_restgpr0_14: + ld r14,-144(r1) +.globl _restgpr0_15 +_restgpr0_15: + ld r15,-136(r1) +.globl _restgpr0_16 +_restgpr0_16: + ld r16,-128(r1) +.globl _restgpr0_17 +_restgpr0_17: + ld r17,-120(r1) +.globl _restgpr0_18 +_restgpr0_18: + ld r18,-112(r1) +.globl _restgpr0_19 +_restgpr0_19: + ld r19,-104(r1) +.globl _restgpr0_20 +_restgpr0_20: + ld r20,-96(r1) +.globl _restgpr0_21 +_restgpr0_21: + ld r21,-88(r1) +.globl _restgpr0_22 +_restgpr0_22: + ld r22,-80(r1) +.globl _restgpr0_23 +_restgpr0_23: + ld r23,-72(r1) +.globl _restgpr0_24 +_restgpr0_24: + ld r24,-64(r1) +.globl _restgpr0_25 +_restgpr0_25: + ld r25,-56(r1) +.globl _restgpr0_26 +_restgpr0_26: + ld r26,-48(r1) +.globl _restgpr0_27 +_restgpr0_27: + ld r27,-40(r1) +.globl _restgpr0_28 +_restgpr0_28: + ld r28,-32(r1) +.globl _restgpr0_29 +_restgpr0_29: + ld r0,16(r1) + ld r29,-24(r1) + mtlr r0 + ld r30,-16(r1) + ld r31,-8(r1) + blr + +.globl _restgpr0_30 +_restgpr0_30: + ld r30,-16(r1) +.globl _restgpr0_31 +_restgpr0_31: + ld r0,16(r1) + ld r31,-8(r1) + mtlr r0 + blr + +#ifdef CONFIG_ALTIVEC +/* Called with r0 pointing just beyond the end of the vector save area. */ + +.globl _savevr_20 +_savevr_20: + li r12,-192 + stvx vr20,r12,r0 +.globl _savevr_21 +_savevr_21: + li r12,-176 + stvx vr21,r12,r0 +.globl _savevr_22 +_savevr_22: + li r12,-160 + stvx vr22,r12,r0 +.globl _savevr_23 +_savevr_23: + li r12,-144 + stvx vr23,r12,r0 +.globl _savevr_24 +_savevr_24: + li r12,-128 + stvx vr24,r12,r0 +.globl _savevr_25 +_savevr_25: + li r12,-112 + stvx vr25,r12,r0 +.globl _savevr_26 +_savevr_26: + li r12,-96 + stvx vr26,r12,r0 +.globl _savevr_27 +_savevr_27: + li r12,-80 + stvx vr27,r12,r0 +.globl _savevr_28 +_savevr_28: + li r12,-64 + stvx vr28,r12,r0 +.globl _savevr_29 +_savevr_29: + li r12,-48 + stvx vr29,r12,r0 +.globl _savevr_30 +_savevr_30: + li r12,-32 + stvx vr30,r12,r0 +.globl _savevr_31 +_savevr_31: + li r12,-16 + stvx vr31,r12,r0 + blr + +.globl _restvr_20 +_restvr_20: + li r12,-192 + lvx vr20,r12,r0 +.globl _restvr_21 +_restvr_21: + li r12,-176 + lvx vr21,r12,r0 +.globl _restvr_22 +_restvr_22: + li r12,-160 + lvx vr22,r12,r0 +.globl _restvr_23 +_restvr_23: + li r12,-144 + lvx vr23,r12,r0 +.globl _restvr_24 +_restvr_24: + li r12,-128 + lvx vr24,r12,r0 +.globl _restvr_25 +_restvr_25: + li r12,-112 + lvx vr25,r12,r0 +.globl _restvr_26 +_restvr_26: + li r12,-96 + lvx vr26,r12,r0 +.globl _restvr_27 +_restvr_27: + li r12,-80 + lvx vr27,r12,r0 +.globl _restvr_28 +_restvr_28: + li r12,-64 + lvx vr28,r12,r0 +.globl _restvr_29 +_restvr_29: + li r12,-48 + lvx vr29,r12,r0 +.globl _restvr_30 +_restvr_30: + li r12,-32 + lvx vr30,r12,r0 +.globl _restvr_31 +_restvr_31: + li r12,-16 + lvx vr31,r12,r0 + blr + +#endif /* CONFIG_ALTIVEC */ + +#endif /* CONFIG_PPC64 */ + #endif diff --git a/arch/powerpc/lib/devres.c b/arch/powerpc/lib/devres.c index 292115d98ea..8df55fc3aad 100644 --- a/arch/powerpc/lib/devres.c +++ b/arch/powerpc/lib/devres.c @@ -8,11 +8,12 @@ */ #include <linux/device.h> /* devres_*(), devm_ioremap_release() */ -#include <linux/io.h> /* ioremap_flags() */ -#include <linux/module.h> /* EXPORT_SYMBOL() */ +#include <linux/gfp.h> +#include <linux/io.h> /* ioremap_prot() */ +#include <linux/export.h> /* EXPORT_SYMBOL() */ /** - * devm_ioremap_prot - Managed ioremap_flags() + * devm_ioremap_prot - Managed ioremap_prot() * @dev: Generic device to remap IO address for * @offset: BUS offset to map * @size: Size of map @@ -30,7 +31,7 @@ void __iomem *devm_ioremap_prot(struct device *dev, resource_size_t offset, if (!ptr) return NULL; - addr = ioremap_flags(offset, size, flags); + addr = ioremap_prot(offset, size, flags); if (addr) { *ptr = addr; devres_add(dev, ptr); diff --git a/arch/powerpc/lib/feature-fixups-test.S b/arch/powerpc/lib/feature-fixups-test.S index cb737484c5a..f4613118132 100644 --- a/arch/powerpc/lib/feature-fixups-test.S +++ b/arch/powerpc/lib/feature-fixups-test.S @@ -172,6 +172,25 @@ globl(ftr_fixup_test6_expected) 3: or 3,3,3 +#if 0 +/* Test that if we have a larger else case the assembler spots it and + * reports an error. #if 0'ed so as not to break the build normally. + */ +ftr_fixup_test7: + or 1,1,1 +BEGIN_FTR_SECTION + or 2,2,2 + or 2,2,2 + or 2,2,2 +FTR_SECTION_ELSE + or 3,3,3 + or 3,3,3 + or 3,3,3 + or 3,3,3 +ALT_FTR_SECTION_END(0, 1) + or 1,1,1 +#endif + #define MAKE_MACRO_TEST(TYPE) \ globl(ftr_fixup_test_ ##TYPE##_macros) \ or 1,1,1; \ diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index 7e8865bcd68..7a8a7487cee 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -12,11 +12,14 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/init.h> #include <asm/cputable.h> #include <asm/code-patching.h> +#include <asm/page.h> +#include <asm/sections.h> struct fixup_entry { @@ -112,7 +115,8 @@ void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end) void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end) { - unsigned int *start, *end, *dest; + long *start, *end; + unsigned int *dest; if (!(value & CPU_FTR_LWSYNC)) return ; @@ -126,6 +130,27 @@ void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end) } } +void do_final_fixups(void) +{ +#if defined(CONFIG_PPC64) && defined(CONFIG_RELOCATABLE) + int *src, *dest; + unsigned long length; + + if (PHYSICAL_START == 0) + return; + + src = (int *)(KERNELBASE + PHYSICAL_START); + dest = (int *)KERNELBASE; + length = (__end_interrupts - _stext) / sizeof(int); + + while (length--) { + patch_instruction(dest, *src); + src++; + dest++; + } +#endif +} + #ifdef CONFIG_FTR_FIXUP_SELFTEST #define check(x) \ @@ -287,8 +312,8 @@ static void test_alternative_case_with_external_branch(void) static void test_cpu_macros(void) { - extern void ftr_fixup_test_FTR_macros; - extern void ftr_fixup_test_FTR_macros_expected; + extern u8 ftr_fixup_test_FTR_macros; + extern u8 ftr_fixup_test_FTR_macros_expected; unsigned long size = &ftr_fixup_test_FTR_macros_expected - &ftr_fixup_test_FTR_macros; @@ -300,8 +325,8 @@ static void test_cpu_macros(void) static void test_fw_macros(void) { #ifdef CONFIG_PPC64 - extern void ftr_fixup_test_FW_FTR_macros; - extern void ftr_fixup_test_FW_FTR_macros_expected; + extern u8 ftr_fixup_test_FW_FTR_macros; + extern u8 ftr_fixup_test_FW_FTR_macros_expected; unsigned long size = &ftr_fixup_test_FW_FTR_macros_expected - &ftr_fixup_test_FW_FTR_macros; @@ -313,10 +338,10 @@ static void test_fw_macros(void) static void test_lwsync_macros(void) { - extern void lwsync_fixup_test; - extern void end_lwsync_fixup_test; - extern void lwsync_fixup_test_expected_LWSYNC; - extern void lwsync_fixup_test_expected_SYNC; + extern u8 lwsync_fixup_test; + extern u8 end_lwsync_fixup_test; + extern u8 lwsync_fixup_test_expected_LWSYNC; + extern u8 lwsync_fixup_test_expected_SYNC; unsigned long size = &end_lwsync_fixup_test - &lwsync_fixup_test; diff --git a/arch/powerpc/lib/hweight_64.S b/arch/powerpc/lib/hweight_64.S new file mode 100644 index 00000000000..19e66001a4f --- /dev/null +++ b/arch/powerpc/lib/hweight_64.S @@ -0,0 +1,110 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2010 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <asm/processor.h> +#include <asm/ppc_asm.h> + +/* Note: This code relies on -mminimal-toc */ + +_GLOBAL(__arch_hweight8) +BEGIN_FTR_SECTION + b __sw_hweight8 + nop + nop +FTR_SECTION_ELSE + PPC_POPCNTB(R3,R3) + clrldi r3,r3,64-8 + blr +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) + +_GLOBAL(__arch_hweight16) +BEGIN_FTR_SECTION + b __sw_hweight16 + nop + nop + nop + nop +FTR_SECTION_ELSE + BEGIN_FTR_SECTION_NESTED(50) + PPC_POPCNTB(R3,R3) + srdi r4,r3,8 + add r3,r4,r3 + clrldi r3,r3,64-8 + blr + FTR_SECTION_ELSE_NESTED(50) + clrlwi r3,r3,16 + PPC_POPCNTW(R3,R3) + clrldi r3,r3,64-8 + blr + ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 50) +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) + +_GLOBAL(__arch_hweight32) +BEGIN_FTR_SECTION + b __sw_hweight32 + nop + nop + nop + nop + nop + nop +FTR_SECTION_ELSE + BEGIN_FTR_SECTION_NESTED(51) + PPC_POPCNTB(R3,R3) + srdi r4,r3,16 + add r3,r4,r3 + srdi r4,r3,8 + add r3,r4,r3 + clrldi r3,r3,64-8 + blr + FTR_SECTION_ELSE_NESTED(51) + PPC_POPCNTW(R3,R3) + clrldi r3,r3,64-8 + blr + ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 51) +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) + +_GLOBAL(__arch_hweight64) +BEGIN_FTR_SECTION + b __sw_hweight64 + nop + nop + nop + nop + nop + nop + nop + nop +FTR_SECTION_ELSE + BEGIN_FTR_SECTION_NESTED(52) + PPC_POPCNTB(R3,R3) + srdi r4,r3,32 + add r3,r4,r3 + srdi r4,r3,16 + add r3,r4,r3 + srdi r4,r3,8 + add r3,r4,r3 + clrldi r3,r3,64-8 + blr + FTR_SECTION_ELSE_NESTED(52) + PPC_POPCNTD(R3,R3) + clrldi r3,r3,64-8 + blr + ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 52) +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) diff --git a/arch/powerpc/lib/ldstfp.S b/arch/powerpc/lib/ldstfp.S new file mode 100644 index 00000000000..85aec08ab23 --- /dev/null +++ b/arch/powerpc/lib/ldstfp.S @@ -0,0 +1,379 @@ +/* + * Floating-point, VMX/Altivec and VSX loads and stores + * for use in instruction emulation. + * + * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/ppc-opcode.h> +#include <asm/reg.h> +#include <asm/asm-offsets.h> +#include <linux/errno.h> + +#ifdef CONFIG_PPC_FPU + +#define STKFRM (PPC_MIN_STKFRM + 16) + + .macro extab instr,handler + .section __ex_table,"a" + PPC_LONG \instr,\handler + .previous + .endm + + .macro inst32 op +reg = 0 + .rept 32 +20: \op reg,0,r4 + b 3f + extab 20b,99f +reg = reg + 1 + .endr + .endm + +/* Get the contents of frN into fr0; N is in r3. */ +_GLOBAL(get_fpr) + mflr r0 + rlwinm r3,r3,3,0xf8 + bcl 20,31,1f + blr /* fr0 is already in fr0 */ + nop +reg = 1 + .rept 31 + fmr fr0,reg + blr +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr + +/* Put the contents of fr0 into frN; N is in r3. */ +_GLOBAL(put_fpr) + mflr r0 + rlwinm r3,r3,3,0xf8 + bcl 20,31,1f + blr /* fr0 is already in fr0 */ + nop +reg = 1 + .rept 31 + fmr reg,fr0 + blr +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr + +/* Load FP reg N from float at *p. N is in r3, p in r4. */ +_GLOBAL(do_lfs) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + ori r7,r6,MSR_FP + cmpwi cr7,r3,0 + MTMSRD(r7) + isync + beq cr7,1f + stfd fr0,STKFRM-16(r1) +1: li r9,-EFAULT +2: lfs fr0,0(r4) + li r9,0 +3: bl put_fpr + beq cr7,4f + lfd fr0,STKFRM-16(r1) +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b + +/* Load FP reg N from double at *p. N is in r3, p in r4. */ +_GLOBAL(do_lfd) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + ori r7,r6,MSR_FP + cmpwi cr7,r3,0 + MTMSRD(r7) + isync + beq cr7,1f + stfd fr0,STKFRM-16(r1) +1: li r9,-EFAULT +2: lfd fr0,0(r4) + li r9,0 +3: beq cr7,4f + bl put_fpr + lfd fr0,STKFRM-16(r1) +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b + +/* Store FP reg N to float at *p. N is in r3, p in r4. */ +_GLOBAL(do_stfs) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + ori r7,r6,MSR_FP + cmpwi cr7,r3,0 + MTMSRD(r7) + isync + beq cr7,1f + stfd fr0,STKFRM-16(r1) + bl get_fpr +1: li r9,-EFAULT +2: stfs fr0,0(r4) + li r9,0 +3: beq cr7,4f + lfd fr0,STKFRM-16(r1) +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b + +/* Store FP reg N to double at *p. N is in r3, p in r4. */ +_GLOBAL(do_stfd) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + ori r7,r6,MSR_FP + cmpwi cr7,r3,0 + MTMSRD(r7) + isync + beq cr7,1f + stfd fr0,STKFRM-16(r1) + bl get_fpr +1: li r9,-EFAULT +2: stfd fr0,0(r4) + li r9,0 +3: beq cr7,4f + lfd fr0,STKFRM-16(r1) +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b + +#ifdef CONFIG_ALTIVEC +/* Get the contents of vrN into vr0; N is in r3. */ +_GLOBAL(get_vr) + mflr r0 + rlwinm r3,r3,3,0xf8 + bcl 20,31,1f + blr /* vr0 is already in vr0 */ + nop +reg = 1 + .rept 31 + vor vr0,reg,reg /* assembler doesn't know vmr? */ + blr +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr + +/* Put the contents of vr0 into vrN; N is in r3. */ +_GLOBAL(put_vr) + mflr r0 + rlwinm r3,r3,3,0xf8 + bcl 20,31,1f + blr /* vr0 is already in vr0 */ + nop +reg = 1 + .rept 31 + vor reg,vr0,vr0 + blr +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr + +/* Load vector reg N from *p. N is in r3, p in r4. */ +_GLOBAL(do_lvx) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + oris r7,r6,MSR_VEC@h + cmpwi cr7,r3,0 + li r8,STKFRM-16 + MTMSRD(r7) + isync + beq cr7,1f + stvx vr0,r1,r8 +1: li r9,-EFAULT +2: lvx vr0,0,r4 + li r9,0 +3: beq cr7,4f + bl put_vr + lvx vr0,r1,r8 +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b + +/* Store vector reg N to *p. N is in r3, p in r4. */ +_GLOBAL(do_stvx) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + oris r7,r6,MSR_VEC@h + cmpwi cr7,r3,0 + li r8,STKFRM-16 + MTMSRD(r7) + isync + beq cr7,1f + stvx vr0,r1,r8 + bl get_vr +1: li r9,-EFAULT +2: stvx vr0,0,r4 + li r9,0 +3: beq cr7,4f + lvx vr0,r1,r8 +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b +#endif /* CONFIG_ALTIVEC */ + +#ifdef CONFIG_VSX +/* Get the contents of vsrN into vsr0; N is in r3. */ +_GLOBAL(get_vsr) + mflr r0 + rlwinm r3,r3,3,0x1f8 + bcl 20,31,1f + blr /* vsr0 is already in vsr0 */ + nop +reg = 1 + .rept 63 + XXLOR(0,reg,reg) + blr +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr + +/* Put the contents of vsr0 into vsrN; N is in r3. */ +_GLOBAL(put_vsr) + mflr r0 + rlwinm r3,r3,3,0x1f8 + bcl 20,31,1f + blr /* vr0 is already in vr0 */ + nop +reg = 1 + .rept 63 + XXLOR(reg,0,0) + blr +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr + +/* Load VSX reg N from vector doubleword *p. N is in r3, p in r4. */ +_GLOBAL(do_lxvd2x) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + oris r7,r6,MSR_VSX@h + cmpwi cr7,r3,0 + li r8,STKFRM-16 + MTMSRD(r7) + isync + beq cr7,1f + STXVD2X(0,R1,R8) +1: li r9,-EFAULT +2: LXVD2X(0,R0,R4) + li r9,0 +3: beq cr7,4f + bl put_vsr + LXVD2X(0,R1,R8) +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b + +/* Store VSX reg N to vector doubleword *p. N is in r3, p in r4. */ +_GLOBAL(do_stxvd2x) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + oris r7,r6,MSR_VSX@h + cmpwi cr7,r3,0 + li r8,STKFRM-16 + MTMSRD(r7) + isync + beq cr7,1f + STXVD2X(0,R1,R8) + bl get_vsr +1: li r9,-EFAULT +2: STXVD2X(0,R0,R4) + li r9,0 +3: beq cr7,4f + LXVD2X(0,R1,R8) +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b + +#endif /* CONFIG_VSX */ + +#endif /* CONFIG_PPC_FPU */ diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c index 58e14fba11b..0c9c8d7d073 100644 --- a/arch/powerpc/lib/locks.c +++ b/arch/powerpc/lib/locks.c @@ -14,16 +14,14 @@ #include <linux/kernel.h> #include <linux/spinlock.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/stringify.h> #include <linux/smp.h> /* waiting for a spinlock... */ -#if defined(CONFIG_PPC_SPLPAR) || defined(CONFIG_PPC_ISERIES) +#if defined(CONFIG_PPC_SPLPAR) #include <asm/hvcall.h> -#include <asm/iseries/hv_call.h> #include <asm/smp.h> -#include <asm/firmware.h> void __spin_yield(arch_spinlock_t *lock) { @@ -34,20 +32,14 @@ void __spin_yield(arch_spinlock_t *lock) return; holder_cpu = lock_value & 0xffff; BUG_ON(holder_cpu >= NR_CPUS); - yield_count = lppaca[holder_cpu].yield_count; + yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count); if ((yield_count & 1) == 0) return; /* virtual cpu is currently running */ rmb(); if (lock->slock != lock_value) return; /* something has changed */ - if (firmware_has_feature(FW_FEATURE_ISERIES)) - HvCall2(HvCallBaseYieldProcessor, HvCall_YieldToProc, - ((u64)holder_cpu << 32) | yield_count); -#ifdef CONFIG_PPC_SPLPAR - else - plpar_hcall_norets(H_CONFER, - get_hard_smp_processor_id(holder_cpu), yield_count); -#endif + plpar_hcall_norets(H_CONFER, + get_hard_smp_processor_id(holder_cpu), yield_count); } /* @@ -65,20 +57,14 @@ void __rw_yield(arch_rwlock_t *rw) return; /* no write lock at present */ holder_cpu = lock_value & 0xffff; BUG_ON(holder_cpu >= NR_CPUS); - yield_count = lppaca[holder_cpu].yield_count; + yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count); if ((yield_count & 1) == 0) return; /* virtual cpu is currently running */ rmb(); if (rw->lock != lock_value) return; /* something has changed */ - if (firmware_has_feature(FW_FEATURE_ISERIES)) - HvCall2(HvCallBaseYieldProcessor, HvCall_YieldToProc, - ((u64)holder_cpu << 32) | yield_count); -#ifdef CONFIG_PPC_SPLPAR - else - plpar_hcall_norets(H_CONFER, - get_hard_smp_processor_id(holder_cpu), yield_count); -#endif + plpar_hcall_norets(H_CONFER, + get_hard_smp_processor_id(holder_cpu), yield_count); } #endif diff --git a/arch/powerpc/lib/mem_64.S b/arch/powerpc/lib/mem_64.S index 11ce045e21f..43435c6892f 100644 --- a/arch/powerpc/lib/mem_64.S +++ b/arch/powerpc/lib/mem_64.S @@ -19,7 +19,7 @@ _GLOBAL(memset) rlwimi r4,r4,16,0,15 cmplw cr1,r5,r0 /* do we get that far? */ rldimi r4,r4,32,0 - PPC_MTOCRF 1,r0 + PPC_MTOCRF(1,r0) mr r6,r3 blt cr1,8f beq+ 3f /* if already 8-byte aligned */ @@ -49,7 +49,7 @@ _GLOBAL(memset) bdnz 4b 5: srwi. r0,r5,3 clrlwi r5,r5,29 - PPC_MTOCRF 1,r0 + PPC_MTOCRF(1,r0) beq 8f bf 29,6f std r4,0(r6) @@ -65,7 +65,7 @@ _GLOBAL(memset) std r4,0(r6) addi r6,r6,8 8: cmpwi r5,0 - PPC_MTOCRF 1,r5 + PPC_MTOCRF(1,r5) beqlr+ bf 29,9f stw r4,0(r6) @@ -77,10 +77,10 @@ _GLOBAL(memset) stb r4,0(r6) blr -_GLOBAL(memmove) +_GLOBAL_TOC(memmove) cmplw 0,r3,r4 - bgt .backwards_memcpy - b .memcpy + bgt backwards_memcpy + b memcpy _GLOBAL(backwards_memcpy) rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */ diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S index e178922b2c2..32a06ec395d 100644 --- a/arch/powerpc/lib/memcpy_64.S +++ b/arch/powerpc/lib/memcpy_64.S @@ -10,9 +10,30 @@ #include <asm/ppc_asm.h> .align 7 -_GLOBAL(memcpy) - std r3,48(r1) /* save destination pointer for return value */ - PPC_MTOCRF 0x01,r5 +_GLOBAL_TOC(memcpy) +BEGIN_FTR_SECTION +#ifdef __LITTLE_ENDIAN__ + cmpdi cr7,r5,0 +#else + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) /* save destination pointer for return value */ +#endif +FTR_SECTION_ELSE +#ifndef SELFTEST + b memcpy_power7 +#endif +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) +#ifdef __LITTLE_ENDIAN__ + /* dumb little-endian memcpy that will get replaced at runtime */ + addi r9,r3,-1 + addi r4,r4,-1 + beqlr cr7 + mtctr r5 +1: lbzu r10,1(r4) + stbu r10,1(r9) + bdnz 1b + blr +#else + PPC_MTOCRF(0x01,r5) cmpldi cr1,r5,16 neg r6,r3 # LS 3 bits = # bytes to 8-byte dest bdry andi. r6,r6,7 @@ -67,7 +88,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) 2: bf cr7*4+3,3f lbz r9,8(r4) stb r9,0(r3) -3: ld r3,48(r1) /* return dest pointer */ +3: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) /* return dest pointer */ blr .Lsrc_unaligned: @@ -150,11 +171,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) 2: bf cr7*4+3,3f rotldi r9,r9,8 stb r9,0(r3) -3: ld r3,48(r1) /* return dest pointer */ +3: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) /* return dest pointer */ blr .Ldst_unaligned: - PPC_MTOCRF 0x01,r6 # put #bytes to 8B bdry into cr7 + PPC_MTOCRF(0x01,r6) # put #bytes to 8B bdry into cr7 subf r5,r6,r5 li r7,0 cmpldi cr1,r5,16 @@ -169,7 +190,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) 2: bf cr7*4+1,3f lwzx r0,r7,r4 stwx r0,r7,r3 -3: PPC_MTOCRF 0x01,r5 +3: PPC_MTOCRF(0x01,r5) add r4,r6,r4 add r3,r6,r3 b .Ldst_aligned @@ -195,5 +216,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) 3: bf cr7*4+3,4f lbz r0,0(r4) stb r0,0(r3) -4: ld r3,48(r1) /* return dest pointer */ +4: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) /* return dest pointer */ blr +#endif diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S new file mode 100644 index 00000000000..2ff5c142f87 --- /dev/null +++ b/arch/powerpc/lib/memcpy_power7.S @@ -0,0 +1,656 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <asm/ppc_asm.h> + +_GLOBAL(memcpy_power7) + +#ifdef __BIG_ENDIAN__ +#define LVS(VRT,RA,RB) lvsl VRT,RA,RB +#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC +#else +#define LVS(VRT,RA,RB) lvsr VRT,RA,RB +#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC +#endif + +#ifdef CONFIG_ALTIVEC + cmpldi r5,16 + cmpldi cr1,r5,4096 + + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + + blt .Lshort_copy + bgt cr1,.Lvmx_copy +#else + cmpldi r5,16 + + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + + blt .Lshort_copy +#endif + +.Lnonvmx_copy: + /* Get the source 8B aligned */ + neg r6,r4 + mtocrf 0x01,r6 + clrldi r6,r6,(64-3) + + bf cr7*4+3,1f + lbz r0,0(r4) + addi r4,r4,1 + stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +3: sub r5,r5,r6 + cmpldi r5,128 + blt 5f + + mflr r0 + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + std r17,STK_REG(R17)(r1) + std r18,STK_REG(R18)(r1) + std r19,STK_REG(R19)(r1) + std r20,STK_REG(R20)(r1) + std r21,STK_REG(R21)(r1) + std r22,STK_REG(R22)(r1) + std r0,STACKFRAMESIZE+16(r1) + + srdi r6,r5,7 + mtctr r6 + + /* Now do cacheline (128B) sized loads and stores. */ + .align 5 +4: + ld r0,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ld r8,24(r4) + ld r9,32(r4) + ld r10,40(r4) + ld r11,48(r4) + ld r12,56(r4) + ld r14,64(r4) + ld r15,72(r4) + ld r16,80(r4) + ld r17,88(r4) + ld r18,96(r4) + ld r19,104(r4) + ld r20,112(r4) + ld r21,120(r4) + addi r4,r4,128 + std r0,0(r3) + std r6,8(r3) + std r7,16(r3) + std r8,24(r3) + std r9,32(r3) + std r10,40(r3) + std r11,48(r3) + std r12,56(r3) + std r14,64(r3) + std r15,72(r3) + std r16,80(r3) + std r17,88(r3) + std r18,96(r3) + std r19,104(r3) + std r20,112(r3) + std r21,120(r3) + addi r3,r3,128 + bdnz 4b + + clrldi r5,r5,(64-7) + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + ld r17,STK_REG(R17)(r1) + ld r18,STK_REG(R18)(r1) + ld r19,STK_REG(R19)(r1) + ld r20,STK_REG(R20)(r1) + ld r21,STK_REG(R21)(r1) + ld r22,STK_REG(R22)(r1) + addi r1,r1,STACKFRAMESIZE + + /* Up to 127B to go */ +5: srdi r6,r5,4 + mtocrf 0x01,r6 + +6: bf cr7*4+1,7f + ld r0,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ld r8,24(r4) + ld r9,32(r4) + ld r10,40(r4) + ld r11,48(r4) + ld r12,56(r4) + addi r4,r4,64 + std r0,0(r3) + std r6,8(r3) + std r7,16(r3) + std r8,24(r3) + std r9,32(r3) + std r10,40(r3) + std r11,48(r3) + std r12,56(r3) + addi r3,r3,64 + + /* Up to 63B to go */ +7: bf cr7*4+2,8f + ld r0,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ld r8,24(r4) + addi r4,r4,32 + std r0,0(r3) + std r6,8(r3) + std r7,16(r3) + std r8,24(r3) + addi r3,r3,32 + + /* Up to 31B to go */ +8: bf cr7*4+3,9f + ld r0,0(r4) + ld r6,8(r4) + addi r4,r4,16 + std r0,0(r3) + std r6,8(r3) + addi r3,r3,16 + +9: clrldi r5,r5,(64-4) + + /* Up to 15B to go */ +.Lshort_copy: + mtocrf 0x01,r5 + bf cr7*4+0,12f + lwz r0,0(r4) /* Less chance of a reject with word ops */ + lwz r6,4(r4) + addi r4,r4,8 + stw r0,0(r3) + stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f + lbz r0,0(r4) + stb r0,0(r3) + +15: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + blr + +.Lunwind_stack_nonvmx_copy: + addi r1,r1,STACKFRAMESIZE + b .Lnonvmx_copy + +#ifdef CONFIG_ALTIVEC +.Lvmx_copy: + mflr r0 + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + bl enter_vmx_copy + cmpwi cr1,r3,0 + ld r0,STACKFRAMESIZE+16(r1) + ld r3,STK_REG(R31)(r1) + ld r4,STK_REG(R30)(r1) + ld r5,STK_REG(R29)(r1) + mtlr r0 + + /* + * We prefetch both the source and destination using enhanced touch + * instructions. We use a stream ID of 0 for the load side and + * 1 for the store side. + */ + clrrdi r6,r4,7 + clrrdi r9,r3,7 + ori r9,r9,1 /* stream=1 */ + + srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ + cmpldi r7,0x3FF + ble 1f + li r7,0x3FF +1: lis r0,0x0E00 /* depth=7 */ + sldi r7,r7,7 + or r7,r7,r0 + ori r10,r7,1 /* stream=1 */ + + lis r8,0x8000 /* GO=1 */ + clrldi r8,r8,32 + +.machine push +.machine "power4" + dcbt r0,r6,0b01000 + dcbt r0,r7,0b01010 + dcbtst r0,r9,0b01000 + dcbtst r0,r10,0b01010 + eieio + dcbt r0,r8,0b01010 /* GO */ +.machine pop + + beq cr1,.Lunwind_stack_nonvmx_copy + + /* + * If source and destination are not relatively aligned we use a + * slower permute loop. + */ + xor r6,r4,r3 + rldicl. r6,r6,0,(64-4) + bne .Lvmx_unaligned_copy + + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f + lbz r0,0(r4) + addi r4,r4,1 + stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f + ld r0,0(r4) + addi r4,r4,8 + std r0,0(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + bf cr7*4+3,5f + lvx vr1,r0,r4 + addi r4,r4,16 + stvx vr1,r0,r3 + addi r3,r3,16 + +5: bf cr7*4+2,6f + lvx vr1,r0,r4 + lvx vr0,r4,r9 + addi r4,r4,32 + stvx vr1,r0,r3 + stvx vr0,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f + lvx vr3,r0,r4 + lvx vr2,r4,r9 + lvx vr1,r4,r10 + lvx vr0,r4,r11 + addi r4,r4,64 + stvx vr3,r0,r3 + stvx vr2,r3,r9 + stvx vr1,r3,r10 + stvx vr0,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: + lvx vr7,r0,r4 + lvx vr6,r4,r9 + lvx vr5,r4,r10 + lvx vr4,r4,r11 + lvx vr3,r4,r12 + lvx vr2,r4,r14 + lvx vr1,r4,r15 + lvx vr0,r4,r16 + addi r4,r4,128 + stvx vr7,r0,r3 + stvx vr6,r3,r9 + stvx vr5,r3,r10 + stvx vr4,r3,r11 + stvx vr3,r3,r12 + stvx vr2,r3,r14 + stvx vr1,r3,r15 + stvx vr0,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f + lvx vr3,r0,r4 + lvx vr2,r4,r9 + lvx vr1,r4,r10 + lvx vr0,r4,r11 + addi r4,r4,64 + stvx vr3,r0,r3 + stvx vr2,r3,r9 + stvx vr1,r3,r10 + stvx vr0,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f + lvx vr1,r0,r4 + lvx vr0,r4,r9 + addi r4,r4,32 + stvx vr1,r0,r3 + stvx vr0,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f + lvx vr1,r0,r4 + addi r4,r4,16 + stvx vr1,r0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + mtocrf 0x01,r5 + bf cr7*4+0,12f + ld r0,0(r4) + addi r4,r4,8 + std r0,0(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f + lbz r0,0(r4) + stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + b exit_vmx_copy /* tail call optimise */ + +.Lvmx_unaligned_copy: + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f + lbz r0,0(r4) + addi r4,r4,1 + stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f + lwz r0,0(r4) /* Less chance of a reject with word ops */ + lwz r7,4(r4) + addi r4,r4,8 + stw r0,0(r3) + stw r7,4(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + LVS(vr16,0,r4) /* Setup permute control vector */ + lvx vr0,0,r4 + addi r4,r4,16 + + bf cr7*4+3,5f + lvx vr1,r0,r4 + VPERM(vr8,vr0,vr1,vr16) + addi r4,r4,16 + stvx vr8,r0,r3 + addi r3,r3,16 + vor vr0,vr1,vr1 + +5: bf cr7*4+2,6f + lvx vr1,r0,r4 + VPERM(vr8,vr0,vr1,vr16) + lvx vr0,r4,r9 + VPERM(vr9,vr1,vr0,vr16) + addi r4,r4,32 + stvx vr8,r0,r3 + stvx vr9,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f + lvx vr3,r0,r4 + VPERM(vr8,vr0,vr3,vr16) + lvx vr2,r4,r9 + VPERM(vr9,vr3,vr2,vr16) + lvx vr1,r4,r10 + VPERM(vr10,vr2,vr1,vr16) + lvx vr0,r4,r11 + VPERM(vr11,vr1,vr0,vr16) + addi r4,r4,64 + stvx vr8,r0,r3 + stvx vr9,r3,r9 + stvx vr10,r3,r10 + stvx vr11,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: + lvx vr7,r0,r4 + VPERM(vr8,vr0,vr7,vr16) + lvx vr6,r4,r9 + VPERM(vr9,vr7,vr6,vr16) + lvx vr5,r4,r10 + VPERM(vr10,vr6,vr5,vr16) + lvx vr4,r4,r11 + VPERM(vr11,vr5,vr4,vr16) + lvx vr3,r4,r12 + VPERM(vr12,vr4,vr3,vr16) + lvx vr2,r4,r14 + VPERM(vr13,vr3,vr2,vr16) + lvx vr1,r4,r15 + VPERM(vr14,vr2,vr1,vr16) + lvx vr0,r4,r16 + VPERM(vr15,vr1,vr0,vr16) + addi r4,r4,128 + stvx vr8,r0,r3 + stvx vr9,r3,r9 + stvx vr10,r3,r10 + stvx vr11,r3,r11 + stvx vr12,r3,r12 + stvx vr13,r3,r14 + stvx vr14,r3,r15 + stvx vr15,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f + lvx vr3,r0,r4 + VPERM(vr8,vr0,vr3,vr16) + lvx vr2,r4,r9 + VPERM(vr9,vr3,vr2,vr16) + lvx vr1,r4,r10 + VPERM(vr10,vr2,vr1,vr16) + lvx vr0,r4,r11 + VPERM(vr11,vr1,vr0,vr16) + addi r4,r4,64 + stvx vr8,r0,r3 + stvx vr9,r3,r9 + stvx vr10,r3,r10 + stvx vr11,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f + lvx vr1,r0,r4 + VPERM(vr8,vr0,vr1,vr16) + lvx vr0,r4,r9 + VPERM(vr9,vr1,vr0,vr16) + addi r4,r4,32 + stvx vr8,r0,r3 + stvx vr9,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f + lvx vr1,r0,r4 + VPERM(vr8,vr0,vr1,vr16) + addi r4,r4,16 + stvx vr8,r0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + addi r4,r4,-16 /* Unwind the +16 load offset */ + mtocrf 0x01,r5 + bf cr7*4+0,12f + lwz r0,0(r4) /* Less chance of a reject with word ops */ + lwz r6,4(r4) + addi r4,r4,8 + stw r0,0(r3) + stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f + lbz r0,0(r4) + stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + b exit_vmx_copy /* tail call optimise */ +#endif /* CONFiG_ALTIVEC */ diff --git a/arch/powerpc/lib/rheap.c b/arch/powerpc/lib/rheap.c index 45907c1dae6..a1060a868e6 100644 --- a/arch/powerpc/lib/rheap.c +++ b/arch/powerpc/lib/rheap.c @@ -15,7 +15,7 @@ #include <linux/types.h> #include <linux/errno.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/mm.h> #include <linux/err.h> #include <linux/slab.h> diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 13b7d54f185..5c09f365c84 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -11,8 +11,11 @@ #include <linux/kernel.h> #include <linux/kprobes.h> #include <linux/ptrace.h> +#include <linux/prefetch.h> #include <asm/sstep.h> #include <asm/processor.h> +#include <asm/uaccess.h> +#include <asm/cputable.h> extern char system_call_common[]; @@ -23,6 +26,37 @@ extern char system_call_common[]; #define MSR_MASK 0x87c0ffff #endif +/* Bits in XER */ +#define XER_SO 0x80000000U +#define XER_OV 0x40000000U +#define XER_CA 0x20000000U + +#ifdef CONFIG_PPC_FPU +/* + * Functions in ldstfp.S + */ +extern int do_lfs(int rn, unsigned long ea); +extern int do_lfd(int rn, unsigned long ea); +extern int do_stfs(int rn, unsigned long ea); +extern int do_stfd(int rn, unsigned long ea); +extern int do_lvx(int rn, unsigned long ea); +extern int do_stvx(int rn, unsigned long ea); +extern int do_lxvd2x(int rn, unsigned long ea); +extern int do_stxvd2x(int rn, unsigned long ea); +#endif + +/* + * Emulate the truncation of 64 bit values in 32-bit mode. + */ +static unsigned long truncate_if_32bit(unsigned long msr, unsigned long val) +{ +#ifdef __powerpc64__ + if ((msr & MSR_64BIT) == 0) + val &= 0xffffffffUL; +#endif + return val; +} + /* * Determine whether a conditional branch instruction would branch. */ @@ -46,16 +80,569 @@ static int __kprobes branch_taken(unsigned int instr, struct pt_regs *regs) return 1; } + +static long __kprobes address_ok(struct pt_regs *regs, unsigned long ea, int nb) +{ + if (!user_mode(regs)) + return 1; + return __access_ok(ea, nb, USER_DS); +} + +/* + * Calculate effective address for a D-form instruction + */ +static unsigned long __kprobes dform_ea(unsigned int instr, struct pt_regs *regs) +{ + int ra; + unsigned long ea; + + ra = (instr >> 16) & 0x1f; + ea = (signed short) instr; /* sign-extend */ + if (ra) { + ea += regs->gpr[ra]; + if (instr & 0x04000000) { /* update forms */ + if ((instr>>26) != 47) /* stmw is not an update form */ + regs->gpr[ra] = ea; + } + } + + return truncate_if_32bit(regs->msr, ea); +} + +#ifdef __powerpc64__ +/* + * Calculate effective address for a DS-form instruction + */ +static unsigned long __kprobes dsform_ea(unsigned int instr, struct pt_regs *regs) +{ + int ra; + unsigned long ea; + + ra = (instr >> 16) & 0x1f; + ea = (signed short) (instr & ~3); /* sign-extend */ + if (ra) { + ea += regs->gpr[ra]; + if ((instr & 3) == 1) /* update forms */ + regs->gpr[ra] = ea; + } + + return truncate_if_32bit(regs->msr, ea); +} +#endif /* __powerpc64 */ + +/* + * Calculate effective address for an X-form instruction + */ +static unsigned long __kprobes xform_ea(unsigned int instr, struct pt_regs *regs, + int do_update) +{ + int ra, rb; + unsigned long ea; + + ra = (instr >> 16) & 0x1f; + rb = (instr >> 11) & 0x1f; + ea = regs->gpr[rb]; + if (ra) { + ea += regs->gpr[ra]; + if (do_update) /* update forms */ + regs->gpr[ra] = ea; + } + + return truncate_if_32bit(regs->msr, ea); +} + +/* + * Return the largest power of 2, not greater than sizeof(unsigned long), + * such that x is a multiple of it. + */ +static inline unsigned long max_align(unsigned long x) +{ + x |= sizeof(unsigned long); + return x & -x; /* isolates rightmost bit */ +} + + +static inline unsigned long byterev_2(unsigned long x) +{ + return ((x >> 8) & 0xff) | ((x & 0xff) << 8); +} + +static inline unsigned long byterev_4(unsigned long x) +{ + return ((x >> 24) & 0xff) | ((x >> 8) & 0xff00) | + ((x & 0xff00) << 8) | ((x & 0xff) << 24); +} + +#ifdef __powerpc64__ +static inline unsigned long byterev_8(unsigned long x) +{ + return (byterev_4(x) << 32) | byterev_4(x >> 32); +} +#endif + +static int __kprobes read_mem_aligned(unsigned long *dest, unsigned long ea, + int nb) +{ + int err = 0; + unsigned long x = 0; + + switch (nb) { + case 1: + err = __get_user(x, (unsigned char __user *) ea); + break; + case 2: + err = __get_user(x, (unsigned short __user *) ea); + break; + case 4: + err = __get_user(x, (unsigned int __user *) ea); + break; +#ifdef __powerpc64__ + case 8: + err = __get_user(x, (unsigned long __user *) ea); + break; +#endif + } + if (!err) + *dest = x; + return err; +} + +static int __kprobes read_mem_unaligned(unsigned long *dest, unsigned long ea, + int nb, struct pt_regs *regs) +{ + int err; + unsigned long x, b, c; +#ifdef __LITTLE_ENDIAN__ + int len = nb; /* save a copy of the length for byte reversal */ +#endif + + /* unaligned, do this in pieces */ + x = 0; + for (; nb > 0; nb -= c) { +#ifdef __LITTLE_ENDIAN__ + c = 1; +#endif +#ifdef __BIG_ENDIAN__ + c = max_align(ea); +#endif + if (c > nb) + c = max_align(nb); + err = read_mem_aligned(&b, ea, c); + if (err) + return err; + x = (x << (8 * c)) + b; + ea += c; + } +#ifdef __LITTLE_ENDIAN__ + switch (len) { + case 2: + *dest = byterev_2(x); + break; + case 4: + *dest = byterev_4(x); + break; +#ifdef __powerpc64__ + case 8: + *dest = byterev_8(x); + break; +#endif + } +#endif +#ifdef __BIG_ENDIAN__ + *dest = x; +#endif + return 0; +} + +/* + * Read memory at address ea for nb bytes, return 0 for success + * or -EFAULT if an error occurred. + */ +static int __kprobes read_mem(unsigned long *dest, unsigned long ea, int nb, + struct pt_regs *regs) +{ + if (!address_ok(regs, ea, nb)) + return -EFAULT; + if ((ea & (nb - 1)) == 0) + return read_mem_aligned(dest, ea, nb); + return read_mem_unaligned(dest, ea, nb, regs); +} + +static int __kprobes write_mem_aligned(unsigned long val, unsigned long ea, + int nb) +{ + int err = 0; + + switch (nb) { + case 1: + err = __put_user(val, (unsigned char __user *) ea); + break; + case 2: + err = __put_user(val, (unsigned short __user *) ea); + break; + case 4: + err = __put_user(val, (unsigned int __user *) ea); + break; +#ifdef __powerpc64__ + case 8: + err = __put_user(val, (unsigned long __user *) ea); + break; +#endif + } + return err; +} + +static int __kprobes write_mem_unaligned(unsigned long val, unsigned long ea, + int nb, struct pt_regs *regs) +{ + int err; + unsigned long c; + +#ifdef __LITTLE_ENDIAN__ + switch (nb) { + case 2: + val = byterev_2(val); + break; + case 4: + val = byterev_4(val); + break; +#ifdef __powerpc64__ + case 8: + val = byterev_8(val); + break; +#endif + } +#endif + /* unaligned or little-endian, do this in pieces */ + for (; nb > 0; nb -= c) { +#ifdef __LITTLE_ENDIAN__ + c = 1; +#endif +#ifdef __BIG_ENDIAN__ + c = max_align(ea); +#endif + if (c > nb) + c = max_align(nb); + err = write_mem_aligned(val >> (nb - c) * 8, ea, c); + if (err) + return err; + ea += c; + } + return 0; +} + +/* + * Write memory at address ea for nb bytes, return 0 for success + * or -EFAULT if an error occurred. + */ +static int __kprobes write_mem(unsigned long val, unsigned long ea, int nb, + struct pt_regs *regs) +{ + if (!address_ok(regs, ea, nb)) + return -EFAULT; + if ((ea & (nb - 1)) == 0) + return write_mem_aligned(val, ea, nb); + return write_mem_unaligned(val, ea, nb, regs); +} + +#ifdef CONFIG_PPC_FPU /* - * Emulate instructions that cause a transfer of control. + * Check the address and alignment, and call func to do the actual + * load or store. + */ +static int __kprobes do_fp_load(int rn, int (*func)(int, unsigned long), + unsigned long ea, int nb, + struct pt_regs *regs) +{ + int err; + union { + double dbl; + unsigned long ul[2]; + struct { +#ifdef __BIG_ENDIAN__ + unsigned _pad_; + unsigned word; +#endif +#ifdef __LITTLE_ENDIAN__ + unsigned word; + unsigned _pad_; +#endif + } single; + } data; + unsigned long ptr; + + if (!address_ok(regs, ea, nb)) + return -EFAULT; + if ((ea & 3) == 0) + return (*func)(rn, ea); + ptr = (unsigned long) &data.ul; + if (sizeof(unsigned long) == 8 || nb == 4) { + err = read_mem_unaligned(&data.ul[0], ea, nb, regs); + if (nb == 4) + ptr = (unsigned long)&(data.single.word); + } else { + /* reading a double on 32-bit */ + err = read_mem_unaligned(&data.ul[0], ea, 4, regs); + if (!err) + err = read_mem_unaligned(&data.ul[1], ea + 4, 4, regs); + } + if (err) + return err; + return (*func)(rn, ptr); +} + +static int __kprobes do_fp_store(int rn, int (*func)(int, unsigned long), + unsigned long ea, int nb, + struct pt_regs *regs) +{ + int err; + union { + double dbl; + unsigned long ul[2]; + struct { +#ifdef __BIG_ENDIAN__ + unsigned _pad_; + unsigned word; +#endif +#ifdef __LITTLE_ENDIAN__ + unsigned word; + unsigned _pad_; +#endif + } single; + } data; + unsigned long ptr; + + if (!address_ok(regs, ea, nb)) + return -EFAULT; + if ((ea & 3) == 0) + return (*func)(rn, ea); + ptr = (unsigned long) &data.ul[0]; + if (sizeof(unsigned long) == 8 || nb == 4) { + if (nb == 4) + ptr = (unsigned long)&(data.single.word); + err = (*func)(rn, ptr); + if (err) + return err; + err = write_mem_unaligned(data.ul[0], ea, nb, regs); + } else { + /* writing a double on 32-bit */ + err = (*func)(rn, ptr); + if (err) + return err; + err = write_mem_unaligned(data.ul[0], ea, 4, regs); + if (!err) + err = write_mem_unaligned(data.ul[1], ea + 4, 4, regs); + } + return err; +} +#endif + +#ifdef CONFIG_ALTIVEC +/* For Altivec/VMX, no need to worry about alignment */ +static int __kprobes do_vec_load(int rn, int (*func)(int, unsigned long), + unsigned long ea, struct pt_regs *regs) +{ + if (!address_ok(regs, ea & ~0xfUL, 16)) + return -EFAULT; + return (*func)(rn, ea); +} + +static int __kprobes do_vec_store(int rn, int (*func)(int, unsigned long), + unsigned long ea, struct pt_regs *regs) +{ + if (!address_ok(regs, ea & ~0xfUL, 16)) + return -EFAULT; + return (*func)(rn, ea); +} +#endif /* CONFIG_ALTIVEC */ + +#ifdef CONFIG_VSX +static int __kprobes do_vsx_load(int rn, int (*func)(int, unsigned long), + unsigned long ea, struct pt_regs *regs) +{ + int err; + unsigned long val[2]; + + if (!address_ok(regs, ea, 16)) + return -EFAULT; + if ((ea & 3) == 0) + return (*func)(rn, ea); + err = read_mem_unaligned(&val[0], ea, 8, regs); + if (!err) + err = read_mem_unaligned(&val[1], ea + 8, 8, regs); + if (!err) + err = (*func)(rn, (unsigned long) &val[0]); + return err; +} + +static int __kprobes do_vsx_store(int rn, int (*func)(int, unsigned long), + unsigned long ea, struct pt_regs *regs) +{ + int err; + unsigned long val[2]; + + if (!address_ok(regs, ea, 16)) + return -EFAULT; + if ((ea & 3) == 0) + return (*func)(rn, ea); + err = (*func)(rn, (unsigned long) &val[0]); + if (err) + return err; + err = write_mem_unaligned(val[0], ea, 8, regs); + if (!err) + err = write_mem_unaligned(val[1], ea + 8, 8, regs); + return err; +} +#endif /* CONFIG_VSX */ + +#define __put_user_asmx(x, addr, err, op, cr) \ + __asm__ __volatile__( \ + "1: " op " %2,0,%3\n" \ + " mfcr %1\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: li %0,%4\n" \ + " b 2b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n" \ + PPC_LONG_ALIGN "\n" \ + PPC_LONG "1b,3b\n" \ + ".previous" \ + : "=r" (err), "=r" (cr) \ + : "r" (x), "r" (addr), "i" (-EFAULT), "0" (err)) + +#define __get_user_asmx(x, addr, err, op) \ + __asm__ __volatile__( \ + "1: "op" %1,0,%2\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: li %0,%3\n" \ + " b 2b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n" \ + PPC_LONG_ALIGN "\n" \ + PPC_LONG "1b,3b\n" \ + ".previous" \ + : "=r" (err), "=r" (x) \ + : "r" (addr), "i" (-EFAULT), "0" (err)) + +#define __cacheop_user_asmx(addr, err, op) \ + __asm__ __volatile__( \ + "1: "op" 0,%1\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: li %0,%3\n" \ + " b 2b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n" \ + PPC_LONG_ALIGN "\n" \ + PPC_LONG "1b,3b\n" \ + ".previous" \ + : "=r" (err) \ + : "r" (addr), "i" (-EFAULT), "0" (err)) + +static void __kprobes set_cr0(struct pt_regs *regs, int rd) +{ + long val = regs->gpr[rd]; + + regs->ccr = (regs->ccr & 0x0fffffff) | ((regs->xer >> 3) & 0x10000000); +#ifdef __powerpc64__ + if (!(regs->msr & MSR_64BIT)) + val = (int) val; +#endif + if (val < 0) + regs->ccr |= 0x80000000; + else if (val > 0) + regs->ccr |= 0x40000000; + else + regs->ccr |= 0x20000000; +} + +static void __kprobes add_with_carry(struct pt_regs *regs, int rd, + unsigned long val1, unsigned long val2, + unsigned long carry_in) +{ + unsigned long val = val1 + val2; + + if (carry_in) + ++val; + regs->gpr[rd] = val; +#ifdef __powerpc64__ + if (!(regs->msr & MSR_64BIT)) { + val = (unsigned int) val; + val1 = (unsigned int) val1; + } +#endif + if (val < val1 || (carry_in && val == val1)) + regs->xer |= XER_CA; + else + regs->xer &= ~XER_CA; +} + +static void __kprobes do_cmp_signed(struct pt_regs *regs, long v1, long v2, + int crfld) +{ + unsigned int crval, shift; + + crval = (regs->xer >> 31) & 1; /* get SO bit */ + if (v1 < v2) + crval |= 8; + else if (v1 > v2) + crval |= 4; + else + crval |= 2; + shift = (7 - crfld) * 4; + regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift); +} + +static void __kprobes do_cmp_unsigned(struct pt_regs *regs, unsigned long v1, + unsigned long v2, int crfld) +{ + unsigned int crval, shift; + + crval = (regs->xer >> 31) & 1; /* get SO bit */ + if (v1 < v2) + crval |= 8; + else if (v1 > v2) + crval |= 4; + else + crval |= 2; + shift = (7 - crfld) * 4; + regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift); +} + +/* + * Elements of 32-bit rotate and mask instructions. + */ +#define MASK32(mb, me) ((0xffffffffUL >> (mb)) + \ + ((signed long)-0x80000000L >> (me)) + ((me) >= (mb))) +#ifdef __powerpc64__ +#define MASK64_L(mb) (~0UL >> (mb)) +#define MASK64_R(me) ((signed long)-0x8000000000000000L >> (me)) +#define MASK64(mb, me) (MASK64_L(mb) + MASK64_R(me) + ((me) >= (mb))) +#define DATA32(x) (((x) & 0xffffffffUL) | (((x) & 0xffffffffUL) << 32)) +#else +#define DATA32(x) (x) +#endif +#define ROTATE(x, n) ((n) ? (((x) << (n)) | ((x) >> (8 * sizeof(long) - (n)))) : (x)) + +/* + * Emulate instructions that cause a transfer of control, + * loads and stores, and a few other instructions. * Returns 1 if the step was emulated, 0 if not, * or -1 if the instruction is one that should not be stepped, * such as an rfid, or a mtmsrd that would clear MSR_RI. */ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) { - unsigned int opcode, rs, rb, rd, spr; + unsigned int opcode, ra, rb, rd, spr, u; unsigned long int imm; + unsigned long int val, val2; + unsigned long int ea; + unsigned int cr, mb, me, sh; + int err; + unsigned long old_ra, val3; + long ival; opcode = instr >> 26; switch (opcode) { @@ -64,12 +651,11 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) if ((instr & 2) == 0) imm += regs->nip; regs->nip += 4; - if ((regs->msr & MSR_SF) == 0) - regs->nip &= 0xffffffffUL; + regs->nip = truncate_if_32bit(regs->msr, regs->nip); if (instr & 1) regs->link = regs->nip; if (branch_taken(instr, regs)) - regs->nip = imm; + regs->nip = truncate_if_32bit(regs->msr, imm); return 1; #ifdef CONFIG_PPC64 case 17: /* sc */ @@ -78,7 +664,13 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) * entry code works. If that is changed, this will * need to be changed also. */ + if (regs->gpr[0] == 0x1ebe && + cpu_has_feature(CPU_FTR_REAL_LE)) { + regs->msr ^= MSR_LE; + goto instr_done; + } regs->gpr[9] = regs->gpr[13]; + regs->gpr[10] = MSR_KERNEL; regs->gpr[11] = regs->nip + 4; regs->gpr[12] = regs->msr & MSR_MASK; regs->gpr[13] = (unsigned long) get_paca(); @@ -92,54 +684,250 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) imm -= 0x04000000; if ((instr & 2) == 0) imm += regs->nip; - if (instr & 1) { - regs->link = regs->nip + 4; - if ((regs->msr & MSR_SF) == 0) - regs->link &= 0xffffffffUL; - } - if ((regs->msr & MSR_SF) == 0) - imm &= 0xffffffffUL; + if (instr & 1) + regs->link = truncate_if_32bit(regs->msr, regs->nip + 4); + imm = truncate_if_32bit(regs->msr, imm); regs->nip = imm; return 1; case 19: - switch (instr & 0x7fe) { - case 0x20: /* bclr */ - case 0x420: /* bcctr */ + switch ((instr >> 1) & 0x3ff) { + case 16: /* bclr */ + case 528: /* bcctr */ imm = (instr & 0x400)? regs->ctr: regs->link; - regs->nip += 4; - if ((regs->msr & MSR_SF) == 0) { - regs->nip &= 0xffffffffUL; - imm &= 0xffffffffUL; - } + regs->nip = truncate_if_32bit(regs->msr, regs->nip + 4); + imm = truncate_if_32bit(regs->msr, imm); if (instr & 1) regs->link = regs->nip; if (branch_taken(instr, regs)) regs->nip = imm; return 1; - case 0x24: /* rfid, scary */ + + case 18: /* rfid, scary */ return -1; + + case 150: /* isync */ + isync(); + goto instr_done; + + case 33: /* crnor */ + case 129: /* crandc */ + case 193: /* crxor */ + case 225: /* crnand */ + case 257: /* crand */ + case 289: /* creqv */ + case 417: /* crorc */ + case 449: /* cror */ + ra = (instr >> 16) & 0x1f; + rb = (instr >> 11) & 0x1f; + rd = (instr >> 21) & 0x1f; + ra = (regs->ccr >> (31 - ra)) & 1; + rb = (regs->ccr >> (31 - rb)) & 1; + val = (instr >> (6 + ra * 2 + rb)) & 1; + regs->ccr = (regs->ccr & ~(1UL << (31 - rd))) | + (val << (31 - rd)); + goto instr_done; } + break; case 31: - rd = (instr >> 21) & 0x1f; - switch (instr & 0x7fe) { - case 0xa6: /* mfmsr */ + switch ((instr >> 1) & 0x3ff) { + case 598: /* sync */ +#ifdef __powerpc64__ + switch ((instr >> 21) & 3) { + case 1: /* lwsync */ + asm volatile("lwsync" : : : "memory"); + goto instr_done; + case 2: /* ptesync */ + asm volatile("ptesync" : : : "memory"); + goto instr_done; + } +#endif + mb(); + goto instr_done; + + case 854: /* eieio */ + eieio(); + goto instr_done; + } + break; + } + + /* Following cases refer to regs->gpr[], so we need all regs */ + if (!FULL_REGS(regs)) + return 0; + + rd = (instr >> 21) & 0x1f; + ra = (instr >> 16) & 0x1f; + rb = (instr >> 11) & 0x1f; + + switch (opcode) { + case 7: /* mulli */ + regs->gpr[rd] = regs->gpr[ra] * (short) instr; + goto instr_done; + + case 8: /* subfic */ + imm = (short) instr; + add_with_carry(regs, rd, ~regs->gpr[ra], imm, 1); + goto instr_done; + + case 10: /* cmpli */ + imm = (unsigned short) instr; + val = regs->gpr[ra]; +#ifdef __powerpc64__ + if ((rd & 1) == 0) + val = (unsigned int) val; +#endif + do_cmp_unsigned(regs, val, imm, rd >> 2); + goto instr_done; + + case 11: /* cmpi */ + imm = (short) instr; + val = regs->gpr[ra]; +#ifdef __powerpc64__ + if ((rd & 1) == 0) + val = (int) val; +#endif + do_cmp_signed(regs, val, imm, rd >> 2); + goto instr_done; + + case 12: /* addic */ + imm = (short) instr; + add_with_carry(regs, rd, regs->gpr[ra], imm, 0); + goto instr_done; + + case 13: /* addic. */ + imm = (short) instr; + add_with_carry(regs, rd, regs->gpr[ra], imm, 0); + set_cr0(regs, rd); + goto instr_done; + + case 14: /* addi */ + imm = (short) instr; + if (ra) + imm += regs->gpr[ra]; + regs->gpr[rd] = imm; + goto instr_done; + + case 15: /* addis */ + imm = ((short) instr) << 16; + if (ra) + imm += regs->gpr[ra]; + regs->gpr[rd] = imm; + goto instr_done; + + case 20: /* rlwimi */ + mb = (instr >> 6) & 0x1f; + me = (instr >> 1) & 0x1f; + val = DATA32(regs->gpr[rd]); + imm = MASK32(mb, me); + regs->gpr[ra] = (regs->gpr[ra] & ~imm) | (ROTATE(val, rb) & imm); + goto logical_done; + + case 21: /* rlwinm */ + mb = (instr >> 6) & 0x1f; + me = (instr >> 1) & 0x1f; + val = DATA32(regs->gpr[rd]); + regs->gpr[ra] = ROTATE(val, rb) & MASK32(mb, me); + goto logical_done; + + case 23: /* rlwnm */ + mb = (instr >> 6) & 0x1f; + me = (instr >> 1) & 0x1f; + rb = regs->gpr[rb] & 0x1f; + val = DATA32(regs->gpr[rd]); + regs->gpr[ra] = ROTATE(val, rb) & MASK32(mb, me); + goto logical_done; + + case 24: /* ori */ + imm = (unsigned short) instr; + regs->gpr[ra] = regs->gpr[rd] | imm; + goto instr_done; + + case 25: /* oris */ + imm = (unsigned short) instr; + regs->gpr[ra] = regs->gpr[rd] | (imm << 16); + goto instr_done; + + case 26: /* xori */ + imm = (unsigned short) instr; + regs->gpr[ra] = regs->gpr[rd] ^ imm; + goto instr_done; + + case 27: /* xoris */ + imm = (unsigned short) instr; + regs->gpr[ra] = regs->gpr[rd] ^ (imm << 16); + goto instr_done; + + case 28: /* andi. */ + imm = (unsigned short) instr; + regs->gpr[ra] = regs->gpr[rd] & imm; + set_cr0(regs, ra); + goto instr_done; + + case 29: /* andis. */ + imm = (unsigned short) instr; + regs->gpr[ra] = regs->gpr[rd] & (imm << 16); + set_cr0(regs, ra); + goto instr_done; + +#ifdef __powerpc64__ + case 30: /* rld* */ + mb = ((instr >> 6) & 0x1f) | (instr & 0x20); + val = regs->gpr[rd]; + if ((instr & 0x10) == 0) { + sh = rb | ((instr & 2) << 4); + val = ROTATE(val, sh); + switch ((instr >> 2) & 3) { + case 0: /* rldicl */ + regs->gpr[ra] = val & MASK64_L(mb); + goto logical_done; + case 1: /* rldicr */ + regs->gpr[ra] = val & MASK64_R(mb); + goto logical_done; + case 2: /* rldic */ + regs->gpr[ra] = val & MASK64(mb, 63 - sh); + goto logical_done; + case 3: /* rldimi */ + imm = MASK64(mb, 63 - sh); + regs->gpr[ra] = (regs->gpr[ra] & ~imm) | + (val & imm); + goto logical_done; + } + } else { + sh = regs->gpr[rb] & 0x3f; + val = ROTATE(val, sh); + switch ((instr >> 1) & 7) { + case 0: /* rldcl */ + regs->gpr[ra] = val & MASK64_L(mb); + goto logical_done; + case 1: /* rldcr */ + regs->gpr[ra] = val & MASK64_R(mb); + goto logical_done; + } + } +#endif + + case 31: + switch ((instr >> 1) & 0x3ff) { + case 83: /* mfmsr */ + if (regs->msr & MSR_PR) + break; regs->gpr[rd] = regs->msr & MSR_MASK; - regs->nip += 4; - if ((regs->msr & MSR_SF) == 0) - regs->nip &= 0xffffffffUL; - return 1; - case 0x124: /* mtmsr */ + goto instr_done; + case 146: /* mtmsr */ + if (regs->msr & MSR_PR) + break; imm = regs->gpr[rd]; if ((imm & MSR_RI) == 0) /* can't step mtmsr that would clear MSR_RI */ return -1; regs->msr = imm; - regs->nip += 4; - return 1; + goto instr_done; #ifdef CONFIG_PPC64 - case 0x164: /* mtmsrd */ + case 178: /* mtmsrd */ /* only MSR_EE and MSR_RI get changed if bit 15 set */ /* mtmsrd doesn't change MSR_HV and MSR_ME */ + if (regs->msr & MSR_PR) + break; imm = (instr & 0x10000)? 0x8002: 0xefffffffffffefffUL; imm = (regs->msr & MSR_MASK & ~imm) | (regs->gpr[rd] & imm); @@ -147,57 +935,803 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) /* can't step mtmsrd that would clear MSR_RI */ return -1; regs->msr = imm; - regs->nip += 4; - if ((imm & MSR_SF) == 0) - regs->nip &= 0xffffffffUL; - return 1; + goto instr_done; #endif - case 0x26: /* mfcr */ + case 19: /* mfcr */ regs->gpr[rd] = regs->ccr; regs->gpr[rd] &= 0xffffffffUL; - goto mtspr_out; - case 0x2a6: /* mfspr */ + goto instr_done; + + case 144: /* mtcrf */ + imm = 0xf0000000UL; + val = regs->gpr[rd]; + for (sh = 0; sh < 8; ++sh) { + if (instr & (0x80000 >> sh)) + regs->ccr = (regs->ccr & ~imm) | + (val & imm); + imm >>= 4; + } + goto instr_done; + + case 339: /* mfspr */ spr = (instr >> 11) & 0x3ff; switch (spr) { case 0x20: /* mfxer */ regs->gpr[rd] = regs->xer; regs->gpr[rd] &= 0xffffffffUL; - goto mtspr_out; + goto instr_done; case 0x100: /* mflr */ regs->gpr[rd] = regs->link; - goto mtspr_out; + goto instr_done; case 0x120: /* mfctr */ regs->gpr[rd] = regs->ctr; - goto mtspr_out; - } - break; - case 0x378: /* orx */ - if (instr & 1) - break; - rs = (instr >> 21) & 0x1f; - rb = (instr >> 11) & 0x1f; - if (rs == rb) { /* mr */ - rd = (instr >> 16) & 0x1f; - regs->gpr[rd] = regs->gpr[rs]; - goto mtspr_out; + goto instr_done; } break; - case 0x3a6: /* mtspr */ + + case 467: /* mtspr */ spr = (instr >> 11) & 0x3ff; switch (spr) { case 0x20: /* mtxer */ regs->xer = (regs->gpr[rd] & 0xffffffffUL); - goto mtspr_out; + goto instr_done; case 0x100: /* mtlr */ regs->link = regs->gpr[rd]; - goto mtspr_out; + goto instr_done; case 0x120: /* mtctr */ regs->ctr = regs->gpr[rd]; -mtspr_out: - regs->nip += 4; - return 1; + goto instr_done; + } + break; + +/* + * Compare instructions + */ + case 0: /* cmp */ + val = regs->gpr[ra]; + val2 = regs->gpr[rb]; +#ifdef __powerpc64__ + if ((rd & 1) == 0) { + /* word (32-bit) compare */ + val = (int) val; + val2 = (int) val2; } +#endif + do_cmp_signed(regs, val, val2, rd >> 2); + goto instr_done; + + case 32: /* cmpl */ + val = regs->gpr[ra]; + val2 = regs->gpr[rb]; +#ifdef __powerpc64__ + if ((rd & 1) == 0) { + /* word (32-bit) compare */ + val = (unsigned int) val; + val2 = (unsigned int) val2; + } +#endif + do_cmp_unsigned(regs, val, val2, rd >> 2); + goto instr_done; + +/* + * Arithmetic instructions + */ + case 8: /* subfc */ + add_with_carry(regs, rd, ~regs->gpr[ra], + regs->gpr[rb], 1); + goto arith_done; +#ifdef __powerpc64__ + case 9: /* mulhdu */ + asm("mulhdu %0,%1,%2" : "=r" (regs->gpr[rd]) : + "r" (regs->gpr[ra]), "r" (regs->gpr[rb])); + goto arith_done; +#endif + case 10: /* addc */ + add_with_carry(regs, rd, regs->gpr[ra], + regs->gpr[rb], 0); + goto arith_done; + + case 11: /* mulhwu */ + asm("mulhwu %0,%1,%2" : "=r" (regs->gpr[rd]) : + "r" (regs->gpr[ra]), "r" (regs->gpr[rb])); + goto arith_done; + + case 40: /* subf */ + regs->gpr[rd] = regs->gpr[rb] - regs->gpr[ra]; + goto arith_done; +#ifdef __powerpc64__ + case 73: /* mulhd */ + asm("mulhd %0,%1,%2" : "=r" (regs->gpr[rd]) : + "r" (regs->gpr[ra]), "r" (regs->gpr[rb])); + goto arith_done; +#endif + case 75: /* mulhw */ + asm("mulhw %0,%1,%2" : "=r" (regs->gpr[rd]) : + "r" (regs->gpr[ra]), "r" (regs->gpr[rb])); + goto arith_done; + + case 104: /* neg */ + regs->gpr[rd] = -regs->gpr[ra]; + goto arith_done; + + case 136: /* subfe */ + add_with_carry(regs, rd, ~regs->gpr[ra], regs->gpr[rb], + regs->xer & XER_CA); + goto arith_done; + + case 138: /* adde */ + add_with_carry(regs, rd, regs->gpr[ra], regs->gpr[rb], + regs->xer & XER_CA); + goto arith_done; + + case 200: /* subfze */ + add_with_carry(regs, rd, ~regs->gpr[ra], 0L, + regs->xer & XER_CA); + goto arith_done; + + case 202: /* addze */ + add_with_carry(regs, rd, regs->gpr[ra], 0L, + regs->xer & XER_CA); + goto arith_done; + + case 232: /* subfme */ + add_with_carry(regs, rd, ~regs->gpr[ra], -1L, + regs->xer & XER_CA); + goto arith_done; +#ifdef __powerpc64__ + case 233: /* mulld */ + regs->gpr[rd] = regs->gpr[ra] * regs->gpr[rb]; + goto arith_done; +#endif + case 234: /* addme */ + add_with_carry(regs, rd, regs->gpr[ra], -1L, + regs->xer & XER_CA); + goto arith_done; + + case 235: /* mullw */ + regs->gpr[rd] = (unsigned int) regs->gpr[ra] * + (unsigned int) regs->gpr[rb]; + goto arith_done; + + case 266: /* add */ + regs->gpr[rd] = regs->gpr[ra] + regs->gpr[rb]; + goto arith_done; +#ifdef __powerpc64__ + case 457: /* divdu */ + regs->gpr[rd] = regs->gpr[ra] / regs->gpr[rb]; + goto arith_done; +#endif + case 459: /* divwu */ + regs->gpr[rd] = (unsigned int) regs->gpr[ra] / + (unsigned int) regs->gpr[rb]; + goto arith_done; +#ifdef __powerpc64__ + case 489: /* divd */ + regs->gpr[rd] = (long int) regs->gpr[ra] / + (long int) regs->gpr[rb]; + goto arith_done; +#endif + case 491: /* divw */ + regs->gpr[rd] = (int) regs->gpr[ra] / + (int) regs->gpr[rb]; + goto arith_done; + + +/* + * Logical instructions + */ + case 26: /* cntlzw */ + asm("cntlzw %0,%1" : "=r" (regs->gpr[ra]) : + "r" (regs->gpr[rd])); + goto logical_done; +#ifdef __powerpc64__ + case 58: /* cntlzd */ + asm("cntlzd %0,%1" : "=r" (regs->gpr[ra]) : + "r" (regs->gpr[rd])); + goto logical_done; +#endif + case 28: /* and */ + regs->gpr[ra] = regs->gpr[rd] & regs->gpr[rb]; + goto logical_done; + + case 60: /* andc */ + regs->gpr[ra] = regs->gpr[rd] & ~regs->gpr[rb]; + goto logical_done; + + case 124: /* nor */ + regs->gpr[ra] = ~(regs->gpr[rd] | regs->gpr[rb]); + goto logical_done; + + case 284: /* xor */ + regs->gpr[ra] = ~(regs->gpr[rd] ^ regs->gpr[rb]); + goto logical_done; + + case 316: /* xor */ + regs->gpr[ra] = regs->gpr[rd] ^ regs->gpr[rb]; + goto logical_done; + + case 412: /* orc */ + regs->gpr[ra] = regs->gpr[rd] | ~regs->gpr[rb]; + goto logical_done; + + case 444: /* or */ + regs->gpr[ra] = regs->gpr[rd] | regs->gpr[rb]; + goto logical_done; + + case 476: /* nand */ + regs->gpr[ra] = ~(regs->gpr[rd] & regs->gpr[rb]); + goto logical_done; + + case 922: /* extsh */ + regs->gpr[ra] = (signed short) regs->gpr[rd]; + goto logical_done; + + case 954: /* extsb */ + regs->gpr[ra] = (signed char) regs->gpr[rd]; + goto logical_done; +#ifdef __powerpc64__ + case 986: /* extsw */ + regs->gpr[ra] = (signed int) regs->gpr[rd]; + goto logical_done; +#endif + +/* + * Shift instructions + */ + case 24: /* slw */ + sh = regs->gpr[rb] & 0x3f; + if (sh < 32) + regs->gpr[ra] = (regs->gpr[rd] << sh) & 0xffffffffUL; + else + regs->gpr[ra] = 0; + goto logical_done; + + case 536: /* srw */ + sh = regs->gpr[rb] & 0x3f; + if (sh < 32) + regs->gpr[ra] = (regs->gpr[rd] & 0xffffffffUL) >> sh; + else + regs->gpr[ra] = 0; + goto logical_done; + + case 792: /* sraw */ + sh = regs->gpr[rb] & 0x3f; + ival = (signed int) regs->gpr[rd]; + regs->gpr[ra] = ival >> (sh < 32 ? sh : 31); + if (ival < 0 && (sh >= 32 || (ival & ((1ul << sh) - 1)) != 0)) + regs->xer |= XER_CA; + else + regs->xer &= ~XER_CA; + goto logical_done; + + case 824: /* srawi */ + sh = rb; + ival = (signed int) regs->gpr[rd]; + regs->gpr[ra] = ival >> sh; + if (ival < 0 && (ival & ((1ul << sh) - 1)) != 0) + regs->xer |= XER_CA; + else + regs->xer &= ~XER_CA; + goto logical_done; + +#ifdef __powerpc64__ + case 27: /* sld */ + sh = regs->gpr[rb] & 0x7f; + if (sh < 64) + regs->gpr[ra] = regs->gpr[rd] << sh; + else + regs->gpr[ra] = 0; + goto logical_done; + + case 539: /* srd */ + sh = regs->gpr[rb] & 0x7f; + if (sh < 64) + regs->gpr[ra] = regs->gpr[rd] >> sh; + else + regs->gpr[ra] = 0; + goto logical_done; + + case 794: /* srad */ + sh = regs->gpr[rb] & 0x7f; + ival = (signed long int) regs->gpr[rd]; + regs->gpr[ra] = ival >> (sh < 64 ? sh : 63); + if (ival < 0 && (sh >= 64 || (ival & ((1ul << sh) - 1)) != 0)) + regs->xer |= XER_CA; + else + regs->xer &= ~XER_CA; + goto logical_done; + + case 826: /* sradi with sh_5 = 0 */ + case 827: /* sradi with sh_5 = 1 */ + sh = rb | ((instr & 2) << 4); + ival = (signed long int) regs->gpr[rd]; + regs->gpr[ra] = ival >> sh; + if (ival < 0 && (ival & ((1ul << sh) - 1)) != 0) + regs->xer |= XER_CA; + else + regs->xer &= ~XER_CA; + goto logical_done; +#endif /* __powerpc64__ */ + +/* + * Cache instructions + */ + case 54: /* dcbst */ + ea = xform_ea(instr, regs, 0); + if (!address_ok(regs, ea, 8)) + return 0; + err = 0; + __cacheop_user_asmx(ea, err, "dcbst"); + if (err) + return 0; + goto instr_done; + + case 86: /* dcbf */ + ea = xform_ea(instr, regs, 0); + if (!address_ok(regs, ea, 8)) + return 0; + err = 0; + __cacheop_user_asmx(ea, err, "dcbf"); + if (err) + return 0; + goto instr_done; + + case 246: /* dcbtst */ + if (rd == 0) { + ea = xform_ea(instr, regs, 0); + prefetchw((void *) ea); + } + goto instr_done; + + case 278: /* dcbt */ + if (rd == 0) { + ea = xform_ea(instr, regs, 0); + prefetch((void *) ea); + } + goto instr_done; + } + break; } - return 0; + + /* + * Following cases are for loads and stores, so bail out + * if we're in little-endian mode. + */ + if (regs->msr & MSR_LE) + return 0; + + /* + * Save register RA in case it's an update form load or store + * and the access faults. + */ + old_ra = regs->gpr[ra]; + + switch (opcode) { + case 31: + u = instr & 0x40; + switch ((instr >> 1) & 0x3ff) { + case 20: /* lwarx */ + ea = xform_ea(instr, regs, 0); + if (ea & 3) + break; /* can't handle misaligned */ + err = -EFAULT; + if (!address_ok(regs, ea, 4)) + goto ldst_done; + err = 0; + __get_user_asmx(val, ea, err, "lwarx"); + if (!err) + regs->gpr[rd] = val; + goto ldst_done; + + case 150: /* stwcx. */ + ea = xform_ea(instr, regs, 0); + if (ea & 3) + break; /* can't handle misaligned */ + err = -EFAULT; + if (!address_ok(regs, ea, 4)) + goto ldst_done; + err = 0; + __put_user_asmx(regs->gpr[rd], ea, err, "stwcx.", cr); + if (!err) + regs->ccr = (regs->ccr & 0x0fffffff) | + (cr & 0xe0000000) | + ((regs->xer >> 3) & 0x10000000); + goto ldst_done; + +#ifdef __powerpc64__ + case 84: /* ldarx */ + ea = xform_ea(instr, regs, 0); + if (ea & 7) + break; /* can't handle misaligned */ + err = -EFAULT; + if (!address_ok(regs, ea, 8)) + goto ldst_done; + err = 0; + __get_user_asmx(val, ea, err, "ldarx"); + if (!err) + regs->gpr[rd] = val; + goto ldst_done; + + case 214: /* stdcx. */ + ea = xform_ea(instr, regs, 0); + if (ea & 7) + break; /* can't handle misaligned */ + err = -EFAULT; + if (!address_ok(regs, ea, 8)) + goto ldst_done; + err = 0; + __put_user_asmx(regs->gpr[rd], ea, err, "stdcx.", cr); + if (!err) + regs->ccr = (regs->ccr & 0x0fffffff) | + (cr & 0xe0000000) | + ((regs->xer >> 3) & 0x10000000); + goto ldst_done; + + case 21: /* ldx */ + case 53: /* ldux */ + err = read_mem(®s->gpr[rd], xform_ea(instr, regs, u), + 8, regs); + goto ldst_done; +#endif + + case 23: /* lwzx */ + case 55: /* lwzux */ + err = read_mem(®s->gpr[rd], xform_ea(instr, regs, u), + 4, regs); + goto ldst_done; + + case 87: /* lbzx */ + case 119: /* lbzux */ + err = read_mem(®s->gpr[rd], xform_ea(instr, regs, u), + 1, regs); + goto ldst_done; + +#ifdef CONFIG_ALTIVEC + case 103: /* lvx */ + case 359: /* lvxl */ + if (!(regs->msr & MSR_VEC)) + break; + ea = xform_ea(instr, regs, 0); + err = do_vec_load(rd, do_lvx, ea, regs); + goto ldst_done; + + case 231: /* stvx */ + case 487: /* stvxl */ + if (!(regs->msr & MSR_VEC)) + break; + ea = xform_ea(instr, regs, 0); + err = do_vec_store(rd, do_stvx, ea, regs); + goto ldst_done; +#endif /* CONFIG_ALTIVEC */ + +#ifdef __powerpc64__ + case 149: /* stdx */ + case 181: /* stdux */ + val = regs->gpr[rd]; + err = write_mem(val, xform_ea(instr, regs, u), 8, regs); + goto ldst_done; +#endif + + case 151: /* stwx */ + case 183: /* stwux */ + val = regs->gpr[rd]; + err = write_mem(val, xform_ea(instr, regs, u), 4, regs); + goto ldst_done; + + case 215: /* stbx */ + case 247: /* stbux */ + val = regs->gpr[rd]; + err = write_mem(val, xform_ea(instr, regs, u), 1, regs); + goto ldst_done; + + case 279: /* lhzx */ + case 311: /* lhzux */ + err = read_mem(®s->gpr[rd], xform_ea(instr, regs, u), + 2, regs); + goto ldst_done; + +#ifdef __powerpc64__ + case 341: /* lwax */ + case 373: /* lwaux */ + err = read_mem(®s->gpr[rd], xform_ea(instr, regs, u), + 4, regs); + if (!err) + regs->gpr[rd] = (signed int) regs->gpr[rd]; + goto ldst_done; +#endif + + case 343: /* lhax */ + case 375: /* lhaux */ + err = read_mem(®s->gpr[rd], xform_ea(instr, regs, u), + 2, regs); + if (!err) + regs->gpr[rd] = (signed short) regs->gpr[rd]; + goto ldst_done; + + case 407: /* sthx */ + case 439: /* sthux */ + val = regs->gpr[rd]; + err = write_mem(val, xform_ea(instr, regs, u), 2, regs); + goto ldst_done; + +#ifdef __powerpc64__ + case 532: /* ldbrx */ + err = read_mem(&val, xform_ea(instr, regs, 0), 8, regs); + if (!err) + regs->gpr[rd] = byterev_8(val); + goto ldst_done; + +#endif + + case 534: /* lwbrx */ + err = read_mem(&val, xform_ea(instr, regs, 0), 4, regs); + if (!err) + regs->gpr[rd] = byterev_4(val); + goto ldst_done; + +#ifdef CONFIG_PPC_FPU + case 535: /* lfsx */ + case 567: /* lfsux */ + if (!(regs->msr & MSR_FP)) + break; + ea = xform_ea(instr, regs, u); + err = do_fp_load(rd, do_lfs, ea, 4, regs); + goto ldst_done; + + case 599: /* lfdx */ + case 631: /* lfdux */ + if (!(regs->msr & MSR_FP)) + break; + ea = xform_ea(instr, regs, u); + err = do_fp_load(rd, do_lfd, ea, 8, regs); + goto ldst_done; + + case 663: /* stfsx */ + case 695: /* stfsux */ + if (!(regs->msr & MSR_FP)) + break; + ea = xform_ea(instr, regs, u); + err = do_fp_store(rd, do_stfs, ea, 4, regs); + goto ldst_done; + + case 727: /* stfdx */ + case 759: /* stfdux */ + if (!(regs->msr & MSR_FP)) + break; + ea = xform_ea(instr, regs, u); + err = do_fp_store(rd, do_stfd, ea, 8, regs); + goto ldst_done; +#endif + +#ifdef __powerpc64__ + case 660: /* stdbrx */ + val = byterev_8(regs->gpr[rd]); + err = write_mem(val, xform_ea(instr, regs, 0), 8, regs); + goto ldst_done; + +#endif + case 662: /* stwbrx */ + val = byterev_4(regs->gpr[rd]); + err = write_mem(val, xform_ea(instr, regs, 0), 4, regs); + goto ldst_done; + + case 790: /* lhbrx */ + err = read_mem(&val, xform_ea(instr, regs, 0), 2, regs); + if (!err) + regs->gpr[rd] = byterev_2(val); + goto ldst_done; + + case 918: /* sthbrx */ + val = byterev_2(regs->gpr[rd]); + err = write_mem(val, xform_ea(instr, regs, 0), 2, regs); + goto ldst_done; + +#ifdef CONFIG_VSX + case 844: /* lxvd2x */ + case 876: /* lxvd2ux */ + if (!(regs->msr & MSR_VSX)) + break; + rd |= (instr & 1) << 5; + ea = xform_ea(instr, regs, u); + err = do_vsx_load(rd, do_lxvd2x, ea, regs); + goto ldst_done; + + case 972: /* stxvd2x */ + case 1004: /* stxvd2ux */ + if (!(regs->msr & MSR_VSX)) + break; + rd |= (instr & 1) << 5; + ea = xform_ea(instr, regs, u); + err = do_vsx_store(rd, do_stxvd2x, ea, regs); + goto ldst_done; + +#endif /* CONFIG_VSX */ + } + break; + + case 32: /* lwz */ + case 33: /* lwzu */ + err = read_mem(®s->gpr[rd], dform_ea(instr, regs), 4, regs); + goto ldst_done; + + case 34: /* lbz */ + case 35: /* lbzu */ + err = read_mem(®s->gpr[rd], dform_ea(instr, regs), 1, regs); + goto ldst_done; + + case 36: /* stw */ + val = regs->gpr[rd]; + err = write_mem(val, dform_ea(instr, regs), 4, regs); + goto ldst_done; + + case 37: /* stwu */ + val = regs->gpr[rd]; + val3 = dform_ea(instr, regs); + /* + * For PPC32 we always use stwu to change stack point with r1. So + * this emulated store may corrupt the exception frame, now we + * have to provide the exception frame trampoline, which is pushed + * below the kprobed function stack. So we only update gpr[1] but + * don't emulate the real store operation. We will do real store + * operation safely in exception return code by checking this flag. + */ + if ((ra == 1) && !(regs->msr & MSR_PR) \ + && (val3 >= (regs->gpr[1] - STACK_INT_FRAME_SIZE))) { +#ifdef CONFIG_PPC32 + /* + * Check if we will touch kernel sack overflow + */ + if (val3 - STACK_INT_FRAME_SIZE <= current->thread.ksp_limit) { + printk(KERN_CRIT "Can't kprobe this since Kernel stack overflow.\n"); + err = -EINVAL; + break; + } +#endif /* CONFIG_PPC32 */ + /* + * Check if we already set since that means we'll + * lose the previous value. + */ + WARN_ON(test_thread_flag(TIF_EMULATE_STACK_STORE)); + set_thread_flag(TIF_EMULATE_STACK_STORE); + err = 0; + } else + err = write_mem(val, val3, 4, regs); + goto ldst_done; + + case 38: /* stb */ + case 39: /* stbu */ + val = regs->gpr[rd]; + err = write_mem(val, dform_ea(instr, regs), 1, regs); + goto ldst_done; + + case 40: /* lhz */ + case 41: /* lhzu */ + err = read_mem(®s->gpr[rd], dform_ea(instr, regs), 2, regs); + goto ldst_done; + + case 42: /* lha */ + case 43: /* lhau */ + err = read_mem(®s->gpr[rd], dform_ea(instr, regs), 2, regs); + if (!err) + regs->gpr[rd] = (signed short) regs->gpr[rd]; + goto ldst_done; + + case 44: /* sth */ + case 45: /* sthu */ + val = regs->gpr[rd]; + err = write_mem(val, dform_ea(instr, regs), 2, regs); + goto ldst_done; + + case 46: /* lmw */ + ra = (instr >> 16) & 0x1f; + if (ra >= rd) + break; /* invalid form, ra in range to load */ + ea = dform_ea(instr, regs); + do { + err = read_mem(®s->gpr[rd], ea, 4, regs); + if (err) + return 0; + ea += 4; + } while (++rd < 32); + goto instr_done; + + case 47: /* stmw */ + ea = dform_ea(instr, regs); + do { + err = write_mem(regs->gpr[rd], ea, 4, regs); + if (err) + return 0; + ea += 4; + } while (++rd < 32); + goto instr_done; + +#ifdef CONFIG_PPC_FPU + case 48: /* lfs */ + case 49: /* lfsu */ + if (!(regs->msr & MSR_FP)) + break; + ea = dform_ea(instr, regs); + err = do_fp_load(rd, do_lfs, ea, 4, regs); + goto ldst_done; + + case 50: /* lfd */ + case 51: /* lfdu */ + if (!(regs->msr & MSR_FP)) + break; + ea = dform_ea(instr, regs); + err = do_fp_load(rd, do_lfd, ea, 8, regs); + goto ldst_done; + + case 52: /* stfs */ + case 53: /* stfsu */ + if (!(regs->msr & MSR_FP)) + break; + ea = dform_ea(instr, regs); + err = do_fp_store(rd, do_stfs, ea, 4, regs); + goto ldst_done; + + case 54: /* stfd */ + case 55: /* stfdu */ + if (!(regs->msr & MSR_FP)) + break; + ea = dform_ea(instr, regs); + err = do_fp_store(rd, do_stfd, ea, 8, regs); + goto ldst_done; +#endif + +#ifdef __powerpc64__ + case 58: /* ld[u], lwa */ + switch (instr & 3) { + case 0: /* ld */ + err = read_mem(®s->gpr[rd], dsform_ea(instr, regs), + 8, regs); + goto ldst_done; + case 1: /* ldu */ + err = read_mem(®s->gpr[rd], dsform_ea(instr, regs), + 8, regs); + goto ldst_done; + case 2: /* lwa */ + err = read_mem(®s->gpr[rd], dsform_ea(instr, regs), + 4, regs); + if (!err) + regs->gpr[rd] = (signed int) regs->gpr[rd]; + goto ldst_done; + } + break; + + case 62: /* std[u] */ + val = regs->gpr[rd]; + switch (instr & 3) { + case 0: /* std */ + err = write_mem(val, dsform_ea(instr, regs), 8, regs); + goto ldst_done; + case 1: /* stdu */ + err = write_mem(val, dsform_ea(instr, regs), 8, regs); + goto ldst_done; + } + break; +#endif /* __powerpc64__ */ + + } + err = -EINVAL; + + ldst_done: + if (err) { + regs->gpr[ra] = old_ra; + return 0; /* invoke DSI if -EFAULT? */ + } + instr_done: + regs->nip = truncate_if_32bit(regs->msr, regs->nip + 4); + return 1; + + logical_done: + if (instr & 1) + set_cr0(regs, ra); + goto instr_done; + + arith_done: + if (instr & 1) + set_cr0(regs, rd); + goto instr_done; } diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S index 64e2e499e32..1b5a0a09d60 100644 --- a/arch/powerpc/lib/string.S +++ b/arch/powerpc/lib/string.S @@ -28,7 +28,7 @@ _GLOBAL(strcpy) /* This clears out any unused part of the destination buffer, just as the libc version does. -- paulus */ _GLOBAL(strncpy) - cmpwi 0,r5,0 + PPC_LCMPI 0,r5,0 beqlr mtctr r5 addi r6,r3,-1 @@ -39,7 +39,7 @@ _GLOBAL(strncpy) bdnzf 2,1b /* dec ctr, branch if ctr != 0 && !cr0.eq */ bnelr /* if we didn't hit a null char, we're done */ mfctr r5 - cmpwi 0,r5,0 /* any space left in destination buffer? */ + PPC_LCMPI 0,r5,0 /* any space left in destination buffer? */ beqlr /* we know r0 == 0 here */ 2: stbu r0,1(r6) /* clear it out if so */ bdnz 2b @@ -70,8 +70,8 @@ _GLOBAL(strcmp) blr _GLOBAL(strncmp) - PPC_LCMPI r5,0 - beqlr + PPC_LCMPI 0,r5,0 + beq- 2f mtctr r5 addi r5,r3,-1 addi r4,r4,-1 @@ -82,6 +82,8 @@ _GLOBAL(strncmp) beqlr 1 bdnzt eq,1b blr +2: li r3,0 + blr _GLOBAL(strlen) addi r4,r3,-1 @@ -92,8 +94,8 @@ _GLOBAL(strlen) blr _GLOBAL(memcmp) - cmpwi 0,r5,0 - ble- 2f + PPC_LCMPI 0,r5,0 + beq- 2f mtctr r5 addi r6,r3,-1 addi r4,r4,-1 @@ -106,8 +108,8 @@ _GLOBAL(memcmp) blr _GLOBAL(memchr) - cmpwi 0,r5,0 - ble- 2f + PPC_LCMPI 0,r5,0 + beq- 2f mtctr r5 addi r3,r3,-1 1: lbzu r0,1(r3) @@ -117,6 +119,7 @@ _GLOBAL(memchr) 2: li r3,0 blr +#ifdef CONFIG_PPC32 _GLOBAL(__clear_user) addi r6,r3,-4 li r3,0 @@ -158,48 +161,4 @@ _GLOBAL(__clear_user) PPC_LONG 1b,91b PPC_LONG 8b,92b .text - -_GLOBAL(__strncpy_from_user) - addi r6,r3,-1 - addi r4,r4,-1 - cmpwi 0,r5,0 - beq 2f - mtctr r5 -1: lbzu r0,1(r4) - cmpwi 0,r0,0 - stbu r0,1(r6) - bdnzf 2,1b /* dec ctr, branch if ctr != 0 && !cr0.eq */ - beq 3f -2: addi r6,r6,1 -3: subf r3,r3,r6 - blr -99: li r3,-EFAULT - blr - - .section __ex_table,"a" - PPC_LONG 1b,99b - .text - -/* r3 = str, r4 = len (> 0), r5 = top (highest addr) */ -_GLOBAL(__strnlen_user) - addi r7,r3,-1 - subf r6,r7,r5 /* top+1 - str */ - cmplw 0,r4,r6 - bge 0f - mr r6,r4 -0: mtctr r6 /* ctr = min(len, top - str) */ -1: lbzu r0,1(r7) /* get next byte */ - cmpwi 0,r0,0 - bdnzf 2,1b /* loop if --ctr != 0 && byte != 0 */ - addi r7,r7,1 - subf r3,r3,r7 /* number of bytes we have looked at */ - beqlr /* return if we found a 0 byte */ - cmpw 0,r3,r4 /* did we look at all len bytes? */ - blt 99f /* if not, must have hit top */ - addi r3,r4,1 /* return len + 1 to indicate no null found */ - blr -99: li r3,0 /* bad address, return 0 */ - blr - - .section __ex_table,"a" - PPC_LONG 1b,99b +#endif diff --git a/arch/powerpc/lib/string_64.S b/arch/powerpc/lib/string_64.S new file mode 100644 index 00000000000..7bd9549a90a --- /dev/null +++ b/arch/powerpc/lib/string_64.S @@ -0,0 +1,202 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ + +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> + + .section ".toc","aw" +PPC64_CACHES: + .tc ppc64_caches[TC],ppc64_caches + .section ".text" + +/** + * __clear_user: - Zero a block of memory in user space, with less checking. + * @to: Destination address, in user space. + * @n: Number of bytes to zero. + * + * Zero a block of memory in user space. Caller must check + * the specified block with access_ok() before calling this function. + * + * Returns number of bytes that could not be cleared. + * On success, this will be zero. + */ + + .macro err1 +100: + .section __ex_table,"a" + .align 3 + .llong 100b,.Ldo_err1 + .previous + .endm + + .macro err2 +200: + .section __ex_table,"a" + .align 3 + .llong 200b,.Ldo_err2 + .previous + .endm + + .macro err3 +300: + .section __ex_table,"a" + .align 3 + .llong 300b,.Ldo_err3 + .previous + .endm + +.Ldo_err1: + mr r3,r8 + +.Ldo_err2: + mtctr r4 +1: +err3; stb r0,0(r3) + addi r3,r3,1 + addi r4,r4,-1 + bdnz 1b + +.Ldo_err3: + mr r3,r4 + blr + +_GLOBAL_TOC(__clear_user) + cmpdi r4,32 + neg r6,r3 + li r0,0 + blt .Lshort_clear + mr r8,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-3) + + /* Get the destination 8 byte aligned */ + bf cr7*4+3,1f +err1; stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f +err1; sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f +err1; stw r0,0(r3) + addi r3,r3,4 + +3: sub r4,r4,r6 + + cmpdi r4,32 + cmpdi cr1,r4,512 + blt .Lshort_clear + bgt cr1,.Llong_clear + +.Lmedium_clear: + srdi r6,r4,5 + mtctr r6 + + /* Do 32 byte chunks */ +4: +err2; std r0,0(r3) +err2; std r0,8(r3) +err2; std r0,16(r3) +err2; std r0,24(r3) + addi r3,r3,32 + addi r4,r4,-32 + bdnz 4b + +.Lshort_clear: + /* up to 31 bytes to go */ + cmpdi r4,16 + blt 6f +err2; std r0,0(r3) +err2; std r0,8(r3) + addi r3,r3,16 + addi r4,r4,-16 + + /* Up to 15 bytes to go */ +6: mr r8,r3 + clrldi r4,r4,(64-4) + mtocrf 0x01,r4 + bf cr7*4+0,7f +err1; std r0,0(r3) + addi r3,r3,8 + +7: bf cr7*4+1,8f +err1; stw r0,0(r3) + addi r3,r3,4 + +8: bf cr7*4+2,9f +err1; sth r0,0(r3) + addi r3,r3,2 + +9: bf cr7*4+3,10f +err1; stb r0,0(r3) + +10: li r3,0 + blr + +.Llong_clear: + ld r5,PPC64_CACHES@toc(r2) + + bf cr7*4+0,11f +err2; std r0,0(r3) + addi r3,r3,8 + addi r4,r4,-8 + + /* Destination is 16 byte aligned, need to get it cacheline aligned */ +11: lwz r7,DCACHEL1LOGLINESIZE(r5) + lwz r9,DCACHEL1LINESIZE(r5) + + /* + * With worst case alignment the long clear loop takes a minimum + * of 1 byte less than 2 cachelines. + */ + sldi r10,r9,2 + cmpd r4,r10 + blt .Lmedium_clear + + neg r6,r3 + addi r10,r9,-1 + and. r5,r6,r10 + beq 13f + + srdi r6,r5,4 + mtctr r6 + mr r8,r3 +12: +err1; std r0,0(r3) +err1; std r0,8(r3) + addi r3,r3,16 + bdnz 12b + + sub r4,r4,r5 + +13: srd r6,r4,r7 + mtctr r6 + mr r8,r3 +14: +err1; dcbz r0,r3 + add r3,r3,r9 + bdnz 14b + + and r4,r4,r10 + + cmpdi r4,32 + blt .Lshort_clear + b .Lmedium_clear diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c new file mode 100644 index 00000000000..3cf529ceec5 --- /dev/null +++ b/arch/powerpc/lib/vmx-helper.c @@ -0,0 +1,74 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2011 + * + * Authors: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com> + * Anton Blanchard <anton@au.ibm.com> + */ +#include <linux/uaccess.h> +#include <linux/hardirq.h> +#include <asm/switch_to.h> + +int enter_vmx_usercopy(void) +{ + if (in_interrupt()) + return 0; + + /* This acts as preempt_disable() as well and will make + * enable_kernel_altivec(). We need to disable page faults + * as they can call schedule and thus make us lose the VMX + * context. So on page faults, we just fail which will cause + * a fallback to the normal non-vmx copy. + */ + pagefault_disable(); + + enable_kernel_altivec(); + + return 1; +} + +/* + * This function must return 0 because we tail call optimise when calling + * from __copy_tofrom_user_power7 which returns 0 on success. + */ +int exit_vmx_usercopy(void) +{ + pagefault_enable(); + return 0; +} + +int enter_vmx_copy(void) +{ + if (in_interrupt()) + return 0; + + preempt_disable(); + + enable_kernel_altivec(); + + return 1; +} + +/* + * All calls to this function will be optimised into tail calls. We are + * passed a pointer to the destination which we return as required by a + * memcpy implementation. + */ +void *exit_vmx_copy(void *dest) +{ + preempt_enable(); + return dest; +} diff --git a/arch/powerpc/lib/xor_vmx.c b/arch/powerpc/lib/xor_vmx.c new file mode 100644 index 00000000000..e905f7c2ea7 --- /dev/null +++ b/arch/powerpc/lib/xor_vmx.c @@ -0,0 +1,177 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <altivec.h> + +#include <linux/preempt.h> +#include <linux/export.h> +#include <linux/sched.h> +#include <asm/switch_to.h> + +typedef vector signed char unative_t; + +#define DEFINE(V) \ + unative_t *V = (unative_t *)V##_in; \ + unative_t V##_0, V##_1, V##_2, V##_3 + +#define LOAD(V) \ + do { \ + V##_0 = V[0]; \ + V##_1 = V[1]; \ + V##_2 = V[2]; \ + V##_3 = V[3]; \ + } while (0) + +#define STORE(V) \ + do { \ + V[0] = V##_0; \ + V[1] = V##_1; \ + V[2] = V##_2; \ + V[3] = V##_3; \ + } while (0) + +#define XOR(V1, V2) \ + do { \ + V1##_0 = vec_xor(V1##_0, V2##_0); \ + V1##_1 = vec_xor(V1##_1, V2##_1); \ + V1##_2 = vec_xor(V1##_2, V2##_2); \ + V1##_3 = vec_xor(V1##_3, V2##_3); \ + } while (0) + +void xor_altivec_2(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in) +{ + DEFINE(v1); + DEFINE(v2); + unsigned long lines = bytes / (sizeof(unative_t)) / 4; + + preempt_disable(); + enable_kernel_altivec(); + + do { + LOAD(v1); + LOAD(v2); + XOR(v1, v2); + STORE(v1); + + v1 += 4; + v2 += 4; + } while (--lines > 0); + + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_2); + +void xor_altivec_3(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in) +{ + DEFINE(v1); + DEFINE(v2); + DEFINE(v3); + unsigned long lines = bytes / (sizeof(unative_t)) / 4; + + preempt_disable(); + enable_kernel_altivec(); + + do { + LOAD(v1); + LOAD(v2); + LOAD(v3); + XOR(v1, v2); + XOR(v1, v3); + STORE(v1); + + v1 += 4; + v2 += 4; + v3 += 4; + } while (--lines > 0); + + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_3); + +void xor_altivec_4(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in, + unsigned long *v4_in) +{ + DEFINE(v1); + DEFINE(v2); + DEFINE(v3); + DEFINE(v4); + unsigned long lines = bytes / (sizeof(unative_t)) / 4; + + preempt_disable(); + enable_kernel_altivec(); + + do { + LOAD(v1); + LOAD(v2); + LOAD(v3); + LOAD(v4); + XOR(v1, v2); + XOR(v3, v4); + XOR(v1, v3); + STORE(v1); + + v1 += 4; + v2 += 4; + v3 += 4; + v4 += 4; + } while (--lines > 0); + + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_4); + +void xor_altivec_5(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in, + unsigned long *v4_in, unsigned long *v5_in) +{ + DEFINE(v1); + DEFINE(v2); + DEFINE(v3); + DEFINE(v4); + DEFINE(v5); + unsigned long lines = bytes / (sizeof(unative_t)) / 4; + + preempt_disable(); + enable_kernel_altivec(); + + do { + LOAD(v1); + LOAD(v2); + LOAD(v3); + LOAD(v4); + LOAD(v5); + XOR(v1, v2); + XOR(v3, v4); + XOR(v1, v5); + XOR(v1, v3); + STORE(v1); + + v1 += 4; + v2 += 4; + v3 += 4; + v4 += 4; + v5 += 4; + } while (--lines > 0); + + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_5); |
