diff options
Diffstat (limited to 'arch/powerpc/lib')
27 files changed, 7147 insertions, 914 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 4bb023f4c86..59fa2de9546 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -2,24 +2,41 @@ # Makefile for ppc-specific library files.. # -ifeq ($(CONFIG_PPC64),y) -EXTRA_CFLAGS += -mno-minimal-toc -endif +subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror + +ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) + +CFLAGS_REMOVE_code-patching.o = -pg +CFLAGS_REMOVE_feature-fixups.o = -pg -ifeq ($(CONFIG_PPC_MERGE),y) obj-y := string.o alloc.o \ - checksum_$(CONFIG_WORD_SIZE).o + crtsavres.o obj-$(CONFIG_PPC32) += div64.o copy_32.o -endif +obj-$(CONFIG_HAS_IOMEM) += devres.o obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \ - memcpy_64.o usercopy_64.o mem_64.o string.o -obj-$(CONFIG_XMON) += sstep.o -obj-$(CONFIG_KPROBES) += sstep.o -obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o + usercopy_64.o mem_64.o string.o \ + hweight_64.o \ + copyuser_power7.o string_64.o copypage_power7.o +ifeq ($(CONFIG_GENERIC_CSUM),) +obj-y += checksum_$(CONFIG_WORD_SIZE).o +obj-$(CONFIG_PPC64) += checksum_wrappers_64.o +endif + +obj-$(CONFIG_PPC64) += memcpy_power7.o memcpy_64.o + +obj-$(CONFIG_PPC_EMULATE_SSTEP) += sstep.o ldstfp.o ifeq ($(CONFIG_PPC64),y) obj-$(CONFIG_SMP) += locks.o +obj-$(CONFIG_ALTIVEC) += vmx-helper.o endif obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o + +obj-y += code-patching.o +obj-y += feature-fixups.o +obj-$(CONFIG_FTR_FIXUP_SELFTEST) += feature-fixups-test.o + +obj-$(CONFIG_ALTIVEC) += xor_vmx.o +CFLAGS_xor_vmx.o += -maltivec -mabi=altivec diff --git a/arch/powerpc/lib/alloc.c b/arch/powerpc/lib/alloc.c index f53e09c7dac..da22c84a8fe 100644 --- a/arch/powerpc/lib/alloc.c +++ b/arch/powerpc/lib/alloc.c @@ -3,16 +3,8 @@ #include <linux/slab.h> #include <linux/bootmem.h> #include <linux/string.h> +#include <asm/setup.h> -#include <asm/system.h> - -void * __init_refok alloc_maybe_bootmem(size_t size, gfp_t mask) -{ - if (mem_init_done) - return kmalloc(size, mask); - else - return alloc_bootmem(size); -} void * __init_refok zalloc_maybe_bootmem(size_t size, gfp_t mask) { diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S index ef96c6c58ef..57a07206505 100644 --- a/arch/powerpc/lib/checksum_64.S +++ b/arch/powerpc/lib/checksum_64.S @@ -69,161 +69,412 @@ _GLOBAL(csum_tcpudp_magic) * Computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit). * - * This code assumes at least halfword alignment, though the length - * can be any number of bytes. The sum is accumulated in r5. - * * csum_partial(r3=buff, r4=len, r5=sum) */ _GLOBAL(csum_partial) - subi r3,r3,8 /* we'll offset by 8 for the loads */ - srdi. r6,r4,3 /* divide by 8 for doubleword count */ - addic r5,r5,0 /* clear carry */ - beq 3f /* if we're doing < 8 bytes */ - andi. r0,r3,2 /* aligned on a word boundary already? */ - beq+ 1f - lhz r6,8(r3) /* do 2 bytes to get aligned */ - addi r3,r3,2 - subi r4,r4,2 - addc r5,r5,r6 - srdi. r6,r4,3 /* recompute number of doublewords */ - beq 3f /* any left? */ -1: mtctr r6 -2: ldu r6,8(r3) /* main sum loop */ - adde r5,r5,r6 - bdnz 2b - andi. r4,r4,7 /* compute bytes left to sum after doublewords */ -3: cmpwi 0,r4,4 /* is at least a full word left? */ - blt 4f - lwz r6,8(r3) /* sum this word */ + addic r0,r5,0 /* clear carry */ + + srdi. r6,r4,3 /* less than 8 bytes? */ + beq .Lcsum_tail_word + + /* + * If only halfword aligned, align to a double word. Since odd + * aligned addresses should be rare and they would require more + * work to calculate the correct checksum, we ignore that case + * and take the potential slowdown of unaligned loads. + */ + rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ + beq .Lcsum_aligned + + li r7,4 + sub r6,r7,r6 + mtctr r6 + +1: + lhz r6,0(r3) /* align to doubleword */ + subi r4,r4,2 + addi r3,r3,2 + adde r0,r0,r6 + bdnz 1b + +.Lcsum_aligned: + /* + * We unroll the loop such that each iteration is 64 bytes with an + * entry and exit limb of 64 bytes, meaning a minimum size of + * 128 bytes. + */ + srdi. r6,r4,7 + beq .Lcsum_tail_doublewords /* len < 128 */ + + srdi r6,r4,6 + subi r6,r6,1 + mtctr r6 + + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + + ld r6,0(r3) + ld r9,8(r3) + + ld r10,16(r3) + ld r11,24(r3) + + /* + * On POWER6 and POWER7 back to back addes take 2 cycles because of + * the XER dependency. This means the fastest this loop can go is + * 16 cycles per iteration. The scheduling of the loop below has + * been shown to hit this on both POWER6 and POWER7. + */ + .align 5 +2: + adde r0,r0,r6 + ld r12,32(r3) + ld r14,40(r3) + + adde r0,r0,r9 + ld r15,48(r3) + ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 + + adde r0,r0,r11 + + adde r0,r0,r12 + + adde r0,r0,r14 + + adde r0,r0,r15 + ld r6,0(r3) + ld r9,8(r3) + + adde r0,r0,r16 + ld r10,16(r3) + ld r11,24(r3) + bdnz 2b + + + adde r0,r0,r6 + ld r12,32(r3) + ld r14,40(r3) + + adde r0,r0,r9 + ld r15,48(r3) + ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 + adde r0,r0,r11 + adde r0,r0,r12 + adde r0,r0,r14 + adde r0,r0,r15 + adde r0,r0,r16 + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + addi r1,r1,STACKFRAMESIZE + + andi. r4,r4,63 + +.Lcsum_tail_doublewords: /* Up to 127 bytes to go */ + srdi. r6,r4,3 + beq .Lcsum_tail_word + + mtctr r6 +3: + ld r6,0(r3) + addi r3,r3,8 + adde r0,r0,r6 + bdnz 3b + + andi. r4,r4,7 + +.Lcsum_tail_word: /* Up to 7 bytes to go */ + srdi. r6,r4,2 + beq .Lcsum_tail_halfword + + lwz r6,0(r3) addi r3,r3,4 + adde r0,r0,r6 subi r4,r4,4 - adde r5,r5,r6 -4: cmpwi 0,r4,2 /* is at least a halfword left? */ - blt+ 5f - lhz r6,8(r3) /* sum this halfword */ - addi r3,r3,2 - subi r4,r4,2 - adde r5,r5,r6 -5: cmpwi 0,r4,1 /* is at least a byte left? */ - bne+ 6f - lbz r6,8(r3) /* sum this byte */ - slwi r6,r6,8 /* this byte is assumed to be the upper byte of a halfword */ - adde r5,r5,r6 -6: addze r5,r5 /* add in final carry */ - rldicl r4,r5,32,0 /* fold two 32-bit halves together */ - add r3,r4,r5 - srdi r3,r3,32 - blr + +.Lcsum_tail_halfword: /* Up to 3 bytes to go */ + srdi. r6,r4,1 + beq .Lcsum_tail_byte + + lhz r6,0(r3) + addi r3,r3,2 + adde r0,r0,r6 + subi r4,r4,2 + +.Lcsum_tail_byte: /* Up to 1 byte to go */ + andi. r6,r4,1 + beq .Lcsum_finish + + lbz r6,0(r3) + sldi r9,r6,8 /* Pad the byte out to 16 bits */ + adde r0,r0,r9 + +.Lcsum_finish: + addze r0,r0 /* add in final carry */ + rldicl r4,r0,32,0 /* fold two 32 bit halves together */ + add r3,r4,r0 + srdi r3,r3,32 + blr + + + .macro srcnr +100: + .section __ex_table,"a" + .align 3 + .llong 100b,.Lsrc_error_nr + .previous + .endm + + .macro source +150: + .section __ex_table,"a" + .align 3 + .llong 150b,.Lsrc_error + .previous + .endm + + .macro dstnr +200: + .section __ex_table,"a" + .align 3 + .llong 200b,.Ldest_error_nr + .previous + .endm + + .macro dest +250: + .section __ex_table,"a" + .align 3 + .llong 250b,.Ldest_error + .previous + .endm /* * Computes the checksum of a memory block at src, length len, * and adds in "sum" (32-bit), while copying the block to dst. * If an access exception occurs on src or dst, it stores -EFAULT - * to *src_err or *dst_err respectively, and (for an error on - * src) zeroes the rest of dst. - * - * This code needs to be reworked to take advantage of 64 bit sum+copy. - * However, due to tokenring halfword alignment problems this will be very - * tricky. For now we'll leave it until we instrument it somehow. + * to *src_err or *dst_err respectively. The caller must take any action + * required in this case (zeroing memory, recalculating partial checksum etc). * * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) */ _GLOBAL(csum_partial_copy_generic) - addic r0,r6,0 - subi r3,r3,4 - subi r4,r4,4 - srwi. r6,r5,2 - beq 3f /* if we're doing < 4 bytes */ - andi. r9,r4,2 /* Align dst to longword boundary */ - beq+ 1f -81: lhz r6,4(r3) /* do 2 bytes to get aligned */ - addi r3,r3,2 + addic r0,r6,0 /* clear carry */ + + srdi. r6,r5,3 /* less than 8 bytes? */ + beq .Lcopy_tail_word + + /* + * If only halfword aligned, align to a double word. Since odd + * aligned addresses should be rare and they would require more + * work to calculate the correct checksum, we ignore that case + * and take the potential slowdown of unaligned loads. + * + * If the source and destination are relatively unaligned we only + * align the source. This keeps things simple. + */ + rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ + beq .Lcopy_aligned + + li r9,4 + sub r6,r9,r6 + mtctr r6 + +1: +srcnr; lhz r6,0(r3) /* align to doubleword */ subi r5,r5,2 -91: sth r6,4(r4) - addi r4,r4,2 - addc r0,r0,r6 - srwi. r6,r5,2 /* # words to do */ - beq 3f -1: mtctr r6 -82: lwzu r6,4(r3) /* the bdnz has zero overhead, so it should */ -92: stwu r6,4(r4) /* be unnecessary to unroll this loop */ - adde r0,r0,r6 - bdnz 82b - andi. r5,r5,3 -3: cmpwi 0,r5,2 - blt+ 4f -83: lhz r6,4(r3) addi r3,r3,2 - subi r5,r5,2 -93: sth r6,4(r4) + adde r0,r0,r6 +dstnr; sth r6,0(r4) addi r4,r4,2 + bdnz 1b + +.Lcopy_aligned: + /* + * We unroll the loop such that each iteration is 64 bytes with an + * entry and exit limb of 64 bytes, meaning a minimum size of + * 128 bytes. + */ + srdi. r6,r5,7 + beq .Lcopy_tail_doublewords /* len < 128 */ + + srdi r6,r5,6 + subi r6,r6,1 + mtctr r6 + + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + +source; ld r6,0(r3) +source; ld r9,8(r3) + +source; ld r10,16(r3) +source; ld r11,24(r3) + + /* + * On POWER6 and POWER7 back to back addes take 2 cycles because of + * the XER dependency. This means the fastest this loop can go is + * 16 cycles per iteration. The scheduling of the loop below has + * been shown to hit this on both POWER6 and POWER7. + */ + .align 5 +2: adde r0,r0,r6 -4: cmpwi 0,r5,1 - bne+ 5f -84: lbz r6,4(r3) -94: stb r6,4(r4) - slwi r6,r6,8 /* Upper byte of word */ +source; ld r12,32(r3) +source; ld r14,40(r3) + + adde r0,r0,r9 +source; ld r15,48(r3) +source; ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 +dest; std r6,0(r4) +dest; std r9,8(r4) + + adde r0,r0,r11 +dest; std r10,16(r4) +dest; std r11,24(r4) + + adde r0,r0,r12 +dest; std r12,32(r4) +dest; std r14,40(r4) + + adde r0,r0,r14 +dest; std r15,48(r4) +dest; std r16,56(r4) + addi r4,r4,64 + + adde r0,r0,r15 +source; ld r6,0(r3) +source; ld r9,8(r3) + + adde r0,r0,r16 +source; ld r10,16(r3) +source; ld r11,24(r3) + bdnz 2b + + adde r0,r0,r6 -5: addze r3,r0 /* add in final carry (unlikely with 64-bit regs) */ - rldicl r4,r3,32,0 /* fold 64 bit value */ - add r3,r4,r3 - srdi r3,r3,32 - blr +source; ld r12,32(r3) +source; ld r14,40(r3) -/* These shouldn't go in the fixup section, since that would - cause the ex_table addresses to get out of order. */ + adde r0,r0,r9 +source; ld r15,48(r3) +source; ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 +dest; std r6,0(r4) +dest; std r9,8(r4) + + adde r0,r0,r11 +dest; std r10,16(r4) +dest; std r11,24(r4) + + adde r0,r0,r12 +dest; std r12,32(r4) +dest; std r14,40(r4) + + adde r0,r0,r14 +dest; std r15,48(r4) +dest; std r16,56(r4) + addi r4,r4,64 + + adde r0,r0,r15 + adde r0,r0,r16 + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + addi r1,r1,STACKFRAMESIZE + + andi. r5,r5,63 + +.Lcopy_tail_doublewords: /* Up to 127 bytes to go */ + srdi. r6,r5,3 + beq .Lcopy_tail_word - .globl src_error_1 -src_error_1: - li r6,0 - subi r5,r5,2 -95: sth r6,4(r4) - addi r4,r4,2 - srwi. r6,r5,2 - beq 3f mtctr r6 - .globl src_error_2 -src_error_2: - li r6,0 -96: stwu r6,4(r4) - bdnz 96b -3: andi. r5,r5,3 - beq src_error - .globl src_error_3 -src_error_3: - li r6,0 - mtctr r5 - addi r4,r4,3 -97: stbu r6,1(r4) - bdnz 97b - .globl src_error -src_error: +3: +srcnr; ld r6,0(r3) + addi r3,r3,8 + adde r0,r0,r6 +dstnr; std r6,0(r4) + addi r4,r4,8 + bdnz 3b + + andi. r5,r5,7 + +.Lcopy_tail_word: /* Up to 7 bytes to go */ + srdi. r6,r5,2 + beq .Lcopy_tail_halfword + +srcnr; lwz r6,0(r3) + addi r3,r3,4 + adde r0,r0,r6 +dstnr; stw r6,0(r4) + addi r4,r4,4 + subi r5,r5,4 + +.Lcopy_tail_halfword: /* Up to 3 bytes to go */ + srdi. r6,r5,1 + beq .Lcopy_tail_byte + +srcnr; lhz r6,0(r3) + addi r3,r3,2 + adde r0,r0,r6 +dstnr; sth r6,0(r4) + addi r4,r4,2 + subi r5,r5,2 + +.Lcopy_tail_byte: /* Up to 1 byte to go */ + andi. r6,r5,1 + beq .Lcopy_finish + +srcnr; lbz r6,0(r3) + sldi r9,r6,8 /* Pad the byte out to 16 bits */ + adde r0,r0,r9 +dstnr; stb r6,0(r4) + +.Lcopy_finish: + addze r0,r0 /* add in final carry */ + rldicl r4,r0,32,0 /* fold two 32 bit halves together */ + add r3,r4,r0 + srdi r3,r3,32 + blr + +.Lsrc_error: + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + addi r1,r1,STACKFRAMESIZE +.Lsrc_error_nr: cmpdi 0,r7,0 - beq 1f + beqlr li r6,-EFAULT stw r6,0(r7) -1: addze r3,r0 blr - .globl dst_error -dst_error: +.Ldest_error: + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + addi r1,r1,STACKFRAMESIZE +.Ldest_error_nr: cmpdi 0,r8,0 - beq 1f + beqlr li r6,-EFAULT stw r6,0(r8) -1: addze r3,r0 blr - -.section __ex_table,"a" - .align 3 - .llong 81b,src_error_1 - .llong 91b,dst_error - .llong 82b,src_error_2 - .llong 92b,dst_error - .llong 83b,src_error_3 - .llong 93b,dst_error - .llong 84b,src_error_3 - .llong 94b,dst_error - .llong 95b,dst_error - .llong 96b,dst_error - .llong 97b,dst_error diff --git a/arch/powerpc/lib/checksum_wrappers_64.c b/arch/powerpc/lib/checksum_wrappers_64.c new file mode 100644 index 00000000000..08e3a3356c4 --- /dev/null +++ b/arch/powerpc/lib/checksum_wrappers_64.c @@ -0,0 +1,102 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2010 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <linux/export.h> +#include <linux/compiler.h> +#include <linux/types.h> +#include <asm/checksum.h> +#include <asm/uaccess.h> + +__wsum csum_and_copy_from_user(const void __user *src, void *dst, + int len, __wsum sum, int *err_ptr) +{ + unsigned int csum; + + might_sleep(); + + *err_ptr = 0; + + if (!len) { + csum = 0; + goto out; + } + + if (unlikely((len < 0) || !access_ok(VERIFY_READ, src, len))) { + *err_ptr = -EFAULT; + csum = (__force unsigned int)sum; + goto out; + } + + csum = csum_partial_copy_generic((void __force *)src, dst, + len, sum, err_ptr, NULL); + + if (unlikely(*err_ptr)) { + int missing = __copy_from_user(dst, src, len); + + if (missing) { + memset(dst + len - missing, 0, missing); + *err_ptr = -EFAULT; + } else { + *err_ptr = 0; + } + + csum = csum_partial(dst, len, sum); + } + +out: + return (__force __wsum)csum; +} +EXPORT_SYMBOL(csum_and_copy_from_user); + +__wsum csum_and_copy_to_user(const void *src, void __user *dst, int len, + __wsum sum, int *err_ptr) +{ + unsigned int csum; + + might_sleep(); + + *err_ptr = 0; + + if (!len) { + csum = 0; + goto out; + } + + if (unlikely((len < 0) || !access_ok(VERIFY_WRITE, dst, len))) { + *err_ptr = -EFAULT; + csum = -1; /* invalid checksum */ + goto out; + } + + csum = csum_partial_copy_generic(src, (void __force *)dst, + len, sum, NULL, err_ptr); + + if (unlikely(*err_ptr)) { + csum = csum_partial(src, len, sum); + + if (copy_to_user(dst, src, len)) { + *err_ptr = -EFAULT; + csum = -1; /* invalid checksum */ + } + } + +out: + return (__force __wsum)csum; +} +EXPORT_SYMBOL(csum_and_copy_to_user); diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c new file mode 100644 index 00000000000..d5edbeb8eb8 --- /dev/null +++ b/arch/powerpc/lib/code-patching.c @@ -0,0 +1,470 @@ +/* + * Copyright 2008 Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <asm/page.h> +#include <asm/code-patching.h> +#include <asm/uaccess.h> + + +int patch_instruction(unsigned int *addr, unsigned int instr) +{ + int err; + + __put_user_size(instr, addr, 4, err); + if (err) + return err; + asm ("dcbst 0, %0; sync; icbi 0,%0; sync; isync" : : "r" (addr)); + return 0; +} + +int patch_branch(unsigned int *addr, unsigned long target, int flags) +{ + return patch_instruction(addr, create_branch(addr, target, flags)); +} + +unsigned int create_branch(const unsigned int *addr, + unsigned long target, int flags) +{ + unsigned int instruction; + long offset; + + offset = target; + if (! (flags & BRANCH_ABSOLUTE)) + offset = offset - (unsigned long)addr; + + /* Check we can represent the target in the instruction format */ + if (offset < -0x2000000 || offset > 0x1fffffc || offset & 0x3) + return 0; + + /* Mask out the flags and target, so they don't step on each other. */ + instruction = 0x48000000 | (flags & 0x3) | (offset & 0x03FFFFFC); + + return instruction; +} + +unsigned int create_cond_branch(const unsigned int *addr, + unsigned long target, int flags) +{ + unsigned int instruction; + long offset; + + offset = target; + if (! (flags & BRANCH_ABSOLUTE)) + offset = offset - (unsigned long)addr; + + /* Check we can represent the target in the instruction format */ + if (offset < -0x8000 || offset > 0x7FFF || offset & 0x3) + return 0; + + /* Mask out the flags and target, so they don't step on each other. */ + instruction = 0x40000000 | (flags & 0x3FF0003) | (offset & 0xFFFC); + + return instruction; +} + +static unsigned int branch_opcode(unsigned int instr) +{ + return (instr >> 26) & 0x3F; +} + +static int instr_is_branch_iform(unsigned int instr) +{ + return branch_opcode(instr) == 18; +} + +static int instr_is_branch_bform(unsigned int instr) +{ + return branch_opcode(instr) == 16; +} + +int instr_is_relative_branch(unsigned int instr) +{ + if (instr & BRANCH_ABSOLUTE) + return 0; + + return instr_is_branch_iform(instr) || instr_is_branch_bform(instr); +} + +static unsigned long branch_iform_target(const unsigned int *instr) +{ + signed long imm; + + imm = *instr & 0x3FFFFFC; + + /* If the top bit of the immediate value is set this is negative */ + if (imm & 0x2000000) + imm -= 0x4000000; + + if ((*instr & BRANCH_ABSOLUTE) == 0) + imm += (unsigned long)instr; + + return (unsigned long)imm; +} + +static unsigned long branch_bform_target(const unsigned int *instr) +{ + signed long imm; + + imm = *instr & 0xFFFC; + + /* If the top bit of the immediate value is set this is negative */ + if (imm & 0x8000) + imm -= 0x10000; + + if ((*instr & BRANCH_ABSOLUTE) == 0) + imm += (unsigned long)instr; + + return (unsigned long)imm; +} + +unsigned long branch_target(const unsigned int *instr) +{ + if (instr_is_branch_iform(*instr)) + return branch_iform_target(instr); + else if (instr_is_branch_bform(*instr)) + return branch_bform_target(instr); + + return 0; +} + +int instr_is_branch_to_addr(const unsigned int *instr, unsigned long addr) +{ + if (instr_is_branch_iform(*instr) || instr_is_branch_bform(*instr)) + return branch_target(instr) == addr; + + return 0; +} + +unsigned int translate_branch(const unsigned int *dest, const unsigned int *src) +{ + unsigned long target; + + target = branch_target(src); + + if (instr_is_branch_iform(*src)) + return create_branch(dest, target, *src); + else if (instr_is_branch_bform(*src)) + return create_cond_branch(dest, target, *src); + + return 0; +} + +#ifdef CONFIG_PPC_BOOK3E_64 +void __patch_exception(int exc, unsigned long addr) +{ + extern unsigned int interrupt_base_book3e; + unsigned int *ibase = &interrupt_base_book3e; + + /* Our exceptions vectors start with a NOP and -then- a branch + * to deal with single stepping from userspace which stops on + * the second instruction. Thus we need to patch the second + * instruction of the exception, not the first one + */ + + patch_branch(ibase + (exc / 4) + 1, addr, 0); +} +#endif + +#ifdef CONFIG_CODE_PATCHING_SELFTEST + +static void __init test_trampoline(void) +{ + asm ("nop;\n"); +} + +#define check(x) \ + if (!(x)) printk("code-patching: test failed at line %d\n", __LINE__); + +static void __init test_branch_iform(void) +{ + unsigned int instr; + unsigned long addr; + + addr = (unsigned long)&instr; + + /* The simplest case, branch to self, no flags */ + check(instr_is_branch_iform(0x48000000)); + /* All bits of target set, and flags */ + check(instr_is_branch_iform(0x4bffffff)); + /* High bit of opcode set, which is wrong */ + check(!instr_is_branch_iform(0xcbffffff)); + /* Middle bits of opcode set, which is wrong */ + check(!instr_is_branch_iform(0x7bffffff)); + + /* Simplest case, branch to self with link */ + check(instr_is_branch_iform(0x48000001)); + /* All bits of targets set */ + check(instr_is_branch_iform(0x4bfffffd)); + /* Some bits of targets set */ + check(instr_is_branch_iform(0x4bff00fd)); + /* Must be a valid branch to start with */ + check(!instr_is_branch_iform(0x7bfffffd)); + + /* Absolute branch to 0x100 */ + instr = 0x48000103; + check(instr_is_branch_to_addr(&instr, 0x100)); + /* Absolute branch to 0x420fc */ + instr = 0x480420ff; + check(instr_is_branch_to_addr(&instr, 0x420fc)); + /* Maximum positive relative branch, + 20MB - 4B */ + instr = 0x49fffffc; + check(instr_is_branch_to_addr(&instr, addr + 0x1FFFFFC)); + /* Smallest negative relative branch, - 4B */ + instr = 0x4bfffffc; + check(instr_is_branch_to_addr(&instr, addr - 4)); + /* Largest negative relative branch, - 32 MB */ + instr = 0x4a000000; + check(instr_is_branch_to_addr(&instr, addr - 0x2000000)); + + /* Branch to self, with link */ + instr = create_branch(&instr, addr, BRANCH_SET_LINK); + check(instr_is_branch_to_addr(&instr, addr)); + + /* Branch to self - 0x100, with link */ + instr = create_branch(&instr, addr - 0x100, BRANCH_SET_LINK); + check(instr_is_branch_to_addr(&instr, addr - 0x100)); + + /* Branch to self + 0x100, no link */ + instr = create_branch(&instr, addr + 0x100, 0); + check(instr_is_branch_to_addr(&instr, addr + 0x100)); + + /* Maximum relative negative offset, - 32 MB */ + instr = create_branch(&instr, addr - 0x2000000, BRANCH_SET_LINK); + check(instr_is_branch_to_addr(&instr, addr - 0x2000000)); + + /* Out of range relative negative offset, - 32 MB + 4*/ + instr = create_branch(&instr, addr - 0x2000004, BRANCH_SET_LINK); + check(instr == 0); + + /* Out of range relative positive offset, + 32 MB */ + instr = create_branch(&instr, addr + 0x2000000, BRANCH_SET_LINK); + check(instr == 0); + + /* Unaligned target */ + instr = create_branch(&instr, addr + 3, BRANCH_SET_LINK); + check(instr == 0); + + /* Check flags are masked correctly */ + instr = create_branch(&instr, addr, 0xFFFFFFFC); + check(instr_is_branch_to_addr(&instr, addr)); + check(instr == 0x48000000); +} + +static void __init test_create_function_call(void) +{ + unsigned int *iptr; + unsigned long dest; + + /* Check we can create a function call */ + iptr = (unsigned int *)ppc_function_entry(test_trampoline); + dest = ppc_function_entry(test_create_function_call); + patch_instruction(iptr, create_branch(iptr, dest, BRANCH_SET_LINK)); + check(instr_is_branch_to_addr(iptr, dest)); +} + +static void __init test_branch_bform(void) +{ + unsigned long addr; + unsigned int *iptr, instr, flags; + + iptr = &instr; + addr = (unsigned long)iptr; + + /* The simplest case, branch to self, no flags */ + check(instr_is_branch_bform(0x40000000)); + /* All bits of target set, and flags */ + check(instr_is_branch_bform(0x43ffffff)); + /* High bit of opcode set, which is wrong */ + check(!instr_is_branch_bform(0xc3ffffff)); + /* Middle bits of opcode set, which is wrong */ + check(!instr_is_branch_bform(0x7bffffff)); + + /* Absolute conditional branch to 0x100 */ + instr = 0x43ff0103; + check(instr_is_branch_to_addr(&instr, 0x100)); + /* Absolute conditional branch to 0x20fc */ + instr = 0x43ff20ff; + check(instr_is_branch_to_addr(&instr, 0x20fc)); + /* Maximum positive relative conditional branch, + 32 KB - 4B */ + instr = 0x43ff7ffc; + check(instr_is_branch_to_addr(&instr, addr + 0x7FFC)); + /* Smallest negative relative conditional branch, - 4B */ + instr = 0x43fffffc; + check(instr_is_branch_to_addr(&instr, addr - 4)); + /* Largest negative relative conditional branch, - 32 KB */ + instr = 0x43ff8000; + check(instr_is_branch_to_addr(&instr, addr - 0x8000)); + + /* All condition code bits set & link */ + flags = 0x3ff000 | BRANCH_SET_LINK; + + /* Branch to self */ + instr = create_cond_branch(iptr, addr, flags); + check(instr_is_branch_to_addr(&instr, addr)); + + /* Branch to self - 0x100 */ + instr = create_cond_branch(iptr, addr - 0x100, flags); + check(instr_is_branch_to_addr(&instr, addr - 0x100)); + + /* Branch to self + 0x100 */ + instr = create_cond_branch(iptr, addr + 0x100, flags); + check(instr_is_branch_to_addr(&instr, addr + 0x100)); + + /* Maximum relative negative offset, - 32 KB */ + instr = create_cond_branch(iptr, addr - 0x8000, flags); + check(instr_is_branch_to_addr(&instr, addr - 0x8000)); + + /* Out of range relative negative offset, - 32 KB + 4*/ + instr = create_cond_branch(iptr, addr - 0x8004, flags); + check(instr == 0); + + /* Out of range relative positive offset, + 32 KB */ + instr = create_cond_branch(iptr, addr + 0x8000, flags); + check(instr == 0); + + /* Unaligned target */ + instr = create_cond_branch(iptr, addr + 3, flags); + check(instr == 0); + + /* Check flags are masked correctly */ + instr = create_cond_branch(iptr, addr, 0xFFFFFFFC); + check(instr_is_branch_to_addr(&instr, addr)); + check(instr == 0x43FF0000); +} + +static void __init test_translate_branch(void) +{ + unsigned long addr; + unsigned int *p, *q; + void *buf; + + buf = vmalloc(PAGE_ALIGN(0x2000000 + 1)); + check(buf); + if (!buf) + return; + + /* Simple case, branch to self moved a little */ + p = buf; + addr = (unsigned long)p; + patch_branch(p, addr, 0); + check(instr_is_branch_to_addr(p, addr)); + q = p + 1; + patch_instruction(q, translate_branch(q, p)); + check(instr_is_branch_to_addr(q, addr)); + + /* Maximum negative case, move b . to addr + 32 MB */ + p = buf; + addr = (unsigned long)p; + patch_branch(p, addr, 0); + q = buf + 0x2000000; + patch_instruction(q, translate_branch(q, p)); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + check(*q == 0x4a000000); + + /* Maximum positive case, move x to x - 32 MB + 4 */ + p = buf + 0x2000000; + addr = (unsigned long)p; + patch_branch(p, addr, 0); + q = buf + 4; + patch_instruction(q, translate_branch(q, p)); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + check(*q == 0x49fffffc); + + /* Jump to x + 16 MB moved to x + 20 MB */ + p = buf; + addr = 0x1000000 + (unsigned long)buf; + patch_branch(p, addr, BRANCH_SET_LINK); + q = buf + 0x1400000; + patch_instruction(q, translate_branch(q, p)); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + + /* Jump to x + 16 MB moved to x - 16 MB + 4 */ + p = buf + 0x1000000; + addr = 0x2000000 + (unsigned long)buf; + patch_branch(p, addr, 0); + q = buf + 4; + patch_instruction(q, translate_branch(q, p)); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + + + /* Conditional branch tests */ + + /* Simple case, branch to self moved a little */ + p = buf; + addr = (unsigned long)p; + patch_instruction(p, create_cond_branch(p, addr, 0)); + check(instr_is_branch_to_addr(p, addr)); + q = p + 1; + patch_instruction(q, translate_branch(q, p)); + check(instr_is_branch_to_addr(q, addr)); + + /* Maximum negative case, move b . to addr + 32 KB */ + p = buf; + addr = (unsigned long)p; + patch_instruction(p, create_cond_branch(p, addr, 0xFFFFFFFC)); + q = buf + 0x8000; + patch_instruction(q, translate_branch(q, p)); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + check(*q == 0x43ff8000); + + /* Maximum positive case, move x to x - 32 KB + 4 */ + p = buf + 0x8000; + addr = (unsigned long)p; + patch_instruction(p, create_cond_branch(p, addr, 0xFFFFFFFC)); + q = buf + 4; + patch_instruction(q, translate_branch(q, p)); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + check(*q == 0x43ff7ffc); + + /* Jump to x + 12 KB moved to x + 20 KB */ + p = buf; + addr = 0x3000 + (unsigned long)buf; + patch_instruction(p, create_cond_branch(p, addr, BRANCH_SET_LINK)); + q = buf + 0x5000; + patch_instruction(q, translate_branch(q, p)); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + + /* Jump to x + 8 KB moved to x - 8 KB + 4 */ + p = buf + 0x2000; + addr = 0x4000 + (unsigned long)buf; + patch_instruction(p, create_cond_branch(p, addr, 0)); + q = buf + 4; + patch_instruction(q, translate_branch(q, p)); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + + /* Free the buffer we were using */ + vfree(buf); +} + +static int __init test_code_patching(void) +{ + printk(KERN_DEBUG "Running code patching self-tests ...\n"); + + test_branch_iform(); + test_branch_bform(); + test_create_function_call(); + test_translate_branch(); + + return 0; +} +late_initcall(test_code_patching); + +#endif /* CONFIG_CODE_PATCHING_SELFTEST */ diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S index c657de59abc..55f19f9fd70 100644 --- a/arch/powerpc/lib/copy_32.S +++ b/arch/powerpc/lib/copy_32.S @@ -62,7 +62,7 @@ .text .stabs "arch/powerpc/lib/",N_SO,0,0,0f - .stabs "copy32.S",N_SO,0,0,0f + .stabs "copy_32.S",N_SO,0,0,0f 0: CACHELINE_BYTES = L1_CACHE_BYTES @@ -98,20 +98,7 @@ _GLOBAL(cacheable_memzero) bdnz 4b 3: mtctr r9 li r7,4 -#if !defined(CONFIG_8xx) 10: dcbz r7,r6 -#else -10: stw r4, 4(r6) - stw r4, 8(r6) - stw r4, 12(r6) - stw r4, 16(r6) -#if CACHE_LINE_SIZE >= 32 - stw r4, 20(r6) - stw r4, 24(r6) - stw r4, 28(r6) - stw r4, 32(r6) -#endif /* CACHE_LINE_SIZE */ -#endif addi r6,r6,CACHELINE_BYTES bdnz 10b clrlwi r5,r8,32-LG_CACHELINE_BYTES @@ -200,9 +187,7 @@ _GLOBAL(cacheable_memcpy) mtctr r0 beq 63f 53: -#if !defined(CONFIG_8xx) dcbz r11,r6 -#endif COPY_16_BYTES #if L1_CACHE_BYTES >= 32 COPY_16_BYTES @@ -356,14 +341,6 @@ _GLOBAL(__copy_tofrom_user) li r11,4 beq 63f -#ifdef CONFIG_8xx - /* Don't use prefetch on 8xx */ - mtctr r0 - li r0,0 -53: COPY_16_BYTES_WITHEX(0) - bdnz 53b - -#else /* not CONFIG_8xx */ /* Here we decide how far ahead to prefetch the source */ li r3,4 cmpwi r0,1 @@ -416,7 +393,6 @@ _GLOBAL(__copy_tofrom_user) li r3,4 li r7,0 bne 114b -#endif /* CONFIG_8xx */ 63: srwi. r0,r5,2 mtctr r0 diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S index f9837f44ac0..a3c4dc4defd 100644 --- a/arch/powerpc/lib/copypage_64.S +++ b/arch/powerpc/lib/copypage_64.S @@ -1,119 +1,112 @@ /* - * Copyright (C) 2002 Paul Mackerras, IBM Corp. + * Copyright (C) 2008 Mark Nelson, IBM Corp. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ +#include <asm/page.h> #include <asm/processor.h> #include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> -_GLOBAL(copy_4K_page) - std r31,-8(1) - std r30,-16(1) - std r29,-24(1) - std r28,-32(1) - std r27,-40(1) - std r26,-48(1) - std r25,-56(1) - std r24,-64(1) - std r23,-72(1) - std r22,-80(1) - std r21,-88(1) - std r20,-96(1) - li r5,4096/32 - 1 + .section ".toc","aw" +PPC64_CACHES: + .tc ppc64_caches[TC],ppc64_caches + .section ".text" + +_GLOBAL_TOC(copy_page) +BEGIN_FTR_SECTION + lis r5,PAGE_SIZE@h +FTR_SECTION_ELSE + b copypage_power7 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) + ori r5,r5,PAGE_SIZE@l +BEGIN_FTR_SECTION + ld r10,PPC64_CACHES@toc(r2) + lwz r11,DCACHEL1LOGLINESIZE(r10) /* log2 of cache line size */ + lwz r12,DCACHEL1LINESIZE(r10) /* get cache line size */ + li r9,0 + srd r8,r5,r11 + + mtctr r8 +.Lsetup: + dcbt r9,r4 + dcbz r9,r3 + add r9,r9,r12 + bdnz .Lsetup +END_FTR_SECTION_IFSET(CPU_FTR_CP_USE_DCBTZ) addi r3,r3,-8 - li r12,5 -0: addi r5,r5,-24 - mtctr r12 - ld r22,640(4) - ld r21,512(4) - ld r20,384(4) - ld r11,256(4) - ld r9,128(4) - ld r7,0(4) - ld r25,648(4) - ld r24,520(4) - ld r23,392(4) - ld r10,264(4) - ld r8,136(4) - ldu r6,8(4) - cmpwi r5,24 -1: std r22,648(3) - std r21,520(3) - std r20,392(3) - std r11,264(3) - std r9,136(3) - std r7,8(3) - ld r28,648(4) - ld r27,520(4) - ld r26,392(4) - ld r31,264(4) - ld r30,136(4) - ld r29,8(4) - std r25,656(3) - std r24,528(3) - std r23,400(3) - std r10,272(3) - std r8,144(3) - std r6,16(3) - ld r22,656(4) - ld r21,528(4) - ld r20,400(4) - ld r11,272(4) - ld r9,144(4) - ld r7,16(4) - std r28,664(3) - std r27,536(3) - std r26,408(3) - std r31,280(3) - std r30,152(3) - stdu r29,24(3) - ld r25,664(4) - ld r24,536(4) - ld r23,408(4) - ld r10,280(4) - ld r8,152(4) - ldu r6,24(4) + srdi r8,r5,7 /* page is copied in 128 byte strides */ + addi r8,r8,-1 /* one stride copied outside loop */ + + mtctr r8 + + ld r5,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ldu r8,24(r4) +1: std r5,8(r3) + std r6,16(r3) + ld r9,8(r4) + ld r10,16(r4) + std r7,24(r3) + std r8,32(r3) + ld r11,24(r4) + ld r12,32(r4) + std r9,40(r3) + std r10,48(r3) + ld r5,40(r4) + ld r6,48(r4) + std r11,56(r3) + std r12,64(r3) + ld r7,56(r4) + ld r8,64(r4) + std r5,72(r3) + std r6,80(r3) + ld r9,72(r4) + ld r10,80(r4) + std r7,88(r3) + std r8,96(r3) + ld r11,88(r4) + ld r12,96(r4) + std r9,104(r3) + std r10,112(r3) + ld r5,104(r4) + ld r6,112(r4) + std r11,120(r3) + stdu r12,128(r3) + ld r7,120(r4) + ldu r8,128(r4) bdnz 1b - std r22,648(3) - std r21,520(3) - std r20,392(3) - std r11,264(3) - std r9,136(3) - std r7,8(3) - addi r4,r4,640 - addi r3,r3,648 - bge 0b - mtctr r5 - ld r7,0(4) - ld r8,8(4) - ldu r9,16(4) -3: ld r10,8(4) - std r7,8(3) - ld r7,16(4) - std r8,16(3) - ld r8,24(4) - std r9,24(3) - ldu r9,32(4) - stdu r10,32(3) - bdnz 3b -4: ld r10,8(4) - std r7,8(3) - std r8,16(3) - std r9,24(3) - std r10,32(3) -9: ld r20,-96(1) - ld r21,-88(1) - ld r22,-80(1) - ld r23,-72(1) - ld r24,-64(1) - ld r25,-56(1) - ld r26,-48(1) - ld r27,-40(1) - ld r28,-32(1) - ld r29,-24(1) - ld r30,-16(1) - ld r31,-8(1) + + std r5,8(r3) + std r6,16(r3) + ld r9,8(r4) + ld r10,16(r4) + std r7,24(r3) + std r8,32(r3) + ld r11,24(r4) + ld r12,32(r4) + std r9,40(r3) + std r10,48(r3) + ld r5,40(r4) + ld r6,48(r4) + std r11,56(r3) + std r12,64(r3) + ld r7,56(r4) + ld r8,64(r4) + std r5,72(r3) + std r6,80(r3) + ld r9,72(r4) + ld r10,80(r4) + std r7,88(r3) + std r8,96(r3) + ld r11,88(r4) + ld r12,96(r4) + std r9,104(r3) + std r10,112(r3) + std r11,120(r3) + std r12,128(r3) blr diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S new file mode 100644 index 00000000000..d7dafb3777a --- /dev/null +++ b/arch/powerpc/lib/copypage_power7.S @@ -0,0 +1,168 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <asm/page.h> +#include <asm/ppc_asm.h> + +_GLOBAL(copypage_power7) + /* + * We prefetch both the source and destination using enhanced touch + * instructions. We use a stream ID of 0 for the load side and + * 1 for the store side. Since source and destination are page + * aligned we don't need to clear the bottom 7 bits of either + * address. + */ + ori r9,r3,1 /* stream=1 => to */ + +#ifdef CONFIG_PPC_64K_PAGES + lis r7,0x0E01 /* depth=7 + * units/cachelines=512 */ +#else + lis r7,0x0E00 /* depth=7 */ + ori r7,r7,0x1000 /* units/cachelines=32 */ +#endif + ori r10,r7,1 /* stream=1 */ + + lis r8,0x8000 /* GO=1 */ + clrldi r8,r8,32 + +.machine push +.machine "power4" + /* setup read stream 0 */ + dcbt r0,r4,0b01000 /* addr from */ + dcbt r0,r7,0b01010 /* length and depth from */ + /* setup write stream 1 */ + dcbtst r0,r9,0b01000 /* addr to */ + dcbtst r0,r10,0b01010 /* length and depth to */ + eieio + dcbt r0,r8,0b01010 /* all streams GO */ +.machine pop + +#ifdef CONFIG_ALTIVEC + mflr r0 + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + bl enter_vmx_copy + cmpwi r3,0 + ld r0,STACKFRAMESIZE+16(r1) + ld r3,STK_REG(R31)(r1) + ld r4,STK_REG(R30)(r1) + mtlr r0 + + li r0,(PAGE_SIZE/128) + mtctr r0 + + beq .Lnonvmx_copy + + addi r1,r1,STACKFRAMESIZE + + li r6,16 + li r7,32 + li r8,48 + li r9,64 + li r10,80 + li r11,96 + li r12,112 + + .align 5 +1: lvx vr7,r0,r4 + lvx vr6,r4,r6 + lvx vr5,r4,r7 + lvx vr4,r4,r8 + lvx vr3,r4,r9 + lvx vr2,r4,r10 + lvx vr1,r4,r11 + lvx vr0,r4,r12 + addi r4,r4,128 + stvx vr7,r0,r3 + stvx vr6,r3,r6 + stvx vr5,r3,r7 + stvx vr4,r3,r8 + stvx vr3,r3,r9 + stvx vr2,r3,r10 + stvx vr1,r3,r11 + stvx vr0,r3,r12 + addi r3,r3,128 + bdnz 1b + + b exit_vmx_copy /* tail call optimise */ + +#else + li r0,(PAGE_SIZE/128) + mtctr r0 + + stdu r1,-STACKFRAMESIZE(r1) +#endif + +.Lnonvmx_copy: + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + std r17,STK_REG(R17)(r1) + std r18,STK_REG(R18)(r1) + std r19,STK_REG(R19)(r1) + std r20,STK_REG(R20)(r1) + +1: ld r0,0(r4) + ld r5,8(r4) + ld r6,16(r4) + ld r7,24(r4) + ld r8,32(r4) + ld r9,40(r4) + ld r10,48(r4) + ld r11,56(r4) + ld r12,64(r4) + ld r14,72(r4) + ld r15,80(r4) + ld r16,88(r4) + ld r17,96(r4) + ld r18,104(r4) + ld r19,112(r4) + ld r20,120(r4) + addi r4,r4,128 + std r0,0(r3) + std r5,8(r3) + std r6,16(r3) + std r7,24(r3) + std r8,32(r3) + std r9,40(r3) + std r10,48(r3) + std r11,56(r3) + std r12,64(r3) + std r14,72(r3) + std r15,80(r3) + std r16,88(r3) + std r17,96(r3) + std r18,104(r3) + std r19,112(r3) + std r20,120(r3) + addi r3,r3,128 + bdnz 1b + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + ld r17,STK_REG(R17)(r1) + ld r18,STK_REG(R18)(r1) + ld r19,STK_REG(R19)(r1) + ld r20,STK_REG(R20)(r1) + addi r1,r1,STACKFRAMESIZE + blr diff --git a/arch/powerpc/lib/copyuser_64.S b/arch/powerpc/lib/copyuser_64.S index 25ec5378afa..0860ee46013 100644 --- a/arch/powerpc/lib/copyuser_64.S +++ b/arch/powerpc/lib/copyuser_64.S @@ -9,8 +9,22 @@ #include <asm/processor.h> #include <asm/ppc_asm.h> +#ifdef __BIG_ENDIAN__ +#define sLd sld /* Shift towards low-numbered address. */ +#define sHd srd /* Shift towards high-numbered address. */ +#else +#define sLd srd /* Shift towards low-numbered address. */ +#define sHd sld /* Shift towards high-numbered address. */ +#endif + .align 7 -_GLOBAL(__copy_tofrom_user) +_GLOBAL_TOC(__copy_tofrom_user) +BEGIN_FTR_SECTION + nop +FTR_SECTION_ELSE + b __copy_tofrom_user_power7 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) +_GLOBAL(__copy_tofrom_user_base) /* first check for a whole page copy on a page boundary */ cmpldi cr1,r5,16 cmpdi cr6,r5,4096 @@ -24,43 +38,75 @@ _GLOBAL(__copy_tofrom_user) dcbt 0,r4 beq .Lcopy_page_4K andi. r6,r6,7 - PPC_MTOCRF 0x01,r5 + PPC_MTOCRF(0x01,r5) blt cr1,.Lshort_copy +/* Below we want to nop out the bne if we're on a CPU that has the + * CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit + * cleared. + * At the time of writing the only CPU that has this combination of bits + * set is Power6. + */ +BEGIN_FTR_SECTION + nop +FTR_SECTION_ELSE bne .Ldst_unaligned +ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \ + CPU_FTR_UNALIGNED_LD_STD) .Ldst_aligned: - andi. r0,r4,7 addi r3,r3,-16 +BEGIN_FTR_SECTION + andi. r0,r4,7 bne .Lsrc_unaligned - srdi r7,r5,4 -20: ld r9,0(r4) - addi r4,r4,-8 - mtctr r7 - andi. r5,r5,7 - bf cr7*4+0,22f - addi r3,r3,8 - addi r4,r4,8 - mr r8,r9 - blt cr1,72f -21: ld r9,8(r4) -70: std r8,8(r3) -22: ldu r8,16(r4) -71: stdu r9,16(r3) +END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) + blt cr1,.Ldo_tail /* if < 16 bytes to copy */ + srdi r0,r5,5 + cmpdi cr1,r0,0 +20: ld r7,0(r4) +220: ld r6,8(r4) + addi r4,r4,16 + mtctr r0 + andi. r0,r5,0x10 + beq 22f + addi r3,r3,16 + addi r4,r4,-16 + mr r9,r7 + mr r8,r6 + beq cr1,72f +21: ld r7,16(r4) +221: ld r6,24(r4) + addi r4,r4,32 +70: std r9,0(r3) +270: std r8,8(r3) +22: ld r9,0(r4) +222: ld r8,8(r4) +71: std r7,16(r3) +271: std r6,24(r3) + addi r3,r3,32 bdnz 21b -72: std r8,8(r3) +72: std r9,0(r3) +272: std r8,8(r3) + andi. r5,r5,0xf beq+ 3f - addi r3,r3,16 -23: ld r9,8(r4) + addi r4,r4,16 .Ldo_tail: - bf cr7*4+1,1f - rotldi r9,r9,32 + addi r3,r3,16 + bf cr7*4+0,246f +244: ld r9,0(r4) + addi r4,r4,8 +245: std r9,0(r3) + addi r3,r3,8 +246: bf cr7*4+1,1f +23: lwz r9,0(r4) + addi r4,r4,4 73: stw r9,0(r3) addi r3,r3,4 1: bf cr7*4+2,2f - rotldi r9,r9,16 +44: lhz r9,0(r4) + addi r4,r4,2 74: sth r9,0(r3) addi r3,r3,2 2: bf cr7*4+3,3f - rotldi r9,r9,8 +45: lbz r9,0(r4) 75: stb r9,0(r3) 3: li r3,0 blr @@ -80,10 +126,10 @@ _GLOBAL(__copy_tofrom_user) 24: ld r9,0(r4) /* 3+2n loads, 2+2n stores */ 25: ld r0,8(r4) - sld r6,r9,r10 + sLd r6,r9,r10 26: ldu r9,16(r4) - srd r7,r0,r11 - sld r8,r0,r10 + sHd r7,r0,r11 + sLd r8,r0,r10 or r7,r7,r6 blt cr6,79f 27: ld r0,8(r4) @@ -91,35 +137,35 @@ _GLOBAL(__copy_tofrom_user) 28: ld r0,0(r4) /* 4+2n loads, 3+2n stores */ 29: ldu r9,8(r4) - sld r8,r0,r10 + sLd r8,r0,r10 addi r3,r3,-8 blt cr6,5f 30: ld r0,8(r4) - srd r12,r9,r11 - sld r6,r9,r10 + sHd r12,r9,r11 + sLd r6,r9,r10 31: ldu r9,16(r4) or r12,r8,r12 - srd r7,r0,r11 - sld r8,r0,r10 + sHd r7,r0,r11 + sLd r8,r0,r10 addi r3,r3,16 beq cr6,78f 1: or r7,r7,r6 32: ld r0,8(r4) 76: std r12,8(r3) -2: srd r12,r9,r11 - sld r6,r9,r10 +2: sHd r12,r9,r11 + sLd r6,r9,r10 33: ldu r9,16(r4) or r12,r8,r12 77: stdu r7,16(r3) - srd r7,r0,r11 - sld r8,r0,r10 + sHd r7,r0,r11 + sLd r8,r0,r10 bdnz 1b 78: std r12,8(r3) or r7,r7,r6 79: std r7,16(r3) -5: srd r12,r9,r11 +5: sHd r12,r9,r11 or r12,r8,r12 80: std r12,24(r3) bne 6f @@ -127,18 +173,46 @@ _GLOBAL(__copy_tofrom_user) blr 6: cmpwi cr1,r5,8 addi r3,r3,32 - sld r9,r9,r10 - ble cr1,.Ldo_tail + sLd r9,r9,r10 + ble cr1,7f 34: ld r0,8(r4) - srd r7,r0,r11 + sHd r7,r0,r11 or r9,r7,r9 - b .Ldo_tail +7: + bf cr7*4+1,1f +#ifdef __BIG_ENDIAN__ + rotldi r9,r9,32 +#endif +94: stw r9,0(r3) +#ifdef __LITTLE_ENDIAN__ + rotrdi r9,r9,32 +#endif + addi r3,r3,4 +1: bf cr7*4+2,2f +#ifdef __BIG_ENDIAN__ + rotldi r9,r9,16 +#endif +95: sth r9,0(r3) +#ifdef __LITTLE_ENDIAN__ + rotrdi r9,r9,16 +#endif + addi r3,r3,2 +2: bf cr7*4+3,3f +#ifdef __BIG_ENDIAN__ + rotldi r9,r9,8 +#endif +96: stb r9,0(r3) +#ifdef __LITTLE_ENDIAN__ + rotrdi r9,r9,8 +#endif +3: li r3,0 + blr .Ldst_unaligned: - PPC_MTOCRF 0x01,r6 /* put #bytes to 8B bdry into cr7 */ + PPC_MTOCRF(0x01,r6) /* put #bytes to 8B bdry into cr7 */ subf r5,r6,r5 li r7,0 - cmpldi r1,r5,16 + cmpldi cr1,r5,16 bf cr7*4+3,1f 35: lbz r0,0(r4) 81: stb r0,0(r3) @@ -150,7 +224,7 @@ _GLOBAL(__copy_tofrom_user) 2: bf cr7*4+1,3f 37: lwzx r0,r7,r4 83: stwx r0,r7,r3 -3: PPC_MTOCRF 0x01,r5 +3: PPC_MTOCRF(0x01,r5) add r4,r6,r4 add r3,r6,r3 b .Ldst_aligned @@ -193,7 +267,9 @@ _GLOBAL(__copy_tofrom_user) 131: addi r3,r3,8 120: +320: 122: +322: 124: 125: 126: @@ -202,10 +278,11 @@ _GLOBAL(__copy_tofrom_user) 129: 133: addi r3,r3,8 -121: 132: addi r3,r3,8 -123: +121: +321: +344: 134: 135: 138: @@ -213,6 +290,9 @@ _GLOBAL(__copy_tofrom_user) 140: 141: 142: +123: +144: +145: /* * here we have had a fault on a load and r3 points to the first @@ -274,18 +354,22 @@ _GLOBAL(__copy_tofrom_user) 183: add r3,r3,r7 b 1f +371: 180: addi r3,r3,8 171: 177: addi r3,r3,8 -170: -172: +370: +372: 176: 178: addi r3,r3,4 185: addi r3,r3,4 +170: +172: +345: 173: 174: 175: @@ -296,6 +380,9 @@ _GLOBAL(__copy_tofrom_user) 187: 188: 189: +194: +195: +196: 1: ld r6,-24(r1) ld r5,-8(r1) @@ -309,14 +396,24 @@ _GLOBAL(__copy_tofrom_user) .section __ex_table,"a" .align 3 .llong 20b,120b + .llong 220b,320b .llong 21b,121b + .llong 221b,321b .llong 70b,170b + .llong 270b,370b .llong 22b,122b + .llong 222b,322b .llong 71b,171b + .llong 271b,371b .llong 72b,172b + .llong 272b,372b + .llong 244b,344b + .llong 245b,345b .llong 23b,123b .llong 73b,173b + .llong 44b,144b .llong 74b,174b + .llong 45b,145b .llong 75b,175b .llong 24b,124b .llong 25b,125b @@ -334,6 +431,9 @@ _GLOBAL(__copy_tofrom_user) .llong 79b,179b .llong 80b,180b .llong 34b,134b + .llong 94b,194b + .llong 95b,195b + .llong 96b,196b .llong 35b,135b .llong 81b,181b .llong 36b,136b diff --git a/arch/powerpc/lib/copyuser_power7.S b/arch/powerpc/lib/copyuser_power7.S new file mode 100644 index 00000000000..c46c876ac96 --- /dev/null +++ b/arch/powerpc/lib/copyuser_power7.S @@ -0,0 +1,721 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2011 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <asm/ppc_asm.h> + +#ifdef __BIG_ENDIAN__ +#define LVS(VRT,RA,RB) lvsl VRT,RA,RB +#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC +#else +#define LVS(VRT,RA,RB) lvsr VRT,RA,RB +#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC +#endif + + .macro err1 +100: + .section __ex_table,"a" + .align 3 + .llong 100b,.Ldo_err1 + .previous + .endm + + .macro err2 +200: + .section __ex_table,"a" + .align 3 + .llong 200b,.Ldo_err2 + .previous + .endm + +#ifdef CONFIG_ALTIVEC + .macro err3 +300: + .section __ex_table,"a" + .align 3 + .llong 300b,.Ldo_err3 + .previous + .endm + + .macro err4 +400: + .section __ex_table,"a" + .align 3 + .llong 400b,.Ldo_err4 + .previous + .endm + + +.Ldo_err4: + ld r16,STK_REG(R16)(r1) + ld r15,STK_REG(R15)(r1) + ld r14,STK_REG(R14)(r1) +.Ldo_err3: + bl exit_vmx_usercopy + ld r0,STACKFRAMESIZE+16(r1) + mtlr r0 + b .Lexit +#endif /* CONFIG_ALTIVEC */ + +.Ldo_err2: + ld r22,STK_REG(R22)(r1) + ld r21,STK_REG(R21)(r1) + ld r20,STK_REG(R20)(r1) + ld r19,STK_REG(R19)(r1) + ld r18,STK_REG(R18)(r1) + ld r17,STK_REG(R17)(r1) + ld r16,STK_REG(R16)(r1) + ld r15,STK_REG(R15)(r1) + ld r14,STK_REG(R14)(r1) +.Lexit: + addi r1,r1,STACKFRAMESIZE +.Ldo_err1: + ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1) + ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1) + b __copy_tofrom_user_base + + +_GLOBAL(__copy_tofrom_user_power7) +#ifdef CONFIG_ALTIVEC + cmpldi r5,16 + cmpldi cr1,r5,4096 + + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) + + blt .Lshort_copy + bgt cr1,.Lvmx_copy +#else + cmpldi r5,16 + + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) + + blt .Lshort_copy +#endif + +.Lnonvmx_copy: + /* Get the source 8B aligned */ + neg r6,r4 + mtocrf 0x01,r6 + clrldi r6,r6,(64-3) + + bf cr7*4+3,1f +err1; lbz r0,0(r4) + addi r4,r4,1 +err1; stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f +err1; lhz r0,0(r4) + addi r4,r4,2 +err1; sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f +err1; lwz r0,0(r4) + addi r4,r4,4 +err1; stw r0,0(r3) + addi r3,r3,4 + +3: sub r5,r5,r6 + cmpldi r5,128 + blt 5f + + mflr r0 + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + std r17,STK_REG(R17)(r1) + std r18,STK_REG(R18)(r1) + std r19,STK_REG(R19)(r1) + std r20,STK_REG(R20)(r1) + std r21,STK_REG(R21)(r1) + std r22,STK_REG(R22)(r1) + std r0,STACKFRAMESIZE+16(r1) + + srdi r6,r5,7 + mtctr r6 + + /* Now do cacheline (128B) sized loads and stores. */ + .align 5 +4: +err2; ld r0,0(r4) +err2; ld r6,8(r4) +err2; ld r7,16(r4) +err2; ld r8,24(r4) +err2; ld r9,32(r4) +err2; ld r10,40(r4) +err2; ld r11,48(r4) +err2; ld r12,56(r4) +err2; ld r14,64(r4) +err2; ld r15,72(r4) +err2; ld r16,80(r4) +err2; ld r17,88(r4) +err2; ld r18,96(r4) +err2; ld r19,104(r4) +err2; ld r20,112(r4) +err2; ld r21,120(r4) + addi r4,r4,128 +err2; std r0,0(r3) +err2; std r6,8(r3) +err2; std r7,16(r3) +err2; std r8,24(r3) +err2; std r9,32(r3) +err2; std r10,40(r3) +err2; std r11,48(r3) +err2; std r12,56(r3) +err2; std r14,64(r3) +err2; std r15,72(r3) +err2; std r16,80(r3) +err2; std r17,88(r3) +err2; std r18,96(r3) +err2; std r19,104(r3) +err2; std r20,112(r3) +err2; std r21,120(r3) + addi r3,r3,128 + bdnz 4b + + clrldi r5,r5,(64-7) + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + ld r17,STK_REG(R17)(r1) + ld r18,STK_REG(R18)(r1) + ld r19,STK_REG(R19)(r1) + ld r20,STK_REG(R20)(r1) + ld r21,STK_REG(R21)(r1) + ld r22,STK_REG(R22)(r1) + addi r1,r1,STACKFRAMESIZE + + /* Up to 127B to go */ +5: srdi r6,r5,4 + mtocrf 0x01,r6 + +6: bf cr7*4+1,7f +err1; ld r0,0(r4) +err1; ld r6,8(r4) +err1; ld r7,16(r4) +err1; ld r8,24(r4) +err1; ld r9,32(r4) +err1; ld r10,40(r4) +err1; ld r11,48(r4) +err1; ld r12,56(r4) + addi r4,r4,64 +err1; std r0,0(r3) +err1; std r6,8(r3) +err1; std r7,16(r3) +err1; std r8,24(r3) +err1; std r9,32(r3) +err1; std r10,40(r3) +err1; std r11,48(r3) +err1; std r12,56(r3) + addi r3,r3,64 + + /* Up to 63B to go */ +7: bf cr7*4+2,8f +err1; ld r0,0(r4) +err1; ld r6,8(r4) +err1; ld r7,16(r4) +err1; ld r8,24(r4) + addi r4,r4,32 +err1; std r0,0(r3) +err1; std r6,8(r3) +err1; std r7,16(r3) +err1; std r8,24(r3) + addi r3,r3,32 + + /* Up to 31B to go */ +8: bf cr7*4+3,9f +err1; ld r0,0(r4) +err1; ld r6,8(r4) + addi r4,r4,16 +err1; std r0,0(r3) +err1; std r6,8(r3) + addi r3,r3,16 + +9: clrldi r5,r5,(64-4) + + /* Up to 15B to go */ +.Lshort_copy: + mtocrf 0x01,r5 + bf cr7*4+0,12f +err1; lwz r0,0(r4) /* Less chance of a reject with word ops */ +err1; lwz r6,4(r4) + addi r4,r4,8 +err1; stw r0,0(r3) +err1; stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f +err1; lwz r0,0(r4) + addi r4,r4,4 +err1; stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f +err1; lhz r0,0(r4) + addi r4,r4,2 +err1; sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f +err1; lbz r0,0(r4) +err1; stb r0,0(r3) + +15: li r3,0 + blr + +.Lunwind_stack_nonvmx_copy: + addi r1,r1,STACKFRAMESIZE + b .Lnonvmx_copy + +#ifdef CONFIG_ALTIVEC +.Lvmx_copy: + mflr r0 + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + bl enter_vmx_usercopy + cmpwi cr1,r3,0 + ld r0,STACKFRAMESIZE+16(r1) + ld r3,STK_REG(R31)(r1) + ld r4,STK_REG(R30)(r1) + ld r5,STK_REG(R29)(r1) + mtlr r0 + + /* + * We prefetch both the source and destination using enhanced touch + * instructions. We use a stream ID of 0 for the load side and + * 1 for the store side. + */ + clrrdi r6,r4,7 + clrrdi r9,r3,7 + ori r9,r9,1 /* stream=1 */ + + srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ + cmpldi r7,0x3FF + ble 1f + li r7,0x3FF +1: lis r0,0x0E00 /* depth=7 */ + sldi r7,r7,7 + or r7,r7,r0 + ori r10,r7,1 /* stream=1 */ + + lis r8,0x8000 /* GO=1 */ + clrldi r8,r8,32 + +.machine push +.machine "power4" + /* setup read stream 0 */ + dcbt r0,r6,0b01000 /* addr from */ + dcbt r0,r7,0b01010 /* length and depth from */ + /* setup write stream 1 */ + dcbtst r0,r9,0b01000 /* addr to */ + dcbtst r0,r10,0b01010 /* length and depth to */ + eieio + dcbt r0,r8,0b01010 /* all streams GO */ +.machine pop + + beq cr1,.Lunwind_stack_nonvmx_copy + + /* + * If source and destination are not relatively aligned we use a + * slower permute loop. + */ + xor r6,r4,r3 + rldicl. r6,r6,0,(64-4) + bne .Lvmx_unaligned_copy + + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f +err3; lbz r0,0(r4) + addi r4,r4,1 +err3; stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f +err3; ld r0,0(r4) + addi r4,r4,8 +err3; std r0,0(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + bf cr7*4+3,5f +err3; lvx vr1,r0,r4 + addi r4,r4,16 +err3; stvx vr1,r0,r3 + addi r3,r3,16 + +5: bf cr7*4+2,6f +err3; lvx vr1,r0,r4 +err3; lvx vr0,r4,r9 + addi r4,r4,32 +err3; stvx vr1,r0,r3 +err3; stvx vr0,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f +err3; lvx vr3,r0,r4 +err3; lvx vr2,r4,r9 +err3; lvx vr1,r4,r10 +err3; lvx vr0,r4,r11 + addi r4,r4,64 +err3; stvx vr3,r0,r3 +err3; stvx vr2,r3,r9 +err3; stvx vr1,r3,r10 +err3; stvx vr0,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: +err4; lvx vr7,r0,r4 +err4; lvx vr6,r4,r9 +err4; lvx vr5,r4,r10 +err4; lvx vr4,r4,r11 +err4; lvx vr3,r4,r12 +err4; lvx vr2,r4,r14 +err4; lvx vr1,r4,r15 +err4; lvx vr0,r4,r16 + addi r4,r4,128 +err4; stvx vr7,r0,r3 +err4; stvx vr6,r3,r9 +err4; stvx vr5,r3,r10 +err4; stvx vr4,r3,r11 +err4; stvx vr3,r3,r12 +err4; stvx vr2,r3,r14 +err4; stvx vr1,r3,r15 +err4; stvx vr0,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f +err3; lvx vr3,r0,r4 +err3; lvx vr2,r4,r9 +err3; lvx vr1,r4,r10 +err3; lvx vr0,r4,r11 + addi r4,r4,64 +err3; stvx vr3,r0,r3 +err3; stvx vr2,r3,r9 +err3; stvx vr1,r3,r10 +err3; stvx vr0,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f +err3; lvx vr1,r0,r4 +err3; lvx vr0,r4,r9 + addi r4,r4,32 +err3; stvx vr1,r0,r3 +err3; stvx vr0,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f +err3; lvx vr1,r0,r4 + addi r4,r4,16 +err3; stvx vr1,r0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + mtocrf 0x01,r5 + bf cr7*4+0,12f +err3; ld r0,0(r4) + addi r4,r4,8 +err3; std r0,0(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f +err3; lbz r0,0(r4) +err3; stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + b exit_vmx_usercopy /* tail call optimise */ + +.Lvmx_unaligned_copy: + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f +err3; lbz r0,0(r4) + addi r4,r4,1 +err3; stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f +err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ +err3; lwz r7,4(r4) + addi r4,r4,8 +err3; stw r0,0(r3) +err3; stw r7,4(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + LVS(vr16,0,r4) /* Setup permute control vector */ +err3; lvx vr0,0,r4 + addi r4,r4,16 + + bf cr7*4+3,5f +err3; lvx vr1,r0,r4 + VPERM(vr8,vr0,vr1,vr16) + addi r4,r4,16 +err3; stvx vr8,r0,r3 + addi r3,r3,16 + vor vr0,vr1,vr1 + +5: bf cr7*4+2,6f +err3; lvx vr1,r0,r4 + VPERM(vr8,vr0,vr1,vr16) +err3; lvx vr0,r4,r9 + VPERM(vr9,vr1,vr0,vr16) + addi r4,r4,32 +err3; stvx vr8,r0,r3 +err3; stvx vr9,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f +err3; lvx vr3,r0,r4 + VPERM(vr8,vr0,vr3,vr16) +err3; lvx vr2,r4,r9 + VPERM(vr9,vr3,vr2,vr16) +err3; lvx vr1,r4,r10 + VPERM(vr10,vr2,vr1,vr16) +err3; lvx vr0,r4,r11 + VPERM(vr11,vr1,vr0,vr16) + addi r4,r4,64 +err3; stvx vr8,r0,r3 +err3; stvx vr9,r3,r9 +err3; stvx vr10,r3,r10 +err3; stvx vr11,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: +err4; lvx vr7,r0,r4 + VPERM(vr8,vr0,vr7,vr16) +err4; lvx vr6,r4,r9 + VPERM(vr9,vr7,vr6,vr16) +err4; lvx vr5,r4,r10 + VPERM(vr10,vr6,vr5,vr16) +err4; lvx vr4,r4,r11 + VPERM(vr11,vr5,vr4,vr16) +err4; lvx vr3,r4,r12 + VPERM(vr12,vr4,vr3,vr16) +err4; lvx vr2,r4,r14 + VPERM(vr13,vr3,vr2,vr16) +err4; lvx vr1,r4,r15 + VPERM(vr14,vr2,vr1,vr16) +err4; lvx vr0,r4,r16 + VPERM(vr15,vr1,vr0,vr16) + addi r4,r4,128 +err4; stvx vr8,r0,r3 +err4; stvx vr9,r3,r9 +err4; stvx vr10,r3,r10 +err4; stvx vr11,r3,r11 +err4; stvx vr12,r3,r12 +err4; stvx vr13,r3,r14 +err4; stvx vr14,r3,r15 +err4; stvx vr15,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f +err3; lvx vr3,r0,r4 + VPERM(vr8,vr0,vr3,vr16) +err3; lvx vr2,r4,r9 + VPERM(vr9,vr3,vr2,vr16) +err3; lvx vr1,r4,r10 + VPERM(vr10,vr2,vr1,vr16) +err3; lvx vr0,r4,r11 + VPERM(vr11,vr1,vr0,vr16) + addi r4,r4,64 +err3; stvx vr8,r0,r3 +err3; stvx vr9,r3,r9 +err3; stvx vr10,r3,r10 +err3; stvx vr11,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f +err3; lvx vr1,r0,r4 + VPERM(vr8,vr0,vr1,vr16) +err3; lvx vr0,r4,r9 + VPERM(vr9,vr1,vr0,vr16) + addi r4,r4,32 +err3; stvx vr8,r0,r3 +err3; stvx vr9,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f +err3; lvx vr1,r0,r4 + VPERM(vr8,vr0,vr1,vr16) + addi r4,r4,16 +err3; stvx vr8,r0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + addi r4,r4,-16 /* Unwind the +16 load offset */ + mtocrf 0x01,r5 + bf cr7*4+0,12f +err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ +err3; lwz r6,4(r4) + addi r4,r4,8 +err3; stw r0,0(r3) +err3; stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f +err3; lbz r0,0(r4) +err3; stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + b exit_vmx_usercopy /* tail call optimise */ +#endif /* CONFiG_ALTIVEC */ diff --git a/arch/powerpc/lib/crtsavres.S b/arch/powerpc/lib/crtsavres.S new file mode 100644 index 00000000000..a5b30c71a8d --- /dev/null +++ b/arch/powerpc/lib/crtsavres.S @@ -0,0 +1,547 @@ +/* + * Special support for eabi and SVR4 + * + * Copyright (C) 1995, 1996, 1998, 2000, 2001 Free Software Foundation, Inc. + * Copyright 2008 Freescale Semiconductor, Inc. + * Written By Michael Meissner + * + * Based on gcc/config/rs6000/crtsavres.asm from gcc + * 64 bit additions from reading the PPC elf64abi document. + * + * This file is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * In addition to the permissions in the GNU General Public License, the + * Free Software Foundation gives you unlimited permission to link the + * compiled version of this file with other programs, and to distribute + * those programs without any restriction coming from the use of this + * file. (The General Public License restrictions do apply in other + * respects; for example, they cover modification of the file, and + * distribution when not linked into another program.) + * + * This file is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + * As a special exception, if you link this library with files + * compiled with GCC to produce an executable, this does not cause + * the resulting executable to be covered by the GNU General Public License. + * This exception does not however invalidate any other reasons why + * the executable file might be covered by the GNU General Public License. + */ + +#include <asm/ppc_asm.h> + + .file "crtsavres.S" + +#ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE + +#ifndef CONFIG_PPC64 + + .section ".text" + +/* Routines for saving integer registers, called by the compiler. */ +/* Called with r11 pointing to the stack header word of the caller of the */ +/* function, just beyond the end of the integer save area. */ + +_GLOBAL(_savegpr_14) +_GLOBAL(_save32gpr_14) + stw 14,-72(11) /* save gp registers */ +_GLOBAL(_savegpr_15) +_GLOBAL(_save32gpr_15) + stw 15,-68(11) +_GLOBAL(_savegpr_16) +_GLOBAL(_save32gpr_16) + stw 16,-64(11) +_GLOBAL(_savegpr_17) +_GLOBAL(_save32gpr_17) + stw 17,-60(11) +_GLOBAL(_savegpr_18) +_GLOBAL(_save32gpr_18) + stw 18,-56(11) +_GLOBAL(_savegpr_19) +_GLOBAL(_save32gpr_19) + stw 19,-52(11) +_GLOBAL(_savegpr_20) +_GLOBAL(_save32gpr_20) + stw 20,-48(11) +_GLOBAL(_savegpr_21) +_GLOBAL(_save32gpr_21) + stw 21,-44(11) +_GLOBAL(_savegpr_22) +_GLOBAL(_save32gpr_22) + stw 22,-40(11) +_GLOBAL(_savegpr_23) +_GLOBAL(_save32gpr_23) + stw 23,-36(11) +_GLOBAL(_savegpr_24) +_GLOBAL(_save32gpr_24) + stw 24,-32(11) +_GLOBAL(_savegpr_25) +_GLOBAL(_save32gpr_25) + stw 25,-28(11) +_GLOBAL(_savegpr_26) +_GLOBAL(_save32gpr_26) + stw 26,-24(11) +_GLOBAL(_savegpr_27) +_GLOBAL(_save32gpr_27) + stw 27,-20(11) +_GLOBAL(_savegpr_28) +_GLOBAL(_save32gpr_28) + stw 28,-16(11) +_GLOBAL(_savegpr_29) +_GLOBAL(_save32gpr_29) + stw 29,-12(11) +_GLOBAL(_savegpr_30) +_GLOBAL(_save32gpr_30) + stw 30,-8(11) +_GLOBAL(_savegpr_31) +_GLOBAL(_save32gpr_31) + stw 31,-4(11) + blr + +/* Routines for restoring integer registers, called by the compiler. */ +/* Called with r11 pointing to the stack header word of the caller of the */ +/* function, just beyond the end of the integer restore area. */ + +_GLOBAL(_restgpr_14) +_GLOBAL(_rest32gpr_14) + lwz 14,-72(11) /* restore gp registers */ +_GLOBAL(_restgpr_15) +_GLOBAL(_rest32gpr_15) + lwz 15,-68(11) +_GLOBAL(_restgpr_16) +_GLOBAL(_rest32gpr_16) + lwz 16,-64(11) +_GLOBAL(_restgpr_17) +_GLOBAL(_rest32gpr_17) + lwz 17,-60(11) +_GLOBAL(_restgpr_18) +_GLOBAL(_rest32gpr_18) + lwz 18,-56(11) +_GLOBAL(_restgpr_19) +_GLOBAL(_rest32gpr_19) + lwz 19,-52(11) +_GLOBAL(_restgpr_20) +_GLOBAL(_rest32gpr_20) + lwz 20,-48(11) +_GLOBAL(_restgpr_21) +_GLOBAL(_rest32gpr_21) + lwz 21,-44(11) +_GLOBAL(_restgpr_22) +_GLOBAL(_rest32gpr_22) + lwz 22,-40(11) +_GLOBAL(_restgpr_23) +_GLOBAL(_rest32gpr_23) + lwz 23,-36(11) +_GLOBAL(_restgpr_24) +_GLOBAL(_rest32gpr_24) + lwz 24,-32(11) +_GLOBAL(_restgpr_25) +_GLOBAL(_rest32gpr_25) + lwz 25,-28(11) +_GLOBAL(_restgpr_26) +_GLOBAL(_rest32gpr_26) + lwz 26,-24(11) +_GLOBAL(_restgpr_27) +_GLOBAL(_rest32gpr_27) + lwz 27,-20(11) +_GLOBAL(_restgpr_28) +_GLOBAL(_rest32gpr_28) + lwz 28,-16(11) +_GLOBAL(_restgpr_29) +_GLOBAL(_rest32gpr_29) + lwz 29,-12(11) +_GLOBAL(_restgpr_30) +_GLOBAL(_rest32gpr_30) + lwz 30,-8(11) +_GLOBAL(_restgpr_31) +_GLOBAL(_rest32gpr_31) + lwz 31,-4(11) + blr + +/* Routines for restoring integer registers, called by the compiler. */ +/* Called with r11 pointing to the stack header word of the caller of the */ +/* function, just beyond the end of the integer restore area. */ + +_GLOBAL(_restgpr_14_x) +_GLOBAL(_rest32gpr_14_x) + lwz 14,-72(11) /* restore gp registers */ +_GLOBAL(_restgpr_15_x) +_GLOBAL(_rest32gpr_15_x) + lwz 15,-68(11) +_GLOBAL(_restgpr_16_x) +_GLOBAL(_rest32gpr_16_x) + lwz 16,-64(11) +_GLOBAL(_restgpr_17_x) +_GLOBAL(_rest32gpr_17_x) + lwz 17,-60(11) +_GLOBAL(_restgpr_18_x) +_GLOBAL(_rest32gpr_18_x) + lwz 18,-56(11) +_GLOBAL(_restgpr_19_x) +_GLOBAL(_rest32gpr_19_x) + lwz 19,-52(11) +_GLOBAL(_restgpr_20_x) +_GLOBAL(_rest32gpr_20_x) + lwz 20,-48(11) +_GLOBAL(_restgpr_21_x) +_GLOBAL(_rest32gpr_21_x) + lwz 21,-44(11) +_GLOBAL(_restgpr_22_x) +_GLOBAL(_rest32gpr_22_x) + lwz 22,-40(11) +_GLOBAL(_restgpr_23_x) +_GLOBAL(_rest32gpr_23_x) + lwz 23,-36(11) +_GLOBAL(_restgpr_24_x) +_GLOBAL(_rest32gpr_24_x) + lwz 24,-32(11) +_GLOBAL(_restgpr_25_x) +_GLOBAL(_rest32gpr_25_x) + lwz 25,-28(11) +_GLOBAL(_restgpr_26_x) +_GLOBAL(_rest32gpr_26_x) + lwz 26,-24(11) +_GLOBAL(_restgpr_27_x) +_GLOBAL(_rest32gpr_27_x) + lwz 27,-20(11) +_GLOBAL(_restgpr_28_x) +_GLOBAL(_rest32gpr_28_x) + lwz 28,-16(11) +_GLOBAL(_restgpr_29_x) +_GLOBAL(_rest32gpr_29_x) + lwz 29,-12(11) +_GLOBAL(_restgpr_30_x) +_GLOBAL(_rest32gpr_30_x) + lwz 30,-8(11) +_GLOBAL(_restgpr_31_x) +_GLOBAL(_rest32gpr_31_x) + lwz 0,4(11) + lwz 31,-4(11) + mtlr 0 + mr 1,11 + blr + +#ifdef CONFIG_ALTIVEC +/* Called with r0 pointing just beyond the end of the vector save area. */ + +_GLOBAL(_savevr_20) + li r11,-192 + stvx vr20,r11,r0 +_GLOBAL(_savevr_21) + li r11,-176 + stvx vr21,r11,r0 +_GLOBAL(_savevr_22) + li r11,-160 + stvx vr22,r11,r0 +_GLOBAL(_savevr_23) + li r11,-144 + stvx vr23,r11,r0 +_GLOBAL(_savevr_24) + li r11,-128 + stvx vr24,r11,r0 +_GLOBAL(_savevr_25) + li r11,-112 + stvx vr25,r11,r0 +_GLOBAL(_savevr_26) + li r11,-96 + stvx vr26,r11,r0 +_GLOBAL(_savevr_27) + li r11,-80 + stvx vr27,r11,r0 +_GLOBAL(_savevr_28) + li r11,-64 + stvx vr28,r11,r0 +_GLOBAL(_savevr_29) + li r11,-48 + stvx vr29,r11,r0 +_GLOBAL(_savevr_30) + li r11,-32 + stvx vr30,r11,r0 +_GLOBAL(_savevr_31) + li r11,-16 + stvx vr31,r11,r0 + blr + +_GLOBAL(_restvr_20) + li r11,-192 + lvx vr20,r11,r0 +_GLOBAL(_restvr_21) + li r11,-176 + lvx vr21,r11,r0 +_GLOBAL(_restvr_22) + li r11,-160 + lvx vr22,r11,r0 +_GLOBAL(_restvr_23) + li r11,-144 + lvx vr23,r11,r0 +_GLOBAL(_restvr_24) + li r11,-128 + lvx vr24,r11,r0 +_GLOBAL(_restvr_25) + li r11,-112 + lvx vr25,r11,r0 +_GLOBAL(_restvr_26) + li r11,-96 + lvx vr26,r11,r0 +_GLOBAL(_restvr_27) + li r11,-80 + lvx vr27,r11,r0 +_GLOBAL(_restvr_28) + li r11,-64 + lvx vr28,r11,r0 +_GLOBAL(_restvr_29) + li r11,-48 + lvx vr29,r11,r0 +_GLOBAL(_restvr_30) + li r11,-32 + lvx vr30,r11,r0 +_GLOBAL(_restvr_31) + li r11,-16 + lvx vr31,r11,r0 + blr + +#endif /* CONFIG_ALTIVEC */ + +#else /* CONFIG_PPC64 */ + + .section ".text.save.restore","ax",@progbits + +.globl _savegpr0_14 +_savegpr0_14: + std r14,-144(r1) +.globl _savegpr0_15 +_savegpr0_15: + std r15,-136(r1) +.globl _savegpr0_16 +_savegpr0_16: + std r16,-128(r1) +.globl _savegpr0_17 +_savegpr0_17: + std r17,-120(r1) +.globl _savegpr0_18 +_savegpr0_18: + std r18,-112(r1) +.globl _savegpr0_19 +_savegpr0_19: + std r19,-104(r1) +.globl _savegpr0_20 +_savegpr0_20: + std r20,-96(r1) +.globl _savegpr0_21 +_savegpr0_21: + std r21,-88(r1) +.globl _savegpr0_22 +_savegpr0_22: + std r22,-80(r1) +.globl _savegpr0_23 +_savegpr0_23: + std r23,-72(r1) +.globl _savegpr0_24 +_savegpr0_24: + std r24,-64(r1) +.globl _savegpr0_25 +_savegpr0_25: + std r25,-56(r1) +.globl _savegpr0_26 +_savegpr0_26: + std r26,-48(r1) +.globl _savegpr0_27 +_savegpr0_27: + std r27,-40(r1) +.globl _savegpr0_28 +_savegpr0_28: + std r28,-32(r1) +.globl _savegpr0_29 +_savegpr0_29: + std r29,-24(r1) +.globl _savegpr0_30 +_savegpr0_30: + std r30,-16(r1) +.globl _savegpr0_31 +_savegpr0_31: + std r31,-8(r1) + std r0,16(r1) + blr + +.globl _restgpr0_14 +_restgpr0_14: + ld r14,-144(r1) +.globl _restgpr0_15 +_restgpr0_15: + ld r15,-136(r1) +.globl _restgpr0_16 +_restgpr0_16: + ld r16,-128(r1) +.globl _restgpr0_17 +_restgpr0_17: + ld r17,-120(r1) +.globl _restgpr0_18 +_restgpr0_18: + ld r18,-112(r1) +.globl _restgpr0_19 +_restgpr0_19: + ld r19,-104(r1) +.globl _restgpr0_20 +_restgpr0_20: + ld r20,-96(r1) +.globl _restgpr0_21 +_restgpr0_21: + ld r21,-88(r1) +.globl _restgpr0_22 +_restgpr0_22: + ld r22,-80(r1) +.globl _restgpr0_23 +_restgpr0_23: + ld r23,-72(r1) +.globl _restgpr0_24 +_restgpr0_24: + ld r24,-64(r1) +.globl _restgpr0_25 +_restgpr0_25: + ld r25,-56(r1) +.globl _restgpr0_26 +_restgpr0_26: + ld r26,-48(r1) +.globl _restgpr0_27 +_restgpr0_27: + ld r27,-40(r1) +.globl _restgpr0_28 +_restgpr0_28: + ld r28,-32(r1) +.globl _restgpr0_29 +_restgpr0_29: + ld r0,16(r1) + ld r29,-24(r1) + mtlr r0 + ld r30,-16(r1) + ld r31,-8(r1) + blr + +.globl _restgpr0_30 +_restgpr0_30: + ld r30,-16(r1) +.globl _restgpr0_31 +_restgpr0_31: + ld r0,16(r1) + ld r31,-8(r1) + mtlr r0 + blr + +#ifdef CONFIG_ALTIVEC +/* Called with r0 pointing just beyond the end of the vector save area. */ + +.globl _savevr_20 +_savevr_20: + li r12,-192 + stvx vr20,r12,r0 +.globl _savevr_21 +_savevr_21: + li r12,-176 + stvx vr21,r12,r0 +.globl _savevr_22 +_savevr_22: + li r12,-160 + stvx vr22,r12,r0 +.globl _savevr_23 +_savevr_23: + li r12,-144 + stvx vr23,r12,r0 +.globl _savevr_24 +_savevr_24: + li r12,-128 + stvx vr24,r12,r0 +.globl _savevr_25 +_savevr_25: + li r12,-112 + stvx vr25,r12,r0 +.globl _savevr_26 +_savevr_26: + li r12,-96 + stvx vr26,r12,r0 +.globl _savevr_27 +_savevr_27: + li r12,-80 + stvx vr27,r12,r0 +.globl _savevr_28 +_savevr_28: + li r12,-64 + stvx vr28,r12,r0 +.globl _savevr_29 +_savevr_29: + li r12,-48 + stvx vr29,r12,r0 +.globl _savevr_30 +_savevr_30: + li r12,-32 + stvx vr30,r12,r0 +.globl _savevr_31 +_savevr_31: + li r12,-16 + stvx vr31,r12,r0 + blr + +.globl _restvr_20 +_restvr_20: + li r12,-192 + lvx vr20,r12,r0 +.globl _restvr_21 +_restvr_21: + li r12,-176 + lvx vr21,r12,r0 +.globl _restvr_22 +_restvr_22: + li r12,-160 + lvx vr22,r12,r0 +.globl _restvr_23 +_restvr_23: + li r12,-144 + lvx vr23,r12,r0 +.globl _restvr_24 +_restvr_24: + li r12,-128 + lvx vr24,r12,r0 +.globl _restvr_25 +_restvr_25: + li r12,-112 + lvx vr25,r12,r0 +.globl _restvr_26 +_restvr_26: + li r12,-96 + lvx vr26,r12,r0 +.globl _restvr_27 +_restvr_27: + li r12,-80 + lvx vr27,r12,r0 +.globl _restvr_28 +_restvr_28: + li r12,-64 + lvx vr28,r12,r0 +.globl _restvr_29 +_restvr_29: + li r12,-48 + lvx vr29,r12,r0 +.globl _restvr_30 +_restvr_30: + li r12,-32 + lvx vr30,r12,r0 +.globl _restvr_31 +_restvr_31: + li r12,-16 + lvx vr31,r12,r0 + blr + +#endif /* CONFIG_ALTIVEC */ + +#endif /* CONFIG_PPC64 */ + +#endif diff --git a/arch/powerpc/lib/devres.c b/arch/powerpc/lib/devres.c new file mode 100644 index 00000000000..8df55fc3aad --- /dev/null +++ b/arch/powerpc/lib/devres.c @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2008 Freescale Semiconductor, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/device.h> /* devres_*(), devm_ioremap_release() */ +#include <linux/gfp.h> +#include <linux/io.h> /* ioremap_prot() */ +#include <linux/export.h> /* EXPORT_SYMBOL() */ + +/** + * devm_ioremap_prot - Managed ioremap_prot() + * @dev: Generic device to remap IO address for + * @offset: BUS offset to map + * @size: Size of map + * @flags: Page flags + * + * Managed ioremap_prot(). Map is automatically unmapped on driver + * detach. + */ +void __iomem *devm_ioremap_prot(struct device *dev, resource_size_t offset, + size_t size, unsigned long flags) +{ + void __iomem **ptr, *addr; + + ptr = devres_alloc(devm_ioremap_release, sizeof(*ptr), GFP_KERNEL); + if (!ptr) + return NULL; + + addr = ioremap_prot(offset, size, flags); + if (addr) { + *ptr = addr; + devres_add(dev, ptr); + } else + devres_free(ptr); + + return addr; +} +EXPORT_SYMBOL(devm_ioremap_prot); diff --git a/arch/powerpc/lib/dma-noncoherent.c b/arch/powerpc/lib/dma-noncoherent.c deleted file mode 100644 index 6656d47841d..00000000000 --- a/arch/powerpc/lib/dma-noncoherent.c +++ /dev/null @@ -1,420 +0,0 @@ -/* - * PowerPC version derived from arch/arm/mm/consistent.c - * Copyright (C) 2001 Dan Malek (dmalek@jlc.net) - * - * Copyright (C) 2000 Russell King - * - * Consistent memory allocators. Used for DMA devices that want to - * share uncached memory with the processor core. The function return - * is the virtual address and 'dma_handle' is the physical address. - * Mostly stolen from the ARM port, with some changes for PowerPC. - * -- Dan - * - * Reorganized to get rid of the arch-specific consistent_* functions - * and provide non-coherent implementations for the DMA API. -Matt - * - * Added in_interrupt() safe dma_alloc_coherent()/dma_free_coherent() - * implementation. This is pulled straight from ARM and barely - * modified. -Matt - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/highmem.h> -#include <linux/dma-mapping.h> - -#include <asm/tlbflush.h> - -/* - * This address range defaults to a value that is safe for all - * platforms which currently set CONFIG_NOT_COHERENT_CACHE. It - * can be further configured for specific applications under - * the "Advanced Setup" menu. -Matt - */ -#define CONSISTENT_BASE (CONFIG_CONSISTENT_START) -#define CONSISTENT_END (CONFIG_CONSISTENT_START + CONFIG_CONSISTENT_SIZE) -#define CONSISTENT_OFFSET(x) (((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT) - -/* - * This is the page table (2MB) covering uncached, DMA consistent allocations - */ -static pte_t *consistent_pte; -static DEFINE_SPINLOCK(consistent_lock); - -/* - * VM region handling support. - * - * This should become something generic, handling VM region allocations for - * vmalloc and similar (ioremap, module space, etc). - * - * I envisage vmalloc()'s supporting vm_struct becoming: - * - * struct vm_struct { - * struct vm_region region; - * unsigned long flags; - * struct page **pages; - * unsigned int nr_pages; - * unsigned long phys_addr; - * }; - * - * get_vm_area() would then call vm_region_alloc with an appropriate - * struct vm_region head (eg): - * - * struct vm_region vmalloc_head = { - * .vm_list = LIST_HEAD_INIT(vmalloc_head.vm_list), - * .vm_start = VMALLOC_START, - * .vm_end = VMALLOC_END, - * }; - * - * However, vmalloc_head.vm_start is variable (typically, it is dependent on - * the amount of RAM found at boot time.) I would imagine that get_vm_area() - * would have to initialise this each time prior to calling vm_region_alloc(). - */ -struct vm_region { - struct list_head vm_list; - unsigned long vm_start; - unsigned long vm_end; -}; - -static struct vm_region consistent_head = { - .vm_list = LIST_HEAD_INIT(consistent_head.vm_list), - .vm_start = CONSISTENT_BASE, - .vm_end = CONSISTENT_END, -}; - -static struct vm_region * -vm_region_alloc(struct vm_region *head, size_t size, gfp_t gfp) -{ - unsigned long addr = head->vm_start, end = head->vm_end - size; - unsigned long flags; - struct vm_region *c, *new; - - new = kmalloc(sizeof(struct vm_region), gfp); - if (!new) - goto out; - - spin_lock_irqsave(&consistent_lock, flags); - - list_for_each_entry(c, &head->vm_list, vm_list) { - if ((addr + size) < addr) - goto nospc; - if ((addr + size) <= c->vm_start) - goto found; - addr = c->vm_end; - if (addr > end) - goto nospc; - } - - found: - /* - * Insert this entry _before_ the one we found. - */ - list_add_tail(&new->vm_list, &c->vm_list); - new->vm_start = addr; - new->vm_end = addr + size; - - spin_unlock_irqrestore(&consistent_lock, flags); - return new; - - nospc: - spin_unlock_irqrestore(&consistent_lock, flags); - kfree(new); - out: - return NULL; -} - -static struct vm_region *vm_region_find(struct vm_region *head, unsigned long addr) -{ - struct vm_region *c; - - list_for_each_entry(c, &head->vm_list, vm_list) { - if (c->vm_start == addr) - goto out; - } - c = NULL; - out: - return c; -} - -/* - * Allocate DMA-coherent memory space and return both the kernel remapped - * virtual and bus address for that space. - */ -void * -__dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp) -{ - struct page *page; - struct vm_region *c; - unsigned long order; - u64 mask = 0x00ffffff, limit; /* ISA default */ - - if (!consistent_pte) { - printk(KERN_ERR "%s: not initialised\n", __func__); - dump_stack(); - return NULL; - } - - size = PAGE_ALIGN(size); - limit = (mask + 1) & ~mask; - if ((limit && size >= limit) || size >= (CONSISTENT_END - CONSISTENT_BASE)) { - printk(KERN_WARNING "coherent allocation too big (requested %#x mask %#Lx)\n", - size, mask); - return NULL; - } - - order = get_order(size); - - if (mask != 0xffffffff) - gfp |= GFP_DMA; - - page = alloc_pages(gfp, order); - if (!page) - goto no_page; - - /* - * Invalidate any data that might be lurking in the - * kernel direct-mapped region for device DMA. - */ - { - unsigned long kaddr = (unsigned long)page_address(page); - memset(page_address(page), 0, size); - flush_dcache_range(kaddr, kaddr + size); - } - - /* - * Allocate a virtual address in the consistent mapping region. - */ - c = vm_region_alloc(&consistent_head, size, - gfp & ~(__GFP_DMA | __GFP_HIGHMEM)); - if (c) { - unsigned long vaddr = c->vm_start; - pte_t *pte = consistent_pte + CONSISTENT_OFFSET(vaddr); - struct page *end = page + (1 << order); - - split_page(page, order); - - /* - * Set the "dma handle" - */ - *handle = page_to_bus(page); - - do { - BUG_ON(!pte_none(*pte)); - - SetPageReserved(page); - set_pte_at(&init_mm, vaddr, - pte, mk_pte(page, pgprot_noncached(PAGE_KERNEL))); - page++; - pte++; - vaddr += PAGE_SIZE; - } while (size -= PAGE_SIZE); - - /* - * Free the otherwise unused pages. - */ - while (page < end) { - __free_page(page); - page++; - } - - return (void *)c->vm_start; - } - - if (page) - __free_pages(page, order); - no_page: - return NULL; -} -EXPORT_SYMBOL(__dma_alloc_coherent); - -/* - * free a page as defined by the above mapping. - */ -void __dma_free_coherent(size_t size, void *vaddr) -{ - struct vm_region *c; - unsigned long flags, addr; - pte_t *ptep; - - size = PAGE_ALIGN(size); - - spin_lock_irqsave(&consistent_lock, flags); - - c = vm_region_find(&consistent_head, (unsigned long)vaddr); - if (!c) - goto no_area; - - if ((c->vm_end - c->vm_start) != size) { - printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n", - __func__, c->vm_end - c->vm_start, size); - dump_stack(); - size = c->vm_end - c->vm_start; - } - - ptep = consistent_pte + CONSISTENT_OFFSET(c->vm_start); - addr = c->vm_start; - do { - pte_t pte = ptep_get_and_clear(&init_mm, addr, ptep); - unsigned long pfn; - - ptep++; - addr += PAGE_SIZE; - - if (!pte_none(pte) && pte_present(pte)) { - pfn = pte_pfn(pte); - - if (pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); - ClearPageReserved(page); - - __free_page(page); - continue; - } - } - - printk(KERN_CRIT "%s: bad page in kernel page table\n", - __func__); - } while (size -= PAGE_SIZE); - - flush_tlb_kernel_range(c->vm_start, c->vm_end); - - list_del(&c->vm_list); - - spin_unlock_irqrestore(&consistent_lock, flags); - - kfree(c); - return; - - no_area: - spin_unlock_irqrestore(&consistent_lock, flags); - printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n", - __func__, vaddr); - dump_stack(); -} -EXPORT_SYMBOL(__dma_free_coherent); - -/* - * Initialise the consistent memory allocation. - */ -static int __init dma_alloc_init(void) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - int ret = 0; - - do { - pgd = pgd_offset(&init_mm, CONSISTENT_BASE); - pud = pud_alloc(&init_mm, pgd, CONSISTENT_BASE); - pmd = pmd_alloc(&init_mm, pud, CONSISTENT_BASE); - if (!pmd) { - printk(KERN_ERR "%s: no pmd tables\n", __func__); - ret = -ENOMEM; - break; - } - WARN_ON(!pmd_none(*pmd)); - - pte = pte_alloc_kernel(pmd, CONSISTENT_BASE); - if (!pte) { - printk(KERN_ERR "%s: no pte tables\n", __func__); - ret = -ENOMEM; - break; - } - - consistent_pte = pte; - } while (0); - - return ret; -} - -core_initcall(dma_alloc_init); - -/* - * make an area consistent. - */ -void __dma_sync(void *vaddr, size_t size, int direction) -{ - unsigned long start = (unsigned long)vaddr; - unsigned long end = start + size; - - switch (direction) { - case DMA_NONE: - BUG(); - case DMA_FROM_DEVICE: /* invalidate only */ - invalidate_dcache_range(start, end); - break; - case DMA_TO_DEVICE: /* writeback only */ - clean_dcache_range(start, end); - break; - case DMA_BIDIRECTIONAL: /* writeback and invalidate */ - flush_dcache_range(start, end); - break; - } -} -EXPORT_SYMBOL(__dma_sync); - -#ifdef CONFIG_HIGHMEM -/* - * __dma_sync_page() implementation for systems using highmem. - * In this case, each page of a buffer must be kmapped/kunmapped - * in order to have a virtual address for __dma_sync(). This must - * not sleep so kmap_atomic()/kunmap_atomic() are used. - * - * Note: yes, it is possible and correct to have a buffer extend - * beyond the first page. - */ -static inline void __dma_sync_page_highmem(struct page *page, - unsigned long offset, size_t size, int direction) -{ - size_t seg_size = min((size_t)(PAGE_SIZE - offset), size); - size_t cur_size = seg_size; - unsigned long flags, start, seg_offset = offset; - int nr_segs = 1 + ((size - seg_size) + PAGE_SIZE - 1)/PAGE_SIZE; - int seg_nr = 0; - - local_irq_save(flags); - - do { - start = (unsigned long)kmap_atomic(page + seg_nr, - KM_PPC_SYNC_PAGE) + seg_offset; - - /* Sync this buffer segment */ - __dma_sync((void *)start, seg_size, direction); - kunmap_atomic((void *)start, KM_PPC_SYNC_PAGE); - seg_nr++; - - /* Calculate next buffer segment size */ - seg_size = min((size_t)PAGE_SIZE, size - cur_size); - - /* Add the segment size to our running total */ - cur_size += seg_size; - seg_offset = 0; - } while (seg_nr < nr_segs); - - local_irq_restore(flags); -} -#endif /* CONFIG_HIGHMEM */ - -/* - * __dma_sync_page makes memory consistent. identical to __dma_sync, but - * takes a struct page instead of a virtual address - */ -void __dma_sync_page(struct page *page, unsigned long offset, - size_t size, int direction) -{ -#ifdef CONFIG_HIGHMEM - __dma_sync_page_highmem(page, offset, size, direction); -#else - unsigned long start = (unsigned long)page_address(page) + offset; - __dma_sync((void *)start, size, direction); -#endif -} -EXPORT_SYMBOL(__dma_sync_page); diff --git a/arch/powerpc/lib/feature-fixups-test.S b/arch/powerpc/lib/feature-fixups-test.S new file mode 100644 index 00000000000..f4613118132 --- /dev/null +++ b/arch/powerpc/lib/feature-fixups-test.S @@ -0,0 +1,761 @@ +/* + * Copyright 2008 Michael Ellerman, IBM Corporation. + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/feature-fixups.h> +#include <asm/ppc_asm.h> +#include <asm/synch.h> + + .text + +#define globl(x) \ + .globl x; \ +x: + +globl(ftr_fixup_test1) + or 1,1,1 + or 2,2,2 /* fixup will nop out this instruction */ + or 3,3,3 + +globl(end_ftr_fixup_test1) + +globl(ftr_fixup_test1_orig) + or 1,1,1 + or 2,2,2 + or 3,3,3 + +globl(ftr_fixup_test1_expected) + or 1,1,1 + nop + or 3,3,3 + +globl(ftr_fixup_test2) + or 1,1,1 + or 2,2,2 /* fixup will replace this with ftr_fixup_test2_alt */ + or 3,3,3 + +globl(end_ftr_fixup_test2) + +globl(ftr_fixup_test2_orig) + or 1,1,1 + or 2,2,2 + or 3,3,3 + +globl(ftr_fixup_test2_alt) + or 31,31,31 + +globl(ftr_fixup_test2_expected) + or 1,1,1 + or 31,31,31 + or 3,3,3 + +globl(ftr_fixup_test3) + or 1,1,1 + or 2,2,2 /* fixup will fail to replace this */ + or 3,3,3 + +globl(end_ftr_fixup_test3) + +globl(ftr_fixup_test3_orig) + or 1,1,1 + or 2,2,2 + or 3,3,3 + +globl(ftr_fixup_test3_alt) + or 31,31,31 + or 31,31,31 + +globl(ftr_fixup_test4) + or 1,1,1 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 3,3,3 + +globl(end_ftr_fixup_test4) + +globl(ftr_fixup_test4_expected) + or 1,1,1 + or 31,31,31 + or 31,31,31 + nop + nop + or 3,3,3 + +globl(ftr_fixup_test4_orig) + or 1,1,1 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 3,3,3 + +globl(ftr_fixup_test4_alt) + or 31,31,31 + or 31,31,31 + + +globl(ftr_fixup_test5) + or 1,1,1 +BEGIN_FTR_SECTION + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 2,2,2 +FTR_SECTION_ELSE +2: b 3f +3: or 5,5,5 + beq 3b + b 1f + or 6,6,6 + b 2b +1: bdnz 3b +ALT_FTR_SECTION_END(0, 1) + or 1,1,1 + +globl(end_ftr_fixup_test5) + +globl(ftr_fixup_test5_expected) + or 1,1,1 +2: b 3f +3: or 5,5,5 + beq 3b + b 1f + or 6,6,6 + b 2b +1: bdnz 3b + or 1,1,1 + +globl(ftr_fixup_test6) +1: or 1,1,1 +BEGIN_FTR_SECTION + or 5,5,5 +2: PPC_LCMPI r3,0 + beq 4f + blt 2b + b 1b + b 4f +FTR_SECTION_ELSE +2: or 2,2,2 + PPC_LCMPI r3,1 + beq 3f + blt 2b + b 3f + b 1b +ALT_FTR_SECTION_END(0, 1) +3: or 1,1,1 + or 2,2,2 +4: or 3,3,3 + +globl(end_ftr_fixup_test6) + +globl(ftr_fixup_test6_expected) +1: or 1,1,1 +2: or 2,2,2 + PPC_LCMPI r3,1 + beq 3f + blt 2b + b 3f + b 1b +2: or 1,1,1 + or 2,2,2 +3: or 3,3,3 + + +#if 0 +/* Test that if we have a larger else case the assembler spots it and + * reports an error. #if 0'ed so as not to break the build normally. + */ +ftr_fixup_test7: + or 1,1,1 +BEGIN_FTR_SECTION + or 2,2,2 + or 2,2,2 + or 2,2,2 +FTR_SECTION_ELSE + or 3,3,3 + or 3,3,3 + or 3,3,3 + or 3,3,3 +ALT_FTR_SECTION_END(0, 1) + or 1,1,1 +#endif + +#define MAKE_MACRO_TEST(TYPE) \ +globl(ftr_fixup_test_ ##TYPE##_macros) \ + or 1,1,1; \ + /* Basic test, this section should all be nop'ed */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ + or 2,2,2; \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Basic test, this section should NOT be nop'ed */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ + or 2,2,2; \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nesting test, inner section should be nop'ed */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(80) \ + or 3,3,3; \ + or 3,3,3; \ +END_##TYPE##_SECTION_NESTED(0, 1, 80) \ + or 2,2,2; \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nesting test, whole section should be nop'ed */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(80) \ + or 3,3,3; \ + or 3,3,3; \ +END_##TYPE##_SECTION_NESTED(0, 0, 80) \ + or 2,2,2; \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nesting test, none should be nop'ed */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(80) \ + or 3,3,3; \ + or 3,3,3; \ +END_##TYPE##_SECTION_NESTED(0, 0, 80) \ + or 2,2,2; \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Basic alt section test, default case should be taken */ \ +BEGIN_##TYPE##_SECTION \ + or 3,3,3; \ + or 3,3,3; \ + or 3,3,3; \ +##TYPE##_SECTION_ELSE \ + or 5,5,5; \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Basic alt section test, else case should be taken */ \ +BEGIN_##TYPE##_SECTION \ + or 3,3,3; \ + or 3,3,3; \ + or 3,3,3; \ +##TYPE##_SECTION_ELSE \ + or 31,31,31; \ + or 31,31,31; \ + or 31,31,31; \ +ALT_##TYPE##_SECTION_END(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt with smaller else case, should be padded with nops */ \ +BEGIN_##TYPE##_SECTION \ + or 3,3,3; \ + or 3,3,3; \ + or 3,3,3; \ +##TYPE##_SECTION_ELSE \ + or 31,31,31; \ +ALT_##TYPE##_SECTION_END(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt section with nested section in default case */ \ + /* Default case should be taken, with nop'ed inner section */ \ +BEGIN_##TYPE##_SECTION \ + or 3,3,3; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 3,3,3; \ + or 3,3,3; \ +END_##TYPE##_SECTION_NESTED(0, 1, 95) \ + or 3,3,3; \ +##TYPE##_SECTION_ELSE \ + or 2,2,2; \ + or 2,2,2; \ +ALT_##TYPE##_SECTION_END(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt section with nested section in else, default taken */ \ +BEGIN_##TYPE##_SECTION \ + or 3,3,3; \ + or 3,3,3; \ + or 3,3,3; \ +##TYPE##_SECTION_ELSE \ + or 5,5,5; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 3,3,3; \ +END_##TYPE##_SECTION_NESTED(0, 1, 95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt section with nested section in else, else taken & nop */ \ +BEGIN_##TYPE##_SECTION \ + or 3,3,3; \ + or 3,3,3; \ + or 3,3,3; \ +##TYPE##_SECTION_ELSE \ + or 5,5,5; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 3,3,3; \ +END_##TYPE##_SECTION_NESTED(0, 1, 95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Feature section with nested alt section, default taken */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 1,1,1; \ +##TYPE##_SECTION_ELSE_NESTED(95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95) \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Feature section with nested alt section, else taken */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 1,1,1; \ +##TYPE##_SECTION_ELSE_NESTED(95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Feature section with nested alt section, all nop'ed */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 1,1,1; \ +##TYPE##_SECTION_ELSE_NESTED(95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95) \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, default with inner default taken */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 1,1,1; \ +##TYPE##_SECTION_ELSE_NESTED(95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95) \ + or 2,2,2; \ +##TYPE##_SECTION_ELSE \ + or 31,31,31; \ +BEGIN_##TYPE##_SECTION_NESTED(94) \ + or 5,5,5; \ +##TYPE##_SECTION_ELSE_NESTED(94) \ + or 1,1,1; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94) \ + or 31,31,31; \ +ALT_##TYPE##_SECTION_END(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, default with inner else taken */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 1,1,1; \ +##TYPE##_SECTION_ELSE_NESTED(95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) \ + or 2,2,2; \ +##TYPE##_SECTION_ELSE \ + or 31,31,31; \ +BEGIN_##TYPE##_SECTION_NESTED(94) \ + or 5,5,5; \ +##TYPE##_SECTION_ELSE_NESTED(94) \ + or 1,1,1; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94) \ + or 31,31,31; \ +ALT_##TYPE##_SECTION_END(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, else with inner default taken */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 1,1,1; \ +##TYPE##_SECTION_ELSE_NESTED(95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) \ + or 2,2,2; \ +##TYPE##_SECTION_ELSE \ + or 31,31,31; \ +BEGIN_##TYPE##_SECTION_NESTED(94) \ + or 5,5,5; \ +##TYPE##_SECTION_ELSE_NESTED(94) \ + or 1,1,1; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94) \ + or 31,31,31; \ +ALT_##TYPE##_SECTION_END(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, else with inner else taken */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 1,1,1; \ +##TYPE##_SECTION_ELSE_NESTED(95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) \ + or 2,2,2; \ +##TYPE##_SECTION_ELSE \ + or 31,31,31; \ +BEGIN_##TYPE##_SECTION_NESTED(94) \ + or 5,5,5; \ +##TYPE##_SECTION_ELSE_NESTED(94) \ + or 1,1,1; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 1, 94) \ + or 31,31,31; \ +ALT_##TYPE##_SECTION_END(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, else can have large else case */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ + or 2,2,2; \ + or 2,2,2; \ + or 2,2,2; \ +##TYPE##_SECTION_ELSE \ +BEGIN_##TYPE##_SECTION_NESTED(94) \ + or 5,5,5; \ + or 5,5,5; \ + or 5,5,5; \ + or 5,5,5; \ +##TYPE##_SECTION_ELSE_NESTED(94) \ + or 1,1,1; \ + or 1,1,1; \ + or 1,1,1; \ + or 1,1,1; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 1, 94) \ +ALT_##TYPE##_SECTION_END(0, 1) \ + or 1,1,1; \ + or 1,1,1; + +#define MAKE_MACRO_TEST_EXPECTED(TYPE) \ +globl(ftr_fixup_test_ ##TYPE##_macros_expected) \ + or 1,1,1; \ + /* Basic test, this section should all be nop'ed */ \ +/* BEGIN_##TYPE##_SECTION */ \ + nop; \ + nop; \ + nop; \ +/* END_##TYPE##_SECTION(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Basic test, this section should NOT be nop'ed */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 2,2,2; \ + or 2,2,2; \ + or 2,2,2; \ +/* END_##TYPE##_SECTION(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nesting test, inner section should be nop'ed */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 2,2,2; \ + or 2,2,2; \ +/* BEGIN_##TYPE##_SECTION_NESTED(80) */ \ + nop; \ + nop; \ +/* END_##TYPE##_SECTION_NESTED(0, 1, 80) */ \ + or 2,2,2; \ + or 2,2,2; \ +/* END_##TYPE##_SECTION(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nesting test, whole section should be nop'ed */ \ + /* NB. inner section is not nop'ed, but then entire outer is */ \ +/* BEGIN_##TYPE##_SECTION */ \ + nop; \ + nop; \ +/* BEGIN_##TYPE##_SECTION_NESTED(80) */ \ + nop; \ + nop; \ +/* END_##TYPE##_SECTION_NESTED(0, 0, 80) */ \ + nop; \ + nop; \ +/* END_##TYPE##_SECTION(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nesting test, none should be nop'ed */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 2,2,2; \ + or 2,2,2; \ +/* BEGIN_##TYPE##_SECTION_NESTED(80) */ \ + or 3,3,3; \ + or 3,3,3; \ +/* END_##TYPE##_SECTION_NESTED(0, 0, 80) */ \ + or 2,2,2; \ + or 2,2,2; \ +/* END_##TYPE##_SECTION(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Basic alt section test, default case should be taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 3,3,3; \ + or 3,3,3; \ + or 3,3,3; \ +/* ##TYPE##_SECTION_ELSE */ \ + /* or 5,5,5; */ \ + /* or 5,5,5; */ \ +/* ALT_##TYPE##_SECTION_END(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Basic alt section test, else case should be taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + /* or 3,3,3; */ \ + /* or 3,3,3; */ \ + /* or 3,3,3; */ \ +/* ##TYPE##_SECTION_ELSE */ \ + or 31,31,31; \ + or 31,31,31; \ + or 31,31,31; \ +/* ALT_##TYPE##_SECTION_END(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt with smaller else case, should be padded with nops */ \ +/* BEGIN_##TYPE##_SECTION */ \ + /* or 3,3,3; */ \ + /* or 3,3,3; */ \ + /* or 3,3,3; */ \ +/* ##TYPE##_SECTION_ELSE */ \ + or 31,31,31; \ + nop; \ + nop; \ +/* ALT_##TYPE##_SECTION_END(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt section with nested section in default case */ \ + /* Default case should be taken, with nop'ed inner section */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 3,3,3; \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + nop; \ + nop; \ +/* END_##TYPE##_SECTION_NESTED(0, 1, 95) */ \ + or 3,3,3; \ +/* ##TYPE##_SECTION_ELSE */ \ + /* or 2,2,2; */ \ + /* or 2,2,2; */ \ +/* ALT_##TYPE##_SECTION_END(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt section with nested section in else, default taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 3,3,3; \ + or 3,3,3; \ + or 3,3,3; \ +/* ##TYPE##_SECTION_ELSE */ \ + /* or 5,5,5; */ \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + /* or 3,3,3; */ \ +/* END_##TYPE##_SECTION_NESTED(0, 1, 95) */ \ + /* or 5,5,5; */ \ +/* ALT_##TYPE##_SECTION_END(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt section with nested section in else, else taken & nop */ \ +/* BEGIN_##TYPE##_SECTION */ \ + /* or 3,3,3; */ \ + /* or 3,3,3; */ \ + /* or 3,3,3; */ \ +/* ##TYPE##_SECTION_ELSE */ \ + or 5,5,5; \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + nop; \ +/* END_##TYPE##_SECTION_NESTED(0, 1, 95) */ \ + or 5,5,5; \ +/* ALT_##TYPE##_SECTION_END(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Feature section with nested alt section, default taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 2,2,2; \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + or 1,1,1; \ +/* ##TYPE##_SECTION_ELSE_NESTED(95) */ \ + /* or 5,5,5; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95) */ \ + or 2,2,2; \ +/* END_##TYPE##_SECTION(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Feature section with nested alt section, else taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 2,2,2; \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + /* or 1,1,1; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(95) */ \ + or 5,5,5; \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) */ \ + or 2,2,2; \ +/* END_##TYPE##_SECTION(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Feature section with nested alt section, all nop'ed */ \ +/* BEGIN_##TYPE##_SECTION */ \ + nop; \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + nop; \ +/* ##TYPE##_SECTION_ELSE_NESTED(95) */ \ + /* or 5,5,5; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95) */ \ + nop; \ +/* END_##TYPE##_SECTION(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, default with inner default taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 2,2,2; \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + or 1,1,1; \ +/* ##TYPE##_SECTION_ELSE_NESTED(95) */ \ + /* or 5,5,5; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95) */ \ + or 2,2,2; \ +/* ##TYPE##_SECTION_ELSE */ \ + /* or 31,31,31; */ \ +/* BEGIN_##TYPE##_SECTION_NESTED(94) */ \ + /* or 5,5,5; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(94) */ \ + /* or 1,1,1; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94) */ \ + /* or 31,31,31; */ \ +/* ALT_##TYPE##_SECTION_END(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, default with inner else taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 2,2,2; \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + /* or 1,1,1; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(95) */ \ + or 5,5,5; \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) */ \ + or 2,2,2; \ +/* ##TYPE##_SECTION_ELSE */ \ + /* or 31,31,31; */ \ +/* BEGIN_##TYPE##_SECTION_NESTED(94) */ \ + /* or 5,5,5; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(94) */ \ + /* or 1,1,1; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94) */ \ + /* or 31,31,31; */ \ +/* ALT_##TYPE##_SECTION_END(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, else with inner default taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + /* or 2,2,2; */ \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + /* or 1,1,1; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(95) */ \ + /* or 5,5,5; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) */ \ + /* or 2,2,2; */ \ +/* ##TYPE##_SECTION_ELSE */ \ + or 31,31,31; \ +/* BEGIN_##TYPE##_SECTION_NESTED(94) */ \ + or 5,5,5; \ +/* ##TYPE##_SECTION_ELSE_NESTED(94) */ \ + /* or 1,1,1; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94) */ \ + or 31,31,31; \ +/* ALT_##TYPE##_SECTION_END(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, else with inner else taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + /* or 2,2,2; */ \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + /* or 1,1,1; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(95) */ \ + /* or 5,5,5; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) */ \ + /* or 2,2,2; */ \ +/* ##TYPE##_SECTION_ELSE */ \ + or 31,31,31; \ +/* BEGIN_##TYPE##_SECTION_NESTED(94) */ \ + /* or 5,5,5; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(94) */ \ + or 1,1,1; \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 94) */ \ + or 31,31,31; \ +/* ALT_##TYPE##_SECTION_END(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, else can have large else case */ \ +/* BEGIN_##TYPE##_SECTION */ \ + /* or 2,2,2; */ \ + /* or 2,2,2; */ \ + /* or 2,2,2; */ \ + /* or 2,2,2; */ \ +/* ##TYPE##_SECTION_ELSE */ \ +/* BEGIN_##TYPE##_SECTION_NESTED(94) */ \ + /* or 5,5,5; */ \ + /* or 5,5,5; */ \ + /* or 5,5,5; */ \ + /* or 5,5,5; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(94) */ \ + or 1,1,1; \ + or 1,1,1; \ + or 1,1,1; \ + or 1,1,1; \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 94) */ \ +/* ALT_##TYPE##_SECTION_END(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; + +MAKE_MACRO_TEST(FTR); +MAKE_MACRO_TEST_EXPECTED(FTR); + +#ifdef CONFIG_PPC64 +MAKE_MACRO_TEST(FW_FTR); +MAKE_MACRO_TEST_EXPECTED(FW_FTR); +#endif + +globl(lwsync_fixup_test) +1: or 1,1,1 + LWSYNC +globl(end_lwsync_fixup_test) + +globl(lwsync_fixup_test_expected_LWSYNC) +1: or 1,1,1 + lwsync + +globl(lwsync_fixup_test_expected_SYNC) +1: or 1,1,1 + sync + diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c new file mode 100644 index 00000000000..7a8a7487cee --- /dev/null +++ b/arch/powerpc/lib/feature-fixups.c @@ -0,0 +1,376 @@ +/* + * Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org) + * + * Modifications for ppc64: + * Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com> + * + * Copyright 2008 Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/init.h> +#include <asm/cputable.h> +#include <asm/code-patching.h> +#include <asm/page.h> +#include <asm/sections.h> + + +struct fixup_entry { + unsigned long mask; + unsigned long value; + long start_off; + long end_off; + long alt_start_off; + long alt_end_off; +}; + +static unsigned int *calc_addr(struct fixup_entry *fcur, long offset) +{ + /* + * We store the offset to the code as a negative offset from + * the start of the alt_entry, to support the VDSO. This + * routine converts that back into an actual address. + */ + return (unsigned int *)((unsigned long)fcur + offset); +} + +static int patch_alt_instruction(unsigned int *src, unsigned int *dest, + unsigned int *alt_start, unsigned int *alt_end) +{ + unsigned int instr; + + instr = *src; + + if (instr_is_relative_branch(*src)) { + unsigned int *target = (unsigned int *)branch_target(src); + + /* Branch within the section doesn't need translating */ + if (target < alt_start || target >= alt_end) { + instr = translate_branch(dest, src); + if (!instr) + return 1; + } + } + + patch_instruction(dest, instr); + + return 0; +} + +static int patch_feature_section(unsigned long value, struct fixup_entry *fcur) +{ + unsigned int *start, *end, *alt_start, *alt_end, *src, *dest; + + start = calc_addr(fcur, fcur->start_off); + end = calc_addr(fcur, fcur->end_off); + alt_start = calc_addr(fcur, fcur->alt_start_off); + alt_end = calc_addr(fcur, fcur->alt_end_off); + + if ((alt_end - alt_start) > (end - start)) + return 1; + + if ((value & fcur->mask) == fcur->value) + return 0; + + src = alt_start; + dest = start; + + for (; src < alt_end; src++, dest++) { + if (patch_alt_instruction(src, dest, alt_start, alt_end)) + return 1; + } + + for (; dest < end; dest++) + patch_instruction(dest, PPC_INST_NOP); + + return 0; +} + +void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end) +{ + struct fixup_entry *fcur, *fend; + + fcur = fixup_start; + fend = fixup_end; + + for (; fcur < fend; fcur++) { + if (patch_feature_section(value, fcur)) { + WARN_ON(1); + printk("Unable to patch feature section at %p - %p" \ + " with %p - %p\n", + calc_addr(fcur, fcur->start_off), + calc_addr(fcur, fcur->end_off), + calc_addr(fcur, fcur->alt_start_off), + calc_addr(fcur, fcur->alt_end_off)); + } + } +} + +void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end) +{ + long *start, *end; + unsigned int *dest; + + if (!(value & CPU_FTR_LWSYNC)) + return ; + + start = fixup_start; + end = fixup_end; + + for (; start < end; start++) { + dest = (void *)start + *start; + patch_instruction(dest, PPC_INST_LWSYNC); + } +} + +void do_final_fixups(void) +{ +#if defined(CONFIG_PPC64) && defined(CONFIG_RELOCATABLE) + int *src, *dest; + unsigned long length; + + if (PHYSICAL_START == 0) + return; + + src = (int *)(KERNELBASE + PHYSICAL_START); + dest = (int *)KERNELBASE; + length = (__end_interrupts - _stext) / sizeof(int); + + while (length--) { + patch_instruction(dest, *src); + src++; + dest++; + } +#endif +} + +#ifdef CONFIG_FTR_FIXUP_SELFTEST + +#define check(x) \ + if (!(x)) printk("feature-fixups: test failed at line %d\n", __LINE__); + +/* This must be after the text it fixes up, vmlinux.lds.S enforces that atm */ +static struct fixup_entry fixup; + +static long calc_offset(struct fixup_entry *entry, unsigned int *p) +{ + return (unsigned long)p - (unsigned long)entry; +} + +void test_basic_patching(void) +{ + extern unsigned int ftr_fixup_test1; + extern unsigned int end_ftr_fixup_test1; + extern unsigned int ftr_fixup_test1_orig; + extern unsigned int ftr_fixup_test1_expected; + int size = &end_ftr_fixup_test1 - &ftr_fixup_test1; + + fixup.value = fixup.mask = 8; + fixup.start_off = calc_offset(&fixup, &ftr_fixup_test1 + 1); + fixup.end_off = calc_offset(&fixup, &ftr_fixup_test1 + 2); + fixup.alt_start_off = fixup.alt_end_off = 0; + + /* Sanity check */ + check(memcmp(&ftr_fixup_test1, &ftr_fixup_test1_orig, size) == 0); + + /* Check we don't patch if the value matches */ + patch_feature_section(8, &fixup); + check(memcmp(&ftr_fixup_test1, &ftr_fixup_test1_orig, size) == 0); + + /* Check we do patch if the value doesn't match */ + patch_feature_section(0, &fixup); + check(memcmp(&ftr_fixup_test1, &ftr_fixup_test1_expected, size) == 0); + + /* Check we do patch if the mask doesn't match */ + memcpy(&ftr_fixup_test1, &ftr_fixup_test1_orig, size); + check(memcmp(&ftr_fixup_test1, &ftr_fixup_test1_orig, size) == 0); + patch_feature_section(~8, &fixup); + check(memcmp(&ftr_fixup_test1, &ftr_fixup_test1_expected, size) == 0); +} + +static void test_alternative_patching(void) +{ + extern unsigned int ftr_fixup_test2; + extern unsigned int end_ftr_fixup_test2; + extern unsigned int ftr_fixup_test2_orig; + extern unsigned int ftr_fixup_test2_alt; + extern unsigned int ftr_fixup_test2_expected; + int size = &end_ftr_fixup_test2 - &ftr_fixup_test2; + + fixup.value = fixup.mask = 0xF; + fixup.start_off = calc_offset(&fixup, &ftr_fixup_test2 + 1); + fixup.end_off = calc_offset(&fixup, &ftr_fixup_test2 + 2); + fixup.alt_start_off = calc_offset(&fixup, &ftr_fixup_test2_alt); + fixup.alt_end_off = calc_offset(&fixup, &ftr_fixup_test2_alt + 1); + + /* Sanity check */ + check(memcmp(&ftr_fixup_test2, &ftr_fixup_test2_orig, size) == 0); + + /* Check we don't patch if the value matches */ + patch_feature_section(0xF, &fixup); + check(memcmp(&ftr_fixup_test2, &ftr_fixup_test2_orig, size) == 0); + + /* Check we do patch if the value doesn't match */ + patch_feature_section(0, &fixup); + check(memcmp(&ftr_fixup_test2, &ftr_fixup_test2_expected, size) == 0); + + /* Check we do patch if the mask doesn't match */ + memcpy(&ftr_fixup_test2, &ftr_fixup_test2_orig, size); + check(memcmp(&ftr_fixup_test2, &ftr_fixup_test2_orig, size) == 0); + patch_feature_section(~0xF, &fixup); + check(memcmp(&ftr_fixup_test2, &ftr_fixup_test2_expected, size) == 0); +} + +static void test_alternative_case_too_big(void) +{ + extern unsigned int ftr_fixup_test3; + extern unsigned int end_ftr_fixup_test3; + extern unsigned int ftr_fixup_test3_orig; + extern unsigned int ftr_fixup_test3_alt; + int size = &end_ftr_fixup_test3 - &ftr_fixup_test3; + + fixup.value = fixup.mask = 0xC; + fixup.start_off = calc_offset(&fixup, &ftr_fixup_test3 + 1); + fixup.end_off = calc_offset(&fixup, &ftr_fixup_test3 + 2); + fixup.alt_start_off = calc_offset(&fixup, &ftr_fixup_test3_alt); + fixup.alt_end_off = calc_offset(&fixup, &ftr_fixup_test3_alt + 2); + + /* Sanity check */ + check(memcmp(&ftr_fixup_test3, &ftr_fixup_test3_orig, size) == 0); + + /* Expect nothing to be patched, and the error returned to us */ + check(patch_feature_section(0xF, &fixup) == 1); + check(memcmp(&ftr_fixup_test3, &ftr_fixup_test3_orig, size) == 0); + check(patch_feature_section(0, &fixup) == 1); + check(memcmp(&ftr_fixup_test3, &ftr_fixup_test3_orig, size) == 0); + check(patch_feature_section(~0xF, &fixup) == 1); + check(memcmp(&ftr_fixup_test3, &ftr_fixup_test3_orig, size) == 0); +} + +static void test_alternative_case_too_small(void) +{ + extern unsigned int ftr_fixup_test4; + extern unsigned int end_ftr_fixup_test4; + extern unsigned int ftr_fixup_test4_orig; + extern unsigned int ftr_fixup_test4_alt; + extern unsigned int ftr_fixup_test4_expected; + int size = &end_ftr_fixup_test4 - &ftr_fixup_test4; + unsigned long flag; + + /* Check a high-bit flag */ + flag = 1UL << ((sizeof(unsigned long) - 1) * 8); + fixup.value = fixup.mask = flag; + fixup.start_off = calc_offset(&fixup, &ftr_fixup_test4 + 1); + fixup.end_off = calc_offset(&fixup, &ftr_fixup_test4 + 5); + fixup.alt_start_off = calc_offset(&fixup, &ftr_fixup_test4_alt); + fixup.alt_end_off = calc_offset(&fixup, &ftr_fixup_test4_alt + 2); + + /* Sanity check */ + check(memcmp(&ftr_fixup_test4, &ftr_fixup_test4_orig, size) == 0); + + /* Check we don't patch if the value matches */ + patch_feature_section(flag, &fixup); + check(memcmp(&ftr_fixup_test4, &ftr_fixup_test4_orig, size) == 0); + + /* Check we do patch if the value doesn't match */ + patch_feature_section(0, &fixup); + check(memcmp(&ftr_fixup_test4, &ftr_fixup_test4_expected, size) == 0); + + /* Check we do patch if the mask doesn't match */ + memcpy(&ftr_fixup_test4, &ftr_fixup_test4_orig, size); + check(memcmp(&ftr_fixup_test4, &ftr_fixup_test4_orig, size) == 0); + patch_feature_section(~flag, &fixup); + check(memcmp(&ftr_fixup_test4, &ftr_fixup_test4_expected, size) == 0); +} + +static void test_alternative_case_with_branch(void) +{ + extern unsigned int ftr_fixup_test5; + extern unsigned int end_ftr_fixup_test5; + extern unsigned int ftr_fixup_test5_expected; + int size = &end_ftr_fixup_test5 - &ftr_fixup_test5; + + check(memcmp(&ftr_fixup_test5, &ftr_fixup_test5_expected, size) == 0); +} + +static void test_alternative_case_with_external_branch(void) +{ + extern unsigned int ftr_fixup_test6; + extern unsigned int end_ftr_fixup_test6; + extern unsigned int ftr_fixup_test6_expected; + int size = &end_ftr_fixup_test6 - &ftr_fixup_test6; + + check(memcmp(&ftr_fixup_test6, &ftr_fixup_test6_expected, size) == 0); +} + +static void test_cpu_macros(void) +{ + extern u8 ftr_fixup_test_FTR_macros; + extern u8 ftr_fixup_test_FTR_macros_expected; + unsigned long size = &ftr_fixup_test_FTR_macros_expected - + &ftr_fixup_test_FTR_macros; + + /* The fixups have already been done for us during boot */ + check(memcmp(&ftr_fixup_test_FTR_macros, + &ftr_fixup_test_FTR_macros_expected, size) == 0); +} + +static void test_fw_macros(void) +{ +#ifdef CONFIG_PPC64 + extern u8 ftr_fixup_test_FW_FTR_macros; + extern u8 ftr_fixup_test_FW_FTR_macros_expected; + unsigned long size = &ftr_fixup_test_FW_FTR_macros_expected - + &ftr_fixup_test_FW_FTR_macros; + + /* The fixups have already been done for us during boot */ + check(memcmp(&ftr_fixup_test_FW_FTR_macros, + &ftr_fixup_test_FW_FTR_macros_expected, size) == 0); +#endif +} + +static void test_lwsync_macros(void) +{ + extern u8 lwsync_fixup_test; + extern u8 end_lwsync_fixup_test; + extern u8 lwsync_fixup_test_expected_LWSYNC; + extern u8 lwsync_fixup_test_expected_SYNC; + unsigned long size = &end_lwsync_fixup_test - + &lwsync_fixup_test; + + /* The fixups have already been done for us during boot */ + if (cur_cpu_spec->cpu_features & CPU_FTR_LWSYNC) { + check(memcmp(&lwsync_fixup_test, + &lwsync_fixup_test_expected_LWSYNC, size) == 0); + } else { + check(memcmp(&lwsync_fixup_test, + &lwsync_fixup_test_expected_SYNC, size) == 0); + } +} + +static int __init test_feature_fixups(void) +{ + printk(KERN_DEBUG "Running feature fixup self-tests ...\n"); + + test_basic_patching(); + test_alternative_patching(); + test_alternative_case_too_big(); + test_alternative_case_too_small(); + test_alternative_case_with_branch(); + test_alternative_case_with_external_branch(); + test_cpu_macros(); + test_fw_macros(); + test_lwsync_macros(); + + return 0; +} +late_initcall(test_feature_fixups); + +#endif /* CONFIG_FTR_FIXUP_SELFTEST */ diff --git a/arch/powerpc/lib/hweight_64.S b/arch/powerpc/lib/hweight_64.S new file mode 100644 index 00000000000..19e66001a4f --- /dev/null +++ b/arch/powerpc/lib/hweight_64.S @@ -0,0 +1,110 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2010 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <asm/processor.h> +#include <asm/ppc_asm.h> + +/* Note: This code relies on -mminimal-toc */ + +_GLOBAL(__arch_hweight8) +BEGIN_FTR_SECTION + b __sw_hweight8 + nop + nop +FTR_SECTION_ELSE + PPC_POPCNTB(R3,R3) + clrldi r3,r3,64-8 + blr +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) + +_GLOBAL(__arch_hweight16) +BEGIN_FTR_SECTION + b __sw_hweight16 + nop + nop + nop + nop +FTR_SECTION_ELSE + BEGIN_FTR_SECTION_NESTED(50) + PPC_POPCNTB(R3,R3) + srdi r4,r3,8 + add r3,r4,r3 + clrldi r3,r3,64-8 + blr + FTR_SECTION_ELSE_NESTED(50) + clrlwi r3,r3,16 + PPC_POPCNTW(R3,R3) + clrldi r3,r3,64-8 + blr + ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 50) +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) + +_GLOBAL(__arch_hweight32) +BEGIN_FTR_SECTION + b __sw_hweight32 + nop + nop + nop + nop + nop + nop +FTR_SECTION_ELSE + BEGIN_FTR_SECTION_NESTED(51) + PPC_POPCNTB(R3,R3) + srdi r4,r3,16 + add r3,r4,r3 + srdi r4,r3,8 + add r3,r4,r3 + clrldi r3,r3,64-8 + blr + FTR_SECTION_ELSE_NESTED(51) + PPC_POPCNTW(R3,R3) + clrldi r3,r3,64-8 + blr + ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 51) +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) + +_GLOBAL(__arch_hweight64) +BEGIN_FTR_SECTION + b __sw_hweight64 + nop + nop + nop + nop + nop + nop + nop + nop +FTR_SECTION_ELSE + BEGIN_FTR_SECTION_NESTED(52) + PPC_POPCNTB(R3,R3) + srdi r4,r3,32 + add r3,r4,r3 + srdi r4,r3,16 + add r3,r4,r3 + srdi r4,r3,8 + add r3,r4,r3 + clrldi r3,r3,64-8 + blr + FTR_SECTION_ELSE_NESTED(52) + PPC_POPCNTD(R3,R3) + clrldi r3,r3,64-8 + blr + ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 52) +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) diff --git a/arch/powerpc/lib/ldstfp.S b/arch/powerpc/lib/ldstfp.S new file mode 100644 index 00000000000..85aec08ab23 --- /dev/null +++ b/arch/powerpc/lib/ldstfp.S @@ -0,0 +1,379 @@ +/* + * Floating-point, VMX/Altivec and VSX loads and stores + * for use in instruction emulation. + * + * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/ppc-opcode.h> +#include <asm/reg.h> +#include <asm/asm-offsets.h> +#include <linux/errno.h> + +#ifdef CONFIG_PPC_FPU + +#define STKFRM (PPC_MIN_STKFRM + 16) + + .macro extab instr,handler + .section __ex_table,"a" + PPC_LONG \instr,\handler + .previous + .endm + + .macro inst32 op +reg = 0 + .rept 32 +20: \op reg,0,r4 + b 3f + extab 20b,99f +reg = reg + 1 + .endr + .endm + +/* Get the contents of frN into fr0; N is in r3. */ +_GLOBAL(get_fpr) + mflr r0 + rlwinm r3,r3,3,0xf8 + bcl 20,31,1f + blr /* fr0 is already in fr0 */ + nop +reg = 1 + .rept 31 + fmr fr0,reg + blr +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr + +/* Put the contents of fr0 into frN; N is in r3. */ +_GLOBAL(put_fpr) + mflr r0 + rlwinm r3,r3,3,0xf8 + bcl 20,31,1f + blr /* fr0 is already in fr0 */ + nop +reg = 1 + .rept 31 + fmr reg,fr0 + blr +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr + +/* Load FP reg N from float at *p. N is in r3, p in r4. */ +_GLOBAL(do_lfs) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + ori r7,r6,MSR_FP + cmpwi cr7,r3,0 + MTMSRD(r7) + isync + beq cr7,1f + stfd fr0,STKFRM-16(r1) +1: li r9,-EFAULT +2: lfs fr0,0(r4) + li r9,0 +3: bl put_fpr + beq cr7,4f + lfd fr0,STKFRM-16(r1) +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b + +/* Load FP reg N from double at *p. N is in r3, p in r4. */ +_GLOBAL(do_lfd) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + ori r7,r6,MSR_FP + cmpwi cr7,r3,0 + MTMSRD(r7) + isync + beq cr7,1f + stfd fr0,STKFRM-16(r1) +1: li r9,-EFAULT +2: lfd fr0,0(r4) + li r9,0 +3: beq cr7,4f + bl put_fpr + lfd fr0,STKFRM-16(r1) +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b + +/* Store FP reg N to float at *p. N is in r3, p in r4. */ +_GLOBAL(do_stfs) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + ori r7,r6,MSR_FP + cmpwi cr7,r3,0 + MTMSRD(r7) + isync + beq cr7,1f + stfd fr0,STKFRM-16(r1) + bl get_fpr +1: li r9,-EFAULT +2: stfs fr0,0(r4) + li r9,0 +3: beq cr7,4f + lfd fr0,STKFRM-16(r1) +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b + +/* Store FP reg N to double at *p. N is in r3, p in r4. */ +_GLOBAL(do_stfd) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + ori r7,r6,MSR_FP + cmpwi cr7,r3,0 + MTMSRD(r7) + isync + beq cr7,1f + stfd fr0,STKFRM-16(r1) + bl get_fpr +1: li r9,-EFAULT +2: stfd fr0,0(r4) + li r9,0 +3: beq cr7,4f + lfd fr0,STKFRM-16(r1) +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b + +#ifdef CONFIG_ALTIVEC +/* Get the contents of vrN into vr0; N is in r3. */ +_GLOBAL(get_vr) + mflr r0 + rlwinm r3,r3,3,0xf8 + bcl 20,31,1f + blr /* vr0 is already in vr0 */ + nop +reg = 1 + .rept 31 + vor vr0,reg,reg /* assembler doesn't know vmr? */ + blr +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr + +/* Put the contents of vr0 into vrN; N is in r3. */ +_GLOBAL(put_vr) + mflr r0 + rlwinm r3,r3,3,0xf8 + bcl 20,31,1f + blr /* vr0 is already in vr0 */ + nop +reg = 1 + .rept 31 + vor reg,vr0,vr0 + blr +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr + +/* Load vector reg N from *p. N is in r3, p in r4. */ +_GLOBAL(do_lvx) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + oris r7,r6,MSR_VEC@h + cmpwi cr7,r3,0 + li r8,STKFRM-16 + MTMSRD(r7) + isync + beq cr7,1f + stvx vr0,r1,r8 +1: li r9,-EFAULT +2: lvx vr0,0,r4 + li r9,0 +3: beq cr7,4f + bl put_vr + lvx vr0,r1,r8 +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b + +/* Store vector reg N to *p. N is in r3, p in r4. */ +_GLOBAL(do_stvx) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + oris r7,r6,MSR_VEC@h + cmpwi cr7,r3,0 + li r8,STKFRM-16 + MTMSRD(r7) + isync + beq cr7,1f + stvx vr0,r1,r8 + bl get_vr +1: li r9,-EFAULT +2: stvx vr0,0,r4 + li r9,0 +3: beq cr7,4f + lvx vr0,r1,r8 +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b +#endif /* CONFIG_ALTIVEC */ + +#ifdef CONFIG_VSX +/* Get the contents of vsrN into vsr0; N is in r3. */ +_GLOBAL(get_vsr) + mflr r0 + rlwinm r3,r3,3,0x1f8 + bcl 20,31,1f + blr /* vsr0 is already in vsr0 */ + nop +reg = 1 + .rept 63 + XXLOR(0,reg,reg) + blr +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr + +/* Put the contents of vsr0 into vsrN; N is in r3. */ +_GLOBAL(put_vsr) + mflr r0 + rlwinm r3,r3,3,0x1f8 + bcl 20,31,1f + blr /* vr0 is already in vr0 */ + nop +reg = 1 + .rept 63 + XXLOR(reg,0,0) + blr +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr + +/* Load VSX reg N from vector doubleword *p. N is in r3, p in r4. */ +_GLOBAL(do_lxvd2x) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + oris r7,r6,MSR_VSX@h + cmpwi cr7,r3,0 + li r8,STKFRM-16 + MTMSRD(r7) + isync + beq cr7,1f + STXVD2X(0,R1,R8) +1: li r9,-EFAULT +2: LXVD2X(0,R0,R4) + li r9,0 +3: beq cr7,4f + bl put_vsr + LXVD2X(0,R1,R8) +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b + +/* Store VSX reg N to vector doubleword *p. N is in r3, p in r4. */ +_GLOBAL(do_stxvd2x) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + oris r7,r6,MSR_VSX@h + cmpwi cr7,r3,0 + li r8,STKFRM-16 + MTMSRD(r7) + isync + beq cr7,1f + STXVD2X(0,R1,R8) + bl get_vsr +1: li r9,-EFAULT +2: STXVD2X(0,R0,R4) + li r9,0 +3: beq cr7,4f + LXVD2X(0,R1,R8) +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b + +#endif /* CONFIG_VSX */ + +#endif /* CONFIG_PPC_FPU */ diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c index 79d0fa3a470..0c9c8d7d073 100644 --- a/arch/powerpc/lib/locks.c +++ b/arch/powerpc/lib/locks.c @@ -14,18 +14,16 @@ #include <linux/kernel.h> #include <linux/spinlock.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/stringify.h> #include <linux/smp.h> /* waiting for a spinlock... */ -#if defined(CONFIG_PPC_SPLPAR) || defined(CONFIG_PPC_ISERIES) +#if defined(CONFIG_PPC_SPLPAR) #include <asm/hvcall.h> -#include <asm/iseries/hv_call.h> #include <asm/smp.h> -#include <asm/firmware.h> -void __spin_yield(raw_spinlock_t *lock) +void __spin_yield(arch_spinlock_t *lock) { unsigned int lock_value, holder_cpu, yield_count; @@ -34,20 +32,14 @@ void __spin_yield(raw_spinlock_t *lock) return; holder_cpu = lock_value & 0xffff; BUG_ON(holder_cpu >= NR_CPUS); - yield_count = lppaca[holder_cpu].yield_count; + yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count); if ((yield_count & 1) == 0) return; /* virtual cpu is currently running */ rmb(); if (lock->slock != lock_value) return; /* something has changed */ - if (firmware_has_feature(FW_FEATURE_ISERIES)) - HvCall2(HvCallBaseYieldProcessor, HvCall_YieldToProc, - ((u64)holder_cpu << 32) | yield_count); -#ifdef CONFIG_PPC_SPLPAR - else - plpar_hcall_norets(H_CONFER, - get_hard_smp_processor_id(holder_cpu), yield_count); -#endif + plpar_hcall_norets(H_CONFER, + get_hard_smp_processor_id(holder_cpu), yield_count); } /* @@ -55,7 +47,7 @@ void __spin_yield(raw_spinlock_t *lock) * This turns out to be the same for read and write locks, since * we only know the holder if it is write-locked. */ -void __rw_yield(raw_rwlock_t *rw) +void __rw_yield(arch_rwlock_t *rw) { int lock_value; unsigned int holder_cpu, yield_count; @@ -65,24 +57,18 @@ void __rw_yield(raw_rwlock_t *rw) return; /* no write lock at present */ holder_cpu = lock_value & 0xffff; BUG_ON(holder_cpu >= NR_CPUS); - yield_count = lppaca[holder_cpu].yield_count; + yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count); if ((yield_count & 1) == 0) return; /* virtual cpu is currently running */ rmb(); if (rw->lock != lock_value) return; /* something has changed */ - if (firmware_has_feature(FW_FEATURE_ISERIES)) - HvCall2(HvCallBaseYieldProcessor, HvCall_YieldToProc, - ((u64)holder_cpu << 32) | yield_count); -#ifdef CONFIG_PPC_SPLPAR - else - plpar_hcall_norets(H_CONFER, - get_hard_smp_processor_id(holder_cpu), yield_count); -#endif + plpar_hcall_norets(H_CONFER, + get_hard_smp_processor_id(holder_cpu), yield_count); } #endif -void __raw_spin_unlock_wait(raw_spinlock_t *lock) +void arch_spin_unlock_wait(arch_spinlock_t *lock) { while (lock->slock) { HMT_low(); @@ -92,4 +78,4 @@ void __raw_spin_unlock_wait(raw_spinlock_t *lock) HMT_medium(); } -EXPORT_SYMBOL(__raw_spin_unlock_wait); +EXPORT_SYMBOL(arch_spin_unlock_wait); diff --git a/arch/powerpc/lib/mem_64.S b/arch/powerpc/lib/mem_64.S index 11ce045e21f..43435c6892f 100644 --- a/arch/powerpc/lib/mem_64.S +++ b/arch/powerpc/lib/mem_64.S @@ -19,7 +19,7 @@ _GLOBAL(memset) rlwimi r4,r4,16,0,15 cmplw cr1,r5,r0 /* do we get that far? */ rldimi r4,r4,32,0 - PPC_MTOCRF 1,r0 + PPC_MTOCRF(1,r0) mr r6,r3 blt cr1,8f beq+ 3f /* if already 8-byte aligned */ @@ -49,7 +49,7 @@ _GLOBAL(memset) bdnz 4b 5: srwi. r0,r5,3 clrlwi r5,r5,29 - PPC_MTOCRF 1,r0 + PPC_MTOCRF(1,r0) beq 8f bf 29,6f std r4,0(r6) @@ -65,7 +65,7 @@ _GLOBAL(memset) std r4,0(r6) addi r6,r6,8 8: cmpwi r5,0 - PPC_MTOCRF 1,r5 + PPC_MTOCRF(1,r5) beqlr+ bf 29,9f stw r4,0(r6) @@ -77,10 +77,10 @@ _GLOBAL(memset) stb r4,0(r6) blr -_GLOBAL(memmove) +_GLOBAL_TOC(memmove) cmplw 0,r3,r4 - bgt .backwards_memcpy - b .memcpy + bgt backwards_memcpy + b memcpy _GLOBAL(backwards_memcpy) rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */ diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S index 3f131129d1c..32a06ec395d 100644 --- a/arch/powerpc/lib/memcpy_64.S +++ b/arch/powerpc/lib/memcpy_64.S @@ -10,19 +10,52 @@ #include <asm/ppc_asm.h> .align 7 -_GLOBAL(memcpy) - std r3,48(r1) /* save destination pointer for return value */ - PPC_MTOCRF 0x01,r5 +_GLOBAL_TOC(memcpy) +BEGIN_FTR_SECTION +#ifdef __LITTLE_ENDIAN__ + cmpdi cr7,r5,0 +#else + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) /* save destination pointer for return value */ +#endif +FTR_SECTION_ELSE +#ifndef SELFTEST + b memcpy_power7 +#endif +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) +#ifdef __LITTLE_ENDIAN__ + /* dumb little-endian memcpy that will get replaced at runtime */ + addi r9,r3,-1 + addi r4,r4,-1 + beqlr cr7 + mtctr r5 +1: lbzu r10,1(r4) + stbu r10,1(r9) + bdnz 1b + blr +#else + PPC_MTOCRF(0x01,r5) cmpldi cr1,r5,16 neg r6,r3 # LS 3 bits = # bytes to 8-byte dest bdry andi. r6,r6,7 dcbt 0,r4 blt cr1,.Lshort_copy +/* Below we want to nop out the bne if we're on a CPU that has the + CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit + cleared. + At the time of writing the only CPU that has this combination of bits + set is Power6. */ +BEGIN_FTR_SECTION + nop +FTR_SECTION_ELSE bne .Ldst_unaligned +ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \ + CPU_FTR_UNALIGNED_LD_STD) .Ldst_aligned: - andi. r0,r4,7 addi r3,r3,-16 +BEGIN_FTR_SECTION + andi. r0,r4,7 bne .Lsrc_unaligned +END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) srdi r7,r5,4 ld r9,0(r4) addi r4,r4,-8 @@ -41,20 +74,21 @@ _GLOBAL(memcpy) 3: std r8,8(r3) beq 3f addi r3,r3,16 - ld r9,8(r4) .Ldo_tail: bf cr7*4+1,1f - rotldi r9,r9,32 + lwz r9,8(r4) + addi r4,r4,4 stw r9,0(r3) addi r3,r3,4 1: bf cr7*4+2,2f - rotldi r9,r9,16 + lhz r9,8(r4) + addi r4,r4,2 sth r9,0(r3) addi r3,r3,2 2: bf cr7*4+3,3f - rotldi r9,r9,8 + lbz r9,8(r4) stb r9,0(r3) -3: ld r3,48(r1) /* return dest pointer */ +3: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) /* return dest pointer */ blr .Lsrc_unaligned: @@ -121,17 +155,30 @@ _GLOBAL(memcpy) cmpwi cr1,r5,8 addi r3,r3,32 sld r9,r9,r10 - ble cr1,.Ldo_tail + ble cr1,6f ld r0,8(r4) srd r7,r0,r11 or r9,r7,r9 - b .Ldo_tail +6: + bf cr7*4+1,1f + rotldi r9,r9,32 + stw r9,0(r3) + addi r3,r3,4 +1: bf cr7*4+2,2f + rotldi r9,r9,16 + sth r9,0(r3) + addi r3,r3,2 +2: bf cr7*4+3,3f + rotldi r9,r9,8 + stb r9,0(r3) +3: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) /* return dest pointer */ + blr .Ldst_unaligned: - PPC_MTOCRF 0x01,r6 # put #bytes to 8B bdry into cr7 + PPC_MTOCRF(0x01,r6) # put #bytes to 8B bdry into cr7 subf r5,r6,r5 li r7,0 - cmpldi r1,r5,16 + cmpldi cr1,r5,16 bf cr7*4+3,1f lbz r0,0(r4) stb r0,0(r3) @@ -143,7 +190,7 @@ _GLOBAL(memcpy) 2: bf cr7*4+1,3f lwzx r0,r7,r4 stwx r0,r7,r3 -3: PPC_MTOCRF 0x01,r5 +3: PPC_MTOCRF(0x01,r5) add r4,r6,r4 add r3,r6,r3 b .Ldst_aligned @@ -169,5 +216,6 @@ _GLOBAL(memcpy) 3: bf cr7*4+3,4f lbz r0,0(r4) stb r0,0(r3) -4: ld r3,48(r1) /* return dest pointer */ +4: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) /* return dest pointer */ blr +#endif diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S new file mode 100644 index 00000000000..2ff5c142f87 --- /dev/null +++ b/arch/powerpc/lib/memcpy_power7.S @@ -0,0 +1,656 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <asm/ppc_asm.h> + +_GLOBAL(memcpy_power7) + +#ifdef __BIG_ENDIAN__ +#define LVS(VRT,RA,RB) lvsl VRT,RA,RB +#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC +#else +#define LVS(VRT,RA,RB) lvsr VRT,RA,RB +#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC +#endif + +#ifdef CONFIG_ALTIVEC + cmpldi r5,16 + cmpldi cr1,r5,4096 + + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + + blt .Lshort_copy + bgt cr1,.Lvmx_copy +#else + cmpldi r5,16 + + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + + blt .Lshort_copy +#endif + +.Lnonvmx_copy: + /* Get the source 8B aligned */ + neg r6,r4 + mtocrf 0x01,r6 + clrldi r6,r6,(64-3) + + bf cr7*4+3,1f + lbz r0,0(r4) + addi r4,r4,1 + stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +3: sub r5,r5,r6 + cmpldi r5,128 + blt 5f + + mflr r0 + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + std r17,STK_REG(R17)(r1) + std r18,STK_REG(R18)(r1) + std r19,STK_REG(R19)(r1) + std r20,STK_REG(R20)(r1) + std r21,STK_REG(R21)(r1) + std r22,STK_REG(R22)(r1) + std r0,STACKFRAMESIZE+16(r1) + + srdi r6,r5,7 + mtctr r6 + + /* Now do cacheline (128B) sized loads and stores. */ + .align 5 +4: + ld r0,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ld r8,24(r4) + ld r9,32(r4) + ld r10,40(r4) + ld r11,48(r4) + ld r12,56(r4) + ld r14,64(r4) + ld r15,72(r4) + ld r16,80(r4) + ld r17,88(r4) + ld r18,96(r4) + ld r19,104(r4) + ld r20,112(r4) + ld r21,120(r4) + addi r4,r4,128 + std r0,0(r3) + std r6,8(r3) + std r7,16(r3) + std r8,24(r3) + std r9,32(r3) + std r10,40(r3) + std r11,48(r3) + std r12,56(r3) + std r14,64(r3) + std r15,72(r3) + std r16,80(r3) + std r17,88(r3) + std r18,96(r3) + std r19,104(r3) + std r20,112(r3) + std r21,120(r3) + addi r3,r3,128 + bdnz 4b + + clrldi r5,r5,(64-7) + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + ld r17,STK_REG(R17)(r1) + ld r18,STK_REG(R18)(r1) + ld r19,STK_REG(R19)(r1) + ld r20,STK_REG(R20)(r1) + ld r21,STK_REG(R21)(r1) + ld r22,STK_REG(R22)(r1) + addi r1,r1,STACKFRAMESIZE + + /* Up to 127B to go */ +5: srdi r6,r5,4 + mtocrf 0x01,r6 + +6: bf cr7*4+1,7f + ld r0,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ld r8,24(r4) + ld r9,32(r4) + ld r10,40(r4) + ld r11,48(r4) + ld r12,56(r4) + addi r4,r4,64 + std r0,0(r3) + std r6,8(r3) + std r7,16(r3) + std r8,24(r3) + std r9,32(r3) + std r10,40(r3) + std r11,48(r3) + std r12,56(r3) + addi r3,r3,64 + + /* Up to 63B to go */ +7: bf cr7*4+2,8f + ld r0,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ld r8,24(r4) + addi r4,r4,32 + std r0,0(r3) + std r6,8(r3) + std r7,16(r3) + std r8,24(r3) + addi r3,r3,32 + + /* Up to 31B to go */ +8: bf cr7*4+3,9f + ld r0,0(r4) + ld r6,8(r4) + addi r4,r4,16 + std r0,0(r3) + std r6,8(r3) + addi r3,r3,16 + +9: clrldi r5,r5,(64-4) + + /* Up to 15B to go */ +.Lshort_copy: + mtocrf 0x01,r5 + bf cr7*4+0,12f + lwz r0,0(r4) /* Less chance of a reject with word ops */ + lwz r6,4(r4) + addi r4,r4,8 + stw r0,0(r3) + stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f + lbz r0,0(r4) + stb r0,0(r3) + +15: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + blr + +.Lunwind_stack_nonvmx_copy: + addi r1,r1,STACKFRAMESIZE + b .Lnonvmx_copy + +#ifdef CONFIG_ALTIVEC +.Lvmx_copy: + mflr r0 + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + bl enter_vmx_copy + cmpwi cr1,r3,0 + ld r0,STACKFRAMESIZE+16(r1) + ld r3,STK_REG(R31)(r1) + ld r4,STK_REG(R30)(r1) + ld r5,STK_REG(R29)(r1) + mtlr r0 + + /* + * We prefetch both the source and destination using enhanced touch + * instructions. We use a stream ID of 0 for the load side and + * 1 for the store side. + */ + clrrdi r6,r4,7 + clrrdi r9,r3,7 + ori r9,r9,1 /* stream=1 */ + + srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ + cmpldi r7,0x3FF + ble 1f + li r7,0x3FF +1: lis r0,0x0E00 /* depth=7 */ + sldi r7,r7,7 + or r7,r7,r0 + ori r10,r7,1 /* stream=1 */ + + lis r8,0x8000 /* GO=1 */ + clrldi r8,r8,32 + +.machine push +.machine "power4" + dcbt r0,r6,0b01000 + dcbt r0,r7,0b01010 + dcbtst r0,r9,0b01000 + dcbtst r0,r10,0b01010 + eieio + dcbt r0,r8,0b01010 /* GO */ +.machine pop + + beq cr1,.Lunwind_stack_nonvmx_copy + + /* + * If source and destination are not relatively aligned we use a + * slower permute loop. + */ + xor r6,r4,r3 + rldicl. r6,r6,0,(64-4) + bne .Lvmx_unaligned_copy + + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f + lbz r0,0(r4) + addi r4,r4,1 + stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f + ld r0,0(r4) + addi r4,r4,8 + std r0,0(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + bf cr7*4+3,5f + lvx vr1,r0,r4 + addi r4,r4,16 + stvx vr1,r0,r3 + addi r3,r3,16 + +5: bf cr7*4+2,6f + lvx vr1,r0,r4 + lvx vr0,r4,r9 + addi r4,r4,32 + stvx vr1,r0,r3 + stvx vr0,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f + lvx vr3,r0,r4 + lvx vr2,r4,r9 + lvx vr1,r4,r10 + lvx vr0,r4,r11 + addi r4,r4,64 + stvx vr3,r0,r3 + stvx vr2,r3,r9 + stvx vr1,r3,r10 + stvx vr0,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: + lvx vr7,r0,r4 + lvx vr6,r4,r9 + lvx vr5,r4,r10 + lvx vr4,r4,r11 + lvx vr3,r4,r12 + lvx vr2,r4,r14 + lvx vr1,r4,r15 + lvx vr0,r4,r16 + addi r4,r4,128 + stvx vr7,r0,r3 + stvx vr6,r3,r9 + stvx vr5,r3,r10 + stvx vr4,r3,r11 + stvx vr3,r3,r12 + stvx vr2,r3,r14 + stvx vr1,r3,r15 + stvx vr0,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f + lvx vr3,r0,r4 + lvx vr2,r4,r9 + lvx vr1,r4,r10 + lvx vr0,r4,r11 + addi r4,r4,64 + stvx vr3,r0,r3 + stvx vr2,r3,r9 + stvx vr1,r3,r10 + stvx vr0,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f + lvx vr1,r0,r4 + lvx vr0,r4,r9 + addi r4,r4,32 + stvx vr1,r0,r3 + stvx vr0,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f + lvx vr1,r0,r4 + addi r4,r4,16 + stvx vr1,r0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + mtocrf 0x01,r5 + bf cr7*4+0,12f + ld r0,0(r4) + addi r4,r4,8 + std r0,0(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f + lbz r0,0(r4) + stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + b exit_vmx_copy /* tail call optimise */ + +.Lvmx_unaligned_copy: + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f + lbz r0,0(r4) + addi r4,r4,1 + stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f + lwz r0,0(r4) /* Less chance of a reject with word ops */ + lwz r7,4(r4) + addi r4,r4,8 + stw r0,0(r3) + stw r7,4(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + LVS(vr16,0,r4) /* Setup permute control vector */ + lvx vr0,0,r4 + addi r4,r4,16 + + bf cr7*4+3,5f + lvx vr1,r0,r4 + VPERM(vr8,vr0,vr1,vr16) + addi r4,r4,16 + stvx vr8,r0,r3 + addi r3,r3,16 + vor vr0,vr1,vr1 + +5: bf cr7*4+2,6f + lvx vr1,r0,r4 + VPERM(vr8,vr0,vr1,vr16) + lvx vr0,r4,r9 + VPERM(vr9,vr1,vr0,vr16) + addi r4,r4,32 + stvx vr8,r0,r3 + stvx vr9,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f + lvx vr3,r0,r4 + VPERM(vr8,vr0,vr3,vr16) + lvx vr2,r4,r9 + VPERM(vr9,vr3,vr2,vr16) + lvx vr1,r4,r10 + VPERM(vr10,vr2,vr1,vr16) + lvx vr0,r4,r11 + VPERM(vr11,vr1,vr0,vr16) + addi r4,r4,64 + stvx vr8,r0,r3 + stvx vr9,r3,r9 + stvx vr10,r3,r10 + stvx vr11,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: + lvx vr7,r0,r4 + VPERM(vr8,vr0,vr7,vr16) + lvx vr6,r4,r9 + VPERM(vr9,vr7,vr6,vr16) + lvx vr5,r4,r10 + VPERM(vr10,vr6,vr5,vr16) + lvx vr4,r4,r11 + VPERM(vr11,vr5,vr4,vr16) + lvx vr3,r4,r12 + VPERM(vr12,vr4,vr3,vr16) + lvx vr2,r4,r14 + VPERM(vr13,vr3,vr2,vr16) + lvx vr1,r4,r15 + VPERM(vr14,vr2,vr1,vr16) + lvx vr0,r4,r16 + VPERM(vr15,vr1,vr0,vr16) + addi r4,r4,128 + stvx vr8,r0,r3 + stvx vr9,r3,r9 + stvx vr10,r3,r10 + stvx vr11,r3,r11 + stvx vr12,r3,r12 + stvx vr13,r3,r14 + stvx vr14,r3,r15 + stvx vr15,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f + lvx vr3,r0,r4 + VPERM(vr8,vr0,vr3,vr16) + lvx vr2,r4,r9 + VPERM(vr9,vr3,vr2,vr16) + lvx vr1,r4,r10 + VPERM(vr10,vr2,vr1,vr16) + lvx vr0,r4,r11 + VPERM(vr11,vr1,vr0,vr16) + addi r4,r4,64 + stvx vr8,r0,r3 + stvx vr9,r3,r9 + stvx vr10,r3,r10 + stvx vr11,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f + lvx vr1,r0,r4 + VPERM(vr8,vr0,vr1,vr16) + lvx vr0,r4,r9 + VPERM(vr9,vr1,vr0,vr16) + addi r4,r4,32 + stvx vr8,r0,r3 + stvx vr9,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f + lvx vr1,r0,r4 + VPERM(vr8,vr0,vr1,vr16) + addi r4,r4,16 + stvx vr8,r0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + addi r4,r4,-16 /* Unwind the +16 load offset */ + mtocrf 0x01,r5 + bf cr7*4+0,12f + lwz r0,0(r4) /* Less chance of a reject with word ops */ + lwz r6,4(r4) + addi r4,r4,8 + stw r0,0(r3) + stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f + lbz r0,0(r4) + stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + b exit_vmx_copy /* tail call optimise */ +#endif /* CONFiG_ALTIVEC */ diff --git a/arch/powerpc/lib/rheap.c b/arch/powerpc/lib/rheap.c index 22c3b4f53de..a1060a868e6 100644 --- a/arch/powerpc/lib/rheap.c +++ b/arch/powerpc/lib/rheap.c @@ -15,7 +15,7 @@ #include <linux/types.h> #include <linux/errno.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/mm.h> #include <linux/err.h> #include <linux/slab.h> @@ -54,7 +54,7 @@ static int grow(rh_info_t * info, int max_blocks) new_blocks = max_blocks - info->max_blocks; - block = kmalloc(sizeof(rh_block_t) * max_blocks, GFP_KERNEL); + block = kmalloc(sizeof(rh_block_t) * max_blocks, GFP_ATOMIC); if (block == NULL) return -ENOMEM; @@ -258,7 +258,7 @@ rh_info_t *rh_create(unsigned int alignment) if ((alignment & (alignment - 1)) != 0) return ERR_PTR(-EINVAL); - info = kmalloc(sizeof(*info), GFP_KERNEL); + info = kmalloc(sizeof(*info), GFP_ATOMIC); if (info == NULL) return ERR_PTR(-ENOMEM); @@ -556,6 +556,7 @@ unsigned long rh_alloc_fixed(rh_info_t * info, unsigned long start, int size, co be = blk->start + blk->size; if (s >= bs && e <= be) break; + blk = NULL; } if (blk == NULL) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 4aae0c38764..5c09f365c84 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -11,8 +11,11 @@ #include <linux/kernel.h> #include <linux/kprobes.h> #include <linux/ptrace.h> +#include <linux/prefetch.h> #include <asm/sstep.h> #include <asm/processor.h> +#include <asm/uaccess.h> +#include <asm/cputable.h> extern char system_call_common[]; @@ -23,6 +26,37 @@ extern char system_call_common[]; #define MSR_MASK 0x87c0ffff #endif +/* Bits in XER */ +#define XER_SO 0x80000000U +#define XER_OV 0x40000000U +#define XER_CA 0x20000000U + +#ifdef CONFIG_PPC_FPU +/* + * Functions in ldstfp.S + */ +extern int do_lfs(int rn, unsigned long ea); +extern int do_lfd(int rn, unsigned long ea); +extern int do_stfs(int rn, unsigned long ea); +extern int do_stfd(int rn, unsigned long ea); +extern int do_lvx(int rn, unsigned long ea); +extern int do_stvx(int rn, unsigned long ea); +extern int do_lxvd2x(int rn, unsigned long ea); +extern int do_stxvd2x(int rn, unsigned long ea); +#endif + +/* + * Emulate the truncation of 64 bit values in 32-bit mode. + */ +static unsigned long truncate_if_32bit(unsigned long msr, unsigned long val) +{ +#ifdef __powerpc64__ + if ((msr & MSR_64BIT) == 0) + val &= 0xffffffffUL; +#endif + return val; +} + /* * Determine whether a conditional branch instruction would branch. */ @@ -46,16 +80,569 @@ static int __kprobes branch_taken(unsigned int instr, struct pt_regs *regs) return 1; } + +static long __kprobes address_ok(struct pt_regs *regs, unsigned long ea, int nb) +{ + if (!user_mode(regs)) + return 1; + return __access_ok(ea, nb, USER_DS); +} + +/* + * Calculate effective address for a D-form instruction + */ +static unsigned long __kprobes dform_ea(unsigned int instr, struct pt_regs *regs) +{ + int ra; + unsigned long ea; + + ra = (instr >> 16) & 0x1f; + ea = (signed short) instr; /* sign-extend */ + if (ra) { + ea += regs->gpr[ra]; + if (instr & 0x04000000) { /* update forms */ + if ((instr>>26) != 47) /* stmw is not an update form */ + regs->gpr[ra] = ea; + } + } + + return truncate_if_32bit(regs->msr, ea); +} + +#ifdef __powerpc64__ +/* + * Calculate effective address for a DS-form instruction + */ +static unsigned long __kprobes dsform_ea(unsigned int instr, struct pt_regs *regs) +{ + int ra; + unsigned long ea; + + ra = (instr >> 16) & 0x1f; + ea = (signed short) (instr & ~3); /* sign-extend */ + if (ra) { + ea += regs->gpr[ra]; + if ((instr & 3) == 1) /* update forms */ + regs->gpr[ra] = ea; + } + + return truncate_if_32bit(regs->msr, ea); +} +#endif /* __powerpc64 */ + +/* + * Calculate effective address for an X-form instruction + */ +static unsigned long __kprobes xform_ea(unsigned int instr, struct pt_regs *regs, + int do_update) +{ + int ra, rb; + unsigned long ea; + + ra = (instr >> 16) & 0x1f; + rb = (instr >> 11) & 0x1f; + ea = regs->gpr[rb]; + if (ra) { + ea += regs->gpr[ra]; + if (do_update) /* update forms */ + regs->gpr[ra] = ea; + } + + return truncate_if_32bit(regs->msr, ea); +} + +/* + * Return the largest power of 2, not greater than sizeof(unsigned long), + * such that x is a multiple of it. + */ +static inline unsigned long max_align(unsigned long x) +{ + x |= sizeof(unsigned long); + return x & -x; /* isolates rightmost bit */ +} + + +static inline unsigned long byterev_2(unsigned long x) +{ + return ((x >> 8) & 0xff) | ((x & 0xff) << 8); +} + +static inline unsigned long byterev_4(unsigned long x) +{ + return ((x >> 24) & 0xff) | ((x >> 8) & 0xff00) | + ((x & 0xff00) << 8) | ((x & 0xff) << 24); +} + +#ifdef __powerpc64__ +static inline unsigned long byterev_8(unsigned long x) +{ + return (byterev_4(x) << 32) | byterev_4(x >> 32); +} +#endif + +static int __kprobes read_mem_aligned(unsigned long *dest, unsigned long ea, + int nb) +{ + int err = 0; + unsigned long x = 0; + + switch (nb) { + case 1: + err = __get_user(x, (unsigned char __user *) ea); + break; + case 2: + err = __get_user(x, (unsigned short __user *) ea); + break; + case 4: + err = __get_user(x, (unsigned int __user *) ea); + break; +#ifdef __powerpc64__ + case 8: + err = __get_user(x, (unsigned long __user *) ea); + break; +#endif + } + if (!err) + *dest = x; + return err; +} + +static int __kprobes read_mem_unaligned(unsigned long *dest, unsigned long ea, + int nb, struct pt_regs *regs) +{ + int err; + unsigned long x, b, c; +#ifdef __LITTLE_ENDIAN__ + int len = nb; /* save a copy of the length for byte reversal */ +#endif + + /* unaligned, do this in pieces */ + x = 0; + for (; nb > 0; nb -= c) { +#ifdef __LITTLE_ENDIAN__ + c = 1; +#endif +#ifdef __BIG_ENDIAN__ + c = max_align(ea); +#endif + if (c > nb) + c = max_align(nb); + err = read_mem_aligned(&b, ea, c); + if (err) + return err; + x = (x << (8 * c)) + b; + ea += c; + } +#ifdef __LITTLE_ENDIAN__ + switch (len) { + case 2: + *dest = byterev_2(x); + break; + case 4: + *dest = byterev_4(x); + break; +#ifdef __powerpc64__ + case 8: + *dest = byterev_8(x); + break; +#endif + } +#endif +#ifdef __BIG_ENDIAN__ + *dest = x; +#endif + return 0; +} + /* - * Emulate instructions that cause a transfer of control. + * Read memory at address ea for nb bytes, return 0 for success + * or -EFAULT if an error occurred. + */ +static int __kprobes read_mem(unsigned long *dest, unsigned long ea, int nb, + struct pt_regs *regs) +{ + if (!address_ok(regs, ea, nb)) + return -EFAULT; + if ((ea & (nb - 1)) == 0) + return read_mem_aligned(dest, ea, nb); + return read_mem_unaligned(dest, ea, nb, regs); +} + +static int __kprobes write_mem_aligned(unsigned long val, unsigned long ea, + int nb) +{ + int err = 0; + + switch (nb) { + case 1: + err = __put_user(val, (unsigned char __user *) ea); + break; + case 2: + err = __put_user(val, (unsigned short __user *) ea); + break; + case 4: + err = __put_user(val, (unsigned int __user *) ea); + break; +#ifdef __powerpc64__ + case 8: + err = __put_user(val, (unsigned long __user *) ea); + break; +#endif + } + return err; +} + +static int __kprobes write_mem_unaligned(unsigned long val, unsigned long ea, + int nb, struct pt_regs *regs) +{ + int err; + unsigned long c; + +#ifdef __LITTLE_ENDIAN__ + switch (nb) { + case 2: + val = byterev_2(val); + break; + case 4: + val = byterev_4(val); + break; +#ifdef __powerpc64__ + case 8: + val = byterev_8(val); + break; +#endif + } +#endif + /* unaligned or little-endian, do this in pieces */ + for (; nb > 0; nb -= c) { +#ifdef __LITTLE_ENDIAN__ + c = 1; +#endif +#ifdef __BIG_ENDIAN__ + c = max_align(ea); +#endif + if (c > nb) + c = max_align(nb); + err = write_mem_aligned(val >> (nb - c) * 8, ea, c); + if (err) + return err; + ea += c; + } + return 0; +} + +/* + * Write memory at address ea for nb bytes, return 0 for success + * or -EFAULT if an error occurred. + */ +static int __kprobes write_mem(unsigned long val, unsigned long ea, int nb, + struct pt_regs *regs) +{ + if (!address_ok(regs, ea, nb)) + return -EFAULT; + if ((ea & (nb - 1)) == 0) + return write_mem_aligned(val, ea, nb); + return write_mem_unaligned(val, ea, nb, regs); +} + +#ifdef CONFIG_PPC_FPU +/* + * Check the address and alignment, and call func to do the actual + * load or store. + */ +static int __kprobes do_fp_load(int rn, int (*func)(int, unsigned long), + unsigned long ea, int nb, + struct pt_regs *regs) +{ + int err; + union { + double dbl; + unsigned long ul[2]; + struct { +#ifdef __BIG_ENDIAN__ + unsigned _pad_; + unsigned word; +#endif +#ifdef __LITTLE_ENDIAN__ + unsigned word; + unsigned _pad_; +#endif + } single; + } data; + unsigned long ptr; + + if (!address_ok(regs, ea, nb)) + return -EFAULT; + if ((ea & 3) == 0) + return (*func)(rn, ea); + ptr = (unsigned long) &data.ul; + if (sizeof(unsigned long) == 8 || nb == 4) { + err = read_mem_unaligned(&data.ul[0], ea, nb, regs); + if (nb == 4) + ptr = (unsigned long)&(data.single.word); + } else { + /* reading a double on 32-bit */ + err = read_mem_unaligned(&data.ul[0], ea, 4, regs); + if (!err) + err = read_mem_unaligned(&data.ul[1], ea + 4, 4, regs); + } + if (err) + return err; + return (*func)(rn, ptr); +} + +static int __kprobes do_fp_store(int rn, int (*func)(int, unsigned long), + unsigned long ea, int nb, + struct pt_regs *regs) +{ + int err; + union { + double dbl; + unsigned long ul[2]; + struct { +#ifdef __BIG_ENDIAN__ + unsigned _pad_; + unsigned word; +#endif +#ifdef __LITTLE_ENDIAN__ + unsigned word; + unsigned _pad_; +#endif + } single; + } data; + unsigned long ptr; + + if (!address_ok(regs, ea, nb)) + return -EFAULT; + if ((ea & 3) == 0) + return (*func)(rn, ea); + ptr = (unsigned long) &data.ul[0]; + if (sizeof(unsigned long) == 8 || nb == 4) { + if (nb == 4) + ptr = (unsigned long)&(data.single.word); + err = (*func)(rn, ptr); + if (err) + return err; + err = write_mem_unaligned(data.ul[0], ea, nb, regs); + } else { + /* writing a double on 32-bit */ + err = (*func)(rn, ptr); + if (err) + return err; + err = write_mem_unaligned(data.ul[0], ea, 4, regs); + if (!err) + err = write_mem_unaligned(data.ul[1], ea + 4, 4, regs); + } + return err; +} +#endif + +#ifdef CONFIG_ALTIVEC +/* For Altivec/VMX, no need to worry about alignment */ +static int __kprobes do_vec_load(int rn, int (*func)(int, unsigned long), + unsigned long ea, struct pt_regs *regs) +{ + if (!address_ok(regs, ea & ~0xfUL, 16)) + return -EFAULT; + return (*func)(rn, ea); +} + +static int __kprobes do_vec_store(int rn, int (*func)(int, unsigned long), + unsigned long ea, struct pt_regs *regs) +{ + if (!address_ok(regs, ea & ~0xfUL, 16)) + return -EFAULT; + return (*func)(rn, ea); +} +#endif /* CONFIG_ALTIVEC */ + +#ifdef CONFIG_VSX +static int __kprobes do_vsx_load(int rn, int (*func)(int, unsigned long), + unsigned long ea, struct pt_regs *regs) +{ + int err; + unsigned long val[2]; + + if (!address_ok(regs, ea, 16)) + return -EFAULT; + if ((ea & 3) == 0) + return (*func)(rn, ea); + err = read_mem_unaligned(&val[0], ea, 8, regs); + if (!err) + err = read_mem_unaligned(&val[1], ea + 8, 8, regs); + if (!err) + err = (*func)(rn, (unsigned long) &val[0]); + return err; +} + +static int __kprobes do_vsx_store(int rn, int (*func)(int, unsigned long), + unsigned long ea, struct pt_regs *regs) +{ + int err; + unsigned long val[2]; + + if (!address_ok(regs, ea, 16)) + return -EFAULT; + if ((ea & 3) == 0) + return (*func)(rn, ea); + err = (*func)(rn, (unsigned long) &val[0]); + if (err) + return err; + err = write_mem_unaligned(val[0], ea, 8, regs); + if (!err) + err = write_mem_unaligned(val[1], ea + 8, 8, regs); + return err; +} +#endif /* CONFIG_VSX */ + +#define __put_user_asmx(x, addr, err, op, cr) \ + __asm__ __volatile__( \ + "1: " op " %2,0,%3\n" \ + " mfcr %1\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: li %0,%4\n" \ + " b 2b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n" \ + PPC_LONG_ALIGN "\n" \ + PPC_LONG "1b,3b\n" \ + ".previous" \ + : "=r" (err), "=r" (cr) \ + : "r" (x), "r" (addr), "i" (-EFAULT), "0" (err)) + +#define __get_user_asmx(x, addr, err, op) \ + __asm__ __volatile__( \ + "1: "op" %1,0,%2\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: li %0,%3\n" \ + " b 2b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n" \ + PPC_LONG_ALIGN "\n" \ + PPC_LONG "1b,3b\n" \ + ".previous" \ + : "=r" (err), "=r" (x) \ + : "r" (addr), "i" (-EFAULT), "0" (err)) + +#define __cacheop_user_asmx(addr, err, op) \ + __asm__ __volatile__( \ + "1: "op" 0,%1\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: li %0,%3\n" \ + " b 2b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n" \ + PPC_LONG_ALIGN "\n" \ + PPC_LONG "1b,3b\n" \ + ".previous" \ + : "=r" (err) \ + : "r" (addr), "i" (-EFAULT), "0" (err)) + +static void __kprobes set_cr0(struct pt_regs *regs, int rd) +{ + long val = regs->gpr[rd]; + + regs->ccr = (regs->ccr & 0x0fffffff) | ((regs->xer >> 3) & 0x10000000); +#ifdef __powerpc64__ + if (!(regs->msr & MSR_64BIT)) + val = (int) val; +#endif + if (val < 0) + regs->ccr |= 0x80000000; + else if (val > 0) + regs->ccr |= 0x40000000; + else + regs->ccr |= 0x20000000; +} + +static void __kprobes add_with_carry(struct pt_regs *regs, int rd, + unsigned long val1, unsigned long val2, + unsigned long carry_in) +{ + unsigned long val = val1 + val2; + + if (carry_in) + ++val; + regs->gpr[rd] = val; +#ifdef __powerpc64__ + if (!(regs->msr & MSR_64BIT)) { + val = (unsigned int) val; + val1 = (unsigned int) val1; + } +#endif + if (val < val1 || (carry_in && val == val1)) + regs->xer |= XER_CA; + else + regs->xer &= ~XER_CA; +} + +static void __kprobes do_cmp_signed(struct pt_regs *regs, long v1, long v2, + int crfld) +{ + unsigned int crval, shift; + + crval = (regs->xer >> 31) & 1; /* get SO bit */ + if (v1 < v2) + crval |= 8; + else if (v1 > v2) + crval |= 4; + else + crval |= 2; + shift = (7 - crfld) * 4; + regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift); +} + +static void __kprobes do_cmp_unsigned(struct pt_regs *regs, unsigned long v1, + unsigned long v2, int crfld) +{ + unsigned int crval, shift; + + crval = (regs->xer >> 31) & 1; /* get SO bit */ + if (v1 < v2) + crval |= 8; + else if (v1 > v2) + crval |= 4; + else + crval |= 2; + shift = (7 - crfld) * 4; + regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift); +} + +/* + * Elements of 32-bit rotate and mask instructions. + */ +#define MASK32(mb, me) ((0xffffffffUL >> (mb)) + \ + ((signed long)-0x80000000L >> (me)) + ((me) >= (mb))) +#ifdef __powerpc64__ +#define MASK64_L(mb) (~0UL >> (mb)) +#define MASK64_R(me) ((signed long)-0x8000000000000000L >> (me)) +#define MASK64(mb, me) (MASK64_L(mb) + MASK64_R(me) + ((me) >= (mb))) +#define DATA32(x) (((x) & 0xffffffffUL) | (((x) & 0xffffffffUL) << 32)) +#else +#define DATA32(x) (x) +#endif +#define ROTATE(x, n) ((n) ? (((x) << (n)) | ((x) >> (8 * sizeof(long) - (n)))) : (x)) + +/* + * Emulate instructions that cause a transfer of control, + * loads and stores, and a few other instructions. * Returns 1 if the step was emulated, 0 if not, * or -1 if the instruction is one that should not be stepped, * such as an rfid, or a mtmsrd that would clear MSR_RI. */ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) { - unsigned int opcode, rs, rb, rd, spr; + unsigned int opcode, ra, rb, rd, spr, u; unsigned long int imm; + unsigned long int val, val2; + unsigned long int ea; + unsigned int cr, mb, me, sh; + int err; + unsigned long old_ra, val3; + long ival; opcode = instr >> 26; switch (opcode) { @@ -64,12 +651,11 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) if ((instr & 2) == 0) imm += regs->nip; regs->nip += 4; - if ((regs->msr & MSR_SF) == 0) - regs->nip &= 0xffffffffUL; + regs->nip = truncate_if_32bit(regs->msr, regs->nip); if (instr & 1) regs->link = regs->nip; if (branch_taken(instr, regs)) - regs->nip = imm; + regs->nip = truncate_if_32bit(regs->msr, imm); return 1; #ifdef CONFIG_PPC64 case 17: /* sc */ @@ -78,7 +664,13 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) * entry code works. If that is changed, this will * need to be changed also. */ + if (regs->gpr[0] == 0x1ebe && + cpu_has_feature(CPU_FTR_REAL_LE)) { + regs->msr ^= MSR_LE; + goto instr_done; + } regs->gpr[9] = regs->gpr[13]; + regs->gpr[10] = MSR_KERNEL; regs->gpr[11] = regs->nip + 4; regs->gpr[12] = regs->msr & MSR_MASK; regs->gpr[13] = (unsigned long) get_paca(); @@ -92,54 +684,250 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) imm -= 0x04000000; if ((instr & 2) == 0) imm += regs->nip; - if (instr & 1) { - regs->link = regs->nip + 4; - if ((regs->msr & MSR_SF) == 0) - regs->link &= 0xffffffffUL; - } - if ((regs->msr & MSR_SF) == 0) - imm &= 0xffffffffUL; + if (instr & 1) + regs->link = truncate_if_32bit(regs->msr, regs->nip + 4); + imm = truncate_if_32bit(regs->msr, imm); regs->nip = imm; return 1; case 19: - switch (instr & 0x7fe) { - case 0x20: /* bclr */ - case 0x420: /* bcctr */ + switch ((instr >> 1) & 0x3ff) { + case 16: /* bclr */ + case 528: /* bcctr */ imm = (instr & 0x400)? regs->ctr: regs->link; - regs->nip += 4; - if ((regs->msr & MSR_SF) == 0) { - regs->nip &= 0xffffffffUL; - imm &= 0xffffffffUL; - } + regs->nip = truncate_if_32bit(regs->msr, regs->nip + 4); + imm = truncate_if_32bit(regs->msr, imm); if (instr & 1) regs->link = regs->nip; if (branch_taken(instr, regs)) regs->nip = imm; return 1; - case 0x24: /* rfid, scary */ + + case 18: /* rfid, scary */ return -1; + + case 150: /* isync */ + isync(); + goto instr_done; + + case 33: /* crnor */ + case 129: /* crandc */ + case 193: /* crxor */ + case 225: /* crnand */ + case 257: /* crand */ + case 289: /* creqv */ + case 417: /* crorc */ + case 449: /* cror */ + ra = (instr >> 16) & 0x1f; + rb = (instr >> 11) & 0x1f; + rd = (instr >> 21) & 0x1f; + ra = (regs->ccr >> (31 - ra)) & 1; + rb = (regs->ccr >> (31 - rb)) & 1; + val = (instr >> (6 + ra * 2 + rb)) & 1; + regs->ccr = (regs->ccr & ~(1UL << (31 - rd))) | + (val << (31 - rd)); + goto instr_done; } + break; case 31: - rd = (instr >> 21) & 0x1f; - switch (instr & 0x7fe) { - case 0xa6: /* mfmsr */ + switch ((instr >> 1) & 0x3ff) { + case 598: /* sync */ +#ifdef __powerpc64__ + switch ((instr >> 21) & 3) { + case 1: /* lwsync */ + asm volatile("lwsync" : : : "memory"); + goto instr_done; + case 2: /* ptesync */ + asm volatile("ptesync" : : : "memory"); + goto instr_done; + } +#endif + mb(); + goto instr_done; + + case 854: /* eieio */ + eieio(); + goto instr_done; + } + break; + } + + /* Following cases refer to regs->gpr[], so we need all regs */ + if (!FULL_REGS(regs)) + return 0; + + rd = (instr >> 21) & 0x1f; + ra = (instr >> 16) & 0x1f; + rb = (instr >> 11) & 0x1f; + + switch (opcode) { + case 7: /* mulli */ + regs->gpr[rd] = regs->gpr[ra] * (short) instr; + goto instr_done; + + case 8: /* subfic */ + imm = (short) instr; + add_with_carry(regs, rd, ~regs->gpr[ra], imm, 1); + goto instr_done; + + case 10: /* cmpli */ + imm = (unsigned short) instr; + val = regs->gpr[ra]; +#ifdef __powerpc64__ + if ((rd & 1) == 0) + val = (unsigned int) val; +#endif + do_cmp_unsigned(regs, val, imm, rd >> 2); + goto instr_done; + + case 11: /* cmpi */ + imm = (short) instr; + val = regs->gpr[ra]; +#ifdef __powerpc64__ + if ((rd & 1) == 0) + val = (int) val; +#endif + do_cmp_signed(regs, val, imm, rd >> 2); + goto instr_done; + + case 12: /* addic */ + imm = (short) instr; + add_with_carry(regs, rd, regs->gpr[ra], imm, 0); + goto instr_done; + + case 13: /* addic. */ + imm = (short) instr; + add_with_carry(regs, rd, regs->gpr[ra], imm, 0); + set_cr0(regs, rd); + goto instr_done; + + case 14: /* addi */ + imm = (short) instr; + if (ra) + imm += regs->gpr[ra]; + regs->gpr[rd] = imm; + goto instr_done; + + case 15: /* addis */ + imm = ((short) instr) << 16; + if (ra) + imm += regs->gpr[ra]; + regs->gpr[rd] = imm; + goto instr_done; + + case 20: /* rlwimi */ + mb = (instr >> 6) & 0x1f; + me = (instr >> 1) & 0x1f; + val = DATA32(regs->gpr[rd]); + imm = MASK32(mb, me); + regs->gpr[ra] = (regs->gpr[ra] & ~imm) | (ROTATE(val, rb) & imm); + goto logical_done; + + case 21: /* rlwinm */ + mb = (instr >> 6) & 0x1f; + me = (instr >> 1) & 0x1f; + val = DATA32(regs->gpr[rd]); + regs->gpr[ra] = ROTATE(val, rb) & MASK32(mb, me); + goto logical_done; + + case 23: /* rlwnm */ + mb = (instr >> 6) & 0x1f; + me = (instr >> 1) & 0x1f; + rb = regs->gpr[rb] & 0x1f; + val = DATA32(regs->gpr[rd]); + regs->gpr[ra] = ROTATE(val, rb) & MASK32(mb, me); + goto logical_done; + + case 24: /* ori */ + imm = (unsigned short) instr; + regs->gpr[ra] = regs->gpr[rd] | imm; + goto instr_done; + + case 25: /* oris */ + imm = (unsigned short) instr; + regs->gpr[ra] = regs->gpr[rd] | (imm << 16); + goto instr_done; + + case 26: /* xori */ + imm = (unsigned short) instr; + regs->gpr[ra] = regs->gpr[rd] ^ imm; + goto instr_done; + + case 27: /* xoris */ + imm = (unsigned short) instr; + regs->gpr[ra] = regs->gpr[rd] ^ (imm << 16); + goto instr_done; + + case 28: /* andi. */ + imm = (unsigned short) instr; + regs->gpr[ra] = regs->gpr[rd] & imm; + set_cr0(regs, ra); + goto instr_done; + + case 29: /* andis. */ + imm = (unsigned short) instr; + regs->gpr[ra] = regs->gpr[rd] & (imm << 16); + set_cr0(regs, ra); + goto instr_done; + +#ifdef __powerpc64__ + case 30: /* rld* */ + mb = ((instr >> 6) & 0x1f) | (instr & 0x20); + val = regs->gpr[rd]; + if ((instr & 0x10) == 0) { + sh = rb | ((instr & 2) << 4); + val = ROTATE(val, sh); + switch ((instr >> 2) & 3) { + case 0: /* rldicl */ + regs->gpr[ra] = val & MASK64_L(mb); + goto logical_done; + case 1: /* rldicr */ + regs->gpr[ra] = val & MASK64_R(mb); + goto logical_done; + case 2: /* rldic */ + regs->gpr[ra] = val & MASK64(mb, 63 - sh); + goto logical_done; + case 3: /* rldimi */ + imm = MASK64(mb, 63 - sh); + regs->gpr[ra] = (regs->gpr[ra] & ~imm) | + (val & imm); + goto logical_done; + } + } else { + sh = regs->gpr[rb] & 0x3f; + val = ROTATE(val, sh); + switch ((instr >> 1) & 7) { + case 0: /* rldcl */ + regs->gpr[ra] = val & MASK64_L(mb); + goto logical_done; + case 1: /* rldcr */ + regs->gpr[ra] = val & MASK64_R(mb); + goto logical_done; + } + } +#endif + + case 31: + switch ((instr >> 1) & 0x3ff) { + case 83: /* mfmsr */ + if (regs->msr & MSR_PR) + break; regs->gpr[rd] = regs->msr & MSR_MASK; - regs->nip += 4; - if ((regs->msr & MSR_SF) == 0) - regs->nip &= 0xffffffffUL; - return 1; - case 0x124: /* mtmsr */ + goto instr_done; + case 146: /* mtmsr */ + if (regs->msr & MSR_PR) + break; imm = regs->gpr[rd]; if ((imm & MSR_RI) == 0) /* can't step mtmsr that would clear MSR_RI */ return -1; regs->msr = imm; - regs->nip += 4; - return 1; + goto instr_done; #ifdef CONFIG_PPC64 - case 0x164: /* mtmsrd */ + case 178: /* mtmsrd */ /* only MSR_EE and MSR_RI get changed if bit 15 set */ /* mtmsrd doesn't change MSR_HV and MSR_ME */ + if (regs->msr & MSR_PR) + break; imm = (instr & 0x10000)? 0x8002: 0xefffffffffffefffUL; imm = (regs->msr & MSR_MASK & ~imm) | (regs->gpr[rd] & imm); @@ -147,55 +935,803 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) /* can't step mtmsrd that would clear MSR_RI */ return -1; regs->msr = imm; - regs->nip += 4; - if ((imm & MSR_SF) == 0) - regs->nip &= 0xffffffffUL; - return 1; + goto instr_done; #endif - case 0x26: /* mfcr */ + case 19: /* mfcr */ regs->gpr[rd] = regs->ccr; regs->gpr[rd] &= 0xffffffffUL; - goto mtspr_out; - case 0x2a6: /* mfspr */ + goto instr_done; + + case 144: /* mtcrf */ + imm = 0xf0000000UL; + val = regs->gpr[rd]; + for (sh = 0; sh < 8; ++sh) { + if (instr & (0x80000 >> sh)) + regs->ccr = (regs->ccr & ~imm) | + (val & imm); + imm >>= 4; + } + goto instr_done; + + case 339: /* mfspr */ spr = (instr >> 11) & 0x3ff; switch (spr) { case 0x20: /* mfxer */ regs->gpr[rd] = regs->xer; regs->gpr[rd] &= 0xffffffffUL; - goto mtspr_out; + goto instr_done; case 0x100: /* mflr */ regs->gpr[rd] = regs->link; - goto mtspr_out; + goto instr_done; case 0x120: /* mfctr */ regs->gpr[rd] = regs->ctr; - goto mtspr_out; - } - break; - case 0x378: /* orx */ - rs = (instr >> 21) & 0x1f; - rb = (instr >> 11) & 0x1f; - if (rs == rb) { /* mr */ - rd = (instr >> 16) & 0x1f; - regs->gpr[rd] = regs->gpr[rs]; - goto mtspr_out; + goto instr_done; } break; - case 0x3a6: /* mtspr */ + + case 467: /* mtspr */ spr = (instr >> 11) & 0x3ff; switch (spr) { case 0x20: /* mtxer */ regs->xer = (regs->gpr[rd] & 0xffffffffUL); - goto mtspr_out; + goto instr_done; case 0x100: /* mtlr */ regs->link = regs->gpr[rd]; - goto mtspr_out; + goto instr_done; case 0x120: /* mtctr */ regs->ctr = regs->gpr[rd]; -mtspr_out: - regs->nip += 4; - return 1; + goto instr_done; + } + break; + +/* + * Compare instructions + */ + case 0: /* cmp */ + val = regs->gpr[ra]; + val2 = regs->gpr[rb]; +#ifdef __powerpc64__ + if ((rd & 1) == 0) { + /* word (32-bit) compare */ + val = (int) val; + val2 = (int) val2; + } +#endif + do_cmp_signed(regs, val, val2, rd >> 2); + goto instr_done; + + case 32: /* cmpl */ + val = regs->gpr[ra]; + val2 = regs->gpr[rb]; +#ifdef __powerpc64__ + if ((rd & 1) == 0) { + /* word (32-bit) compare */ + val = (unsigned int) val; + val2 = (unsigned int) val2; + } +#endif + do_cmp_unsigned(regs, val, val2, rd >> 2); + goto instr_done; + +/* + * Arithmetic instructions + */ + case 8: /* subfc */ + add_with_carry(regs, rd, ~regs->gpr[ra], + regs->gpr[rb], 1); + goto arith_done; +#ifdef __powerpc64__ + case 9: /* mulhdu */ + asm("mulhdu %0,%1,%2" : "=r" (regs->gpr[rd]) : + "r" (regs->gpr[ra]), "r" (regs->gpr[rb])); + goto arith_done; +#endif + case 10: /* addc */ + add_with_carry(regs, rd, regs->gpr[ra], + regs->gpr[rb], 0); + goto arith_done; + + case 11: /* mulhwu */ + asm("mulhwu %0,%1,%2" : "=r" (regs->gpr[rd]) : + "r" (regs->gpr[ra]), "r" (regs->gpr[rb])); + goto arith_done; + + case 40: /* subf */ + regs->gpr[rd] = regs->gpr[rb] - regs->gpr[ra]; + goto arith_done; +#ifdef __powerpc64__ + case 73: /* mulhd */ + asm("mulhd %0,%1,%2" : "=r" (regs->gpr[rd]) : + "r" (regs->gpr[ra]), "r" (regs->gpr[rb])); + goto arith_done; +#endif + case 75: /* mulhw */ + asm("mulhw %0,%1,%2" : "=r" (regs->gpr[rd]) : + "r" (regs->gpr[ra]), "r" (regs->gpr[rb])); + goto arith_done; + + case 104: /* neg */ + regs->gpr[rd] = -regs->gpr[ra]; + goto arith_done; + + case 136: /* subfe */ + add_with_carry(regs, rd, ~regs->gpr[ra], regs->gpr[rb], + regs->xer & XER_CA); + goto arith_done; + + case 138: /* adde */ + add_with_carry(regs, rd, regs->gpr[ra], regs->gpr[rb], + regs->xer & XER_CA); + goto arith_done; + + case 200: /* subfze */ + add_with_carry(regs, rd, ~regs->gpr[ra], 0L, + regs->xer & XER_CA); + goto arith_done; + + case 202: /* addze */ + add_with_carry(regs, rd, regs->gpr[ra], 0L, + regs->xer & XER_CA); + goto arith_done; + + case 232: /* subfme */ + add_with_carry(regs, rd, ~regs->gpr[ra], -1L, + regs->xer & XER_CA); + goto arith_done; +#ifdef __powerpc64__ + case 233: /* mulld */ + regs->gpr[rd] = regs->gpr[ra] * regs->gpr[rb]; + goto arith_done; +#endif + case 234: /* addme */ + add_with_carry(regs, rd, regs->gpr[ra], -1L, + regs->xer & XER_CA); + goto arith_done; + + case 235: /* mullw */ + regs->gpr[rd] = (unsigned int) regs->gpr[ra] * + (unsigned int) regs->gpr[rb]; + goto arith_done; + + case 266: /* add */ + regs->gpr[rd] = regs->gpr[ra] + regs->gpr[rb]; + goto arith_done; +#ifdef __powerpc64__ + case 457: /* divdu */ + regs->gpr[rd] = regs->gpr[ra] / regs->gpr[rb]; + goto arith_done; +#endif + case 459: /* divwu */ + regs->gpr[rd] = (unsigned int) regs->gpr[ra] / + (unsigned int) regs->gpr[rb]; + goto arith_done; +#ifdef __powerpc64__ + case 489: /* divd */ + regs->gpr[rd] = (long int) regs->gpr[ra] / + (long int) regs->gpr[rb]; + goto arith_done; +#endif + case 491: /* divw */ + regs->gpr[rd] = (int) regs->gpr[ra] / + (int) regs->gpr[rb]; + goto arith_done; + + +/* + * Logical instructions + */ + case 26: /* cntlzw */ + asm("cntlzw %0,%1" : "=r" (regs->gpr[ra]) : + "r" (regs->gpr[rd])); + goto logical_done; +#ifdef __powerpc64__ + case 58: /* cntlzd */ + asm("cntlzd %0,%1" : "=r" (regs->gpr[ra]) : + "r" (regs->gpr[rd])); + goto logical_done; +#endif + case 28: /* and */ + regs->gpr[ra] = regs->gpr[rd] & regs->gpr[rb]; + goto logical_done; + + case 60: /* andc */ + regs->gpr[ra] = regs->gpr[rd] & ~regs->gpr[rb]; + goto logical_done; + + case 124: /* nor */ + regs->gpr[ra] = ~(regs->gpr[rd] | regs->gpr[rb]); + goto logical_done; + + case 284: /* xor */ + regs->gpr[ra] = ~(regs->gpr[rd] ^ regs->gpr[rb]); + goto logical_done; + + case 316: /* xor */ + regs->gpr[ra] = regs->gpr[rd] ^ regs->gpr[rb]; + goto logical_done; + + case 412: /* orc */ + regs->gpr[ra] = regs->gpr[rd] | ~regs->gpr[rb]; + goto logical_done; + + case 444: /* or */ + regs->gpr[ra] = regs->gpr[rd] | regs->gpr[rb]; + goto logical_done; + + case 476: /* nand */ + regs->gpr[ra] = ~(regs->gpr[rd] & regs->gpr[rb]); + goto logical_done; + + case 922: /* extsh */ + regs->gpr[ra] = (signed short) regs->gpr[rd]; + goto logical_done; + + case 954: /* extsb */ + regs->gpr[ra] = (signed char) regs->gpr[rd]; + goto logical_done; +#ifdef __powerpc64__ + case 986: /* extsw */ + regs->gpr[ra] = (signed int) regs->gpr[rd]; + goto logical_done; +#endif + +/* + * Shift instructions + */ + case 24: /* slw */ + sh = regs->gpr[rb] & 0x3f; + if (sh < 32) + regs->gpr[ra] = (regs->gpr[rd] << sh) & 0xffffffffUL; + else + regs->gpr[ra] = 0; + goto logical_done; + + case 536: /* srw */ + sh = regs->gpr[rb] & 0x3f; + if (sh < 32) + regs->gpr[ra] = (regs->gpr[rd] & 0xffffffffUL) >> sh; + else + regs->gpr[ra] = 0; + goto logical_done; + + case 792: /* sraw */ + sh = regs->gpr[rb] & 0x3f; + ival = (signed int) regs->gpr[rd]; + regs->gpr[ra] = ival >> (sh < 32 ? sh : 31); + if (ival < 0 && (sh >= 32 || (ival & ((1ul << sh) - 1)) != 0)) + regs->xer |= XER_CA; + else + regs->xer &= ~XER_CA; + goto logical_done; + + case 824: /* srawi */ + sh = rb; + ival = (signed int) regs->gpr[rd]; + regs->gpr[ra] = ival >> sh; + if (ival < 0 && (ival & ((1ul << sh) - 1)) != 0) + regs->xer |= XER_CA; + else + regs->xer &= ~XER_CA; + goto logical_done; + +#ifdef __powerpc64__ + case 27: /* sld */ + sh = regs->gpr[rb] & 0x7f; + if (sh < 64) + regs->gpr[ra] = regs->gpr[rd] << sh; + else + regs->gpr[ra] = 0; + goto logical_done; + + case 539: /* srd */ + sh = regs->gpr[rb] & 0x7f; + if (sh < 64) + regs->gpr[ra] = regs->gpr[rd] >> sh; + else + regs->gpr[ra] = 0; + goto logical_done; + + case 794: /* srad */ + sh = regs->gpr[rb] & 0x7f; + ival = (signed long int) regs->gpr[rd]; + regs->gpr[ra] = ival >> (sh < 64 ? sh : 63); + if (ival < 0 && (sh >= 64 || (ival & ((1ul << sh) - 1)) != 0)) + regs->xer |= XER_CA; + else + regs->xer &= ~XER_CA; + goto logical_done; + + case 826: /* sradi with sh_5 = 0 */ + case 827: /* sradi with sh_5 = 1 */ + sh = rb | ((instr & 2) << 4); + ival = (signed long int) regs->gpr[rd]; + regs->gpr[ra] = ival >> sh; + if (ival < 0 && (ival & ((1ul << sh) - 1)) != 0) + regs->xer |= XER_CA; + else + regs->xer &= ~XER_CA; + goto logical_done; +#endif /* __powerpc64__ */ + +/* + * Cache instructions + */ + case 54: /* dcbst */ + ea = xform_ea(instr, regs, 0); + if (!address_ok(regs, ea, 8)) + return 0; + err = 0; + __cacheop_user_asmx(ea, err, "dcbst"); + if (err) + return 0; + goto instr_done; + + case 86: /* dcbf */ + ea = xform_ea(instr, regs, 0); + if (!address_ok(regs, ea, 8)) + return 0; + err = 0; + __cacheop_user_asmx(ea, err, "dcbf"); + if (err) + return 0; + goto instr_done; + + case 246: /* dcbtst */ + if (rd == 0) { + ea = xform_ea(instr, regs, 0); + prefetchw((void *) ea); + } + goto instr_done; + + case 278: /* dcbt */ + if (rd == 0) { + ea = xform_ea(instr, regs, 0); + prefetch((void *) ea); } + goto instr_done; + } + break; } - return 0; + + /* + * Following cases are for loads and stores, so bail out + * if we're in little-endian mode. + */ + if (regs->msr & MSR_LE) + return 0; + + /* + * Save register RA in case it's an update form load or store + * and the access faults. + */ + old_ra = regs->gpr[ra]; + + switch (opcode) { + case 31: + u = instr & 0x40; + switch ((instr >> 1) & 0x3ff) { + case 20: /* lwarx */ + ea = xform_ea(instr, regs, 0); + if (ea & 3) + break; /* can't handle misaligned */ + err = -EFAULT; + if (!address_ok(regs, ea, 4)) + goto ldst_done; + err = 0; + __get_user_asmx(val, ea, err, "lwarx"); + if (!err) + regs->gpr[rd] = val; + goto ldst_done; + + case 150: /* stwcx. */ + ea = xform_ea(instr, regs, 0); + if (ea & 3) + break; /* can't handle misaligned */ + err = -EFAULT; + if (!address_ok(regs, ea, 4)) + goto ldst_done; + err = 0; + __put_user_asmx(regs->gpr[rd], ea, err, "stwcx.", cr); + if (!err) + regs->ccr = (regs->ccr & 0x0fffffff) | + (cr & 0xe0000000) | + ((regs->xer >> 3) & 0x10000000); + goto ldst_done; + +#ifdef __powerpc64__ + case 84: /* ldarx */ + ea = xform_ea(instr, regs, 0); + if (ea & 7) + break; /* can't handle misaligned */ + err = -EFAULT; + if (!address_ok(regs, ea, 8)) + goto ldst_done; + err = 0; + __get_user_asmx(val, ea, err, "ldarx"); + if (!err) + regs->gpr[rd] = val; + goto ldst_done; + + case 214: /* stdcx. */ + ea = xform_ea(instr, regs, 0); + if (ea & 7) + break; /* can't handle misaligned */ + err = -EFAULT; + if (!address_ok(regs, ea, 8)) + goto ldst_done; + err = 0; + __put_user_asmx(regs->gpr[rd], ea, err, "stdcx.", cr); + if (!err) + regs->ccr = (regs->ccr & 0x0fffffff) | + (cr & 0xe0000000) | + ((regs->xer >> 3) & 0x10000000); + goto ldst_done; + + case 21: /* ldx */ + case 53: /* ldux */ + err = read_mem(®s->gpr[rd], xform_ea(instr, regs, u), + 8, regs); + goto ldst_done; +#endif + + case 23: /* lwzx */ + case 55: /* lwzux */ + err = read_mem(®s->gpr[rd], xform_ea(instr, regs, u), + 4, regs); + goto ldst_done; + + case 87: /* lbzx */ + case 119: /* lbzux */ + err = read_mem(®s->gpr[rd], xform_ea(instr, regs, u), + 1, regs); + goto ldst_done; + +#ifdef CONFIG_ALTIVEC + case 103: /* lvx */ + case 359: /* lvxl */ + if (!(regs->msr & MSR_VEC)) + break; + ea = xform_ea(instr, regs, 0); + err = do_vec_load(rd, do_lvx, ea, regs); + goto ldst_done; + + case 231: /* stvx */ + case 487: /* stvxl */ + if (!(regs->msr & MSR_VEC)) + break; + ea = xform_ea(instr, regs, 0); + err = do_vec_store(rd, do_stvx, ea, regs); + goto ldst_done; +#endif /* CONFIG_ALTIVEC */ + +#ifdef __powerpc64__ + case 149: /* stdx */ + case 181: /* stdux */ + val = regs->gpr[rd]; + err = write_mem(val, xform_ea(instr, regs, u), 8, regs); + goto ldst_done; +#endif + + case 151: /* stwx */ + case 183: /* stwux */ + val = regs->gpr[rd]; + err = write_mem(val, xform_ea(instr, regs, u), 4, regs); + goto ldst_done; + + case 215: /* stbx */ + case 247: /* stbux */ + val = regs->gpr[rd]; + err = write_mem(val, xform_ea(instr, regs, u), 1, regs); + goto ldst_done; + + case 279: /* lhzx */ + case 311: /* lhzux */ + err = read_mem(®s->gpr[rd], xform_ea(instr, regs, u), + 2, regs); + goto ldst_done; + +#ifdef __powerpc64__ + case 341: /* lwax */ + case 373: /* lwaux */ + err = read_mem(®s->gpr[rd], xform_ea(instr, regs, u), + 4, regs); + if (!err) + regs->gpr[rd] = (signed int) regs->gpr[rd]; + goto ldst_done; +#endif + + case 343: /* lhax */ + case 375: /* lhaux */ + err = read_mem(®s->gpr[rd], xform_ea(instr, regs, u), + 2, regs); + if (!err) + regs->gpr[rd] = (signed short) regs->gpr[rd]; + goto ldst_done; + + case 407: /* sthx */ + case 439: /* sthux */ + val = regs->gpr[rd]; + err = write_mem(val, xform_ea(instr, regs, u), 2, regs); + goto ldst_done; + +#ifdef __powerpc64__ + case 532: /* ldbrx */ + err = read_mem(&val, xform_ea(instr, regs, 0), 8, regs); + if (!err) + regs->gpr[rd] = byterev_8(val); + goto ldst_done; + +#endif + + case 534: /* lwbrx */ + err = read_mem(&val, xform_ea(instr, regs, 0), 4, regs); + if (!err) + regs->gpr[rd] = byterev_4(val); + goto ldst_done; + +#ifdef CONFIG_PPC_FPU + case 535: /* lfsx */ + case 567: /* lfsux */ + if (!(regs->msr & MSR_FP)) + break; + ea = xform_ea(instr, regs, u); + err = do_fp_load(rd, do_lfs, ea, 4, regs); + goto ldst_done; + + case 599: /* lfdx */ + case 631: /* lfdux */ + if (!(regs->msr & MSR_FP)) + break; + ea = xform_ea(instr, regs, u); + err = do_fp_load(rd, do_lfd, ea, 8, regs); + goto ldst_done; + + case 663: /* stfsx */ + case 695: /* stfsux */ + if (!(regs->msr & MSR_FP)) + break; + ea = xform_ea(instr, regs, u); + err = do_fp_store(rd, do_stfs, ea, 4, regs); + goto ldst_done; + + case 727: /* stfdx */ + case 759: /* stfdux */ + if (!(regs->msr & MSR_FP)) + break; + ea = xform_ea(instr, regs, u); + err = do_fp_store(rd, do_stfd, ea, 8, regs); + goto ldst_done; +#endif + +#ifdef __powerpc64__ + case 660: /* stdbrx */ + val = byterev_8(regs->gpr[rd]); + err = write_mem(val, xform_ea(instr, regs, 0), 8, regs); + goto ldst_done; + +#endif + case 662: /* stwbrx */ + val = byterev_4(regs->gpr[rd]); + err = write_mem(val, xform_ea(instr, regs, 0), 4, regs); + goto ldst_done; + + case 790: /* lhbrx */ + err = read_mem(&val, xform_ea(instr, regs, 0), 2, regs); + if (!err) + regs->gpr[rd] = byterev_2(val); + goto ldst_done; + + case 918: /* sthbrx */ + val = byterev_2(regs->gpr[rd]); + err = write_mem(val, xform_ea(instr, regs, 0), 2, regs); + goto ldst_done; + +#ifdef CONFIG_VSX + case 844: /* lxvd2x */ + case 876: /* lxvd2ux */ + if (!(regs->msr & MSR_VSX)) + break; + rd |= (instr & 1) << 5; + ea = xform_ea(instr, regs, u); + err = do_vsx_load(rd, do_lxvd2x, ea, regs); + goto ldst_done; + + case 972: /* stxvd2x */ + case 1004: /* stxvd2ux */ + if (!(regs->msr & MSR_VSX)) + break; + rd |= (instr & 1) << 5; + ea = xform_ea(instr, regs, u); + err = do_vsx_store(rd, do_stxvd2x, ea, regs); + goto ldst_done; + +#endif /* CONFIG_VSX */ + } + break; + + case 32: /* lwz */ + case 33: /* lwzu */ + err = read_mem(®s->gpr[rd], dform_ea(instr, regs), 4, regs); + goto ldst_done; + + case 34: /* lbz */ + case 35: /* lbzu */ + err = read_mem(®s->gpr[rd], dform_ea(instr, regs), 1, regs); + goto ldst_done; + + case 36: /* stw */ + val = regs->gpr[rd]; + err = write_mem(val, dform_ea(instr, regs), 4, regs); + goto ldst_done; + + case 37: /* stwu */ + val = regs->gpr[rd]; + val3 = dform_ea(instr, regs); + /* + * For PPC32 we always use stwu to change stack point with r1. So + * this emulated store may corrupt the exception frame, now we + * have to provide the exception frame trampoline, which is pushed + * below the kprobed function stack. So we only update gpr[1] but + * don't emulate the real store operation. We will do real store + * operation safely in exception return code by checking this flag. + */ + if ((ra == 1) && !(regs->msr & MSR_PR) \ + && (val3 >= (regs->gpr[1] - STACK_INT_FRAME_SIZE))) { +#ifdef CONFIG_PPC32 + /* + * Check if we will touch kernel sack overflow + */ + if (val3 - STACK_INT_FRAME_SIZE <= current->thread.ksp_limit) { + printk(KERN_CRIT "Can't kprobe this since Kernel stack overflow.\n"); + err = -EINVAL; + break; + } +#endif /* CONFIG_PPC32 */ + /* + * Check if we already set since that means we'll + * lose the previous value. + */ + WARN_ON(test_thread_flag(TIF_EMULATE_STACK_STORE)); + set_thread_flag(TIF_EMULATE_STACK_STORE); + err = 0; + } else + err = write_mem(val, val3, 4, regs); + goto ldst_done; + + case 38: /* stb */ + case 39: /* stbu */ + val = regs->gpr[rd]; + err = write_mem(val, dform_ea(instr, regs), 1, regs); + goto ldst_done; + + case 40: /* lhz */ + case 41: /* lhzu */ + err = read_mem(®s->gpr[rd], dform_ea(instr, regs), 2, regs); + goto ldst_done; + + case 42: /* lha */ + case 43: /* lhau */ + err = read_mem(®s->gpr[rd], dform_ea(instr, regs), 2, regs); + if (!err) + regs->gpr[rd] = (signed short) regs->gpr[rd]; + goto ldst_done; + + case 44: /* sth */ + case 45: /* sthu */ + val = regs->gpr[rd]; + err = write_mem(val, dform_ea(instr, regs), 2, regs); + goto ldst_done; + + case 46: /* lmw */ + ra = (instr >> 16) & 0x1f; + if (ra >= rd) + break; /* invalid form, ra in range to load */ + ea = dform_ea(instr, regs); + do { + err = read_mem(®s->gpr[rd], ea, 4, regs); + if (err) + return 0; + ea += 4; + } while (++rd < 32); + goto instr_done; + + case 47: /* stmw */ + ea = dform_ea(instr, regs); + do { + err = write_mem(regs->gpr[rd], ea, 4, regs); + if (err) + return 0; + ea += 4; + } while (++rd < 32); + goto instr_done; + +#ifdef CONFIG_PPC_FPU + case 48: /* lfs */ + case 49: /* lfsu */ + if (!(regs->msr & MSR_FP)) + break; + ea = dform_ea(instr, regs); + err = do_fp_load(rd, do_lfs, ea, 4, regs); + goto ldst_done; + + case 50: /* lfd */ + case 51: /* lfdu */ + if (!(regs->msr & MSR_FP)) + break; + ea = dform_ea(instr, regs); + err = do_fp_load(rd, do_lfd, ea, 8, regs); + goto ldst_done; + + case 52: /* stfs */ + case 53: /* stfsu */ + if (!(regs->msr & MSR_FP)) + break; + ea = dform_ea(instr, regs); + err = do_fp_store(rd, do_stfs, ea, 4, regs); + goto ldst_done; + + case 54: /* stfd */ + case 55: /* stfdu */ + if (!(regs->msr & MSR_FP)) + break; + ea = dform_ea(instr, regs); + err = do_fp_store(rd, do_stfd, ea, 8, regs); + goto ldst_done; +#endif + +#ifdef __powerpc64__ + case 58: /* ld[u], lwa */ + switch (instr & 3) { + case 0: /* ld */ + err = read_mem(®s->gpr[rd], dsform_ea(instr, regs), + 8, regs); + goto ldst_done; + case 1: /* ldu */ + err = read_mem(®s->gpr[rd], dsform_ea(instr, regs), + 8, regs); + goto ldst_done; + case 2: /* lwa */ + err = read_mem(®s->gpr[rd], dsform_ea(instr, regs), + 4, regs); + if (!err) + regs->gpr[rd] = (signed int) regs->gpr[rd]; + goto ldst_done; + } + break; + + case 62: /* std[u] */ + val = regs->gpr[rd]; + switch (instr & 3) { + case 0: /* std */ + err = write_mem(val, dsform_ea(instr, regs), 8, regs); + goto ldst_done; + case 1: /* stdu */ + err = write_mem(val, dsform_ea(instr, regs), 8, regs); + goto ldst_done; + } + break; +#endif /* __powerpc64__ */ + + } + err = -EINVAL; + + ldst_done: + if (err) { + regs->gpr[ra] = old_ra; + return 0; /* invoke DSI if -EFAULT? */ + } + instr_done: + regs->nip = truncate_if_32bit(regs->msr, regs->nip + 4); + return 1; + + logical_done: + if (instr & 1) + set_cr0(regs, ra); + goto instr_done; + + arith_done: + if (instr & 1) + set_cr0(regs, rd); + goto instr_done; } diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S index c4c622d8e6a..1b5a0a09d60 100644 --- a/arch/powerpc/lib/string.S +++ b/arch/powerpc/lib/string.S @@ -13,13 +13,7 @@ #include <asm/ppc_asm.h> .section __ex_table,"a" -#ifdef CONFIG_PPC64 - .align 3 -#define EXTBL .llong -#else - .align 2 -#define EXTBL .long -#endif + PPC_LONG_ALIGN .text _GLOBAL(strcpy) @@ -34,7 +28,7 @@ _GLOBAL(strcpy) /* This clears out any unused part of the destination buffer, just as the libc version does. -- paulus */ _GLOBAL(strncpy) - cmpwi 0,r5,0 + PPC_LCMPI 0,r5,0 beqlr mtctr r5 addi r6,r3,-1 @@ -45,7 +39,7 @@ _GLOBAL(strncpy) bdnzf 2,1b /* dec ctr, branch if ctr != 0 && !cr0.eq */ bnelr /* if we didn't hit a null char, we're done */ mfctr r5 - cmpwi 0,r5,0 /* any space left in destination buffer? */ + PPC_LCMPI 0,r5,0 /* any space left in destination buffer? */ beqlr /* we know r0 == 0 here */ 2: stbu r0,1(r6) /* clear it out if so */ bdnz 2b @@ -75,6 +69,22 @@ _GLOBAL(strcmp) beq 1b blr +_GLOBAL(strncmp) + PPC_LCMPI 0,r5,0 + beq- 2f + mtctr r5 + addi r5,r3,-1 + addi r4,r4,-1 +1: lbzu r3,1(r5) + cmpwi 1,r3,0 + lbzu r0,1(r4) + subf. r3,r0,r3 + beqlr 1 + bdnzt eq,1b + blr +2: li r3,0 + blr + _GLOBAL(strlen) addi r4,r3,-1 1: lbzu r0,1(r4) @@ -84,8 +94,8 @@ _GLOBAL(strlen) blr _GLOBAL(memcmp) - cmpwi 0,r5,0 - ble- 2f + PPC_LCMPI 0,r5,0 + beq- 2f mtctr r5 addi r6,r3,-1 addi r4,r4,-1 @@ -98,8 +108,8 @@ _GLOBAL(memcmp) blr _GLOBAL(memchr) - cmpwi 0,r5,0 - ble- 2f + PPC_LCMPI 0,r5,0 + beq- 2f mtctr r5 addi r3,r3,-1 1: lbzu r0,1(r3) @@ -109,6 +119,7 @@ _GLOBAL(memchr) 2: li r3,0 blr +#ifdef CONFIG_PPC32 _GLOBAL(__clear_user) addi r6,r3,-4 li r3,0 @@ -146,52 +157,8 @@ _GLOBAL(__clear_user) blr .section __ex_table,"a" - EXTBL 11b,90b - EXTBL 1b,91b - EXTBL 8b,92b + PPC_LONG 11b,90b + PPC_LONG 1b,91b + PPC_LONG 8b,92b .text - -_GLOBAL(__strncpy_from_user) - addi r6,r3,-1 - addi r4,r4,-1 - cmpwi 0,r5,0 - beq 2f - mtctr r5 -1: lbzu r0,1(r4) - cmpwi 0,r0,0 - stbu r0,1(r6) - bdnzf 2,1b /* dec ctr, branch if ctr != 0 && !cr0.eq */ - beq 3f -2: addi r6,r6,1 -3: subf r3,r3,r6 - blr -99: li r3,-EFAULT - blr - - .section __ex_table,"a" - EXTBL 1b,99b - .text - -/* r3 = str, r4 = len (> 0), r5 = top (highest addr) */ -_GLOBAL(__strnlen_user) - addi r7,r3,-1 - subf r6,r7,r5 /* top+1 - str */ - cmplw 0,r4,r6 - bge 0f - mr r6,r4 -0: mtctr r6 /* ctr = min(len, top - str) */ -1: lbzu r0,1(r7) /* get next byte */ - cmpwi 0,r0,0 - bdnzf 2,1b /* loop if --ctr != 0 && byte != 0 */ - addi r7,r7,1 - subf r3,r3,r7 /* number of bytes we have looked at */ - beqlr /* return if we found a 0 byte */ - cmpw 0,r3,r4 /* did we look at all len bytes? */ - blt 99f /* if not, must have hit top */ - addi r3,r4,1 /* return len + 1 to indicate no null found */ - blr -99: li r3,0 /* bad address, return 0 */ - blr - - .section __ex_table,"a" - EXTBL 1b,99b +#endif diff --git a/arch/powerpc/lib/string_64.S b/arch/powerpc/lib/string_64.S new file mode 100644 index 00000000000..7bd9549a90a --- /dev/null +++ b/arch/powerpc/lib/string_64.S @@ -0,0 +1,202 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ + +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> + + .section ".toc","aw" +PPC64_CACHES: + .tc ppc64_caches[TC],ppc64_caches + .section ".text" + +/** + * __clear_user: - Zero a block of memory in user space, with less checking. + * @to: Destination address, in user space. + * @n: Number of bytes to zero. + * + * Zero a block of memory in user space. Caller must check + * the specified block with access_ok() before calling this function. + * + * Returns number of bytes that could not be cleared. + * On success, this will be zero. + */ + + .macro err1 +100: + .section __ex_table,"a" + .align 3 + .llong 100b,.Ldo_err1 + .previous + .endm + + .macro err2 +200: + .section __ex_table,"a" + .align 3 + .llong 200b,.Ldo_err2 + .previous + .endm + + .macro err3 +300: + .section __ex_table,"a" + .align 3 + .llong 300b,.Ldo_err3 + .previous + .endm + +.Ldo_err1: + mr r3,r8 + +.Ldo_err2: + mtctr r4 +1: +err3; stb r0,0(r3) + addi r3,r3,1 + addi r4,r4,-1 + bdnz 1b + +.Ldo_err3: + mr r3,r4 + blr + +_GLOBAL_TOC(__clear_user) + cmpdi r4,32 + neg r6,r3 + li r0,0 + blt .Lshort_clear + mr r8,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-3) + + /* Get the destination 8 byte aligned */ + bf cr7*4+3,1f +err1; stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f +err1; sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f +err1; stw r0,0(r3) + addi r3,r3,4 + +3: sub r4,r4,r6 + + cmpdi r4,32 + cmpdi cr1,r4,512 + blt .Lshort_clear + bgt cr1,.Llong_clear + +.Lmedium_clear: + srdi r6,r4,5 + mtctr r6 + + /* Do 32 byte chunks */ +4: +err2; std r0,0(r3) +err2; std r0,8(r3) +err2; std r0,16(r3) +err2; std r0,24(r3) + addi r3,r3,32 + addi r4,r4,-32 + bdnz 4b + +.Lshort_clear: + /* up to 31 bytes to go */ + cmpdi r4,16 + blt 6f +err2; std r0,0(r3) +err2; std r0,8(r3) + addi r3,r3,16 + addi r4,r4,-16 + + /* Up to 15 bytes to go */ +6: mr r8,r3 + clrldi r4,r4,(64-4) + mtocrf 0x01,r4 + bf cr7*4+0,7f +err1; std r0,0(r3) + addi r3,r3,8 + +7: bf cr7*4+1,8f +err1; stw r0,0(r3) + addi r3,r3,4 + +8: bf cr7*4+2,9f +err1; sth r0,0(r3) + addi r3,r3,2 + +9: bf cr7*4+3,10f +err1; stb r0,0(r3) + +10: li r3,0 + blr + +.Llong_clear: + ld r5,PPC64_CACHES@toc(r2) + + bf cr7*4+0,11f +err2; std r0,0(r3) + addi r3,r3,8 + addi r4,r4,-8 + + /* Destination is 16 byte aligned, need to get it cacheline aligned */ +11: lwz r7,DCACHEL1LOGLINESIZE(r5) + lwz r9,DCACHEL1LINESIZE(r5) + + /* + * With worst case alignment the long clear loop takes a minimum + * of 1 byte less than 2 cachelines. + */ + sldi r10,r9,2 + cmpd r4,r10 + blt .Lmedium_clear + + neg r6,r3 + addi r10,r9,-1 + and. r5,r6,r10 + beq 13f + + srdi r6,r5,4 + mtctr r6 + mr r8,r3 +12: +err1; std r0,0(r3) +err1; std r0,8(r3) + addi r3,r3,16 + bdnz 12b + + sub r4,r4,r5 + +13: srd r6,r4,r7 + mtctr r6 + mr r8,r3 +14: +err1; dcbz r0,r3 + add r3,r3,r9 + bdnz 14b + + and r4,r4,r10 + + cmpdi r4,32 + blt .Lshort_clear + b .Lmedium_clear diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c new file mode 100644 index 00000000000..3cf529ceec5 --- /dev/null +++ b/arch/powerpc/lib/vmx-helper.c @@ -0,0 +1,74 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2011 + * + * Authors: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com> + * Anton Blanchard <anton@au.ibm.com> + */ +#include <linux/uaccess.h> +#include <linux/hardirq.h> +#include <asm/switch_to.h> + +int enter_vmx_usercopy(void) +{ + if (in_interrupt()) + return 0; + + /* This acts as preempt_disable() as well and will make + * enable_kernel_altivec(). We need to disable page faults + * as they can call schedule and thus make us lose the VMX + * context. So on page faults, we just fail which will cause + * a fallback to the normal non-vmx copy. + */ + pagefault_disable(); + + enable_kernel_altivec(); + + return 1; +} + +/* + * This function must return 0 because we tail call optimise when calling + * from __copy_tofrom_user_power7 which returns 0 on success. + */ +int exit_vmx_usercopy(void) +{ + pagefault_enable(); + return 0; +} + +int enter_vmx_copy(void) +{ + if (in_interrupt()) + return 0; + + preempt_disable(); + + enable_kernel_altivec(); + + return 1; +} + +/* + * All calls to this function will be optimised into tail calls. We are + * passed a pointer to the destination which we return as required by a + * memcpy implementation. + */ +void *exit_vmx_copy(void *dest) +{ + preempt_enable(); + return dest; +} diff --git a/arch/powerpc/lib/xor_vmx.c b/arch/powerpc/lib/xor_vmx.c new file mode 100644 index 00000000000..e905f7c2ea7 --- /dev/null +++ b/arch/powerpc/lib/xor_vmx.c @@ -0,0 +1,177 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <altivec.h> + +#include <linux/preempt.h> +#include <linux/export.h> +#include <linux/sched.h> +#include <asm/switch_to.h> + +typedef vector signed char unative_t; + +#define DEFINE(V) \ + unative_t *V = (unative_t *)V##_in; \ + unative_t V##_0, V##_1, V##_2, V##_3 + +#define LOAD(V) \ + do { \ + V##_0 = V[0]; \ + V##_1 = V[1]; \ + V##_2 = V[2]; \ + V##_3 = V[3]; \ + } while (0) + +#define STORE(V) \ + do { \ + V[0] = V##_0; \ + V[1] = V##_1; \ + V[2] = V##_2; \ + V[3] = V##_3; \ + } while (0) + +#define XOR(V1, V2) \ + do { \ + V1##_0 = vec_xor(V1##_0, V2##_0); \ + V1##_1 = vec_xor(V1##_1, V2##_1); \ + V1##_2 = vec_xor(V1##_2, V2##_2); \ + V1##_3 = vec_xor(V1##_3, V2##_3); \ + } while (0) + +void xor_altivec_2(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in) +{ + DEFINE(v1); + DEFINE(v2); + unsigned long lines = bytes / (sizeof(unative_t)) / 4; + + preempt_disable(); + enable_kernel_altivec(); + + do { + LOAD(v1); + LOAD(v2); + XOR(v1, v2); + STORE(v1); + + v1 += 4; + v2 += 4; + } while (--lines > 0); + + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_2); + +void xor_altivec_3(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in) +{ + DEFINE(v1); + DEFINE(v2); + DEFINE(v3); + unsigned long lines = bytes / (sizeof(unative_t)) / 4; + + preempt_disable(); + enable_kernel_altivec(); + + do { + LOAD(v1); + LOAD(v2); + LOAD(v3); + XOR(v1, v2); + XOR(v1, v3); + STORE(v1); + + v1 += 4; + v2 += 4; + v3 += 4; + } while (--lines > 0); + + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_3); + +void xor_altivec_4(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in, + unsigned long *v4_in) +{ + DEFINE(v1); + DEFINE(v2); + DEFINE(v3); + DEFINE(v4); + unsigned long lines = bytes / (sizeof(unative_t)) / 4; + + preempt_disable(); + enable_kernel_altivec(); + + do { + LOAD(v1); + LOAD(v2); + LOAD(v3); + LOAD(v4); + XOR(v1, v2); + XOR(v3, v4); + XOR(v1, v3); + STORE(v1); + + v1 += 4; + v2 += 4; + v3 += 4; + v4 += 4; + } while (--lines > 0); + + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_4); + +void xor_altivec_5(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in, + unsigned long *v4_in, unsigned long *v5_in) +{ + DEFINE(v1); + DEFINE(v2); + DEFINE(v3); + DEFINE(v4); + DEFINE(v5); + unsigned long lines = bytes / (sizeof(unative_t)) / 4; + + preempt_disable(); + enable_kernel_altivec(); + + do { + LOAD(v1); + LOAD(v2); + LOAD(v3); + LOAD(v4); + LOAD(v5); + XOR(v1, v2); + XOR(v3, v4); + XOR(v1, v5); + XOR(v1, v3); + STORE(v1); + + v1 += 4; + v2 += 4; + v3 += 4; + v4 += 4; + v5 += 4; + } while (--lines > 0); + + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_5); |
