27 files changed, 7147 insertions, 914 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 4bb023f4c86..59fa2de9546 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -2,24 +2,41 @@
 # Makefile for ppc-specific library files..
 #
 
-ifeq ($(CONFIG_PPC64),y)
-EXTRA_CFLAGS		+= -mno-minimal-toc
-endif
+subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
+
+ccflags-$(CONFIG_PPC64)	:= $(NO_MINIMAL_TOC)
+
+CFLAGS_REMOVE_code-patching.o = -pg
+CFLAGS_REMOVE_feature-fixups.o = -pg
 
-ifeq ($(CONFIG_PPC_MERGE),y)
 obj-y			:= string.o alloc.o \
-			   checksum_$(CONFIG_WORD_SIZE).o
+			   crtsavres.o
 obj-$(CONFIG_PPC32)	+= div64.o copy_32.o
-endif
+obj-$(CONFIG_HAS_IOMEM)	+= devres.o
 
 obj-$(CONFIG_PPC64)	+= copypage_64.o copyuser_64.o \
-			   memcpy_64.o usercopy_64.o mem_64.o string.o
-obj-$(CONFIG_XMON)	+= sstep.o
-obj-$(CONFIG_KPROBES)	+= sstep.o
-obj-$(CONFIG_NOT_COHERENT_CACHE)	+= dma-noncoherent.o
+			   usercopy_64.o mem_64.o string.o \
+			   hweight_64.o \
+			   copyuser_power7.o string_64.o copypage_power7.o
+ifeq ($(CONFIG_GENERIC_CSUM),)
+obj-y			+= checksum_$(CONFIG_WORD_SIZE).o
+obj-$(CONFIG_PPC64)	+= checksum_wrappers_64.o
+endif
+
+obj-$(CONFIG_PPC64)		+= memcpy_power7.o memcpy_64.o 
+
+obj-$(CONFIG_PPC_EMULATE_SSTEP)	+= sstep.o ldstfp.o
 
 ifeq ($(CONFIG_PPC64),y)
 obj-$(CONFIG_SMP)	+= locks.o
+obj-$(CONFIG_ALTIVEC)	+= vmx-helper.o
 endif
 
 obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o
+
+obj-y			+= code-patching.o
+obj-y			+= feature-fixups.o
+obj-$(CONFIG_FTR_FIXUP_SELFTEST) += feature-fixups-test.o
+
+obj-$(CONFIG_ALTIVEC)	+= xor_vmx.o
+CFLAGS_xor_vmx.o += -maltivec -mabi=altivec
diff --git a/arch/powerpc/lib/alloc.c b/arch/powerpc/lib/alloc.c
index f53e09c7dac..da22c84a8fe 100644
--- a/arch/powerpc/lib/alloc.c
+++ b/arch/powerpc/lib/alloc.c
@@ -3,16 +3,8 @@
 #include <linux/slab.h>
 #include <linux/bootmem.h>
 #include <linux/string.h>
+#include <asm/setup.h>
 
-#include <asm/system.h>
-
-void * __init_refok alloc_maybe_bootmem(size_t size, gfp_t mask)
-{
-	if (mem_init_done)
-		return kmalloc(size, mask);
-	else
-		return alloc_bootmem(size);
-}
 
 void * __init_refok zalloc_maybe_bootmem(size_t size, gfp_t mask)
 {
diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S
index ef96c6c58ef..57a07206505 100644
--- a/arch/powerpc/lib/checksum_64.S
+++ b/arch/powerpc/lib/checksum_64.S
@@ -69,161 +69,412 @@ _GLOBAL(csum_tcpudp_magic)
  * Computes the checksum of a memory block at buff, length len,
  * and adds in "sum" (32-bit).
  *
- * This code assumes at least halfword alignment, though the length
- * can be any number of bytes.  The sum is accumulated in r5.
- *
  * csum_partial(r3=buff, r4=len, r5=sum)
  */
 _GLOBAL(csum_partial)
-        subi	r3,r3,8		/* we'll offset by 8 for the loads */
-        srdi.	r6,r4,3         /* divide by 8 for doubleword count */
-        addic   r5,r5,0         /* clear carry */
-        beq	3f              /* if we're doing < 8 bytes */
-        andi.	r0,r3,2         /* aligned on a word boundary already? */
-        beq+	1f
-        lhz     r6,8(r3)        /* do 2 bytes to get aligned */
-        addi    r3,r3,2
-        subi    r4,r4,2
-        addc    r5,r5,r6
-        srdi.   r6,r4,3         /* recompute number of doublewords */
-        beq     3f              /* any left? */
-1:      mtctr   r6
-2:      ldu     r6,8(r3)        /* main sum loop */
-        adde    r5,r5,r6
-        bdnz    2b
-        andi.	r4,r4,7         /* compute bytes left to sum after doublewords */
-3:	cmpwi	0,r4,4		/* is at least a full word left? */
-	blt	4f
-	lwz	r6,8(r3)	/* sum this word */
+	addic	r0,r5,0			/* clear carry */
+
+	srdi.	r6,r4,3			/* less than 8 bytes? */
+	beq	.Lcsum_tail_word
+
+	/*
+	 * If only halfword aligned, align to a double word. Since odd
+	 * aligned addresses should be rare and they would require more
+	 * work to calculate the correct checksum, we ignore that case
+	 * and take the potential slowdown of unaligned loads.
+	 */
+	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
+	beq	.Lcsum_aligned
+
+	li	r7,4
+	sub	r6,r7,r6
+	mtctr	r6
+
+1:
+	lhz	r6,0(r3)		/* align to doubleword */
+	subi	r4,r4,2
+	addi	r3,r3,2
+	adde	r0,r0,r6
+	bdnz	1b
+
+.Lcsum_aligned:
+	/*
+	 * We unroll the loop such that each iteration is 64 bytes with an
+	 * entry and exit limb of 64 bytes, meaning a minimum size of
+	 * 128 bytes.
+	 */
+	srdi.	r6,r4,7
+	beq	.Lcsum_tail_doublewords		/* len < 128 */
+
+	srdi	r6,r4,6
+	subi	r6,r6,1
+	mtctr	r6
+
+	stdu	r1,-STACKFRAMESIZE(r1)
+	std	r14,STK_REG(R14)(r1)
+	std	r15,STK_REG(R15)(r1)
+	std	r16,STK_REG(R16)(r1)
+
+	ld	r6,0(r3)
+	ld	r9,8(r3)
+
+	ld	r10,16(r3)
+	ld	r11,24(r3)
+
+	/*
+	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
+	 * the XER dependency. This means the fastest this loop can go is
+	 * 16 cycles per iteration. The scheduling of the loop below has
+	 * been shown to hit this on both POWER6 and POWER7.
+	 */
+	.align 5
+2:
+	adde	r0,r0,r6
+	ld	r12,32(r3)
+	ld	r14,40(r3)
+
+	adde	r0,r0,r9
+	ld	r15,48(r3)
+	ld	r16,56(r3)
+	addi	r3,r3,64
+
+	adde	r0,r0,r10
+
+	adde	r0,r0,r11
+
+	adde	r0,r0,r12
+
+	adde	r0,r0,r14
+
+	adde	r0,r0,r15
+	ld	r6,0(r3)
+	ld	r9,8(r3)
+
+	adde	r0,r0,r16
+	ld	r10,16(r3)
+	ld	r11,24(r3)
+	bdnz	2b
+
+
+	adde	r0,r0,r6
+	ld	r12,32(r3)
+	ld	r14,40(r3)
+
+	adde	r0,r0,r9
+	ld	r15,48(r3)
+	ld	r16,56(r3)
+	addi	r3,r3,64
+
+	adde	r0,r0,r10
+	adde	r0,r0,r11
+	adde	r0,r0,r12
+	adde	r0,r0,r14
+	adde	r0,r0,r15
+	adde	r0,r0,r16
+
+	ld	r14,STK_REG(R14)(r1)
+	ld	r15,STK_REG(R15)(r1)
+	ld	r16,STK_REG(R16)(r1)
+	addi	r1,r1,STACKFRAMESIZE
+
+	andi.	r4,r4,63
+
+.Lcsum_tail_doublewords:		/* Up to 127 bytes to go */
+	srdi.	r6,r4,3
+	beq	.Lcsum_tail_word
+
+	mtctr	r6
+3:
+	ld	r6,0(r3)
+	addi	r3,r3,8
+	adde	r0,r0,r6
+	bdnz	3b
+
+	andi.	r4,r4,7
+
+.Lcsum_tail_word:			/* Up to 7 bytes to go */
+	srdi.	r6,r4,2
+	beq	.Lcsum_tail_halfword
+
+	lwz	r6,0(r3)
 	addi	r3,r3,4
+	adde	r0,r0,r6
 	subi	r4,r4,4
-	adde	r5,r5,r6
-4:	cmpwi	0,r4,2		/* is at least a halfword left? */
-        blt+	5f
-        lhz     r6,8(r3)        /* sum this halfword */
-        addi    r3,r3,2
-        subi    r4,r4,2
-        adde    r5,r5,r6
-5:	cmpwi	0,r4,1		/* is at least a byte left? */
-        bne+    6f
-        lbz     r6,8(r3)        /* sum this byte */
-        slwi    r6,r6,8         /* this byte is assumed to be the upper byte of a halfword */
-        adde    r5,r5,r6
-6:      addze	r5,r5		/* add in final carry */
-	rldicl  r4,r5,32,0      /* fold two 32-bit halves together */
-        add     r3,r4,r5
-        srdi    r3,r3,32
-        blr
+
+.Lcsum_tail_halfword:			/* Up to 3 bytes to go */
+	srdi.	r6,r4,1
+	beq	.Lcsum_tail_byte
+
+	lhz	r6,0(r3)
+	addi	r3,r3,2
+	adde	r0,r0,r6
+	subi	r4,r4,2
+
+.Lcsum_tail_byte:			/* Up to 1 byte to go */
+	andi.	r6,r4,1
+	beq	.Lcsum_finish
+
+	lbz	r6,0(r3)
+	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
+	adde	r0,r0,r9
+
+.Lcsum_finish:
+	addze	r0,r0			/* add in final carry */
+	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
+	add	r3,r4,r0
+	srdi	r3,r3,32
+	blr
+
+
+	.macro srcnr
+100:
+	.section __ex_table,"a"
+	.align 3
+	.llong 100b,.Lsrc_error_nr
+	.previous
+	.endm
+
+	.macro source
+150:
+	.section __ex_table,"a"
+	.align 3
+	.llong 150b,.Lsrc_error
+	.previous
+	.endm
+
+	.macro dstnr
+200:
+	.section __ex_table,"a"
+	.align 3
+	.llong 200b,.Ldest_error_nr
+	.previous
+	.endm
+
+	.macro dest
+250:
+	.section __ex_table,"a"
+	.align 3
+	.llong 250b,.Ldest_error
+	.previous
+	.endm
 
 /*
  * Computes the checksum of a memory block at src, length len,
  * and adds in "sum" (32-bit), while copying the block to dst.
  * If an access exception occurs on src or dst, it stores -EFAULT
- * to *src_err or *dst_err respectively, and (for an error on
- * src) zeroes the rest of dst.
- *
- * This code needs to be reworked to take advantage of 64 bit sum+copy.
- * However, due to tokenring halfword alignment problems this will be very
- * tricky.  For now we'll leave it until we instrument it somehow.
+ * to *src_err or *dst_err respectively. The caller must take any action
+ * required in this case (zeroing memory, recalculating partial checksum etc).
  *
  * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
  */
 _GLOBAL(csum_partial_copy_generic)
-	addic	r0,r6,0
-	subi	r3,r3,4
-	subi	r4,r4,4
-	srwi.	r6,r5,2
-	beq	3f		/* if we're doing < 4 bytes */
-	andi.	r9,r4,2		/* Align dst to longword boundary */
-	beq+	1f
-81:	lhz	r6,4(r3)	/* do 2 bytes to get aligned */
-	addi	r3,r3,2
+	addic	r0,r6,0			/* clear carry */
+
+	srdi.	r6,r5,3			/* less than 8 bytes? */
+	beq	.Lcopy_tail_word
+
+	/*
+	 * If only halfword aligned, align to a double word. Since odd
+	 * aligned addresses should be rare and they would require more
+	 * work to calculate the correct checksum, we ignore that case
+	 * and take the potential slowdown of unaligned loads.
+	 *
+	 * If the source and destination are relatively unaligned we only
+	 * align the source. This keeps things simple.
+	 */
+	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
+	beq	.Lcopy_aligned
+
+	li	r9,4
+	sub	r6,r9,r6
+	mtctr	r6
+
+1:
+srcnr;	lhz	r6,0(r3)		/* align to doubleword */
 	subi	r5,r5,2
-91:	sth	r6,4(r4)
-	addi	r4,r4,2
-	addc	r0,r0,r6
-	srwi.	r6,r5,2		/* # words to do */
-	beq	3f
-1:	mtctr	r6
-82:	lwzu	r6,4(r3)	/* the bdnz has zero overhead, so it should */
-92:	stwu	r6,4(r4)	/* be unnecessary to unroll this loop */
-	adde	r0,r0,r6
-	bdnz	82b
-	andi.	r5,r5,3
-3:	cmpwi	0,r5,2
-	blt+	4f
-83:	lhz	r6,4(r3)
 	addi	r3,r3,2
-	subi	r5,r5,2
-93:	sth	r6,4(r4)
+	adde	r0,r0,r6
+dstnr;	sth	r6,0(r4)
 	addi	r4,r4,2
+	bdnz	1b
+
+.Lcopy_aligned:
+	/*
+	 * We unroll the loop such that each iteration is 64 bytes with an
+	 * entry and exit limb of 64 bytes, meaning a minimum size of
+	 * 128 bytes.
+	 */
+	srdi.	r6,r5,7
+	beq	.Lcopy_tail_doublewords		/* len < 128 */
+
+	srdi	r6,r5,6
+	subi	r6,r6,1
+	mtctr	r6
+
+	stdu	r1,-STACKFRAMESIZE(r1)
+	std	r14,STK_REG(R14)(r1)
+	std	r15,STK_REG(R15)(r1)
+	std	r16,STK_REG(R16)(r1)
+
+source;	ld	r6,0(r3)
+source;	ld	r9,8(r3)
+
+source;	ld	r10,16(r3)
+source;	ld	r11,24(r3)
+
+	/*
+	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
+	 * the XER dependency. This means the fastest this loop can go is
+	 * 16 cycles per iteration. The scheduling of the loop below has
+	 * been shown to hit this on both POWER6 and POWER7.
+	 */
+	.align 5
+2:
 	adde	r0,r0,r6
-4:	cmpwi	0,r5,1
-	bne+	5f
-84:	lbz	r6,4(r3)
-94:	stb	r6,4(r4)
-	slwi	r6,r6,8		/* Upper byte of word */
+source;	ld	r12,32(r3)
+source;	ld	r14,40(r3)
+
+	adde	r0,r0,r9
+source;	ld	r15,48(r3)
+source;	ld	r16,56(r3)
+	addi	r3,r3,64
+
+	adde	r0,r0,r10
+dest;	std	r6,0(r4)
+dest;	std	r9,8(r4)
+
+	adde	r0,r0,r11
+dest;	std	r10,16(r4)
+dest;	std	r11,24(r4)
+
+	adde	r0,r0,r12
+dest;	std	r12,32(r4)
+dest;	std	r14,40(r4)
+
+	adde	r0,r0,r14
+dest;	std	r15,48(r4)
+dest;	std	r16,56(r4)
+	addi	r4,r4,64
+
+	adde	r0,r0,r15
+source;	ld	r6,0(r3)
+source;	ld	r9,8(r3)
+
+	adde	r0,r0,r16
+source;	ld	r10,16(r3)
+source;	ld	r11,24(r3)
+	bdnz	2b
+
+
 	adde	r0,r0,r6
-5:	addze	r3,r0		/* add in final carry (unlikely with 64-bit regs) */
-        rldicl  r4,r3,32,0      /* fold 64 bit value */
-        add     r3,r4,r3
-        srdi    r3,r3,32
-	blr
+source;	ld	r12,32(r3)
+source;	ld	r14,40(r3)
 
-/* These shouldn't go in the fixup section, since that would
-   cause the ex_table addresses to get out of order. */
+	adde	r0,r0,r9
+source;	ld	r15,48(r3)
+source;	ld	r16,56(r3)
+	addi	r3,r3,64
+
+	adde	r0,r0,r10
+dest;	std	r6,0(r4)
+dest;	std	r9,8(r4)
+
+	adde	r0,r0,r11
+dest;	std	r10,16(r4)
+dest;	std	r11,24(r4)
+
+	adde	r0,r0,r12
+dest;	std	r12,32(r4)
+dest;	std	r14,40(r4)
+
+	adde	r0,r0,r14
+dest;	std	r15,48(r4)
+dest;	std	r16,56(r4)
+	addi	r4,r4,64
+
+	adde	r0,r0,r15
+	adde	r0,r0,r16
+
+	ld	r14,STK_REG(R14)(r1)
+	ld	r15,STK_REG(R15)(r1)
+	ld	r16,STK_REG(R16)(r1)
+	addi	r1,r1,STACKFRAMESIZE
+
+	andi.	r5,r5,63
+
+.Lcopy_tail_doublewords:		/* Up to 127 bytes to go */
+	srdi.	r6,r5,3
+	beq	.Lcopy_tail_word
 
-	.globl src_error_1
-src_error_1:
-	li	r6,0
-	subi	r5,r5,2
-95:	sth	r6,4(r4)
-	addi	r4,r4,2
-	srwi.	r6,r5,2
-	beq	3f
 	mtctr	r6
-	.globl src_error_2
-src_error_2:
-	li	r6,0
-96:	stwu	r6,4(r4)
-	bdnz	96b
-3:	andi.	r5,r5,3
-	beq	src_error
-	.globl src_error_3
-src_error_3:
-	li	r6,0
-	mtctr	r5
-	addi	r4,r4,3
-97:	stbu	r6,1(r4)
-	bdnz	97b
-	.globl src_error
-src_error:
+3:
+srcnr;	ld	r6,0(r3)
+	addi	r3,r3,8
+	adde	r0,r0,r6
+dstnr;	std	r6,0(r4)
+	addi	r4,r4,8
+	bdnz	3b
+
+	andi.	r5,r5,7
+
+.Lcopy_tail_word:			/* Up to 7 bytes to go */
+	srdi.	r6,r5,2
+	beq	.Lcopy_tail_halfword
+
+srcnr;	lwz	r6,0(r3)
+	addi	r3,r3,4
+	adde	r0,r0,r6
+dstnr;	stw	r6,0(r4)
+	addi	r4,r4,4
+	subi	r5,r5,4
+
+.Lcopy_tail_halfword:			/* Up to 3 bytes to go */
+	srdi.	r6,r5,1
+	beq	.Lcopy_tail_byte
+
+srcnr;	lhz	r6,0(r3)
+	addi	r3,r3,2
+	adde	r0,r0,r6
+dstnr;	sth	r6,0(r4)
+	addi	r4,r4,2
+	subi	r5,r5,2
+
+.Lcopy_tail_byte:			/* Up to 1 byte to go */
+	andi.	r6,r5,1
+	beq	.Lcopy_finish
+
+srcnr;	lbz	r6,0(r3)
+	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
+	adde	r0,r0,r9
+dstnr;	stb	r6,0(r4)
+
+.Lcopy_finish:
+	addze	r0,r0			/* add in final carry */
+	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
+	add	r3,r4,r0
+	srdi	r3,r3,32
+	blr
+
+.Lsrc_error:
+	ld	r14,STK_REG(R14)(r1)
+	ld	r15,STK_REG(R15)(r1)
+	ld	r16,STK_REG(R16)(r1)
+	addi	r1,r1,STACKFRAMESIZE
+.Lsrc_error_nr:
 	cmpdi	0,r7,0
-	beq	1f
+	beqlr
 	li	r6,-EFAULT
 	stw	r6,0(r7)
-1:	addze	r3,r0
 	blr
 
-	.globl dst_error
-dst_error:
+.Ldest_error:
+	ld	r14,STK_REG(R14)(r1)
+	ld	r15,STK_REG(R15)(r1)
+	ld	r16,STK_REG(R16)(r1)
+	addi	r1,r1,STACKFRAMESIZE
+.Ldest_error_nr:
 	cmpdi	0,r8,0
-	beq	1f
+	beqlr
 	li	r6,-EFAULT
 	stw	r6,0(r8)
-1:	addze	r3,r0
 	blr
-
-.section __ex_table,"a"
-	.align  3
-	.llong	81b,src_error_1
-	.llong	91b,dst_error
-	.llong	82b,src_error_2
-	.llong	92b,dst_error
-	.llong	83b,src_error_3
-	.llong	93b,dst_error
-	.llong	84b,src_error_3
-	.llong	94b,dst_error
-	.llong	95b,dst_error
-	.llong	96b,dst_error
-	.llong	97b,dst_error
diff --git a/arch/powerpc/lib/checksum_wrappers_64.c b/arch/powerpc/lib/checksum_wrappers_64.c
new file mode 100644
index 00000000000..08e3a3356c4
--- /dev/null
+++ b/arch/powerpc/lib/checksum_wrappers_64.c
@@ -0,0 +1,102 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2010
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <linux/export.h>
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <asm/checksum.h>
+#include <asm/uaccess.h>
+
+__wsum csum_and_copy_from_user(const void __user *src, void *dst,
+			       int len, __wsum sum, int *err_ptr)
+{
+	unsigned int csum;
+
+	might_sleep();
+
+	*err_ptr = 0;
+
+	if (!len) {
+		csum = 0;
+		goto out;
+	}
+
+	if (unlikely((len < 0) || !access_ok(VERIFY_READ, src, len))) {
+		*err_ptr = -EFAULT;
+		csum = (__force unsigned int)sum;
+		goto out;
+	}
+
+	csum = csum_partial_copy_generic((void __force *)src, dst,
+					 len, sum, err_ptr, NULL);
+
+	if (unlikely(*err_ptr)) {
+		int missing = __copy_from_user(dst, src, len);
+
+		if (missing) {
+			memset(dst + len - missing, 0, missing);
+			*err_ptr = -EFAULT;
+		} else {
+			*err_ptr = 0;
+		}
+
+		csum = csum_partial(dst, len, sum);
+	}
+
+out:
+	return (__force __wsum)csum;
+}
+EXPORT_SYMBOL(csum_and_copy_from_user);
+
+__wsum csum_and_copy_to_user(const void *src, void __user *dst, int len,
+			     __wsum sum, int *err_ptr)
+{
+	unsigned int csum;
+
+	might_sleep();
+
+	*err_ptr = 0;
+
+	if (!len) {
+		csum = 0;
+		goto out;
+	}
+
+	if (unlikely((len < 0) || !access_ok(VERIFY_WRITE, dst, len))) {
+		*err_ptr = -EFAULT;
+		csum = -1; /* invalid checksum */
+		goto out;
+	}
+
+	csum = csum_partial_copy_generic(src, (void __force *)dst,
+					 len, sum, NULL, err_ptr);
+
+	if (unlikely(*err_ptr)) {
+		csum = csum_partial(src, len, sum);
+
+		if (copy_to_user(dst, src, len)) {
+			*err_ptr = -EFAULT;
+			csum = -1; /* invalid checksum */
+		}
+	}
+
+out:
+	return (__force __wsum)csum;
+}
+EXPORT_SYMBOL(csum_and_copy_to_user);
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
new file mode 100644
index 00000000000..d5edbeb8eb8
--- /dev/null
+++ b/arch/powerpc/lib/code-patching.c
@@ -0,0 +1,470 @@
+/*
+ *  Copyright 2008 Michael Ellerman, IBM Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <asm/page.h>
+#include <asm/code-patching.h>
+#include <asm/uaccess.h>
+
+
+int patch_instruction(unsigned int *addr, unsigned int instr)
+{
+	int err;
+
+	__put_user_size(instr, addr, 4, err);
+	if (err)
+		return err;
+	asm ("dcbst 0, %0; sync; icbi 0,%0; sync; isync" : : "r" (addr));
+	return 0;
+}
+
+int patch_branch(unsigned int *addr, unsigned long target, int flags)
+{
+	return patch_instruction(addr, create_branch(addr, target, flags));
+}
+
+unsigned int create_branch(const unsigned int *addr,
+			   unsigned long target, int flags)
+{
+	unsigned int instruction;
+	long offset;
+
+	offset = target;
+	if (! (flags & BRANCH_ABSOLUTE))
+		offset = offset - (unsigned long)addr;
+
+	/* Check we can represent the target in the instruction format */
+	if (offset < -0x2000000 || offset > 0x1fffffc || offset & 0x3)
+		return 0;
+
+	/* Mask out the flags and target, so they don't step on each other. */
+	instruction = 0x48000000 | (flags & 0x3) | (offset & 0x03FFFFFC);
+
+	return instruction;
+}
+
+unsigned int create_cond_branch(const unsigned int *addr,
+				unsigned long target, int flags)
+{
+	unsigned int instruction;
+	long offset;
+
+	offset = target;
+	if (! (flags & BRANCH_ABSOLUTE))
+		offset = offset - (unsigned long)addr;
+
+	/* Check we can represent the target in the instruction format */
+	if (offset < -0x8000 || offset > 0x7FFF || offset & 0x3)
+		return 0;
+
+	/* Mask out the flags and target, so they don't step on each other. */
+	instruction = 0x40000000 | (flags & 0x3FF0003) | (offset & 0xFFFC);
+
+	return instruction;
+}
+
+static unsigned int branch_opcode(unsigned int instr)
+{
+	return (instr >> 26) & 0x3F;
+}
+
+static int instr_is_branch_iform(unsigned int instr)
+{
+	return branch_opcode(instr) == 18;
+}
+
+static int instr_is_branch_bform(unsigned int instr)
+{
+	return branch_opcode(instr) == 16;
+}
+
+int instr_is_relative_branch(unsigned int instr)
+{
+	if (instr & BRANCH_ABSOLUTE)
+		return 0;
+
+	return instr_is_branch_iform(instr) || instr_is_branch_bform(instr);
+}
+
+static unsigned long branch_iform_target(const unsigned int *instr)
+{
+	signed long imm;
+
+	imm = *instr & 0x3FFFFFC;
+
+	/* If the top bit of the immediate value is set this is negative */
+	if (imm & 0x2000000)
+		imm -= 0x4000000;
+
+	if ((*instr & BRANCH_ABSOLUTE) == 0)
+		imm += (unsigned long)instr;
+
+	return (unsigned long)imm;
+}
+
+static unsigned long branch_bform_target(const unsigned int *instr)
+{
+	signed long imm;
+
+	imm = *instr & 0xFFFC;
+
+	/* If the top bit of the immediate value is set this is negative */
+	if (imm & 0x8000)
+		imm -= 0x10000;
+
+	if ((*instr & BRANCH_ABSOLUTE) == 0)
+		imm += (unsigned long)instr;
+
+	return (unsigned long)imm;
+}
+
+unsigned long branch_target(const unsigned int *instr)
+{
+	if (instr_is_branch_iform(*instr))
+		return branch_iform_target(instr);
+	else if (instr_is_branch_bform(*instr))
+		return branch_bform_target(instr);
+
+	return 0;
+}
+
+int instr_is_branch_to_addr(const unsigned int *instr, unsigned long addr)
+{
+	if (instr_is_branch_iform(*instr) || instr_is_branch_bform(*instr))
+		return branch_target(instr) == addr;
+
+	return 0;
+}
+
+unsigned int translate_branch(const unsigned int *dest, const unsigned int *src)
+{
+	unsigned long target;
+
+	target = branch_target(src);
+
+	if (instr_is_branch_iform(*src))
+		return create_branch(dest, target, *src);
+	else if (instr_is_branch_bform(*src))
+		return create_cond_branch(dest, target, *src);
+
+	return 0;
+}
+
+#ifdef CONFIG_PPC_BOOK3E_64
+void __patch_exception(int exc, unsigned long addr)
+{
+	extern unsigned int interrupt_base_book3e;
+	unsigned int *ibase = &interrupt_base_book3e;
+
+	/* Our exceptions vectors start with a NOP and -then- a branch
+	 * to deal with single stepping from userspace which stops on
+	 * the second instruction. Thus we need to patch the second
+	 * instruction of the exception, not the first one
+	 */
+
+	patch_branch(ibase + (exc / 4) + 1, addr, 0);
+}
+#endif
+
+#ifdef CONFIG_CODE_PATCHING_SELFTEST
+
+static void __init test_trampoline(void)
+{
+	asm ("nop;\n");
+}
+
+#define check(x)	\
+	if (!(x)) printk("code-patching: test failed at line %d\n", __LINE__);
+
+static void __init test_branch_iform(void)
+{
+	unsigned int instr;
+	unsigned long addr;
+
+	addr = (unsigned long)&instr;
+
+	/* The simplest case, branch to self, no flags */
+	check(instr_is_branch_iform(0x48000000));
+	/* All bits of target set, and flags */
+	check(instr_is_branch_iform(0x4bffffff));
+	/* High bit of opcode set, which is wrong */
+	check(!instr_is_branch_iform(0xcbffffff));
+	/* Middle bits of opcode set, which is wrong */
+	check(!instr_is_branch_iform(0x7bffffff));
+
+	/* Simplest case, branch to self with link */
+	check(instr_is_branch_iform(0x48000001));
+	/* All bits of targets set */
+	check(instr_is_branch_iform(0x4bfffffd));
+	/* Some bits of targets set */
+	check(instr_is_branch_iform(0x4bff00fd));
+	/* Must be a valid branch to start with */
+	check(!instr_is_branch_iform(0x7bfffffd));
+
+	/* Absolute branch to 0x100 */
+	instr = 0x48000103;
+	check(instr_is_branch_to_addr(&instr, 0x100));
+	/* Absolute branch to 0x420fc */
+	instr = 0x480420ff;
+	check(instr_is_branch_to_addr(&instr, 0x420fc));
+	/* Maximum positive relative branch, + 20MB - 4B */
+	instr = 0x49fffffc;
+	check(instr_is_branch_to_addr(&instr, addr + 0x1FFFFFC));
+	/* Smallest negative relative branch, - 4B */
+	instr = 0x4bfffffc;
+	check(instr_is_branch_to_addr(&instr, addr - 4));
+	/* Largest negative relative branch, - 32 MB */
+	instr = 0x4a000000;
+	check(instr_is_branch_to_addr(&instr, addr - 0x2000000));
+
+	/* Branch to self, with link */
+	instr = create_branch(&instr, addr, BRANCH_SET_LINK);
+	check(instr_is_branch_to_addr(&instr, addr));
+
+	/* Branch to self - 0x100, with link */
+	instr = create_branch(&instr, addr - 0x100, BRANCH_SET_LINK);
+	check(instr_is_branch_to_addr(&instr, addr - 0x100));
+
+	/* Branch to self + 0x100, no link */
+	instr = create_branch(&instr, addr + 0x100, 0);
+	check(instr_is_branch_to_addr(&instr, addr + 0x100));
+
+	/* Maximum relative negative offset, - 32 MB */
+	instr = create_branch(&instr, addr - 0x2000000, BRANCH_SET_LINK);
+	check(instr_is_branch_to_addr(&instr, addr - 0x2000000));
+
+	/* Out of range relative negative offset, - 32 MB + 4*/
+	instr = create_branch(&instr, addr - 0x2000004, BRANCH_SET_LINK);
+	check(instr == 0);
+
+	/* Out of range relative positive offset, + 32 MB */
+	instr = create_branch(&instr, addr + 0x2000000, BRANCH_SET_LINK);
+	check(instr == 0);
+
+	/* Unaligned target */
+	instr = create_branch(&instr, addr + 3, BRANCH_SET_LINK);
+	check(instr == 0);
+
+	/* Check flags are masked correctly */
+	instr = create_branch(&instr, addr, 0xFFFFFFFC);
+	check(instr_is_branch_to_addr(&instr, addr));
+	check(instr == 0x48000000);
+}
+
+static void __init test_create_function_call(void)
+{
+	unsigned int *iptr;
+	unsigned long dest;
+
+	/* Check we can create a function call */
+	iptr = (unsigned int *)ppc_function_entry(test_trampoline);
+	dest = ppc_function_entry(test_create_function_call);
+	patch_instruction(iptr, create_branch(iptr, dest, BRANCH_SET_LINK));
+	check(instr_is_branch_to_addr(iptr, dest));
+}
+
+static void __init test_branch_bform(void)
+{
+	unsigned long addr;
+	unsigned int *iptr, instr, flags;
+
+	iptr = &instr;
+	addr = (unsigned long)iptr;
+
+	/* The simplest case, branch to self, no flags */
+	check(instr_is_branch_bform(0x40000000));
+	/* All bits of target set, and flags */
+	check(instr_is_branch_bform(0x43ffffff));
+	/* High bit of opcode set, which is wrong */
+	check(!instr_is_branch_bform(0xc3ffffff));
+	/* Middle bits of opcode set, which is wrong */
+	check(!instr_is_branch_bform(0x7bffffff));
+
+	/* Absolute conditional branch to 0x100 */
+	instr = 0x43ff0103;
+	check(instr_is_branch_to_addr(&instr, 0x100));
+	/* Absolute conditional branch to 0x20fc */
+	instr = 0x43ff20ff;
+	check(instr_is_branch_to_addr(&instr, 0x20fc));
+	/* Maximum positive relative conditional branch, + 32 KB - 4B */
+	instr = 0x43ff7ffc;
+	check(instr_is_branch_to_addr(&instr, addr + 0x7FFC));
+	/* Smallest negative relative conditional branch, - 4B */
+	instr = 0x43fffffc;
+	check(instr_is_branch_to_addr(&instr, addr - 4));
+	/* Largest negative relative conditional branch, - 32 KB */
+	instr = 0x43ff8000;
+	check(instr_is_branch_to_addr(&instr, addr - 0x8000));
+
+	/* All condition code bits set & link */
+	flags = 0x3ff000 | BRANCH_SET_LINK;
+
+	/* Branch to self */
+	instr = create_cond_branch(iptr, addr, flags);
+	check(instr_is_branch_to_addr(&instr, addr));
+
+	/* Branch to self - 0x100 */
+	instr = create_cond_branch(iptr, addr - 0x100, flags);
+	check(instr_is_branch_to_addr(&instr, addr - 0x100));
+
+	/* Branch to self + 0x100 */
+	instr = create_cond_branch(iptr, addr + 0x100, flags);
+	check(instr_is_branch_to_addr(&instr, addr + 0x100));
+
+	/* Maximum relative negative offset, - 32 KB */
+	instr = create_cond_branch(iptr, addr - 0x8000, flags);
+	check(instr_is_branch_to_addr(&instr, addr - 0x8000));
+
+	/* Out of range relative negative offset, - 32 KB + 4*/
+	instr = create_cond_branch(iptr, addr - 0x8004, flags);
+	check(instr == 0);
+
+	/* Out of range relative positive offset, + 32 KB */
+	instr = create_cond_branch(iptr, addr + 0x8000, flags);
+	check(instr == 0);
+
+	/* Unaligned target */
+	instr = create_cond_branch(iptr, addr + 3, flags);
+	check(instr == 0);
+
+	/* Check flags are masked correctly */
+	instr = create_cond_branch(iptr, addr, 0xFFFFFFFC);
+	check(instr_is_branch_to_addr(&instr, addr));
+	check(instr == 0x43FF0000);
+}
+
+static void __init test_translate_branch(void)
+{
+	unsigned long addr;
+	unsigned int *p, *q;
+	void *buf;
+
+	buf = vmalloc(PAGE_ALIGN(0x2000000 + 1));
+	check(buf);
+	if (!buf)
+		return;
+
+	/* Simple case, branch to self moved a little */
+	p = buf;
+	addr = (unsigned long)p;
+	patch_branch(p, addr, 0);
+	check(instr_is_branch_to_addr(p, addr));
+	q = p + 1;
+	patch_instruction(q, translate_branch(q, p));
+	check(instr_is_branch_to_addr(q, addr));
+
+	/* Maximum negative case, move b . to addr + 32 MB */
+	p = buf;
+	addr = (unsigned long)p;
+	patch_branch(p, addr, 0);
+	q = buf + 0x2000000;
+	patch_instruction(q, translate_branch(q, p));
+	check(instr_is_branch_to_addr(p, addr));
+	check(instr_is_branch_to_addr(q, addr));
+	check(*q == 0x4a000000);
+
+	/* Maximum positive case, move x to x - 32 MB + 4 */
+	p = buf + 0x2000000;
+	addr = (unsigned long)p;
+	patch_branch(p, addr, 0);
+	q = buf + 4;
+	patch_instruction(q, translate_branch(q, p));
+	check(instr_is_branch_to_addr(p, addr));
+	check(instr_is_branch_to_addr(q, addr));
+	check(*q == 0x49fffffc);
+
+	/* Jump to x + 16 MB moved to x + 20 MB */
+	p = buf;
+	addr = 0x1000000 + (unsigned long)buf;
+	patch_branch(p, addr, BRANCH_SET_LINK);
+	q = buf + 0x1400000;
+	patch_instruction(q, translate_branch(q, p));
+	check(instr_is_branch_to_addr(p, addr));
+	check(instr_is_branch_to_addr(q, addr));
+
+	/* Jump to x + 16 MB moved to x - 16 MB + 4 */
+	p = buf + 0x1000000;
+	addr = 0x2000000 + (unsigned long)buf;
+	patch_branch(p, addr, 0);
+	q = buf + 4;
+	patch_instruction(q, translate_branch(q, p));
+	check(instr_is_branch_to_addr(p, addr));
+	check(instr_is_branch_to_addr(q, addr));
+
+
+	/* Conditional branch tests */
+
+	/* Simple case, branch to self moved a little */
+	p = buf;
+	addr = (unsigned long)p;
+	patch_instruction(p, create_cond_branch(p, addr, 0));
+	check(instr_is_branch_to_addr(p, addr));
+	q = p + 1;
+	patch_instruction(q, translate_branch(q, p));
+	check(instr_is_branch_to_addr(q, addr));
+
+	/* Maximum negative case, move b . to addr + 32 KB */
+	p = buf;
+	addr = (unsigned long)p;
+	patch_instruction(p, create_cond_branch(p, addr, 0xFFFFFFFC));
+	q = buf + 0x8000;
+	patch_instruction(q, translate_branch(q, p));
+	check(instr_is_branch_to_addr(p, addr));
+	check(instr_is_branch_to_addr(q, addr));
+	check(*q == 0x43ff8000);
+
+	/* Maximum positive case, move x to x - 32 KB + 4 */
+	p = buf + 0x8000;
+	addr = (unsigned long)p;
+	patch_instruction(p, create_cond_branch(p, addr, 0xFFFFFFFC));
+	q = buf + 4;
+	patch_instruction(q, translate_branch(q, p));
+	check(instr_is_branch_to_addr(p, addr));
+	check(instr_is_branch_to_addr(q, addr));
+	check(*q == 0x43ff7ffc);
+
+	/* Jump to x + 12 KB moved to x + 20 KB */
+	p = buf;
+	addr = 0x3000 + (unsigned long)buf;
+	patch_instruction(p, create_cond_branch(p, addr, BRANCH_SET_LINK));
+	q = buf + 0x5000;
+	patch_instruction(q, translate_branch(q, p));
+	check(instr_is_branch_to_addr(p, addr));
+	check(instr_is_branch_to_addr(q, addr));
+
+	/* Jump to x + 8 KB moved to x - 8 KB + 4 */
+	p = buf + 0x2000;
+	addr = 0x4000 + (unsigned long)buf;
+	patch_instruction(p, create_cond_branch(p, addr, 0));
+	q = buf + 4;
+	patch_instruction(q, translate_branch(q, p));
+	check(instr_is_branch_to_addr(p, addr));
+	check(instr_is_branch_to_addr(q, addr));
+
+	/* Free the buffer we were using */
+	vfree(buf);
+}
+
+static int __init test_code_patching(void)
+{
+	printk(KERN_DEBUG "Running code patching self-tests ...\n");
+
+	test_branch_iform();
+	test_branch_bform();
+	test_create_function_call();
+	test_translate_branch();
+
+	return 0;
+}
+late_initcall(test_code_patching);
+
+#endif /* CONFIG_CODE_PATCHING_SELFTEST */
diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S
index c657de59abc..55f19f9fd70 100644
--- a/arch/powerpc/lib/copy_32.S
+++ b/arch/powerpc/lib/copy_32.S
@@ -62,7 +62,7 @@
 
 	.text
 	.stabs	"arch/powerpc/lib/",N_SO,0,0,0f
-	.stabs	"copy32.S",N_SO,0,0,0f
+	.stabs	"copy_32.S",N_SO,0,0,0f
 0:
 
 CACHELINE_BYTES = L1_CACHE_BYTES
@@ -98,20 +98,7 @@ _GLOBAL(cacheable_memzero)
 	bdnz	4b
 3:	mtctr	r9
 	li	r7,4
-#if !defined(CONFIG_8xx)
 10:	dcbz	r7,r6
-#else
-10:	stw	r4, 4(r6)
-	stw	r4, 8(r6)
-	stw	r4, 12(r6)
-	stw	r4, 16(r6)
-#if CACHE_LINE_SIZE >= 32
-	stw	r4, 20(r6)
-	stw	r4, 24(r6)
-	stw	r4, 28(r6)
-	stw	r4, 32(r6)
-#endif /* CACHE_LINE_SIZE */
-#endif
 	addi	r6,r6,CACHELINE_BYTES
 	bdnz	10b
 	clrlwi	r5,r8,32-LG_CACHELINE_BYTES
@@ -200,9 +187,7 @@ _GLOBAL(cacheable_memcpy)
 	mtctr	r0
 	beq	63f
 53:
-#if !defined(CONFIG_8xx)
 	dcbz	r11,r6
-#endif
 	COPY_16_BYTES
 #if L1_CACHE_BYTES >= 32
 	COPY_16_BYTES
@@ -356,14 +341,6 @@ _GLOBAL(__copy_tofrom_user)
 	li	r11,4
 	beq	63f
 
-#ifdef CONFIG_8xx
-	/* Don't use prefetch on 8xx */
-	mtctr	r0
-	li	r0,0
-53:	COPY_16_BYTES_WITHEX(0)
-	bdnz	53b
-
-#else /* not CONFIG_8xx */
 	/* Here we decide how far ahead to prefetch the source */
 	li	r3,4
 	cmpwi	r0,1
@@ -416,7 +393,6 @@ _GLOBAL(__copy_tofrom_user)
 	li	r3,4
 	li	r7,0
 	bne	114b
-#endif /* CONFIG_8xx */
 
 63:	srwi.	r0,r5,2
 	mtctr	r0
diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S
index f9837f44ac0..a3c4dc4defd 100644
--- a/arch/powerpc/lib/copypage_64.S
+++ b/arch/powerpc/lib/copypage_64.S
@@ -1,119 +1,112 @@
 /*
- * Copyright (C) 2002 Paul Mackerras, IBM Corp.
+ * Copyright (C) 2008 Mark Nelson, IBM Corp.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
+#include <asm/page.h>
 #include <asm/processor.h>
 #include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
 
-_GLOBAL(copy_4K_page)
-	std	r31,-8(1)
-	std	r30,-16(1)
-	std	r29,-24(1)
-	std	r28,-32(1)
-	std	r27,-40(1)
-	std	r26,-48(1)
-	std	r25,-56(1)
-	std	r24,-64(1)
-	std	r23,-72(1)
-	std	r22,-80(1)
-	std	r21,-88(1)
-	std	r20,-96(1)
-	li	r5,4096/32 - 1
+        .section        ".toc","aw"
+PPC64_CACHES:
+        .tc             ppc64_caches[TC],ppc64_caches
+        .section        ".text"
+
+_GLOBAL_TOC(copy_page)
+BEGIN_FTR_SECTION
+	lis	r5,PAGE_SIZE@h
+FTR_SECTION_ELSE
+	b	copypage_power7
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
+	ori	r5,r5,PAGE_SIZE@l
+BEGIN_FTR_SECTION
+	ld      r10,PPC64_CACHES@toc(r2)
+	lwz	r11,DCACHEL1LOGLINESIZE(r10)	/* log2 of cache line size */
+	lwz     r12,DCACHEL1LINESIZE(r10)	/* get cache line size */
+	li	r9,0
+	srd	r8,r5,r11
+
+	mtctr	r8
+.Lsetup:
+	dcbt	r9,r4
+	dcbz	r9,r3
+	add	r9,r9,r12
+	bdnz	.Lsetup
+END_FTR_SECTION_IFSET(CPU_FTR_CP_USE_DCBTZ)
 	addi	r3,r3,-8
-	li	r12,5
-0:	addi	r5,r5,-24
-	mtctr	r12
-	ld	r22,640(4)
-	ld	r21,512(4)
-	ld	r20,384(4)
-	ld	r11,256(4)
-	ld	r9,128(4)
-	ld	r7,0(4)
-	ld	r25,648(4)
-	ld	r24,520(4)
-	ld	r23,392(4)
-	ld	r10,264(4)
-	ld	r8,136(4)
-	ldu	r6,8(4)
-	cmpwi	r5,24
-1:	std	r22,648(3)
-	std	r21,520(3)
-	std	r20,392(3)
-	std	r11,264(3)
-	std	r9,136(3)
-	std	r7,8(3)
-	ld	r28,648(4)
-	ld	r27,520(4)
-	ld	r26,392(4)
-	ld	r31,264(4)
-	ld	r30,136(4)
-	ld	r29,8(4)
-	std	r25,656(3)
-	std	r24,528(3)
-	std	r23,400(3)
-	std	r10,272(3)
-	std	r8,144(3)
-	std	r6,16(3)
-	ld	r22,656(4)
-	ld	r21,528(4)
-	ld	r20,400(4)
-	ld	r11,272(4)
-	ld	r9,144(4)
-	ld	r7,16(4)
-	std	r28,664(3)
-	std	r27,536(3)
-	std	r26,408(3)
-	std	r31,280(3)
-	std	r30,152(3)
-	stdu	r29,24(3)
-	ld	r25,664(4)
-	ld	r24,536(4)
-	ld	r23,408(4)
-	ld	r10,280(4)
-	ld	r8,152(4)
-	ldu	r6,24(4)
+	srdi    r8,r5,7		/* page is copied in 128 byte strides */
+	addi	r8,r8,-1	/* one stride copied outside loop */
+
+	mtctr	r8
+
+	ld	r5,0(r4)
+	ld	r6,8(r4)
+	ld	r7,16(r4)
+	ldu	r8,24(r4)
+1:	std	r5,8(r3)
+	std	r6,16(r3)
+	ld	r9,8(r4)
+	ld	r10,16(r4)
+	std	r7,24(r3)
+	std	r8,32(r3)
+	ld	r11,24(r4)
+	ld	r12,32(r4)
+	std	r9,40(r3)
+	std	r10,48(r3)
+	ld	r5,40(r4)
+	ld	r6,48(r4)
+	std	r11,56(r3)
+	std	r12,64(r3)
+	ld	r7,56(r4)
+	ld	r8,64(r4)
+	std	r5,72(r3)
+	std	r6,80(r3)
+	ld	r9,72(r4)
+	ld	r10,80(r4)
+	std	r7,88(r3)
+	std	r8,96(r3)
+	ld	r11,88(r4)
+	ld	r12,96(r4)
+	std	r9,104(r3)
+	std	r10,112(r3)
+	ld	r5,104(r4)
+	ld	r6,112(r4)
+	std	r11,120(r3)
+	stdu	r12,128(r3)
+	ld	r7,120(r4)
+	ldu	r8,128(r4)
 	bdnz	1b
-	std	r22,648(3)
-	std	r21,520(3)
-	std	r20,392(3)
-	std	r11,264(3)
-	std	r9,136(3)
-	std	r7,8(3)
-	addi	r4,r4,640
-	addi	r3,r3,648
-	bge	0b
-	mtctr	r5
-	ld	r7,0(4)
-	ld	r8,8(4)
-	ldu	r9,16(4)
-3:	ld	r10,8(4)
-	std	r7,8(3)
-	ld	r7,16(4)
-	std	r8,16(3)
-	ld	r8,24(4)
-	std	r9,24(3)
-	ldu	r9,32(4)
-	stdu	r10,32(3)
-	bdnz	3b
-4:	ld	r10,8(4)
-	std	r7,8(3)
-	std	r8,16(3)
-	std	r9,24(3)
-	std	r10,32(3)
-9:	ld	r20,-96(1)
-	ld	r21,-88(1)
-	ld	r22,-80(1)
-	ld	r23,-72(1)
-	ld	r24,-64(1)
-	ld	r25,-56(1)
-	ld	r26,-48(1)
-	ld	r27,-40(1)
-	ld	r28,-32(1)
-	ld	r29,-24(1)
-	ld	r30,-16(1)
-	ld	r31,-8(1)
+
+	std	r5,8(r3)
+	std	r6,16(r3)
+	ld	r9,8(r4)
+	ld	r10,16(r4)
+	std	r7,24(r3)
+	std	r8,32(r3)
+	ld	r11,24(r4)
+	ld	r12,32(r4)
+	std	r9,40(r3)
+	std	r10,48(r3)
+	ld	r5,40(r4)
+	ld	r6,48(r4)
+	std	r11,56(r3)
+	std	r12,64(r3)
+	ld	r7,56(r4)
+	ld	r8,64(r4)
+	std	r5,72(r3)
+	std	r6,80(r3)
+	ld	r9,72(r4)
+	ld	r10,80(r4)
+	std	r7,88(r3)
+	std	r8,96(r3)
+	ld	r11,88(r4)
+	ld	r12,96(r4)
+	std	r9,104(r3)
+	std	r10,112(r3)
+	std	r11,120(r3)
+	std	r12,128(r3)
 	blr
diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S
new file mode 100644
index 00000000000..d7dafb3777a
--- /dev/null
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -0,0 +1,168 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2012
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <asm/page.h>
+#include <asm/ppc_asm.h>
+
+_GLOBAL(copypage_power7)
+	/*
+	 * We prefetch both the source and destination using enhanced touch
+	 * instructions. We use a stream ID of 0 for the load side and
+	 * 1 for the store side. Since source and destination are page
+	 * aligned we don't need to clear the bottom 7 bits of either
+	 * address.
+	 */
+	ori	r9,r3,1		/* stream=1 => to */
+
+#ifdef CONFIG_PPC_64K_PAGES
+	lis	r7,0x0E01	/* depth=7
+				 * units/cachelines=512 */
+#else
+	lis	r7,0x0E00	/* depth=7 */
+	ori	r7,r7,0x1000	/* units/cachelines=32 */
+#endif
+	ori	r10,r7,1	/* stream=1 */
+
+	lis	r8,0x8000	/* GO=1 */
+	clrldi	r8,r8,32
+
+.machine push
+.machine "power4"
+	/* setup read stream 0  */
+	dcbt	r0,r4,0b01000  	/* addr from */
+	dcbt	r0,r7,0b01010   /* length and depth from */
+	/* setup write stream 1 */
+	dcbtst	r0,r9,0b01000   /* addr to */
+	dcbtst	r0,r10,0b01010  /* length and depth to */
+	eieio
+	dcbt	r0,r8,0b01010	/* all streams GO */
+.machine pop
+
+#ifdef CONFIG_ALTIVEC
+	mflr	r0
+	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
+	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
+	std	r0,16(r1)
+	stdu	r1,-STACKFRAMESIZE(r1)
+	bl	enter_vmx_copy
+	cmpwi	r3,0
+	ld	r0,STACKFRAMESIZE+16(r1)
+	ld	r3,STK_REG(R31)(r1)
+	ld	r4,STK_REG(R30)(r1)
+	mtlr	r0
+
+	li	r0,(PAGE_SIZE/128)
+	mtctr	r0
+
+	beq	.Lnonvmx_copy
+
+	addi	r1,r1,STACKFRAMESIZE
+
+	li	r6,16
+	li	r7,32
+	li	r8,48
+	li	r9,64
+	li	r10,80
+	li	r11,96
+	li	r12,112
+
+	.align	5
+1:	lvx	vr7,r0,r4
+	lvx	vr6,r4,r6
+	lvx	vr5,r4,r7
+	lvx	vr4,r4,r8
+	lvx	vr3,r4,r9
+	lvx	vr2,r4,r10
+	lvx	vr1,r4,r11
+	lvx	vr0,r4,r12
+	addi	r4,r4,128
+	stvx	vr7,r0,r3
+	stvx	vr6,r3,r6
+	stvx	vr5,r3,r7
+	stvx	vr4,r3,r8
+	stvx	vr3,r3,r9
+	stvx	vr2,r3,r10
+	stvx	vr1,r3,r11
+	stvx	vr0,r3,r12
+	addi	r3,r3,128
+	bdnz	1b
+
+	b	exit_vmx_copy		/* tail call optimise */
+
+#else
+	li	r0,(PAGE_SIZE/128)
+	mtctr	r0
+
+	stdu	r1,-STACKFRAMESIZE(r1)
+#endif
+
+.Lnonvmx_copy:
+	std	r14,STK_REG(R14)(r1)
+	std	r15,STK_REG(R15)(r1)
+	std	r16,STK_REG(R16)(r1)
+	std	r17,STK_REG(R17)(r1)
+	std	r18,STK_REG(R18)(r1)
+	std	r19,STK_REG(R19)(r1)
+	std	r20,STK_REG(R20)(r1)
+
+1:	ld	r0,0(r4)
+	ld	r5,8(r4)
+	ld	r6,16(r4)
+	ld	r7,24(r4)
+	ld	r8,32(r4)
+	ld	r9,40(r4)
+	ld	r10,48(r4)
+	ld	r11,56(r4)
+	ld	r12,64(r4)
+	ld	r14,72(r4)
+	ld	r15,80(r4)
+	ld	r16,88(r4)
+	ld	r17,96(r4)
+	ld	r18,104(r4)
+	ld	r19,112(r4)
+	ld	r20,120(r4)
+	addi	r4,r4,128
+	std	r0,0(r3)
+	std	r5,8(r3)
+	std	r6,16(r3)
+	std	r7,24(r3)
+	std	r8,32(r3)
+	std	r9,40(r3)
+	std	r10,48(r3)
+	std	r11,56(r3)
+	std	r12,64(r3)
+	std	r14,72(r3)
+	std	r15,80(r3)
+	std	r16,88(r3)
+	std	r17,96(r3)
+	std	r18,104(r3)
+	std	r19,112(r3)
+	std	r20,120(r3)
+	addi	r3,r3,128
+	bdnz	1b
+
+	ld	r14,STK_REG(R14)(r1)
+	ld	r15,STK_REG(R15)(r1)
+	ld	r16,STK_REG(R16)(r1)
+	ld	r17,STK_REG(R17)(r1)
+	ld	r18,STK_REG(R18)(r1)
+	ld	r19,STK_REG(R19)(r1)
+	ld	r20,STK_REG(R20)(r1)
+	addi	r1,r1,STACKFRAMESIZE
+	blr
diff --git a/arch/powerpc/lib/copyuser_64.S b/arch/powerpc/lib/copyuser_64.S
index 25ec5378afa..0860ee46013 100644
--- a/arch/powerpc/lib/copyuser_64.S
+++ b/arch/powerpc/lib/copyuser_64.S
@@ -9,8 +9,22 @@
 #include <asm/processor.h>
 #include <asm/ppc_asm.h>
 
+#ifdef __BIG_ENDIAN__
+#define sLd sld		/* Shift towards low-numbered address. */
+#define sHd srd		/* Shift towards high-numbered address. */
+#else
+#define sLd srd		/* Shift towards low-numbered address. */
+#define sHd sld		/* Shift towards high-numbered address. */
+#endif
+
 	.align	7
-_GLOBAL(__copy_tofrom_user)
+_GLOBAL_TOC(__copy_tofrom_user)
+BEGIN_FTR_SECTION
+	nop
+FTR_SECTION_ELSE
+	b	__copy_tofrom_user_power7
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
+_GLOBAL(__copy_tofrom_user_base)
 	/* first check for a whole page copy on a page boundary */
 	cmpldi	cr1,r5,16
 	cmpdi	cr6,r5,4096
@@ -24,43 +38,75 @@ _GLOBAL(__copy_tofrom_user)
 	dcbt	0,r4
 	beq	.Lcopy_page_4K
 	andi.	r6,r6,7
-	PPC_MTOCRF	0x01,r5
+	PPC_MTOCRF(0x01,r5)
 	blt	cr1,.Lshort_copy
+/* Below we want to nop out the bne if we're on a CPU that has the
+ * CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
+ * cleared.
+ * At the time of writing the only CPU that has this combination of bits
+ * set is Power6.
+ */
+BEGIN_FTR_SECTION
+	nop
+FTR_SECTION_ELSE
 	bne	.Ldst_unaligned
+ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
+		    CPU_FTR_UNALIGNED_LD_STD)
 .Ldst_aligned:
-	andi.	r0,r4,7
 	addi	r3,r3,-16
+BEGIN_FTR_SECTION
+	andi.	r0,r4,7
 	bne	.Lsrc_unaligned
-	srdi	r7,r5,4
-20:	ld	r9,0(r4)
-	addi	r4,r4,-8
-	mtctr	r7
-	andi.	r5,r5,7
-	bf	cr7*4+0,22f
-	addi	r3,r3,8
-	addi	r4,r4,8
-	mr	r8,r9
-	blt	cr1,72f
-21:	ld	r9,8(r4)
-70:	std	r8,8(r3)
-22:	ldu	r8,16(r4)
-71:	stdu	r9,16(r3)
+END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
+	blt	cr1,.Ldo_tail		/* if < 16 bytes to copy */
+	srdi	r0,r5,5
+	cmpdi	cr1,r0,0
+20:	ld	r7,0(r4)
+220:	ld	r6,8(r4)
+	addi	r4,r4,16
+	mtctr	r0
+	andi.	r0,r5,0x10
+	beq	22f
+	addi	r3,r3,16
+	addi	r4,r4,-16
+	mr	r9,r7
+	mr	r8,r6
+	beq	cr1,72f
+21:	ld	r7,16(r4)
+221:	ld	r6,24(r4)
+	addi	r4,r4,32
+70:	std	r9,0(r3)
+270:	std	r8,8(r3)
+22:	ld	r9,0(r4)
+222:	ld	r8,8(r4)
+71:	std	r7,16(r3)
+271:	std	r6,24(r3)
+	addi	r3,r3,32
 	bdnz	21b
-72:	std	r8,8(r3)
+72:	std	r9,0(r3)
+272:	std	r8,8(r3)
+	andi.	r5,r5,0xf
 	beq+	3f
-	addi	r3,r3,16
-23:	ld	r9,8(r4)
+	addi	r4,r4,16
 .Ldo_tail:
-	bf	cr7*4+1,1f
-	rotldi	r9,r9,32
+	addi	r3,r3,16
+	bf	cr7*4+0,246f
+244:	ld	r9,0(r4)
+	addi	r4,r4,8
+245:	std	r9,0(r3)
+	addi	r3,r3,8
+246:	bf	cr7*4+1,1f
+23:	lwz	r9,0(r4)
+	addi	r4,r4,4
 73:	stw	r9,0(r3)
 	addi	r3,r3,4
 1:	bf	cr7*4+2,2f
-	rotldi	r9,r9,16
+44:	lhz	r9,0(r4)
+	addi	r4,r4,2
 74:	sth	r9,0(r3)
 	addi	r3,r3,2
 2:	bf	cr7*4+3,3f
-	rotldi	r9,r9,8
+45:	lbz	r9,0(r4)
 75:	stb	r9,0(r3)
 3:	li	r3,0
 	blr
@@ -80,10 +126,10 @@ _GLOBAL(__copy_tofrom_user)
 
 24:	ld	r9,0(r4)	/* 3+2n loads, 2+2n stores */
 25:	ld	r0,8(r4)
-	sld	r6,r9,r10
+	sLd	r6,r9,r10
 26:	ldu	r9,16(r4)
-	srd	r7,r0,r11
-	sld	r8,r0,r10
+	sHd	r7,r0,r11
+	sLd	r8,r0,r10
 	or	r7,r7,r6
 	blt	cr6,79f
 27:	ld	r0,8(r4)
@@ -91,35 +137,35 @@ _GLOBAL(__copy_tofrom_user)
 
 28:	ld	r0,0(r4)	/* 4+2n loads, 3+2n stores */
 29:	ldu	r9,8(r4)
-	sld	r8,r0,r10
+	sLd	r8,r0,r10
 	addi	r3,r3,-8
 	blt	cr6,5f
 30:	ld	r0,8(r4)
-	srd	r12,r9,r11
-	sld	r6,r9,r10
+	sHd	r12,r9,r11
+	sLd	r6,r9,r10
 31:	ldu	r9,16(r4)
 	or	r12,r8,r12
-	srd	r7,r0,r11
-	sld	r8,r0,r10
+	sHd	r7,r0,r11
+	sLd	r8,r0,r10
 	addi	r3,r3,16
 	beq	cr6,78f
 
 1:	or	r7,r7,r6
 32:	ld	r0,8(r4)
 76:	std	r12,8(r3)
-2:	srd	r12,r9,r11
-	sld	r6,r9,r10
+2:	sHd	r12,r9,r11
+	sLd	r6,r9,r10
 33:	ldu	r9,16(r4)
 	or	r12,r8,r12
 77:	stdu	r7,16(r3)
-	srd	r7,r0,r11
-	sld	r8,r0,r10
+	sHd	r7,r0,r11
+	sLd	r8,r0,r10
 	bdnz	1b
 
 78:	std	r12,8(r3)
 	or	r7,r7,r6
 79:	std	r7,16(r3)
-5:	srd	r12,r9,r11
+5:	sHd	r12,r9,r11
 	or	r12,r8,r12
 80:	std	r12,24(r3)
 	bne	6f
@@ -127,18 +173,46 @@ _GLOBAL(__copy_tofrom_user)
 	blr
 6:	cmpwi	cr1,r5,8
 	addi	r3,r3,32
-	sld	r9,r9,r10
-	ble	cr1,.Ldo_tail
+	sLd	r9,r9,r10
+	ble	cr1,7f
 34:	ld	r0,8(r4)
-	srd	r7,r0,r11
+	sHd	r7,r0,r11
 	or	r9,r7,r9
-	b	.Ldo_tail
+7:
+	bf	cr7*4+1,1f
+#ifdef __BIG_ENDIAN__
+	rotldi	r9,r9,32
+#endif
+94:	stw	r9,0(r3)
+#ifdef __LITTLE_ENDIAN__
+	rotrdi	r9,r9,32
+#endif
+	addi	r3,r3,4
+1:	bf	cr7*4+2,2f
+#ifdef __BIG_ENDIAN__
+	rotldi	r9,r9,16
+#endif
+95:	sth	r9,0(r3)
+#ifdef __LITTLE_ENDIAN__
+	rotrdi	r9,r9,16
+#endif
+	addi	r3,r3,2
+2:	bf	cr7*4+3,3f
+#ifdef __BIG_ENDIAN__
+	rotldi	r9,r9,8
+#endif
+96:	stb	r9,0(r3)
+#ifdef __LITTLE_ENDIAN__
+	rotrdi	r9,r9,8
+#endif
+3:	li	r3,0
+	blr
 
 .Ldst_unaligned:
-	PPC_MTOCRF	0x01,r6		/* put #bytes to 8B bdry into cr7 */
+	PPC_MTOCRF(0x01,r6)		/* put #bytes to 8B bdry into cr7 */
 	subf	r5,r6,r5
 	li	r7,0
-	cmpldi	r1,r5,16
+	cmpldi	cr1,r5,16
 	bf	cr7*4+3,1f
 35:	lbz	r0,0(r4)
 81:	stb	r0,0(r3)
@@ -150,7 +224,7 @@ _GLOBAL(__copy_tofrom_user)
 2:	bf	cr7*4+1,3f
 37:	lwzx	r0,r7,r4
 83:	stwx	r0,r7,r3
-3:	PPC_MTOCRF	0x01,r5
+3:	PPC_MTOCRF(0x01,r5)
 	add	r4,r6,r4
 	add	r3,r6,r3
 	b	.Ldst_aligned
@@ -193,7 +267,9 @@ _GLOBAL(__copy_tofrom_user)
 131:
 	addi	r3,r3,8
 120:
+320:
 122:
+322:
 124:
 125:
 126:
@@ -202,10 +278,11 @@ _GLOBAL(__copy_tofrom_user)
 129:
 133:
 	addi	r3,r3,8
-121:
 132:
 	addi	r3,r3,8
-123:
+121:
+321:
+344:
 134:
 135:
 138:
@@ -213,6 +290,9 @@ _GLOBAL(__copy_tofrom_user)
 140:
 141:
 142:
+123:
+144:
+145:
 
 /*
  * here we have had a fault on a load and r3 points to the first
@@ -274,18 +354,22 @@ _GLOBAL(__copy_tofrom_user)
 183:
 	add	r3,r3,r7
 	b	1f
+371:
 180:
 	addi	r3,r3,8
 171:
 177:
 	addi	r3,r3,8
-170:
-172:
+370:
+372:
 176:
 178:
 	addi	r3,r3,4
 185:
 	addi	r3,r3,4
+170:
+172:
+345:
 173:
 174:
 175:
@@ -296,6 +380,9 @@ _GLOBAL(__copy_tofrom_user)
 187:
 188:
 189:	
+194:
+195:
+196:
 1:
 	ld	r6,-24(r1)
 	ld	r5,-8(r1)
@@ -309,14 +396,24 @@ _GLOBAL(__copy_tofrom_user)
 	.section __ex_table,"a"
 	.align	3
 	.llong	20b,120b
+	.llong	220b,320b
 	.llong	21b,121b
+	.llong	221b,321b
 	.llong	70b,170b
+	.llong	270b,370b
 	.llong	22b,122b
+	.llong	222b,322b
 	.llong	71b,171b
+	.llong	271b,371b
 	.llong	72b,172b
+	.llong	272b,372b
+	.llong	244b,344b
+	.llong	245b,345b
 	.llong	23b,123b
 	.llong	73b,173b
+	.llong	44b,144b
 	.llong	74b,174b
+	.llong	45b,145b
 	.llong	75b,175b
 	.llong	24b,124b
 	.llong	25b,125b
@@ -334,6 +431,9 @@ _GLOBAL(__copy_tofrom_user)
 	.llong	79b,179b
 	.llong	80b,180b
 	.llong	34b,134b
+	.llong	94b,194b
+	.llong	95b,195b
+	.llong	96b,196b
 	.llong	35b,135b
 	.llong	81b,181b
 	.llong	36b,136b
diff --git a/arch/powerpc/lib/copyuser_power7.S b/arch/powerpc/lib/copyuser_power7.S
new file mode 100644
index 00000000000..c46c876ac96
--- /dev/null
+++ b/arch/powerpc/lib/copyuser_power7.S
@@ -0,0 +1,721 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2011
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <asm/ppc_asm.h>
+
+#ifdef __BIG_ENDIAN__
+#define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB
+#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRA,VRB,VRC
+#else
+#define LVS(VRT,RA,RB)		lvsr	VRT,RA,RB
+#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRB,VRA,VRC
+#endif
+
+	.macro err1
+100:
+	.section __ex_table,"a"
+	.align 3
+	.llong 100b,.Ldo_err1
+	.previous
+	.endm
+
+	.macro err2
+200:
+	.section __ex_table,"a"
+	.align 3
+	.llong 200b,.Ldo_err2
+	.previous
+	.endm
+
+#ifdef CONFIG_ALTIVEC
+	.macro err3
+300:
+	.section __ex_table,"a"
+	.align 3
+	.llong 300b,.Ldo_err3
+	.previous
+	.endm
+
+	.macro err4
+400:
+	.section __ex_table,"a"
+	.align 3
+	.llong 400b,.Ldo_err4
+	.previous
+	.endm
+
+
+.Ldo_err4:
+	ld	r16,STK_REG(R16)(r1)
+	ld	r15,STK_REG(R15)(r1)
+	ld	r14,STK_REG(R14)(r1)
+.Ldo_err3:
+	bl	exit_vmx_usercopy
+	ld	r0,STACKFRAMESIZE+16(r1)
+	mtlr	r0
+	b	.Lexit
+#endif /* CONFIG_ALTIVEC */
+
+.Ldo_err2:
+	ld	r22,STK_REG(R22)(r1)
+	ld	r21,STK_REG(R21)(r1)
+	ld	r20,STK_REG(R20)(r1)
+	ld	r19,STK_REG(R19)(r1)
+	ld	r18,STK_REG(R18)(r1)
+	ld	r17,STK_REG(R17)(r1)
+	ld	r16,STK_REG(R16)(r1)
+	ld	r15,STK_REG(R15)(r1)
+	ld	r14,STK_REG(R14)(r1)
+.Lexit:
+	addi	r1,r1,STACKFRAMESIZE
+.Ldo_err1:
+	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
+	ld	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
+	ld	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
+	b	__copy_tofrom_user_base
+
+
+_GLOBAL(__copy_tofrom_user_power7)
+#ifdef CONFIG_ALTIVEC
+	cmpldi	r5,16
+	cmpldi	cr1,r5,4096
+
+	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
+	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
+	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
+
+	blt	.Lshort_copy
+	bgt	cr1,.Lvmx_copy
+#else
+	cmpldi	r5,16
+
+	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
+	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
+	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
+
+	blt	.Lshort_copy
+#endif
+
+.Lnonvmx_copy:
+	/* Get the source 8B aligned */
+	neg	r6,r4
+	mtocrf	0x01,r6
+	clrldi	r6,r6,(64-3)
+
+	bf	cr7*4+3,1f
+err1;	lbz	r0,0(r4)
+	addi	r4,r4,1
+err1;	stb	r0,0(r3)
+	addi	r3,r3,1
+
+1:	bf	cr7*4+2,2f
+err1;	lhz	r0,0(r4)
+	addi	r4,r4,2
+err1;	sth	r0,0(r3)
+	addi	r3,r3,2
+
+2:	bf	cr7*4+1,3f
+err1;	lwz	r0,0(r4)
+	addi	r4,r4,4
+err1;	stw	r0,0(r3)
+	addi	r3,r3,4
+
+3:	sub	r5,r5,r6
+	cmpldi	r5,128
+	blt	5f
+
+	mflr	r0
+	stdu	r1,-STACKFRAMESIZE(r1)
+	std	r14,STK_REG(R14)(r1)
+	std	r15,STK_REG(R15)(r1)
+	std	r16,STK_REG(R16)(r1)
+	std	r17,STK_REG(R17)(r1)
+	std	r18,STK_REG(R18)(r1)
+	std	r19,STK_REG(R19)(r1)
+	std	r20,STK_REG(R20)(r1)
+	std	r21,STK_REG(R21)(r1)
+	std	r22,STK_REG(R22)(r1)
+	std	r0,STACKFRAMESIZE+16(r1)
+
+	srdi	r6,r5,7
+	mtctr	r6
+
+	/* Now do cacheline (128B) sized loads and stores. */
+	.align	5
+4:
+err2;	ld	r0,0(r4)
+err2;	ld	r6,8(r4)
+err2;	ld	r7,16(r4)
+err2;	ld	r8,24(r4)
+err2;	ld	r9,32(r4)
+err2;	ld	r10,40(r4)
+err2;	ld	r11,48(r4)
+err2;	ld	r12,56(r4)
+err2;	ld	r14,64(r4)
+err2;	ld	r15,72(r4)
+err2;	ld	r16,80(r4)
+err2;	ld	r17,88(r4)
+err2;	ld	r18,96(r4)
+err2;	ld	r19,104(r4)
+err2;	ld	r20,112(r4)
+err2;	ld	r21,120(r4)
+	addi	r4,r4,128
+err2;	std	r0,0(r3)
+err2;	std	r6,8(r3)
+err2;	std	r7,16(r3)
+err2;	std	r8,24(r3)
+err2;	std	r9,32(r3)
+err2;	std	r10,40(r3)
+err2;	std	r11,48(r3)
+err2;	std	r12,56(r3)
+err2;	std	r14,64(r3)
+err2;	std	r15,72(r3)
+err2;	std	r16,80(r3)
+err2;	std	r17,88(r3)
+err2;	std	r18,96(r3)
+err2;	std	r19,104(r3)
+err2;	std	r20,112(r3)
+err2;	std	r21,120(r3)
+	addi	r3,r3,128
+	bdnz	4b
+
+	clrldi	r5,r5,(64-7)
+
+	ld	r14,STK_REG(R14)(r1)
+	ld	r15,STK_REG(R15)(r1)
+	ld	r16,STK_REG(R16)(r1)
+	ld	r17,STK_REG(R17)(r1)
+	ld	r18,STK_REG(R18)(r1)
+	ld	r19,STK_REG(R19)(r1)
+	ld	r20,STK_REG(R20)(r1)
+	ld	r21,STK_REG(R21)(r1)
+	ld	r22,STK_REG(R22)(r1)
+	addi	r1,r1,STACKFRAMESIZE
+
+	/* Up to 127B to go */
+5:	srdi	r6,r5,4
+	mtocrf	0x01,r6
+
+6:	bf	cr7*4+1,7f
+err1;	ld	r0,0(r4)
+err1;	ld	r6,8(r4)
+err1;	ld	r7,16(r4)
+err1;	ld	r8,24(r4)
+err1;	ld	r9,32(r4)
+err1;	ld	r10,40(r4)
+err1;	ld	r11,48(r4)
+err1;	ld	r12,56(r4)
+	addi	r4,r4,64
+err1;	std	r0,0(r3)
+err1;	std	r6,8(r3)
+err1;	std	r7,16(r3)
+err1;	std	r8,24(r3)
+err1;	std	r9,32(r3)
+err1;	std	r10,40(r3)
+err1;	std	r11,48(r3)
+err1;	std	r12,56(r3)
+	addi	r3,r3,64
+
+	/* Up to 63B to go */
+7:	bf	cr7*4+2,8f
+err1;	ld	r0,0(r4)
+err1;	ld	r6,8(r4)
+err1;	ld	r7,16(r4)
+err1;	ld	r8,24(r4)
+	addi	r4,r4,32
+err1;	std	r0,0(r3)
+err1;	std	r6,8(r3)
+err1;	std	r7,16(r3)
+err1;	std	r8,24(r3)
+	addi	r3,r3,32
+
+	/* Up to 31B to go */
+8:	bf	cr7*4+3,9f
+err1;	ld	r0,0(r4)
+err1;	ld	r6,8(r4)
+	addi	r4,r4,16
+err1;	std	r0,0(r3)
+err1;	std	r6,8(r3)
+	addi	r3,r3,16
+
+9:	clrldi	r5,r5,(64-4)
+
+	/* Up to 15B to go */
+.Lshort_copy:
+	mtocrf	0x01,r5
+	bf	cr7*4+0,12f
+err1;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
+err1;	lwz	r6,4(r4)
+	addi	r4,r4,8
+err1;	stw	r0,0(r3)
+err1;	stw	r6,4(r3)
+	addi	r3,r3,8
+
+12:	bf	cr7*4+1,13f
+err1;	lwz	r0,0(r4)
+	addi	r4,r4,4
+err1;	stw	r0,0(r3)
+	addi	r3,r3,4
+
+13:	bf	cr7*4+2,14f
+err1;	lhz	r0,0(r4)
+	addi	r4,r4,2
+err1;	sth	r0,0(r3)
+	addi	r3,r3,2
+
+14:	bf	cr7*4+3,15f
+err1;	lbz	r0,0(r4)
+err1;	stb	r0,0(r3)
+
+15:	li	r3,0
+	blr
+
+.Lunwind_stack_nonvmx_copy:
+	addi	r1,r1,STACKFRAMESIZE
+	b	.Lnonvmx_copy
+
+#ifdef CONFIG_ALTIVEC
+.Lvmx_copy:
+	mflr	r0
+	std	r0,16(r1)
+	stdu	r1,-STACKFRAMESIZE(r1)
+	bl	enter_vmx_usercopy
+	cmpwi	cr1,r3,0
+	ld	r0,STACKFRAMESIZE+16(r1)
+	ld	r3,STK_REG(R31)(r1)
+	ld	r4,STK_REG(R30)(r1)
+	ld	r5,STK_REG(R29)(r1)
+	mtlr	r0
+
+	/*
+	 * We prefetch both the source and destination using enhanced touch
+	 * instructions. We use a stream ID of 0 for the load side and
+	 * 1 for the store side.
+	 */
+	clrrdi	r6,r4,7
+	clrrdi	r9,r3,7
+	ori	r9,r9,1		/* stream=1 */
+
+	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */
+	cmpldi	r7,0x3FF
+	ble	1f
+	li	r7,0x3FF
+1:	lis	r0,0x0E00	/* depth=7 */
+	sldi	r7,r7,7
+	or	r7,r7,r0
+	ori	r10,r7,1	/* stream=1 */
+
+	lis	r8,0x8000	/* GO=1 */
+	clrldi	r8,r8,32
+
+.machine push
+.machine "power4"
+	/* setup read stream 0 */
+	dcbt	r0,r6,0b01000   /* addr from */
+	dcbt	r0,r7,0b01010   /* length and depth from */
+	/* setup write stream 1 */
+	dcbtst	r0,r9,0b01000   /* addr to */
+	dcbtst	r0,r10,0b01010  /* length and depth to */
+	eieio
+	dcbt	r0,r8,0b01010	/* all streams GO */
+.machine pop
+
+	beq	cr1,.Lunwind_stack_nonvmx_copy
+
+	/*
+	 * If source and destination are not relatively aligned we use a
+	 * slower permute loop.
+	 */
+	xor	r6,r4,r3
+	rldicl.	r6,r6,0,(64-4)
+	bne	.Lvmx_unaligned_copy
+
+	/* Get the destination 16B aligned */
+	neg	r6,r3
+	mtocrf	0x01,r6
+	clrldi	r6,r6,(64-4)
+
+	bf	cr7*4+3,1f
+err3;	lbz	r0,0(r4)
+	addi	r4,r4,1
+err3;	stb	r0,0(r3)
+	addi	r3,r3,1
+
+1:	bf	cr7*4+2,2f
+err3;	lhz	r0,0(r4)
+	addi	r4,r4,2
+err3;	sth	r0,0(r3)
+	addi	r3,r3,2
+
+2:	bf	cr7*4+1,3f
+err3;	lwz	r0,0(r4)
+	addi	r4,r4,4
+err3;	stw	r0,0(r3)
+	addi	r3,r3,4
+
+3:	bf	cr7*4+0,4f
+err3;	ld	r0,0(r4)
+	addi	r4,r4,8
+err3;	std	r0,0(r3)
+	addi	r3,r3,8
+
+4:	sub	r5,r5,r6
+
+	/* Get the desination 128B aligned */
+	neg	r6,r3
+	srdi	r7,r6,4
+	mtocrf	0x01,r7
+	clrldi	r6,r6,(64-7)
+
+	li	r9,16
+	li	r10,32
+	li	r11,48
+
+	bf	cr7*4+3,5f
+err3;	lvx	vr1,r0,r4
+	addi	r4,r4,16
+err3;	stvx	vr1,r0,r3
+	addi	r3,r3,16
+
+5:	bf	cr7*4+2,6f
+err3;	lvx	vr1,r0,r4
+err3;	lvx	vr0,r4,r9
+	addi	r4,r4,32
+err3;	stvx	vr1,r0,r3
+err3;	stvx	vr0,r3,r9
+	addi	r3,r3,32
+
+6:	bf	cr7*4+1,7f
+err3;	lvx	vr3,r0,r4
+err3;	lvx	vr2,r4,r9
+err3;	lvx	vr1,r4,r10
+err3;	lvx	vr0,r4,r11
+	addi	r4,r4,64
+err3;	stvx	vr3,r0,r3
+err3;	stvx	vr2,r3,r9
+err3;	stvx	vr1,r3,r10
+err3;	stvx	vr0,r3,r11
+	addi	r3,r3,64
+
+7:	sub	r5,r5,r6
+	srdi	r6,r5,7
+
+	std	r14,STK_REG(R14)(r1)
+	std	r15,STK_REG(R15)(r1)
+	std	r16,STK_REG(R16)(r1)
+
+	li	r12,64
+	li	r14,80
+	li	r15,96
+	li	r16,112
+
+	mtctr	r6
+
+	/*
+	 * Now do cacheline sized loads and stores. By this stage the
+	 * cacheline stores are also cacheline aligned.
+	 */
+	.align	5
+8:
+err4;	lvx	vr7,r0,r4
+err4;	lvx	vr6,r4,r9
+err4;	lvx	vr5,r4,r10
+err4;	lvx	vr4,r4,r11
+err4;	lvx	vr3,r4,r12
+err4;	lvx	vr2,r4,r14
+err4;	lvx	vr1,r4,r15
+err4;	lvx	vr0,r4,r16
+	addi	r4,r4,128
+err4;	stvx	vr7,r0,r3
+err4;	stvx	vr6,r3,r9
+err4;	stvx	vr5,r3,r10
+err4;	stvx	vr4,r3,r11
+err4;	stvx	vr3,r3,r12
+err4;	stvx	vr2,r3,r14
+err4;	stvx	vr1,r3,r15
+err4;	stvx	vr0,r3,r16
+	addi	r3,r3,128
+	bdnz	8b
+
+	ld	r14,STK_REG(R14)(r1)
+	ld	r15,STK_REG(R15)(r1)
+	ld	r16,STK_REG(R16)(r1)
+
+	/* Up to 127B to go */
+	clrldi	r5,r5,(64-7)
+	srdi	r6,r5,4
+	mtocrf	0x01,r6
+
+	bf	cr7*4+1,9f
+err3;	lvx	vr3,r0,r4
+err3;	lvx	vr2,r4,r9
+err3;	lvx	vr1,r4,r10
+err3;	lvx	vr0,r4,r11
+	addi	r4,r4,64
+err3;	stvx	vr3,r0,r3
+err3;	stvx	vr2,r3,r9
+err3;	stvx	vr1,r3,r10
+err3;	stvx	vr0,r3,r11
+	addi	r3,r3,64
+
+9:	bf	cr7*4+2,10f
+err3;	lvx	vr1,r0,r4
+err3;	lvx	vr0,r4,r9
+	addi	r4,r4,32
+err3;	stvx	vr1,r0,r3
+err3;	stvx	vr0,r3,r9
+	addi	r3,r3,32
+
+10:	bf	cr7*4+3,11f
+err3;	lvx	vr1,r0,r4
+	addi	r4,r4,16
+err3;	stvx	vr1,r0,r3
+	addi	r3,r3,16
+
+	/* Up to 15B to go */
+11:	clrldi	r5,r5,(64-4)
+	mtocrf	0x01,r5
+	bf	cr7*4+0,12f
+err3;	ld	r0,0(r4)
+	addi	r4,r4,8
+err3;	std	r0,0(r3)
+	addi	r3,r3,8
+
+12:	bf	cr7*4+1,13f
+err3;	lwz	r0,0(r4)
+	addi	r4,r4,4
+err3;	stw	r0,0(r3)
+	addi	r3,r3,4
+
+13:	bf	cr7*4+2,14f
+err3;	lhz	r0,0(r4)
+	addi	r4,r4,2
+err3;	sth	r0,0(r3)
+	addi	r3,r3,2
+
+14:	bf	cr7*4+3,15f
+err3;	lbz	r0,0(r4)
+err3;	stb	r0,0(r3)
+
+15:	addi	r1,r1,STACKFRAMESIZE
+	b	exit_vmx_usercopy	/* tail call optimise */
+
+.Lvmx_unaligned_copy:
+	/* Get the destination 16B aligned */
+	neg	r6,r3
+	mtocrf	0x01,r6
+	clrldi	r6,r6,(64-4)
+
+	bf	cr7*4+3,1f
+err3;	lbz	r0,0(r4)
+	addi	r4,r4,1
+err3;	stb	r0,0(r3)
+	addi	r3,r3,1
+
+1:	bf	cr7*4+2,2f
+err3;	lhz	r0,0(r4)
+	addi	r4,r4,2
+err3;	sth	r0,0(r3)
+	addi	r3,r3,2
+
+2:	bf	cr7*4+1,3f
+err3;	lwz	r0,0(r4)
+	addi	r4,r4,4
+err3;	stw	r0,0(r3)
+	addi	r3,r3,4
+
+3:	bf	cr7*4+0,4f
+err3;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
+err3;	lwz	r7,4(r4)
+	addi	r4,r4,8
+err3;	stw	r0,0(r3)
+err3;	stw	r7,4(r3)
+	addi	r3,r3,8
+
+4:	sub	r5,r5,r6
+
+	/* Get the desination 128B aligned */
+	neg	r6,r3
+	srdi	r7,r6,4
+	mtocrf	0x01,r7
+	clrldi	r6,r6,(64-7)
+
+	li	r9,16
+	li	r10,32
+	li	r11,48
+
+	LVS(vr16,0,r4)		/* Setup permute control vector */
+err3;	lvx	vr0,0,r4
+	addi	r4,r4,16
+
+	bf	cr7*4+3,5f
+err3;	lvx	vr1,r0,r4
+	VPERM(vr8,vr0,vr1,vr16)
+	addi	r4,r4,16
+err3;	stvx	vr8,r0,r3
+	addi	r3,r3,16
+	vor	vr0,vr1,vr1
+
+5:	bf	cr7*4+2,6f
+err3;	lvx	vr1,r0,r4
+	VPERM(vr8,vr0,vr1,vr16)
+err3;	lvx	vr0,r4,r9
+	VPERM(vr9,vr1,vr0,vr16)
+	addi	r4,r4,32
+err3;	stvx	vr8,r0,r3
+err3;	stvx	vr9,r3,r9
+	addi	r3,r3,32
+
+6:	bf	cr7*4+1,7f
+err3;	lvx	vr3,r0,r4
+	VPERM(vr8,vr0,vr3,vr16)
+err3;	lvx	vr2,r4,r9
+	VPERM(vr9,vr3,vr2,vr16)
+err3;	lvx	vr1,r4,r10
+	VPERM(vr10,vr2,vr1,vr16)
+err3;	lvx	vr0,r4,r11
+	VPERM(vr11,vr1,vr0,vr16)
+	addi	r4,r4,64
+err3;	stvx	vr8,r0,r3
+err3;	stvx	vr9,r3,r9
+err3;	stvx	vr10,r3,r10
+err3;	stvx	vr11,r3,r11
+	addi	r3,r3,64
+
+7:	sub	r5,r5,r6
+	srdi	r6,r5,7
+
+	std	r14,STK_REG(R14)(r1)
+	std	r15,STK_REG(R15)(r1)
+	std	r16,STK_REG(R16)(r1)
+
+	li	r12,64
+	li	r14,80
+	li	r15,96
+	li	r16,112
+
+	mtctr	r6
+
+	/*
+	 * Now do cacheline sized loads and stores. By this stage the
+	 * cacheline stores are also cacheline aligned.
+	 */
+	.align	5
+8:
+err4;	lvx	vr7,r0,r4
+	VPERM(vr8,vr0,vr7,vr16)
+err4;	lvx	vr6,r4,r9
+	VPERM(vr9,vr7,vr6,vr16)
+err4;	lvx	vr5,r4,r10
+	VPERM(vr10,vr6,vr5,vr16)
+err4;	lvx	vr4,r4,r11
+	VPERM(vr11,vr5,vr4,vr16)
+err4;	lvx	vr3,r4,r12
+	VPERM(vr12,vr4,vr3,vr16)
+err4;	lvx	vr2,r4,r14
+	VPERM(vr13,vr3,vr2,vr16)
+err4;	lvx	vr1,r4,r15
+	VPERM(vr14,vr2,vr1,vr16)
+err4;	lvx	vr0,r4,r16
+	VPERM(vr15,vr1,vr0,vr16)
+	addi	r4,r4,128
+err4;	stvx	vr8,r0,r3
+err4;	stvx	vr9,r3,r9
+err4;	stvx	vr10,r3,r10
+err4;	stvx	vr11,r3,r11
+err4;	stvx	vr12,r3,r12
+err4;	stvx	vr13,r3,r14
+err4;	stvx	vr14,r3,r15
+err4;	stvx	vr15,r3,r16
+	addi	r3,r3,128
+	bdnz	8b
+
+	ld	r14,STK_REG(R14)(r1)
+	ld	r15,STK_REG(R15)(r1)
+	ld	r16,STK_REG(R16)(r1)
+
+	/* Up to 127B to go */
+	clrldi	r5,r5,(64-7)
+	srdi	r6,r5,4
+	mtocrf	0x01,r6
+
+	bf	cr7*4+1,9f
+err3;	lvx	vr3,r0,r4
+	VPERM(vr8,vr0,vr3,vr16)
+err3;	lvx	vr2,r4,r9
+	VPERM(vr9,vr3,vr2,vr16)
+err3;	lvx	vr1,r4,r10
+	VPERM(vr10,vr2,vr1,vr16)
+err3;	lvx	vr0,r4,r11
+	VPERM(vr11,vr1,vr0,vr16)
+	addi	r4,r4,64
+err3;	stvx	vr8,r0,r3
+err3;	stvx	vr9,r3,r9
+err3;	stvx	vr10,r3,r10
+err3;	stvx	vr11,r3,r11
+	addi	r3,r3,64
+
+9:	bf	cr7*4+2,10f
+err3;	lvx	vr1,r0,r4
+	VPERM(vr8,vr0,vr1,vr16)
+err3;	lvx	vr0,r4,r9
+	VPERM(vr9,vr1,vr0,vr16)
+	addi	r4,r4,32
+err3;	stvx	vr8,r0,r3
+err3;	stvx	vr9,r3,r9
+	addi	r3,r3,32
+
+10:	bf	cr7*4+3,11f
+err3;	lvx	vr1,r0,r4
+	VPERM(vr8,vr0,vr1,vr16)
+	addi	r4,r4,16
+err3;	stvx	vr8,r0,r3
+	addi	r3,r3,16
+
+	/* Up to 15B to go */
+11:	clrldi	r5,r5,(64-4)
+	addi	r4,r4,-16	/* Unwind the +16 load offset */
+	mtocrf	0x01,r5
+	bf	cr7*4+0,12f
+err3;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
+err3;	lwz	r6,4(r4)
+	addi	r4,r4,8
+err3;	stw	r0,0(r3)
+err3;	stw	r6,4(r3)
+	addi	r3,r3,8
+
+12:	bf	cr7*4+1,13f
+err3;	lwz	r0,0(r4)
+	addi	r4,r4,4
+err3;	stw	r0,0(r3)
+	addi	r3,r3,4
+
+13:	bf	cr7*4+2,14f
+err3;	lhz	r0,0(r4)
+	addi	r4,r4,2
+err3;	sth	r0,0(r3)
+	addi	r3,r3,2
+
+14:	bf	cr7*4+3,15f
+err3;	lbz	r0,0(r4)
+err3;	stb	r0,0(r3)
+
+15:	addi	r1,r1,STACKFRAMESIZE
+	b	exit_vmx_usercopy	/* tail call optimise */
+#endif /* CONFiG_ALTIVEC */
diff --git a/arch/powerpc/lib/crtsavres.S b/arch/powerpc/lib/crtsavres.S
new file mode 100644
index 00000000000..a5b30c71a8d
--- /dev/null
+++ b/arch/powerpc/lib/crtsavres.S
@@ -0,0 +1,547 @@
+/*
+ * Special support for eabi and SVR4
+ *
+ *   Copyright (C) 1995, 1996, 1998, 2000, 2001 Free Software Foundation, Inc.
+ *   Copyright 2008 Freescale Semiconductor, Inc.
+ *   Written By Michael Meissner
+ *
+ * Based on gcc/config/rs6000/crtsavres.asm from gcc
+ * 64 bit additions from reading the PPC elf64abi document.
+ *
+ * This file is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * In addition to the permissions in the GNU General Public License, the
+ * Free Software Foundation gives you unlimited permission to link the
+ * compiled version of this file with other programs, and to distribute
+ * those programs without any restriction coming from the use of this
+ * file.  (The General Public License restrictions do apply in other
+ * respects; for example, they cover modification of the file, and
+ * distribution when not linked into another program.)
+ *
+ * This file is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ *
+ *    As a special exception, if you link this library with files
+ *    compiled with GCC to produce an executable, this does not cause
+ *    the resulting executable to be covered by the GNU General Public License.
+ *    This exception does not however invalidate any other reasons why
+ *    the executable file might be covered by the GNU General Public License.
+ */
+
+#include <asm/ppc_asm.h>
+
+	.file	"crtsavres.S"
+
+#ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
+
+#ifndef CONFIG_PPC64
+
+	.section ".text"
+
+/* Routines for saving integer registers, called by the compiler.  */
+/* Called with r11 pointing to the stack header word of the caller of the */
+/* function, just beyond the end of the integer save area.  */
+
+_GLOBAL(_savegpr_14)
+_GLOBAL(_save32gpr_14)
+	stw	14,-72(11)	/* save gp registers */
+_GLOBAL(_savegpr_15)
+_GLOBAL(_save32gpr_15)
+	stw	15,-68(11)
+_GLOBAL(_savegpr_16)
+_GLOBAL(_save32gpr_16)
+	stw	16,-64(11)
+_GLOBAL(_savegpr_17)
+_GLOBAL(_save32gpr_17)
+	stw	17,-60(11)
+_GLOBAL(_savegpr_18)
+_GLOBAL(_save32gpr_18)
+	stw	18,-56(11)
+_GLOBAL(_savegpr_19)
+_GLOBAL(_save32gpr_19)
+	stw	19,-52(11)
+_GLOBAL(_savegpr_20)
+_GLOBAL(_save32gpr_20)
+	stw	20,-48(11)
+_GLOBAL(_savegpr_21)
+_GLOBAL(_save32gpr_21)
+	stw	21,-44(11)
+_GLOBAL(_savegpr_22)
+_GLOBAL(_save32gpr_22)
+	stw	22,-40(11)
+_GLOBAL(_savegpr_23)
+_GLOBAL(_save32gpr_23)
+	stw	23,-36(11)
+_GLOBAL(_savegpr_24)
+_GLOBAL(_save32gpr_24)
+	stw	24,-32(11)
+_GLOBAL(_savegpr_25)
+_GLOBAL(_save32gpr_25)
+	stw	25,-28(11)
+_GLOBAL(_savegpr_26)
+_GLOBAL(_save32gpr_26)
+	stw	26,-24(11)
+_GLOBAL(_savegpr_27)
+_GLOBAL(_save32gpr_27)
+	stw	27,-20(11)
+_GLOBAL(_savegpr_28)
+_GLOBAL(_save32gpr_28)
+	stw	28,-16(11)
+_GLOBAL(_savegpr_29)
+_GLOBAL(_save32gpr_29)
+	stw	29,-12(11)
+_GLOBAL(_savegpr_30)
+_GLOBAL(_save32gpr_30)
+	stw	30,-8(11)
+_GLOBAL(_savegpr_31)
+_GLOBAL(_save32gpr_31)
+	stw	31,-4(11)
+	blr
+
+/* Routines for restoring integer registers, called by the compiler.  */
+/* Called with r11 pointing to the stack header word of the caller of the */
+/* function, just beyond the end of the integer restore area.  */
+
+_GLOBAL(_restgpr_14)
+_GLOBAL(_rest32gpr_14)
+	lwz	14,-72(11)	/* restore gp registers */
+_GLOBAL(_restgpr_15)
+_GLOBAL(_rest32gpr_15)
+	lwz	15,-68(11)
+_GLOBAL(_restgpr_16)
+_GLOBAL(_rest32gpr_16)
+	lwz	16,-64(11)
+_GLOBAL(_restgpr_17)
+_GLOBAL(_rest32gpr_17)
+	lwz	17,-60(11)
+_GLOBAL(_restgpr_18)
+_GLOBAL(_rest32gpr_18)
+	lwz	18,-56(11)
+_GLOBAL(_restgpr_19)
+_GLOBAL(_rest32gpr_19)
+	lwz	19,-52(11)
+_GLOBAL(_restgpr_20)
+_GLOBAL(_rest32gpr_20)
+	lwz	20,-48(11)
+_GLOBAL(_restgpr_21)
+_GLOBAL(_rest32gpr_21)
+	lwz	21,-44(11)
+_GLOBAL(_restgpr_22)
+_GLOBAL(_rest32gpr_22)
+	lwz	22,-40(11)
+_GLOBAL(_restgpr_23)
+_GLOBAL(_rest32gpr_23)
+	lwz	23,-36(11)
+_GLOBAL(_restgpr_24)
+_GLOBAL(_rest32gpr_24)
+	lwz	24,-32(11)
+_GLOBAL(_restgpr_25)
+_GLOBAL(_rest32gpr_25)
+	lwz	25,-28(11)
+_GLOBAL(_restgpr_26)
+_GLOBAL(_rest32gpr_26)
+	lwz	26,-24(11)
+_GLOBAL(_restgpr_27)
+_GLOBAL(_rest32gpr_27)
+	lwz	27,-20(11)
+_GLOBAL(_restgpr_28)
+_GLOBAL(_rest32gpr_28)
+	lwz	28,-16(11)
+_GLOBAL(_restgpr_29)
+_GLOBAL(_rest32gpr_29)
+	lwz	29,-12(11)
+_GLOBAL(_restgpr_30)
+_GLOBAL(_rest32gpr_30)
+	lwz	30,-8(11)
+_GLOBAL(_restgpr_31)
+_GLOBAL(_rest32gpr_31)
+	lwz	31,-4(11)
+	blr
+
+/* Routines for restoring integer registers, called by the compiler.  */
+/* Called with r11 pointing to the stack header word of the caller of the */
+/* function, just beyond the end of the integer restore area.  */
+
+_GLOBAL(_restgpr_14_x)
+_GLOBAL(_rest32gpr_14_x)
+	lwz	14,-72(11)	/* restore gp registers */
+_GLOBAL(_restgpr_15_x)
+_GLOBAL(_rest32gpr_15_x)
+	lwz	15,-68(11)
+_GLOBAL(_restgpr_16_x)
+_GLOBAL(_rest32gpr_16_x)
+	lwz	16,-64(11)
+_GLOBAL(_restgpr_17_x)
+_GLOBAL(_rest32gpr_17_x)
+	lwz	17,-60(11)
+_GLOBAL(_restgpr_18_x)
+_GLOBAL(_rest32gpr_18_x)
+	lwz	18,-56(11)
+_GLOBAL(_restgpr_19_x)
+_GLOBAL(_rest32gpr_19_x)
+	lwz	19,-52(11)
+_GLOBAL(_restgpr_20_x)
+_GLOBAL(_rest32gpr_20_x)
+	lwz	20,-48(11)
+_GLOBAL(_restgpr_21_x)
+_GLOBAL(_rest32gpr_21_x)
+	lwz	21,-44(11)
+_GLOBAL(_restgpr_22_x)
+_GLOBAL(_rest32gpr_22_x)
+	lwz	22,-40(11)
+_GLOBAL(_restgpr_23_x)
+_GLOBAL(_rest32gpr_23_x)
+	lwz	23,-36(11)
+_GLOBAL(_restgpr_24_x)
+_GLOBAL(_rest32gpr_24_x)
+	lwz	24,-32(11)
+_GLOBAL(_restgpr_25_x)
+_GLOBAL(_rest32gpr_25_x)
+	lwz	25,-28(11)
+_GLOBAL(_restgpr_26_x)
+_GLOBAL(_rest32gpr_26_x)
+	lwz	26,-24(11)
+_GLOBAL(_restgpr_27_x)
+_GLOBAL(_rest32gpr_27_x)
+	lwz	27,-20(11)
+_GLOBAL(_restgpr_28_x)
+_GLOBAL(_rest32gpr_28_x)
+	lwz	28,-16(11)
+_GLOBAL(_restgpr_29_x)
+_GLOBAL(_rest32gpr_29_x)
+	lwz	29,-12(11)
+_GLOBAL(_restgpr_30_x)
+_GLOBAL(_rest32gpr_30_x)
+	lwz	30,-8(11)
+_GLOBAL(_restgpr_31_x)
+_GLOBAL(_rest32gpr_31_x)
+	lwz	0,4(11)
+	lwz	31,-4(11)
+	mtlr	0
+	mr	1,11
+	blr
+
+#ifdef CONFIG_ALTIVEC
+/* Called with r0 pointing just beyond the end of the vector save area.  */
+
+_GLOBAL(_savevr_20)
+	li	r11,-192
+	stvx	vr20,r11,r0
+_GLOBAL(_savevr_21)
+	li	r11,-176
+	stvx	vr21,r11,r0
+_GLOBAL(_savevr_22)
+	li	r11,-160
+	stvx	vr22,r11,r0
+_GLOBAL(_savevr_23)
+	li	r11,-144
+	stvx	vr23,r11,r0
+_GLOBAL(_savevr_24)
+	li	r11,-128
+	stvx	vr24,r11,r0
+_GLOBAL(_savevr_25)
+	li	r11,-112
+	stvx	vr25,r11,r0
+_GLOBAL(_savevr_26)
+	li	r11,-96
+	stvx	vr26,r11,r0
+_GLOBAL(_savevr_27)
+	li	r11,-80
+	stvx	vr27,r11,r0
+_GLOBAL(_savevr_28)
+	li	r11,-64
+	stvx	vr28,r11,r0
+_GLOBAL(_savevr_29)
+	li	r11,-48
+	stvx	vr29,r11,r0
+_GLOBAL(_savevr_30)
+	li	r11,-32
+	stvx	vr30,r11,r0
+_GLOBAL(_savevr_31)
+	li	r11,-16
+	stvx	vr31,r11,r0
+	blr
+
+_GLOBAL(_restvr_20)
+	li	r11,-192
+	lvx	vr20,r11,r0
+_GLOBAL(_restvr_21)
+	li	r11,-176
+	lvx	vr21,r11,r0
+_GLOBAL(_restvr_22)
+	li	r11,-160
+	lvx	vr22,r11,r0
+_GLOBAL(_restvr_23)
+	li	r11,-144
+	lvx	vr23,r11,r0
+_GLOBAL(_restvr_24)
+	li	r11,-128
+	lvx	vr24,r11,r0
+_GLOBAL(_restvr_25)
+	li	r11,-112
+	lvx	vr25,r11,r0
+_GLOBAL(_restvr_26)
+	li	r11,-96
+	lvx	vr26,r11,r0
+_GLOBAL(_restvr_27)
+	li	r11,-80
+	lvx	vr27,r11,r0
+_GLOBAL(_restvr_28)
+	li	r11,-64
+	lvx	vr28,r11,r0
+_GLOBAL(_restvr_29)
+	li	r11,-48
+	lvx	vr29,r11,r0
+_GLOBAL(_restvr_30)
+	li	r11,-32
+	lvx	vr30,r11,r0
+_GLOBAL(_restvr_31)
+	li	r11,-16
+	lvx	vr31,r11,r0
+	blr
+
+#endif /* CONFIG_ALTIVEC */
+
+#else /* CONFIG_PPC64 */
+
+	.section ".text.save.restore","ax",@progbits
+
+.globl	_savegpr0_14
+_savegpr0_14:
+	std	r14,-144(r1)
+.globl	_savegpr0_15
+_savegpr0_15:
+	std	r15,-136(r1)
+.globl	_savegpr0_16
+_savegpr0_16:
+	std	r16,-128(r1)
+.globl	_savegpr0_17
+_savegpr0_17:
+	std	r17,-120(r1)
+.globl	_savegpr0_18
+_savegpr0_18:
+	std	r18,-112(r1)
+.globl	_savegpr0_19
+_savegpr0_19:
+	std	r19,-104(r1)
+.globl	_savegpr0_20
+_savegpr0_20:
+	std	r20,-96(r1)
+.globl	_savegpr0_21
+_savegpr0_21:
+	std	r21,-88(r1)
+.globl	_savegpr0_22
+_savegpr0_22:
+	std	r22,-80(r1)
+.globl	_savegpr0_23
+_savegpr0_23:
+	std	r23,-72(r1)
+.globl	_savegpr0_24
+_savegpr0_24:
+	std	r24,-64(r1)
+.globl	_savegpr0_25
+_savegpr0_25:
+	std	r25,-56(r1)
+.globl	_savegpr0_26
+_savegpr0_26:
+	std	r26,-48(r1)
+.globl	_savegpr0_27
+_savegpr0_27:
+	std	r27,-40(r1)
+.globl	_savegpr0_28
+_savegpr0_28:
+	std	r28,-32(r1)
+.globl	_savegpr0_29
+_savegpr0_29:
+	std	r29,-24(r1)
+.globl	_savegpr0_30
+_savegpr0_30:
+	std	r30,-16(r1)
+.globl	_savegpr0_31
+_savegpr0_31:
+	std	r31,-8(r1)
+	std	r0,16(r1)
+	blr
+
+.globl	_restgpr0_14
+_restgpr0_14:
+	ld	r14,-144(r1)
+.globl	_restgpr0_15
+_restgpr0_15:
+	ld	r15,-136(r1)
+.globl	_restgpr0_16
+_restgpr0_16:
+	ld	r16,-128(r1)
+.globl	_restgpr0_17
+_restgpr0_17:
+	ld	r17,-120(r1)
+.globl	_restgpr0_18
+_restgpr0_18:
+	ld	r18,-112(r1)
+.globl	_restgpr0_19
+_restgpr0_19:
+	ld	r19,-104(r1)
+.globl	_restgpr0_20
+_restgpr0_20:
+	ld	r20,-96(r1)
+.globl	_restgpr0_21
+_restgpr0_21:
+	ld	r21,-88(r1)
+.globl	_restgpr0_22
+_restgpr0_22:
+	ld	r22,-80(r1)
+.globl	_restgpr0_23
+_restgpr0_23:
+	ld	r23,-72(r1)
+.globl	_restgpr0_24
+_restgpr0_24:
+	ld	r24,-64(r1)
+.globl	_restgpr0_25
+_restgpr0_25:
+	ld	r25,-56(r1)
+.globl	_restgpr0_26
+_restgpr0_26:
+	ld	r26,-48(r1)
+.globl	_restgpr0_27
+_restgpr0_27:
+	ld	r27,-40(r1)
+.globl	_restgpr0_28
+_restgpr0_28:
+	ld	r28,-32(r1)
+.globl	_restgpr0_29
+_restgpr0_29:
+	ld	r0,16(r1)
+	ld	r29,-24(r1)
+	mtlr	r0
+	ld	r30,-16(r1)
+	ld	r31,-8(r1)
+	blr
+
+.globl	_restgpr0_30
+_restgpr0_30:
+	ld	r30,-16(r1)
+.globl	_restgpr0_31
+_restgpr0_31:
+	ld	r0,16(r1)
+	ld	r31,-8(r1)
+	mtlr	r0
+	blr
+
+#ifdef CONFIG_ALTIVEC
+/* Called with r0 pointing just beyond the end of the vector save area.  */
+
+.globl	_savevr_20
+_savevr_20:
+	li	r12,-192
+	stvx	vr20,r12,r0
+.globl	_savevr_21
+_savevr_21:
+	li	r12,-176
+	stvx	vr21,r12,r0
+.globl	_savevr_22
+_savevr_22:
+	li	r12,-160
+	stvx	vr22,r12,r0
+.globl	_savevr_23
+_savevr_23:
+	li	r12,-144
+	stvx	vr23,r12,r0
+.globl	_savevr_24
+_savevr_24:
+	li	r12,-128
+	stvx	vr24,r12,r0
+.globl	_savevr_25
+_savevr_25:
+	li	r12,-112
+	stvx	vr25,r12,r0
+.globl	_savevr_26
+_savevr_26:
+	li	r12,-96
+	stvx	vr26,r12,r0
+.globl	_savevr_27
+_savevr_27:
+	li	r12,-80
+	stvx	vr27,r12,r0
+.globl	_savevr_28
+_savevr_28:
+	li	r12,-64
+	stvx	vr28,r12,r0
+.globl	_savevr_29
+_savevr_29:
+	li	r12,-48
+	stvx	vr29,r12,r0
+.globl	_savevr_30
+_savevr_30:
+	li	r12,-32
+	stvx	vr30,r12,r0
+.globl	_savevr_31
+_savevr_31:
+	li	r12,-16
+	stvx	vr31,r12,r0
+	blr
+
+.globl	_restvr_20
+_restvr_20:
+	li	r12,-192
+	lvx	vr20,r12,r0
+.globl	_restvr_21
+_restvr_21:
+	li	r12,-176
+	lvx	vr21,r12,r0
+.globl	_restvr_22
+_restvr_22:
+	li	r12,-160
+	lvx	vr22,r12,r0
+.globl	_restvr_23
+_restvr_23:
+	li	r12,-144
+	lvx	vr23,r12,r0
+.globl	_restvr_24
+_restvr_24:
+	li	r12,-128
+	lvx	vr24,r12,r0
+.globl	_restvr_25
+_restvr_25:
+	li	r12,-112
+	lvx	vr25,r12,r0
+.globl	_restvr_26
+_restvr_26:
+	li	r12,-96
+	lvx	vr26,r12,r0
+.globl	_restvr_27
+_restvr_27:
+	li	r12,-80
+	lvx	vr27,r12,r0
+.globl	_restvr_28
+_restvr_28:
+	li	r12,-64
+	lvx	vr28,r12,r0
+.globl	_restvr_29
+_restvr_29:
+	li	r12,-48
+	lvx	vr29,r12,r0
+.globl	_restvr_30
+_restvr_30:
+	li	r12,-32
+	lvx	vr30,r12,r0
+.globl	_restvr_31
+_restvr_31:
+	li	r12,-16
+	lvx	vr31,r12,r0
+	blr
+
+#endif /* CONFIG_ALTIVEC */
+
+#endif /* CONFIG_PPC64 */
+
+#endif
diff --git a/arch/powerpc/lib/devres.c b/arch/powerpc/lib/devres.c
new file mode 100644
index 00000000000..8df55fc3aad
--- /dev/null
+++ b/arch/powerpc/lib/devres.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2008 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/device.h>	/* devres_*(), devm_ioremap_release() */
+#include <linux/gfp.h>
+#include <linux/io.h>		/* ioremap_prot() */
+#include <linux/export.h>	/* EXPORT_SYMBOL() */
+
+/**
+ * devm_ioremap_prot - Managed ioremap_prot()
+ * @dev: Generic device to remap IO address for
+ * @offset: BUS offset to map
+ * @size: Size of map
+ * @flags: Page flags
+ *
+ * Managed ioremap_prot().  Map is automatically unmapped on driver
+ * detach.
+ */
+void __iomem *devm_ioremap_prot(struct device *dev, resource_size_t offset,
+				 size_t size, unsigned long flags)
+{
+	void __iomem **ptr, *addr;
+
+	ptr = devres_alloc(devm_ioremap_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return NULL;
+
+	addr = ioremap_prot(offset, size, flags);
+	if (addr) {
+		*ptr = addr;
+		devres_add(dev, ptr);
+	} else
+		devres_free(ptr);
+
+	return addr;
+}
+EXPORT_SYMBOL(devm_ioremap_prot);
diff --git a/arch/powerpc/lib/dma-noncoherent.c b/arch/powerpc/lib/dma-noncoherent.c
deleted file mode 100644
index 6656d47841d..00000000000
--- a/arch/powerpc/lib/dma-noncoherent.c
+++ /dev/null
@@ -1,420 +0,0 @@
-/*
- *  PowerPC version derived from arch/arm/mm/consistent.c
- *    Copyright (C) 2001 Dan Malek (dmalek@jlc.net)
- *
- *  Copyright (C) 2000 Russell King
- *
- * Consistent memory allocators.  Used for DMA devices that want to
- * share uncached memory with the processor core.  The function return
- * is the virtual address and 'dma_handle' is the physical address.
- * Mostly stolen from the ARM port, with some changes for PowerPC.
- *						-- Dan
- *
- * Reorganized to get rid of the arch-specific consistent_* functions
- * and provide non-coherent implementations for the DMA API. -Matt
- *
- * Added in_interrupt() safe dma_alloc_coherent()/dma_free_coherent()
- * implementation. This is pulled straight from ARM and barely
- * modified. -Matt
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/highmem.h>
-#include <linux/dma-mapping.h>
-
-#include <asm/tlbflush.h>
-
-/*
- * This address range defaults to a value that is safe for all
- * platforms which currently set CONFIG_NOT_COHERENT_CACHE. It
- * can be further configured for specific applications under
- * the "Advanced Setup" menu. -Matt
- */
-#define CONSISTENT_BASE	(CONFIG_CONSISTENT_START)
-#define CONSISTENT_END	(CONFIG_CONSISTENT_START + CONFIG_CONSISTENT_SIZE)
-#define CONSISTENT_OFFSET(x)	(((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT)
-
-/*
- * This is the page table (2MB) covering uncached, DMA consistent allocations
- */
-static pte_t *consistent_pte;
-static DEFINE_SPINLOCK(consistent_lock);
-
-/*
- * VM region handling support.
- *
- * This should become something generic, handling VM region allocations for
- * vmalloc and similar (ioremap, module space, etc).
- *
- * I envisage vmalloc()'s supporting vm_struct becoming:
- *
- *  struct vm_struct {
- *    struct vm_region	region;
- *    unsigned long	flags;
- *    struct page	**pages;
- *    unsigned int	nr_pages;
- *    unsigned long	phys_addr;
- *  };
- *
- * get_vm_area() would then call vm_region_alloc with an appropriate
- * struct vm_region head (eg):
- *
- *  struct vm_region vmalloc_head = {
- *	.vm_list	= LIST_HEAD_INIT(vmalloc_head.vm_list),
- *	.vm_start	= VMALLOC_START,
- *	.vm_end		= VMALLOC_END,
- *  };
- *
- * However, vmalloc_head.vm_start is variable (typically, it is dependent on
- * the amount of RAM found at boot time.)  I would imagine that get_vm_area()
- * would have to initialise this each time prior to calling vm_region_alloc().
- */
-struct vm_region {
-	struct list_head	vm_list;
-	unsigned long		vm_start;
-	unsigned long		vm_end;
-};
-
-static struct vm_region consistent_head = {
-	.vm_list	= LIST_HEAD_INIT(consistent_head.vm_list),
-	.vm_start	= CONSISTENT_BASE,
-	.vm_end		= CONSISTENT_END,
-};
-
-static struct vm_region *
-vm_region_alloc(struct vm_region *head, size_t size, gfp_t gfp)
-{
-	unsigned long addr = head->vm_start, end = head->vm_end - size;
-	unsigned long flags;
-	struct vm_region *c, *new;
-
-	new = kmalloc(sizeof(struct vm_region), gfp);
-	if (!new)
-		goto out;
-
-	spin_lock_irqsave(&consistent_lock, flags);
-
-	list_for_each_entry(c, &head->vm_list, vm_list) {
-		if ((addr + size) < addr)
-			goto nospc;
-		if ((addr + size) <= c->vm_start)
-			goto found;
-		addr = c->vm_end;
-		if (addr > end)
-			goto nospc;
-	}
-
- found:
-	/*
-	 * Insert this entry _before_ the one we found.
-	 */
-	list_add_tail(&new->vm_list, &c->vm_list);
-	new->vm_start = addr;
-	new->vm_end = addr + size;
-
-	spin_unlock_irqrestore(&consistent_lock, flags);
-	return new;
-
- nospc:
-	spin_unlock_irqrestore(&consistent_lock, flags);
-	kfree(new);
- out:
-	return NULL;
-}
-
-static struct vm_region *vm_region_find(struct vm_region *head, unsigned long addr)
-{
-	struct vm_region *c;
-
-	list_for_each_entry(c, &head->vm_list, vm_list) {
-		if (c->vm_start == addr)
-			goto out;
-	}
-	c = NULL;
- out:
-	return c;
-}
-
-/*
- * Allocate DMA-coherent memory space and return both the kernel remapped
- * virtual and bus address for that space.
- */
-void *
-__dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp)
-{
-	struct page *page;
-	struct vm_region *c;
-	unsigned long order;
-	u64 mask = 0x00ffffff, limit; /* ISA default */
-
-	if (!consistent_pte) {
-		printk(KERN_ERR "%s: not initialised\n", __func__);
-		dump_stack();
-		return NULL;
-	}
-
-	size = PAGE_ALIGN(size);
-	limit = (mask + 1) & ~mask;
-	if ((limit && size >= limit) || size >= (CONSISTENT_END - CONSISTENT_BASE)) {
-		printk(KERN_WARNING "coherent allocation too big (requested %#x mask %#Lx)\n",
-		       size, mask);
-		return NULL;
-	}
-
-	order = get_order(size);
-
-	if (mask != 0xffffffff)
-		gfp |= GFP_DMA;
-
-	page = alloc_pages(gfp, order);
-	if (!page)
-		goto no_page;
-
-	/*
-	 * Invalidate any data that might be lurking in the
-	 * kernel direct-mapped region for device DMA.
-	 */
-	{
-		unsigned long kaddr = (unsigned long)page_address(page);
-		memset(page_address(page), 0, size);
-		flush_dcache_range(kaddr, kaddr + size);
-	}
-
-	/*
-	 * Allocate a virtual address in the consistent mapping region.
-	 */
-	c = vm_region_alloc(&consistent_head, size,
-			    gfp & ~(__GFP_DMA | __GFP_HIGHMEM));
-	if (c) {
-		unsigned long vaddr = c->vm_start;
-		pte_t *pte = consistent_pte + CONSISTENT_OFFSET(vaddr);
-		struct page *end = page + (1 << order);
-
-		split_page(page, order);
-
-		/*
-		 * Set the "dma handle"
-		 */
-		*handle = page_to_bus(page);
-
-		do {
-			BUG_ON(!pte_none(*pte));
-
-			SetPageReserved(page);
-			set_pte_at(&init_mm, vaddr,
-				   pte, mk_pte(page, pgprot_noncached(PAGE_KERNEL)));
-			page++;
-			pte++;
-			vaddr += PAGE_SIZE;
-		} while (size -= PAGE_SIZE);
-
-		/*
-		 * Free the otherwise unused pages.
-		 */
-		while (page < end) {
-			__free_page(page);
-			page++;
-		}
-
-		return (void *)c->vm_start;
-	}
-
-	if (page)
-		__free_pages(page, order);
- no_page:
-	return NULL;
-}
-EXPORT_SYMBOL(__dma_alloc_coherent);
-
-/*
- * free a page as defined by the above mapping.
- */
-void __dma_free_coherent(size_t size, void *vaddr)
-{
-	struct vm_region *c;
-	unsigned long flags, addr;
-	pte_t *ptep;
-
-	size = PAGE_ALIGN(size);
-
-	spin_lock_irqsave(&consistent_lock, flags);
-
-	c = vm_region_find(&consistent_head, (unsigned long)vaddr);
-	if (!c)
-		goto no_area;
-
-	if ((c->vm_end - c->vm_start) != size) {
-		printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n",
-		       __func__, c->vm_end - c->vm_start, size);
-		dump_stack();
-		size = c->vm_end - c->vm_start;
-	}
-
-	ptep = consistent_pte + CONSISTENT_OFFSET(c->vm_start);
-	addr = c->vm_start;
-	do {
-		pte_t pte = ptep_get_and_clear(&init_mm, addr, ptep);
-		unsigned long pfn;
-
-		ptep++;
-		addr += PAGE_SIZE;
-
-		if (!pte_none(pte) && pte_present(pte)) {
-			pfn = pte_pfn(pte);
-
-			if (pfn_valid(pfn)) {
-				struct page *page = pfn_to_page(pfn);
-				ClearPageReserved(page);
-
-				__free_page(page);
-				continue;
-			}
-		}
-
-		printk(KERN_CRIT "%s: bad page in kernel page table\n",
-		       __func__);
-	} while (size -= PAGE_SIZE);
-
-	flush_tlb_kernel_range(c->vm_start, c->vm_end);
-
-	list_del(&c->vm_list);
-
-	spin_unlock_irqrestore(&consistent_lock, flags);
-
-	kfree(c);
-	return;
-
- no_area:
-	spin_unlock_irqrestore(&consistent_lock, flags);
-	printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n",
-	       __func__, vaddr);
-	dump_stack();
-}
-EXPORT_SYMBOL(__dma_free_coherent);
-
-/*
- * Initialise the consistent memory allocation.
- */
-static int __init dma_alloc_init(void)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-	int ret = 0;
-
-	do {
-		pgd = pgd_offset(&init_mm, CONSISTENT_BASE);
-		pud = pud_alloc(&init_mm, pgd, CONSISTENT_BASE);
-		pmd = pmd_alloc(&init_mm, pud, CONSISTENT_BASE);
-		if (!pmd) {
-			printk(KERN_ERR "%s: no pmd tables\n", __func__);
-			ret = -ENOMEM;
-			break;
-		}
-		WARN_ON(!pmd_none(*pmd));
-
-		pte = pte_alloc_kernel(pmd, CONSISTENT_BASE);
-		if (!pte) {
-			printk(KERN_ERR "%s: no pte tables\n", __func__);
-			ret = -ENOMEM;
-			break;
-		}
-
-		consistent_pte = pte;
-	} while (0);
-
-	return ret;
-}
-
-core_initcall(dma_alloc_init);
-
-/*
- * make an area consistent.
- */
-void __dma_sync(void *vaddr, size_t size, int direction)
-{
-	unsigned long start = (unsigned long)vaddr;
-	unsigned long end   = start + size;
-
-	switch (direction) {
-	case DMA_NONE:
-		BUG();
-	case DMA_FROM_DEVICE:	/* invalidate only */
-		invalidate_dcache_range(start, end);
-		break;
-	case DMA_TO_DEVICE:		/* writeback only */
-		clean_dcache_range(start, end);
-		break;
-	case DMA_BIDIRECTIONAL:	/* writeback and invalidate */
-		flush_dcache_range(start, end);
-		break;
-	}
-}
-EXPORT_SYMBOL(__dma_sync);
-
-#ifdef CONFIG_HIGHMEM
-/*
- * __dma_sync_page() implementation for systems using highmem.
- * In this case, each page of a buffer must be kmapped/kunmapped
- * in order to have a virtual address for __dma_sync(). This must
- * not sleep so kmap_atomic()/kunmap_atomic() are used.
- *
- * Note: yes, it is possible and correct to have a buffer extend
- * beyond the first page.
- */
-static inline void __dma_sync_page_highmem(struct page *page,
-		unsigned long offset, size_t size, int direction)
-{
-	size_t seg_size = min((size_t)(PAGE_SIZE - offset), size);
-	size_t cur_size = seg_size;
-	unsigned long flags, start, seg_offset = offset;
-	int nr_segs = 1 + ((size - seg_size) + PAGE_SIZE - 1)/PAGE_SIZE;
-	int seg_nr = 0;
-
-	local_irq_save(flags);
-
-	do {
-		start = (unsigned long)kmap_atomic(page + seg_nr,
-				KM_PPC_SYNC_PAGE) + seg_offset;
-
-		/* Sync this buffer segment */
-		__dma_sync((void *)start, seg_size, direction);
-		kunmap_atomic((void *)start, KM_PPC_SYNC_PAGE);
-		seg_nr++;
-
-		/* Calculate next buffer segment size */
-		seg_size = min((size_t)PAGE_SIZE, size - cur_size);
-
-		/* Add the segment size to our running total */
-		cur_size += seg_size;
-		seg_offset = 0;
-	} while (seg_nr < nr_segs);
-
-	local_irq_restore(flags);
-}
-#endif /* CONFIG_HIGHMEM */
-
-/*
- * __dma_sync_page makes memory consistent. identical to __dma_sync, but
- * takes a struct page instead of a virtual address
- */
-void __dma_sync_page(struct page *page, unsigned long offset,
-	size_t size, int direction)
-{
-#ifdef CONFIG_HIGHMEM
-	__dma_sync_page_highmem(page, offset, size, direction);
-#else
-	unsigned long start = (unsigned long)page_address(page) + offset;
-	__dma_sync((void *)start, size, direction);
-#endif
-}
-EXPORT_SYMBOL(__dma_sync_page);
diff --git a/arch/powerpc/lib/feature-fixups-test.S b/arch/powerpc/lib/feature-fixups-test.S
new file mode 100644
index 00000000000..f4613118132
--- /dev/null
+++ b/arch/powerpc/lib/feature-fixups-test.S
@@ -0,0 +1,761 @@
+/*
+ * Copyright 2008 Michael Ellerman, IBM Corporation.
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/feature-fixups.h>
+#include <asm/ppc_asm.h>
+#include <asm/synch.h>
+
+	.text
+
+#define globl(x)		\
+	.globl x;	\
+x:
+
+globl(ftr_fixup_test1)
+	or	1,1,1
+	or	2,2,2	/* fixup will nop out this instruction */
+	or	3,3,3
+
+globl(end_ftr_fixup_test1)
+
+globl(ftr_fixup_test1_orig)
+	or	1,1,1
+	or	2,2,2
+	or	3,3,3
+
+globl(ftr_fixup_test1_expected)
+	or	1,1,1
+	nop
+	or	3,3,3
+
+globl(ftr_fixup_test2)
+	or	1,1,1
+	or	2,2,2	/* fixup will replace this with ftr_fixup_test2_alt */
+	or	3,3,3
+
+globl(end_ftr_fixup_test2)
+
+globl(ftr_fixup_test2_orig)
+	or	1,1,1
+	or	2,2,2
+	or	3,3,3
+
+globl(ftr_fixup_test2_alt)
+	or	31,31,31
+
+globl(ftr_fixup_test2_expected)
+	or	1,1,1
+	or	31,31,31
+	or	3,3,3
+
+globl(ftr_fixup_test3)
+	or	1,1,1
+	or	2,2,2	/* fixup will fail to replace this */
+	or	3,3,3
+
+globl(end_ftr_fixup_test3)
+
+globl(ftr_fixup_test3_orig)
+	or	1,1,1
+	or	2,2,2
+	or	3,3,3
+
+globl(ftr_fixup_test3_alt)
+	or	31,31,31
+	or	31,31,31
+
+globl(ftr_fixup_test4)
+	or	1,1,1
+	or	2,2,2
+	or	2,2,2
+	or	2,2,2
+	or	2,2,2
+	or	3,3,3
+
+globl(end_ftr_fixup_test4)
+
+globl(ftr_fixup_test4_expected)
+	or	1,1,1
+	or	31,31,31
+	or	31,31,31
+	nop
+	nop
+	or	3,3,3
+
+globl(ftr_fixup_test4_orig)
+	or	1,1,1
+	or	2,2,2
+	or	2,2,2
+	or	2,2,2
+	or	2,2,2
+	or	3,3,3
+
+globl(ftr_fixup_test4_alt)
+	or	31,31,31
+	or	31,31,31
+
+
+globl(ftr_fixup_test5)
+	or	1,1,1
+BEGIN_FTR_SECTION
+	or	2,2,2
+	or	2,2,2
+	or	2,2,2
+	or	2,2,2
+	or	2,2,2
+	or	2,2,2
+	or	2,2,2
+FTR_SECTION_ELSE
+2:	b	3f
+3:	or	5,5,5
+	beq	3b
+	b	1f
+	or	6,6,6
+	b	2b
+1:	bdnz	3b
+ALT_FTR_SECTION_END(0, 1)
+	or	1,1,1
+
+globl(end_ftr_fixup_test5)
+
+globl(ftr_fixup_test5_expected)
+	or	1,1,1
+2:	b	3f
+3:	or	5,5,5
+	beq	3b
+	b	1f
+	or	6,6,6
+	b	2b
+1:	bdnz	3b
+	or	1,1,1
+
+globl(ftr_fixup_test6)
+1:	or	1,1,1
+BEGIN_FTR_SECTION
+	or	5,5,5
+2:	PPC_LCMPI	r3,0
+	beq	4f
+	blt	2b
+	b	1b
+	b	4f
+FTR_SECTION_ELSE
+2:	or	2,2,2
+	PPC_LCMPI	r3,1
+	beq	3f
+	blt	2b
+	b	3f
+	b	1b
+ALT_FTR_SECTION_END(0, 1)
+3:	or	1,1,1
+	or	2,2,2
+4:	or	3,3,3
+
+globl(end_ftr_fixup_test6)
+
+globl(ftr_fixup_test6_expected)
+1:	or	1,1,1
+2:	or	2,2,2
+	PPC_LCMPI	r3,1
+	beq	3f
+	blt	2b
+	b	3f
+	b	1b
+2:	or	1,1,1
+	or	2,2,2
+3:	or	3,3,3
+
+
+#if 0
+/* Test that if we have a larger else case the assembler spots it and
+ * reports an error. #if 0'ed so as not to break the build normally.
+ */
+ftr_fixup_test7:
+	or	1,1,1
+BEGIN_FTR_SECTION
+	or	2,2,2
+	or	2,2,2
+	or	2,2,2
+FTR_SECTION_ELSE
+	or	3,3,3
+	or	3,3,3
+	or	3,3,3
+	or	3,3,3
+ALT_FTR_SECTION_END(0, 1)
+	or	1,1,1
+#endif
+
+#define	MAKE_MACRO_TEST(TYPE)						\
+globl(ftr_fixup_test_ ##TYPE##_macros)					\
+	or	1,1,1;							\
+	/* Basic test, this section should all be nop'ed */		\
+BEGIN_##TYPE##_SECTION							\
+	or	2,2,2;							\
+	or	2,2,2;							\
+	or	2,2,2;							\
+END_##TYPE##_SECTION(0, 1)						\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Basic test, this section should NOT be nop'ed */		\
+BEGIN_##TYPE##_SECTION							\
+	or	2,2,2;							\
+	or	2,2,2;							\
+	or	2,2,2;							\
+END_##TYPE##_SECTION(0, 0)						\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Nesting test, inner section should be nop'ed */		\
+BEGIN_##TYPE##_SECTION							\
+	or	2,2,2;							\
+	or	2,2,2;							\
+BEGIN_##TYPE##_SECTION_NESTED(80)					\
+	or	3,3,3;							\
+	or	3,3,3;							\
+END_##TYPE##_SECTION_NESTED(0, 1, 80)					\
+	or	2,2,2;							\
+	or	2,2,2;							\
+END_##TYPE##_SECTION(0, 0)						\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Nesting test, whole section should be nop'ed */		\
+BEGIN_##TYPE##_SECTION							\
+	or	2,2,2;							\
+	or	2,2,2;							\
+BEGIN_##TYPE##_SECTION_NESTED(80)					\
+	or	3,3,3;							\
+	or	3,3,3;							\
+END_##TYPE##_SECTION_NESTED(0, 0, 80)					\
+	or	2,2,2;							\
+	or	2,2,2;							\
+END_##TYPE##_SECTION(0, 1)						\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Nesting test, none should be nop'ed */			\
+BEGIN_##TYPE##_SECTION							\
+	or	2,2,2;							\
+	or	2,2,2;							\
+BEGIN_##TYPE##_SECTION_NESTED(80)					\
+	or	3,3,3;							\
+	or	3,3,3;							\
+END_##TYPE##_SECTION_NESTED(0, 0, 80)					\
+	or	2,2,2;							\
+	or	2,2,2;							\
+END_##TYPE##_SECTION(0, 0)						\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Basic alt section test, default case should be taken */	\
+BEGIN_##TYPE##_SECTION							\
+	or	3,3,3;							\
+	or	3,3,3;							\
+	or	3,3,3;							\
+##TYPE##_SECTION_ELSE							\
+	or	5,5,5;							\
+	or	5,5,5;							\
+ALT_##TYPE##_SECTION_END(0, 0)						\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Basic alt section test, else case should be taken */		\
+BEGIN_##TYPE##_SECTION							\
+	or	3,3,3;							\
+	or	3,3,3;							\
+	or	3,3,3;							\
+##TYPE##_SECTION_ELSE							\
+	or	31,31,31;						\
+	or	31,31,31;						\
+	or	31,31,31;						\
+ALT_##TYPE##_SECTION_END(0, 1)						\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Alt with smaller else case, should be padded with nops */	\
+BEGIN_##TYPE##_SECTION							\
+	or	3,3,3;							\
+	or	3,3,3;							\
+	or	3,3,3;							\
+##TYPE##_SECTION_ELSE							\
+	or	31,31,31;						\
+ALT_##TYPE##_SECTION_END(0, 1)						\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Alt section with nested section in default case */		\
+	/* Default case should be taken, with nop'ed inner section */	\
+BEGIN_##TYPE##_SECTION							\
+	or	3,3,3;							\
+BEGIN_##TYPE##_SECTION_NESTED(95)					\
+	or	3,3,3;							\
+	or	3,3,3;							\
+END_##TYPE##_SECTION_NESTED(0, 1, 95)					\
+	or	3,3,3;							\
+##TYPE##_SECTION_ELSE							\
+	or	2,2,2;							\
+	or	2,2,2;							\
+ALT_##TYPE##_SECTION_END(0, 0)						\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Alt section with nested section in else, default taken */	\
+BEGIN_##TYPE##_SECTION							\
+	or	3,3,3;							\
+	or	3,3,3;							\
+	or	3,3,3;							\
+##TYPE##_SECTION_ELSE							\
+	or	5,5,5;							\
+BEGIN_##TYPE##_SECTION_NESTED(95)					\
+	or	3,3,3;							\
+END_##TYPE##_SECTION_NESTED(0, 1, 95)					\
+	or	5,5,5;							\
+ALT_##TYPE##_SECTION_END(0, 0)						\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Alt section with nested section in else, else taken & nop */	\
+BEGIN_##TYPE##_SECTION							\
+	or	3,3,3;							\
+	or	3,3,3;							\
+	or	3,3,3;							\
+##TYPE##_SECTION_ELSE							\
+	or	5,5,5;							\
+BEGIN_##TYPE##_SECTION_NESTED(95)					\
+	or	3,3,3;							\
+END_##TYPE##_SECTION_NESTED(0, 1, 95)					\
+	or	5,5,5;							\
+ALT_##TYPE##_SECTION_END(0, 1)						\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Feature section with nested alt section, default taken */	\
+BEGIN_##TYPE##_SECTION							\
+	or	2,2,2;							\
+BEGIN_##TYPE##_SECTION_NESTED(95)					\
+	or	1,1,1;							\
+##TYPE##_SECTION_ELSE_NESTED(95)					\
+	or	5,5,5;							\
+ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95)				\
+	or	2,2,2;							\
+END_##TYPE##_SECTION(0, 0)						\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Feature section with nested alt section, else taken */	\
+BEGIN_##TYPE##_SECTION							\
+	or	2,2,2;							\
+BEGIN_##TYPE##_SECTION_NESTED(95)					\
+	or	1,1,1;							\
+##TYPE##_SECTION_ELSE_NESTED(95)					\
+	or	5,5,5;							\
+ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95)				\
+	or	2,2,2;							\
+END_##TYPE##_SECTION(0, 0)						\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Feature section with nested alt section, all nop'ed */	\
+BEGIN_##TYPE##_SECTION							\
+	or	2,2,2;							\
+BEGIN_##TYPE##_SECTION_NESTED(95)					\
+	or	1,1,1;							\
+##TYPE##_SECTION_ELSE_NESTED(95)					\
+	or	5,5,5;							\
+ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95)				\
+	or	2,2,2;							\
+END_##TYPE##_SECTION(0, 1)						\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Nested alt sections, default with inner default taken */	\
+BEGIN_##TYPE##_SECTION							\
+	or	2,2,2;							\
+BEGIN_##TYPE##_SECTION_NESTED(95)					\
+	or	1,1,1;							\
+##TYPE##_SECTION_ELSE_NESTED(95)					\
+	or	5,5,5;							\
+ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95)				\
+	or	2,2,2;							\
+##TYPE##_SECTION_ELSE							\
+	or	31,31,31;						\
+BEGIN_##TYPE##_SECTION_NESTED(94)					\
+	or	5,5,5;							\
+##TYPE##_SECTION_ELSE_NESTED(94)					\
+	or	1,1,1;							\
+ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94)				\
+	or	31,31,31;						\
+ALT_##TYPE##_SECTION_END(0, 0)						\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Nested alt sections, default with inner else taken */	\
+BEGIN_##TYPE##_SECTION							\
+	or	2,2,2;							\
+BEGIN_##TYPE##_SECTION_NESTED(95)					\
+	or	1,1,1;							\
+##TYPE##_SECTION_ELSE_NESTED(95)					\
+	or	5,5,5;							\
+ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95)				\
+	or	2,2,2;							\
+##TYPE##_SECTION_ELSE							\
+	or	31,31,31;						\
+BEGIN_##TYPE##_SECTION_NESTED(94)					\
+	or	5,5,5;							\
+##TYPE##_SECTION_ELSE_NESTED(94)					\
+	or	1,1,1;							\
+ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94)				\
+	or	31,31,31;						\
+ALT_##TYPE##_SECTION_END(0, 0)						\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Nested alt sections, else with inner default taken */	\
+BEGIN_##TYPE##_SECTION							\
+	or	2,2,2;							\
+BEGIN_##TYPE##_SECTION_NESTED(95)					\
+	or	1,1,1;							\
+##TYPE##_SECTION_ELSE_NESTED(95)					\
+	or	5,5,5;							\
+ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95)				\
+	or	2,2,2;							\
+##TYPE##_SECTION_ELSE							\
+	or	31,31,31;						\
+BEGIN_##TYPE##_SECTION_NESTED(94)					\
+	or	5,5,5;							\
+##TYPE##_SECTION_ELSE_NESTED(94)					\
+	or	1,1,1;							\
+ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94)				\
+	or	31,31,31;						\
+ALT_##TYPE##_SECTION_END(0, 1)						\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Nested alt sections, else with inner else taken */		\
+BEGIN_##TYPE##_SECTION							\
+	or	2,2,2;							\
+BEGIN_##TYPE##_SECTION_NESTED(95)					\
+	or	1,1,1;							\
+##TYPE##_SECTION_ELSE_NESTED(95)					\
+	or	5,5,5;							\
+ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95)				\
+	or	2,2,2;							\
+##TYPE##_SECTION_ELSE							\
+	or	31,31,31;						\
+BEGIN_##TYPE##_SECTION_NESTED(94)					\
+	or	5,5,5;							\
+##TYPE##_SECTION_ELSE_NESTED(94)					\
+	or	1,1,1;							\
+ALT_##TYPE##_SECTION_END_NESTED(0, 1, 94)				\
+	or	31,31,31;						\
+ALT_##TYPE##_SECTION_END(0, 1)						\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Nested alt sections, else can have large else case */	\
+BEGIN_##TYPE##_SECTION							\
+	or	2,2,2;							\
+	or	2,2,2;							\
+	or	2,2,2;							\
+	or	2,2,2;							\
+##TYPE##_SECTION_ELSE 							\
+BEGIN_##TYPE##_SECTION_NESTED(94) 					\
+	or	5,5,5;							\
+	or	5,5,5;							\
+	or	5,5,5;							\
+	or	5,5,5;							\
+##TYPE##_SECTION_ELSE_NESTED(94) 					\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	or	1,1,1;							\
+ALT_##TYPE##_SECTION_END_NESTED(0, 1, 94)				\
+ALT_##TYPE##_SECTION_END(0, 1)						\
+	or	1,1,1;							\
+	or	1,1,1;
+
+#define	MAKE_MACRO_TEST_EXPECTED(TYPE)					\
+globl(ftr_fixup_test_ ##TYPE##_macros_expected)				\
+	or	1,1,1;							\
+	/* Basic test, this section should all be nop'ed */		\
+/* BEGIN_##TYPE##_SECTION */						\
+	nop;								\
+	nop;								\
+	nop;								\
+/* END_##TYPE##_SECTION(0, 1) */					\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Basic test, this section should NOT be nop'ed */		\
+/* BEGIN_##TYPE##_SECTION */						\
+	or	2,2,2;							\
+	or	2,2,2;							\
+	or	2,2,2;							\
+/* END_##TYPE##_SECTION(0, 0) */					\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Nesting test, inner section should be nop'ed */		\
+/* BEGIN_##TYPE##_SECTION */						\
+	or	2,2,2;							\
+	or	2,2,2;							\
+/* BEGIN_##TYPE##_SECTION_NESTED(80) */					\
+	nop;								\
+	nop;								\
+/* END_##TYPE##_SECTION_NESTED(0, 1, 80) */				\
+	or	2,2,2;							\
+	or	2,2,2;							\
+/* END_##TYPE##_SECTION(0, 0) */					\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Nesting test, whole section should be nop'ed */		\
+	/* NB. inner section is not nop'ed, but then entire outer is */	\
+/* BEGIN_##TYPE##_SECTION */						\
+	nop;								\
+	nop;								\
+/* BEGIN_##TYPE##_SECTION_NESTED(80) */					\
+	nop;								\
+	nop;								\
+/* END_##TYPE##_SECTION_NESTED(0, 0, 80) */				\
+	nop;								\
+	nop;								\
+/* END_##TYPE##_SECTION(0, 1) */					\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Nesting test, none should be nop'ed */			\
+/* BEGIN_##TYPE##_SECTION */						\
+	or	2,2,2;							\
+	or	2,2,2;							\
+/* BEGIN_##TYPE##_SECTION_NESTED(80) */					\
+	or	3,3,3;							\
+	or	3,3,3;							\
+/* END_##TYPE##_SECTION_NESTED(0, 0, 80) */				\
+	or	2,2,2;							\
+	or	2,2,2;							\
+/* END_##TYPE##_SECTION(0, 0) */					\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Basic alt section test, default case should be taken */	\
+/* BEGIN_##TYPE##_SECTION */						\
+	or	3,3,3;							\
+	or	3,3,3;							\
+	or	3,3,3;							\
+/* ##TYPE##_SECTION_ELSE */						\
+	/* or	5,5,5; */						\
+	/* or	5,5,5; */						\
+/* ALT_##TYPE##_SECTION_END(0, 0) */					\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Basic alt section test, else case should be taken */		\
+/* BEGIN_##TYPE##_SECTION */						\
+	/* or	3,3,3; */						\
+	/* or	3,3,3; */						\
+	/* or	3,3,3; */						\
+/* ##TYPE##_SECTION_ELSE */						\
+	or	31,31,31;						\
+	or	31,31,31;						\
+	or	31,31,31;						\
+/* ALT_##TYPE##_SECTION_END(0, 1) */					\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Alt with smaller else case, should be padded with nops */	\
+/* BEGIN_##TYPE##_SECTION */						\
+	/* or	3,3,3; */						\
+	/* or	3,3,3; */						\
+	/* or	3,3,3; */						\
+/* ##TYPE##_SECTION_ELSE */						\
+	or	31,31,31;						\
+	nop;								\
+	nop;								\
+/* ALT_##TYPE##_SECTION_END(0, 1) */					\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Alt section with nested section in default case */		\
+	/* Default case should be taken, with nop'ed inner section */	\
+/* BEGIN_##TYPE##_SECTION */						\
+	or	3,3,3;							\
+/* BEGIN_##TYPE##_SECTION_NESTED(95) */					\
+	nop;								\
+	nop;								\
+/* END_##TYPE##_SECTION_NESTED(0, 1, 95) */				\
+	or	3,3,3;							\
+/* ##TYPE##_SECTION_ELSE */						\
+	/* or	2,2,2; */						\
+	/* or	2,2,2; */						\
+/* ALT_##TYPE##_SECTION_END(0, 0) */					\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Alt section with nested section in else, default taken */	\
+/* BEGIN_##TYPE##_SECTION */						\
+	or	3,3,3;							\
+	or	3,3,3;							\
+	or	3,3,3;							\
+/* ##TYPE##_SECTION_ELSE */						\
+	/* or	5,5,5; */						\
+/* BEGIN_##TYPE##_SECTION_NESTED(95) */					\
+	/* or	3,3,3; */						\
+/* END_##TYPE##_SECTION_NESTED(0, 1, 95) */				\
+	/* or	5,5,5; */						\
+/* ALT_##TYPE##_SECTION_END(0, 0) */					\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Alt section with nested section in else, else taken & nop */	\
+/* BEGIN_##TYPE##_SECTION */						\
+	/* or	3,3,3; */						\
+	/* or	3,3,3; */						\
+	/* or	3,3,3; */						\
+/* ##TYPE##_SECTION_ELSE */						\
+	or	5,5,5;							\
+/* BEGIN_##TYPE##_SECTION_NESTED(95) */					\
+	nop;								\
+/* END_##TYPE##_SECTION_NESTED(0, 1, 95) */				\
+	or	5,5,5;							\
+/* ALT_##TYPE##_SECTION_END(0, 1) */					\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Feature section with nested alt section, default taken */	\
+/* BEGIN_##TYPE##_SECTION */						\
+	or	2,2,2;							\
+/* BEGIN_##TYPE##_SECTION_NESTED(95) */					\
+	or	1,1,1;							\
+/* ##TYPE##_SECTION_ELSE_NESTED(95) */					\
+	/* or	5,5,5; */						\
+/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95) */				\
+	or	2,2,2;							\
+/* END_##TYPE##_SECTION(0, 0) */					\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Feature section with nested alt section, else taken */	\
+/* BEGIN_##TYPE##_SECTION */						\
+	or	2,2,2;							\
+/* BEGIN_##TYPE##_SECTION_NESTED(95) */					\
+	/* or	1,1,1; */						\
+/* ##TYPE##_SECTION_ELSE_NESTED(95) */					\
+	or	5,5,5;							\
+/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) */				\
+	or	2,2,2;							\
+/* END_##TYPE##_SECTION(0, 0) */					\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Feature section with nested alt section, all nop'ed */	\
+/* BEGIN_##TYPE##_SECTION */						\
+	nop;								\
+/* BEGIN_##TYPE##_SECTION_NESTED(95) */					\
+	nop;								\
+/* ##TYPE##_SECTION_ELSE_NESTED(95) */					\
+	/* or	5,5,5; */						\
+/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95) */				\
+	nop;								\
+/* END_##TYPE##_SECTION(0, 1) */					\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Nested alt sections, default with inner default taken */	\
+/* BEGIN_##TYPE##_SECTION */						\
+	or	2,2,2;							\
+/* BEGIN_##TYPE##_SECTION_NESTED(95) */					\
+	or	1,1,1;							\
+/* ##TYPE##_SECTION_ELSE_NESTED(95) */					\
+	/* or	5,5,5; */						\
+/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95) */				\
+	or	2,2,2;							\
+/* ##TYPE##_SECTION_ELSE */						\
+	/* or	31,31,31; */						\
+/* BEGIN_##TYPE##_SECTION_NESTED(94) */					\
+	/* or	5,5,5; */						\
+/* ##TYPE##_SECTION_ELSE_NESTED(94) */					\
+	/* or	1,1,1; */						\
+/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94) */				\
+	/* or	31,31,31; */						\
+/* ALT_##TYPE##_SECTION_END(0, 0) */					\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Nested alt sections, default with inner else taken */	\
+/* BEGIN_##TYPE##_SECTION */						\
+	or	2,2,2;							\
+/* BEGIN_##TYPE##_SECTION_NESTED(95) */					\
+	/* or	1,1,1; */						\
+/* ##TYPE##_SECTION_ELSE_NESTED(95) */					\
+	or	5,5,5;							\
+/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) */				\
+	or	2,2,2;							\
+/* ##TYPE##_SECTION_ELSE */						\
+	/* or	31,31,31; */						\
+/* BEGIN_##TYPE##_SECTION_NESTED(94) */					\
+	/* or	5,5,5; */						\
+/* ##TYPE##_SECTION_ELSE_NESTED(94) */					\
+	/* or	1,1,1; */						\
+/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94) */				\
+	/* or	31,31,31; */						\
+/* ALT_##TYPE##_SECTION_END(0, 0) */					\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Nested alt sections, else with inner default taken */	\
+/* BEGIN_##TYPE##_SECTION */						\
+	/* or	2,2,2; */						\
+/* BEGIN_##TYPE##_SECTION_NESTED(95) */					\
+	/* or	1,1,1; */						\
+/* ##TYPE##_SECTION_ELSE_NESTED(95) */					\
+	/* or	5,5,5; */						\
+/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) */				\
+	/* or	2,2,2; */						\
+/* ##TYPE##_SECTION_ELSE */						\
+	or	31,31,31;						\
+/* BEGIN_##TYPE##_SECTION_NESTED(94) */					\
+	or	5,5,5;							\
+/* ##TYPE##_SECTION_ELSE_NESTED(94) */					\
+	/* or	1,1,1; */						\
+/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94) */				\
+	or	31,31,31;						\
+/* ALT_##TYPE##_SECTION_END(0, 1) */					\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Nested alt sections, else with inner else taken */		\
+/* BEGIN_##TYPE##_SECTION */						\
+	/* or	2,2,2; */						\
+/* BEGIN_##TYPE##_SECTION_NESTED(95) */					\
+	/* or	1,1,1; */						\
+/* ##TYPE##_SECTION_ELSE_NESTED(95) */					\
+	/* or	5,5,5; */						\
+/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) */				\
+	/* or	2,2,2; */						\
+/* ##TYPE##_SECTION_ELSE */						\
+	or	31,31,31;						\
+/* BEGIN_##TYPE##_SECTION_NESTED(94) */					\
+	/* or	5,5,5; */						\
+/* ##TYPE##_SECTION_ELSE_NESTED(94) */					\
+	or	1,1,1;							\
+/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 94) */				\
+	or	31,31,31;						\
+/* ALT_##TYPE##_SECTION_END(0, 1) */					\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	/* Nested alt sections, else can have large else case */	\
+/* BEGIN_##TYPE##_SECTION */						\
+	/* or	2,2,2; */						\
+	/* or	2,2,2; */						\
+	/* or	2,2,2; */						\
+	/* or	2,2,2; */						\
+/* ##TYPE##_SECTION_ELSE */						\
+/* BEGIN_##TYPE##_SECTION_NESTED(94) */					\
+	/* or	5,5,5; */						\
+	/* or	5,5,5; */						\
+	/* or	5,5,5; */						\
+	/* or	5,5,5; */						\
+/* ##TYPE##_SECTION_ELSE_NESTED(94) */					\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	or	1,1,1;							\
+	or	1,1,1;							\
+/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 94) */				\
+/* ALT_##TYPE##_SECTION_END(0, 1) */					\
+	or	1,1,1;							\
+	or	1,1,1;
+
+MAKE_MACRO_TEST(FTR);
+MAKE_MACRO_TEST_EXPECTED(FTR);
+
+#ifdef CONFIG_PPC64
+MAKE_MACRO_TEST(FW_FTR);
+MAKE_MACRO_TEST_EXPECTED(FW_FTR);
+#endif
+
+globl(lwsync_fixup_test)
+1:	or	1,1,1
+	LWSYNC
+globl(end_lwsync_fixup_test)
+
+globl(lwsync_fixup_test_expected_LWSYNC)
+1:	or	1,1,1
+	lwsync
+
+globl(lwsync_fixup_test_expected_SYNC)
+1:	or	1,1,1
+	sync
+
diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c
new file mode 100644
index 00000000000..7a8a7487cee
--- /dev/null
+++ b/arch/powerpc/lib/feature-fixups.c
@@ -0,0 +1,376 @@
+/*
+ *  Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org)
+ *
+ *  Modifications for ppc64:
+ *      Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com>
+ *
+ *  Copyright 2008 Michael Ellerman, IBM Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <asm/cputable.h>
+#include <asm/code-patching.h>
+#include <asm/page.h>
+#include <asm/sections.h>
+
+
+struct fixup_entry {
+	unsigned long	mask;
+	unsigned long	value;
+	long		start_off;
+	long		end_off;
+	long		alt_start_off;
+	long		alt_end_off;
+};
+
+static unsigned int *calc_addr(struct fixup_entry *fcur, long offset)
+{
+	/*
+	 * We store the offset to the code as a negative offset from
+	 * the start of the alt_entry, to support the VDSO. This
+	 * routine converts that back into an actual address.
+	 */
+	return (unsigned int *)((unsigned long)fcur + offset);
+}
+
+static int patch_alt_instruction(unsigned int *src, unsigned int *dest,
+				 unsigned int *alt_start, unsigned int *alt_end)
+{
+	unsigned int instr;
+
+	instr = *src;
+
+	if (instr_is_relative_branch(*src)) {
+		unsigned int *target = (unsigned int *)branch_target(src);
+
+		/* Branch within the section doesn't need translating */
+		if (target < alt_start || target >= alt_end) {
+			instr = translate_branch(dest, src);
+			if (!instr)
+				return 1;
+		}
+	}
+
+	patch_instruction(dest, instr);
+
+	return 0;
+}
+
+static int patch_feature_section(unsigned long value, struct fixup_entry *fcur)
+{
+	unsigned int *start, *end, *alt_start, *alt_end, *src, *dest;
+
+	start = calc_addr(fcur, fcur->start_off);
+	end = calc_addr(fcur, fcur->end_off);
+	alt_start = calc_addr(fcur, fcur->alt_start_off);
+	alt_end = calc_addr(fcur, fcur->alt_end_off);
+
+	if ((alt_end - alt_start) > (end - start))
+		return 1;
+
+	if ((value & fcur->mask) == fcur->value)
+		return 0;
+
+	src = alt_start;
+	dest = start;
+
+	for (; src < alt_end; src++, dest++) {
+		if (patch_alt_instruction(src, dest, alt_start, alt_end))
+			return 1;
+	}
+
+	for (; dest < end; dest++)
+		patch_instruction(dest, PPC_INST_NOP);
+
+	return 0;
+}
+
+void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end)
+{
+	struct fixup_entry *fcur, *fend;
+
+	fcur = fixup_start;
+	fend = fixup_end;
+
+	for (; fcur < fend; fcur++) {
+		if (patch_feature_section(value, fcur)) {
+			WARN_ON(1);
+			printk("Unable to patch feature section at %p - %p" \
+				" with %p - %p\n",
+				calc_addr(fcur, fcur->start_off),
+				calc_addr(fcur, fcur->end_off),
+				calc_addr(fcur, fcur->alt_start_off),
+				calc_addr(fcur, fcur->alt_end_off));
+		}
+	}
+}
+
+void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end)
+{
+	long *start, *end;
+	unsigned int *dest;
+
+	if (!(value & CPU_FTR_LWSYNC))
+		return ;
+
+	start = fixup_start;
+	end = fixup_end;
+
+	for (; start < end; start++) {
+		dest = (void *)start + *start;
+		patch_instruction(dest, PPC_INST_LWSYNC);
+	}
+}
+
+void do_final_fixups(void)
+{
+#if defined(CONFIG_PPC64) && defined(CONFIG_RELOCATABLE)
+	int *src, *dest;
+	unsigned long length;
+
+	if (PHYSICAL_START == 0)
+		return;
+
+	src = (int *)(KERNELBASE + PHYSICAL_START);
+	dest = (int *)KERNELBASE;
+	length = (__end_interrupts - _stext) / sizeof(int);
+
+	while (length--) {
+		patch_instruction(dest, *src);
+		src++;
+		dest++;
+	}
+#endif
+}
+
+#ifdef CONFIG_FTR_FIXUP_SELFTEST
+
+#define check(x)	\
+	if (!(x)) printk("feature-fixups: test failed at line %d\n", __LINE__);
+
+/* This must be after the text it fixes up, vmlinux.lds.S enforces that atm */
+static struct fixup_entry fixup;
+
+static long calc_offset(struct fixup_entry *entry, unsigned int *p)
+{
+	return (unsigned long)p - (unsigned long)entry;
+}
+
+void test_basic_patching(void)
+{
+	extern unsigned int ftr_fixup_test1;
+	extern unsigned int end_ftr_fixup_test1;
+	extern unsigned int ftr_fixup_test1_orig;
+	extern unsigned int ftr_fixup_test1_expected;
+	int size = &end_ftr_fixup_test1 - &ftr_fixup_test1;
+
+	fixup.value = fixup.mask = 8;
+	fixup.start_off = calc_offset(&fixup, &ftr_fixup_test1 + 1);
+	fixup.end_off = calc_offset(&fixup, &ftr_fixup_test1 + 2);
+	fixup.alt_start_off = fixup.alt_end_off = 0;
+
+	/* Sanity check */
+	check(memcmp(&ftr_fixup_test1, &ftr_fixup_test1_orig, size) == 0);
+
+	/* Check we don't patch if the value matches */
+	patch_feature_section(8, &fixup);
+	check(memcmp(&ftr_fixup_test1, &ftr_fixup_test1_orig, size) == 0);
+
+	/* Check we do patch if the value doesn't match */
+	patch_feature_section(0, &fixup);
+	check(memcmp(&ftr_fixup_test1, &ftr_fixup_test1_expected, size) == 0);
+
+	/* Check we do patch if the mask doesn't match */
+	memcpy(&ftr_fixup_test1, &ftr_fixup_test1_orig, size);
+	check(memcmp(&ftr_fixup_test1, &ftr_fixup_test1_orig, size) == 0);
+	patch_feature_section(~8, &fixup);
+	check(memcmp(&ftr_fixup_test1, &ftr_fixup_test1_expected, size) == 0);
+}
+
+static void test_alternative_patching(void)
+{
+	extern unsigned int ftr_fixup_test2;
+	extern unsigned int end_ftr_fixup_test2;
+	extern unsigned int ftr_fixup_test2_orig;
+	extern unsigned int ftr_fixup_test2_alt;
+	extern unsigned int ftr_fixup_test2_expected;
+	int size = &end_ftr_fixup_test2 - &ftr_fixup_test2;
+
+	fixup.value = fixup.mask = 0xF;
+	fixup.start_off = calc_offset(&fixup, &ftr_fixup_test2 + 1);
+	fixup.end_off = calc_offset(&fixup, &ftr_fixup_test2 + 2);
+	fixup.alt_start_off = calc_offset(&fixup, &ftr_fixup_test2_alt);
+	fixup.alt_end_off = calc_offset(&fixup, &ftr_fixup_test2_alt + 1);
+
+	/* Sanity check */
+	check(memcmp(&ftr_fixup_test2, &ftr_fixup_test2_orig, size) == 0);
+
+	/* Check we don't patch if the value matches */
+	patch_feature_section(0xF, &fixup);
+	check(memcmp(&ftr_fixup_test2, &ftr_fixup_test2_orig, size) == 0);
+
+	/* Check we do patch if the value doesn't match */
+	patch_feature_section(0, &fixup);
+	check(memcmp(&ftr_fixup_test2, &ftr_fixup_test2_expected, size) == 0);
+
+	/* Check we do patch if the mask doesn't match */
+	memcpy(&ftr_fixup_test2, &ftr_fixup_test2_orig, size);
+	check(memcmp(&ftr_fixup_test2, &ftr_fixup_test2_orig, size) == 0);
+	patch_feature_section(~0xF, &fixup);
+	check(memcmp(&ftr_fixup_test2, &ftr_fixup_test2_expected, size) == 0);
+}
+
+static void test_alternative_case_too_big(void)
+{
+	extern unsigned int ftr_fixup_test3;
+	extern unsigned int end_ftr_fixup_test3;
+	extern unsigned int ftr_fixup_test3_orig;
+	extern unsigned int ftr_fixup_test3_alt;
+	int size = &end_ftr_fixup_test3 - &ftr_fixup_test3;
+
+	fixup.value = fixup.mask = 0xC;
+	fixup.start_off = calc_offset(&fixup, &ftr_fixup_test3 + 1);
+	fixup.end_off = calc_offset(&fixup, &ftr_fixup_test3 + 2);
+	fixup.alt_start_off = calc_offset(&fixup, &ftr_fixup_test3_alt);
+	fixup.alt_end_off = calc_offset(&fixup, &ftr_fixup_test3_alt + 2);
+
+	/* Sanity check */
+	check(memcmp(&ftr_fixup_test3, &ftr_fixup_test3_orig, size) == 0);
+
+	/* Expect nothing to be patched, and the error returned to us */
+	check(patch_feature_section(0xF, &fixup) == 1);
+	check(memcmp(&ftr_fixup_test3, &ftr_fixup_test3_orig, size) == 0);
+	check(patch_feature_section(0, &fixup) == 1);
+	check(memcmp(&ftr_fixup_test3, &ftr_fixup_test3_orig, size) == 0);
+	check(patch_feature_section(~0xF, &fixup) == 1);
+	check(memcmp(&ftr_fixup_test3, &ftr_fixup_test3_orig, size) == 0);
+}
+
+static void test_alternative_case_too_small(void)
+{
+	extern unsigned int ftr_fixup_test4;
+	extern unsigned int end_ftr_fixup_test4;
+	extern unsigned int ftr_fixup_test4_orig;
+	extern unsigned int ftr_fixup_test4_alt;
+	extern unsigned int ftr_fixup_test4_expected;
+	int size = &end_ftr_fixup_test4 - &ftr_fixup_test4;
+	unsigned long flag;
+
+	/* Check a high-bit flag */
+	flag = 1UL << ((sizeof(unsigned long) - 1) * 8);
+	fixup.value = fixup.mask = flag;
+	fixup.start_off = calc_offset(&fixup, &ftr_fixup_test4 + 1);
+	fixup.end_off = calc_offset(&fixup, &ftr_fixup_test4 + 5);
+	fixup.alt_start_off = calc_offset(&fixup, &ftr_fixup_test4_alt);
+	fixup.alt_end_off = calc_offset(&fixup, &ftr_fixup_test4_alt + 2);
+
+	/* Sanity check */
+	check(memcmp(&ftr_fixup_test4, &ftr_fixup_test4_orig, size) == 0);
+
+	/* Check we don't patch if the value matches */
+	patch_feature_section(flag, &fixup);
+	check(memcmp(&ftr_fixup_test4, &ftr_fixup_test4_orig, size) == 0);
+
+	/* Check we do patch if the value doesn't match */
+	patch_feature_section(0, &fixup);
+	check(memcmp(&ftr_fixup_test4, &ftr_fixup_test4_expected, size) == 0);
+
+	/* Check we do patch if the mask doesn't match */
+	memcpy(&ftr_fixup_test4, &ftr_fixup_test4_orig, size);
+	check(memcmp(&ftr_fixup_test4, &ftr_fixup_test4_orig, size) == 0);
+	patch_feature_section(~flag, &fixup);
+	check(memcmp(&ftr_fixup_test4, &ftr_fixup_test4_expected, size) == 0);
+}
+
+static void test_alternative_case_with_branch(void)
+{
+	extern unsigned int ftr_fixup_test5;
+	extern unsigned int end_ftr_fixup_test5;
+	extern unsigned int ftr_fixup_test5_expected;
+	int size = &end_ftr_fixup_test5 - &ftr_fixup_test5;
+
+	check(memcmp(&ftr_fixup_test5, &ftr_fixup_test5_expected, size) == 0);
+}
+
+static void test_alternative_case_with_external_branch(void)
+{
+	extern unsigned int ftr_fixup_test6;
+	extern unsigned int end_ftr_fixup_test6;
+	extern unsigned int ftr_fixup_test6_expected;
+	int size = &end_ftr_fixup_test6 - &ftr_fixup_test6;
+
+	check(memcmp(&ftr_fixup_test6, &ftr_fixup_test6_expected, size) == 0);
+}
+
+static void test_cpu_macros(void)
+{
+	extern u8 ftr_fixup_test_FTR_macros;
+	extern u8 ftr_fixup_test_FTR_macros_expected;
+	unsigned long size = &ftr_fixup_test_FTR_macros_expected -
+			     &ftr_fixup_test_FTR_macros;
+
+	/* The fixups have already been done for us during boot */
+	check(memcmp(&ftr_fixup_test_FTR_macros,
+		     &ftr_fixup_test_FTR_macros_expected, size) == 0);
+}
+
+static void test_fw_macros(void)
+{
+#ifdef CONFIG_PPC64
+	extern u8 ftr_fixup_test_FW_FTR_macros;
+	extern u8 ftr_fixup_test_FW_FTR_macros_expected;
+	unsigned long size = &ftr_fixup_test_FW_FTR_macros_expected -
+			     &ftr_fixup_test_FW_FTR_macros;
+
+	/* The fixups have already been done for us during boot */
+	check(memcmp(&ftr_fixup_test_FW_FTR_macros,
+		     &ftr_fixup_test_FW_FTR_macros_expected, size) == 0);
+#endif
+}
+
+static void test_lwsync_macros(void)
+{
+	extern u8 lwsync_fixup_test;
+	extern u8 end_lwsync_fixup_test;
+	extern u8 lwsync_fixup_test_expected_LWSYNC;
+	extern u8 lwsync_fixup_test_expected_SYNC;
+	unsigned long size = &end_lwsync_fixup_test -
+			     &lwsync_fixup_test;
+
+	/* The fixups have already been done for us during boot */
+	if (cur_cpu_spec->cpu_features & CPU_FTR_LWSYNC) {
+		check(memcmp(&lwsync_fixup_test,
+			     &lwsync_fixup_test_expected_LWSYNC, size) == 0);
+	} else {
+		check(memcmp(&lwsync_fixup_test,
+			     &lwsync_fixup_test_expected_SYNC, size) == 0);
+	}
+}
+
+static int __init test_feature_fixups(void)
+{
+	printk(KERN_DEBUG "Running feature fixup self-tests ...\n");
+
+	test_basic_patching();
+	test_alternative_patching();
+	test_alternative_case_too_big();
+	test_alternative_case_too_small();
+	test_alternative_case_with_branch();
+	test_alternative_case_with_external_branch();
+	test_cpu_macros();
+	test_fw_macros();
+	test_lwsync_macros();
+
+	return 0;
+}
+late_initcall(test_feature_fixups);
+
+#endif /* CONFIG_FTR_FIXUP_SELFTEST */
diff --git a/arch/powerpc/lib/hweight_64.S b/arch/powerpc/lib/hweight_64.S
new file mode 100644
index 00000000000..19e66001a4f
--- /dev/null
+++ b/arch/powerpc/lib/hweight_64.S
@@ -0,0 +1,110 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2010
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+
+/* Note: This code relies on -mminimal-toc */
+
+_GLOBAL(__arch_hweight8)
+BEGIN_FTR_SECTION
+	b __sw_hweight8
+	nop
+	nop
+FTR_SECTION_ELSE
+	PPC_POPCNTB(R3,R3)
+	clrldi	r3,r3,64-8
+	blr
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB)
+
+_GLOBAL(__arch_hweight16)
+BEGIN_FTR_SECTION
+	b __sw_hweight16
+	nop
+	nop
+	nop
+	nop
+FTR_SECTION_ELSE
+  BEGIN_FTR_SECTION_NESTED(50)
+	PPC_POPCNTB(R3,R3)
+	srdi	r4,r3,8
+	add	r3,r4,r3
+	clrldi	r3,r3,64-8
+	blr
+  FTR_SECTION_ELSE_NESTED(50)
+	clrlwi  r3,r3,16
+	PPC_POPCNTW(R3,R3)
+	clrldi	r3,r3,64-8
+	blr
+  ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 50)
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB)
+
+_GLOBAL(__arch_hweight32)
+BEGIN_FTR_SECTION
+	b __sw_hweight32
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+FTR_SECTION_ELSE
+  BEGIN_FTR_SECTION_NESTED(51)
+	PPC_POPCNTB(R3,R3)
+	srdi	r4,r3,16
+	add	r3,r4,r3
+	srdi	r4,r3,8
+	add	r3,r4,r3
+	clrldi	r3,r3,64-8
+	blr
+  FTR_SECTION_ELSE_NESTED(51)
+	PPC_POPCNTW(R3,R3)
+	clrldi	r3,r3,64-8
+	blr
+  ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 51)
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB)
+
+_GLOBAL(__arch_hweight64)
+BEGIN_FTR_SECTION
+	b __sw_hweight64
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+FTR_SECTION_ELSE
+  BEGIN_FTR_SECTION_NESTED(52)
+	PPC_POPCNTB(R3,R3)
+	srdi	r4,r3,32
+	add	r3,r4,r3
+	srdi	r4,r3,16
+	add	r3,r4,r3
+	srdi	r4,r3,8
+	add	r3,r4,r3
+	clrldi	r3,r3,64-8
+	blr
+  FTR_SECTION_ELSE_NESTED(52)
+	PPC_POPCNTD(R3,R3)
+	clrldi	r3,r3,64-8
+	blr
+  ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 52)
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB)
diff --git a/arch/powerpc/lib/ldstfp.S b/arch/powerpc/lib/ldstfp.S
new file mode 100644
index 00000000000..85aec08ab23
--- /dev/null
+++ b/arch/powerpc/lib/ldstfp.S
@@ -0,0 +1,379 @@
+/*
+ * Floating-point, VMX/Altivec and VSX loads and stores
+ * for use in instruction emulation.
+ *
+ * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/ppc-opcode.h>
+#include <asm/reg.h>
+#include <asm/asm-offsets.h>
+#include <linux/errno.h>
+
+#ifdef CONFIG_PPC_FPU
+
+#define STKFRM	(PPC_MIN_STKFRM + 16)
+
+	.macro	extab	instr,handler
+	.section __ex_table,"a"
+	PPC_LONG \instr,\handler
+	.previous
+	.endm
+
+	.macro	inst32	op
+reg = 0
+	.rept	32
+20:	\op	reg,0,r4
+	b	3f
+	extab	20b,99f
+reg = reg + 1
+	.endr
+	.endm
+
+/* Get the contents of frN into fr0; N is in r3. */
+_GLOBAL(get_fpr)
+	mflr	r0
+	rlwinm	r3,r3,3,0xf8
+	bcl	20,31,1f
+	blr			/* fr0 is already in fr0 */
+	nop
+reg = 1
+	.rept	31
+	fmr	fr0,reg
+	blr
+reg = reg + 1
+	.endr
+1:	mflr	r5
+	add	r5,r3,r5
+	mtctr	r5
+	mtlr	r0
+	bctr
+
+/* Put the contents of fr0 into frN; N is in r3. */
+_GLOBAL(put_fpr)
+	mflr	r0
+	rlwinm	r3,r3,3,0xf8
+	bcl	20,31,1f
+	blr			/* fr0 is already in fr0 */
+	nop
+reg = 1
+	.rept	31
+	fmr	reg,fr0
+	blr
+reg = reg + 1
+	.endr
+1:	mflr	r5
+	add	r5,r3,r5
+	mtctr	r5
+	mtlr	r0
+	bctr
+
+/* Load FP reg N from float at *p.  N is in r3, p in r4. */
+_GLOBAL(do_lfs)
+	PPC_STLU r1,-STKFRM(r1)
+	mflr	r0
+	PPC_STL	r0,STKFRM+PPC_LR_STKOFF(r1)
+	mfmsr	r6
+	ori	r7,r6,MSR_FP
+	cmpwi	cr7,r3,0
+	MTMSRD(r7)
+	isync
+	beq	cr7,1f
+	stfd	fr0,STKFRM-16(r1)
+1:	li	r9,-EFAULT
+2:	lfs	fr0,0(r4)
+	li	r9,0
+3:	bl	put_fpr
+	beq	cr7,4f
+	lfd	fr0,STKFRM-16(r1)
+4:	PPC_LL	r0,STKFRM+PPC_LR_STKOFF(r1)
+	mtlr	r0
+	MTMSRD(r6)
+	isync
+	mr	r3,r9
+	addi	r1,r1,STKFRM
+	blr
+	extab	2b,3b
+
+/* Load FP reg N from double at *p.  N is in r3, p in r4. */
+_GLOBAL(do_lfd)
+	PPC_STLU r1,-STKFRM(r1)
+	mflr	r0
+	PPC_STL	r0,STKFRM+PPC_LR_STKOFF(r1)
+	mfmsr	r6
+	ori	r7,r6,MSR_FP
+	cmpwi	cr7,r3,0
+	MTMSRD(r7)
+	isync
+	beq	cr7,1f
+	stfd	fr0,STKFRM-16(r1)
+1:	li	r9,-EFAULT
+2:	lfd	fr0,0(r4)
+	li	r9,0
+3:	beq	cr7,4f
+	bl	put_fpr
+	lfd	fr0,STKFRM-16(r1)
+4:	PPC_LL	r0,STKFRM+PPC_LR_STKOFF(r1)
+	mtlr	r0
+	MTMSRD(r6)
+	isync
+	mr	r3,r9
+	addi	r1,r1,STKFRM
+	blr
+	extab	2b,3b
+
+/* Store FP reg N to float at *p.  N is in r3, p in r4. */
+_GLOBAL(do_stfs)
+	PPC_STLU r1,-STKFRM(r1)
+	mflr	r0
+	PPC_STL	r0,STKFRM+PPC_LR_STKOFF(r1)
+	mfmsr	r6
+	ori	r7,r6,MSR_FP
+	cmpwi	cr7,r3,0
+	MTMSRD(r7)
+	isync
+	beq	cr7,1f
+	stfd	fr0,STKFRM-16(r1)
+	bl	get_fpr
+1:	li	r9,-EFAULT
+2:	stfs	fr0,0(r4)
+	li	r9,0
+3:	beq	cr7,4f
+	lfd	fr0,STKFRM-16(r1)
+4:	PPC_LL	r0,STKFRM+PPC_LR_STKOFF(r1)
+	mtlr	r0
+	MTMSRD(r6)
+	isync
+	mr	r3,r9
+	addi	r1,r1,STKFRM
+	blr
+	extab	2b,3b
+
+/* Store FP reg N to double at *p.  N is in r3, p in r4. */
+_GLOBAL(do_stfd)
+	PPC_STLU r1,-STKFRM(r1)
+	mflr	r0
+	PPC_STL	r0,STKFRM+PPC_LR_STKOFF(r1)
+	mfmsr	r6
+	ori	r7,r6,MSR_FP
+	cmpwi	cr7,r3,0
+	MTMSRD(r7)
+	isync
+	beq	cr7,1f
+	stfd	fr0,STKFRM-16(r1)
+	bl	get_fpr
+1:	li	r9,-EFAULT
+2:	stfd	fr0,0(r4)
+	li	r9,0
+3:	beq	cr7,4f
+	lfd	fr0,STKFRM-16(r1)
+4:	PPC_LL	r0,STKFRM+PPC_LR_STKOFF(r1)
+	mtlr	r0
+	MTMSRD(r6)
+	isync
+	mr	r3,r9
+	addi	r1,r1,STKFRM
+	blr
+	extab	2b,3b
+
+#ifdef CONFIG_ALTIVEC
+/* Get the contents of vrN into vr0; N is in r3. */
+_GLOBAL(get_vr)
+	mflr	r0
+	rlwinm	r3,r3,3,0xf8
+	bcl	20,31,1f
+	blr			/* vr0 is already in vr0 */
+	nop
+reg = 1
+	.rept	31
+	vor	vr0,reg,reg	/* assembler doesn't know vmr? */
+	blr
+reg = reg + 1
+	.endr
+1:	mflr	r5
+	add	r5,r3,r5
+	mtctr	r5
+	mtlr	r0
+	bctr
+
+/* Put the contents of vr0 into vrN; N is in r3. */
+_GLOBAL(put_vr)
+	mflr	r0
+	rlwinm	r3,r3,3,0xf8
+	bcl	20,31,1f
+	blr			/* vr0 is already in vr0 */
+	nop
+reg = 1
+	.rept	31
+	vor	reg,vr0,vr0
+	blr
+reg = reg + 1
+	.endr
+1:	mflr	r5
+	add	r5,r3,r5
+	mtctr	r5
+	mtlr	r0
+	bctr
+
+/* Load vector reg N from *p.  N is in r3, p in r4. */
+_GLOBAL(do_lvx)
+	PPC_STLU r1,-STKFRM(r1)
+	mflr	r0
+	PPC_STL	r0,STKFRM+PPC_LR_STKOFF(r1)
+	mfmsr	r6
+	oris	r7,r6,MSR_VEC@h
+	cmpwi	cr7,r3,0
+	li	r8,STKFRM-16
+	MTMSRD(r7)
+	isync
+	beq	cr7,1f
+	stvx	vr0,r1,r8
+1:	li	r9,-EFAULT
+2:	lvx	vr0,0,r4
+	li	r9,0
+3:	beq	cr7,4f
+	bl	put_vr
+	lvx	vr0,r1,r8
+4:	PPC_LL	r0,STKFRM+PPC_LR_STKOFF(r1)
+	mtlr	r0
+	MTMSRD(r6)
+	isync
+	mr	r3,r9
+	addi	r1,r1,STKFRM
+	blr
+	extab	2b,3b
+
+/* Store vector reg N to *p.  N is in r3, p in r4. */
+_GLOBAL(do_stvx)
+	PPC_STLU r1,-STKFRM(r1)
+	mflr	r0
+	PPC_STL	r0,STKFRM+PPC_LR_STKOFF(r1)
+	mfmsr	r6
+	oris	r7,r6,MSR_VEC@h
+	cmpwi	cr7,r3,0
+	li	r8,STKFRM-16
+	MTMSRD(r7)
+	isync
+	beq	cr7,1f
+	stvx	vr0,r1,r8
+	bl	get_vr
+1:	li	r9,-EFAULT
+2:	stvx	vr0,0,r4
+	li	r9,0
+3:	beq	cr7,4f
+	lvx	vr0,r1,r8
+4:	PPC_LL	r0,STKFRM+PPC_LR_STKOFF(r1)
+	mtlr	r0
+	MTMSRD(r6)
+	isync
+	mr	r3,r9
+	addi	r1,r1,STKFRM
+	blr
+	extab	2b,3b
+#endif /* CONFIG_ALTIVEC */
+
+#ifdef CONFIG_VSX
+/* Get the contents of vsrN into vsr0; N is in r3. */
+_GLOBAL(get_vsr)
+	mflr	r0
+	rlwinm	r3,r3,3,0x1f8
+	bcl	20,31,1f
+	blr			/* vsr0 is already in vsr0 */
+	nop
+reg = 1
+	.rept	63
+	XXLOR(0,reg,reg)
+	blr
+reg = reg + 1
+	.endr
+1:	mflr	r5
+	add	r5,r3,r5
+	mtctr	r5
+	mtlr	r0
+	bctr
+
+/* Put the contents of vsr0 into vsrN; N is in r3. */
+_GLOBAL(put_vsr)
+	mflr	r0
+	rlwinm	r3,r3,3,0x1f8
+	bcl	20,31,1f
+	blr			/* vr0 is already in vr0 */
+	nop
+reg = 1
+	.rept	63
+	XXLOR(reg,0,0)
+	blr
+reg = reg + 1
+	.endr
+1:	mflr	r5
+	add	r5,r3,r5
+	mtctr	r5
+	mtlr	r0
+	bctr
+
+/* Load VSX reg N from vector doubleword *p.  N is in r3, p in r4. */
+_GLOBAL(do_lxvd2x)
+	PPC_STLU r1,-STKFRM(r1)
+	mflr	r0
+	PPC_STL	r0,STKFRM+PPC_LR_STKOFF(r1)
+	mfmsr	r6
+	oris	r7,r6,MSR_VSX@h
+	cmpwi	cr7,r3,0
+	li	r8,STKFRM-16
+	MTMSRD(r7)
+	isync
+	beq	cr7,1f
+	STXVD2X(0,R1,R8)
+1:	li	r9,-EFAULT
+2:	LXVD2X(0,R0,R4)
+	li	r9,0
+3:	beq	cr7,4f
+	bl	put_vsr
+	LXVD2X(0,R1,R8)
+4:	PPC_LL	r0,STKFRM+PPC_LR_STKOFF(r1)
+	mtlr	r0
+	MTMSRD(r6)
+	isync
+	mr	r3,r9
+	addi	r1,r1,STKFRM
+	blr
+	extab	2b,3b
+
+/* Store VSX reg N to vector doubleword *p.  N is in r3, p in r4. */
+_GLOBAL(do_stxvd2x)
+	PPC_STLU r1,-STKFRM(r1)
+	mflr	r0
+	PPC_STL	r0,STKFRM+PPC_LR_STKOFF(r1)
+	mfmsr	r6
+	oris	r7,r6,MSR_VSX@h
+	cmpwi	cr7,r3,0
+	li	r8,STKFRM-16
+	MTMSRD(r7)
+	isync
+	beq	cr7,1f
+	STXVD2X(0,R1,R8)
+	bl	get_vsr
+1:	li	r9,-EFAULT
+2:	STXVD2X(0,R0,R4)
+	li	r9,0
+3:	beq	cr7,4f
+	LXVD2X(0,R1,R8)
+4:	PPC_LL	r0,STKFRM+PPC_LR_STKOFF(r1)
+	mtlr	r0
+	MTMSRD(r6)
+	isync
+	mr	r3,r9
+	addi	r1,r1,STKFRM
+	blr
+	extab	2b,3b
+
+#endif /* CONFIG_VSX */
+
+#endif	/* CONFIG_PPC_FPU */
diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
index 79d0fa3a470..0c9c8d7d073 100644
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -14,18 +14,16 @@
 
 #include <linux/kernel.h>
 #include <linux/spinlock.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/stringify.h>
 #include <linux/smp.h>
 
 /* waiting for a spinlock... */
-#if defined(CONFIG_PPC_SPLPAR) || defined(CONFIG_PPC_ISERIES)
+#if defined(CONFIG_PPC_SPLPAR)
 #include <asm/hvcall.h>
-#include <asm/iseries/hv_call.h>
 #include <asm/smp.h>
-#include <asm/firmware.h>
 
-void __spin_yield(raw_spinlock_t *lock)
+void __spin_yield(arch_spinlock_t *lock)
 {
 	unsigned int lock_value, holder_cpu, yield_count;
 
@@ -34,20 +32,14 @@ void __spin_yield(raw_spinlock_t *lock)
 		return;
 	holder_cpu = lock_value & 0xffff;
 	BUG_ON(holder_cpu >= NR_CPUS);
-	yield_count = lppaca[holder_cpu].yield_count;
+	yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count);
 	if ((yield_count & 1) == 0)
 		return;		/* virtual cpu is currently running */
 	rmb();
 	if (lock->slock != lock_value)
 		return;		/* something has changed */
-	if (firmware_has_feature(FW_FEATURE_ISERIES))
-		HvCall2(HvCallBaseYieldProcessor, HvCall_YieldToProc,
-			((u64)holder_cpu << 32) | yield_count);
-#ifdef CONFIG_PPC_SPLPAR
-	else
-		plpar_hcall_norets(H_CONFER,
-			get_hard_smp_processor_id(holder_cpu), yield_count);
-#endif
+	plpar_hcall_norets(H_CONFER,
+		get_hard_smp_processor_id(holder_cpu), yield_count);
 }
 
 /*
@@ -55,7 +47,7 @@ void __spin_yield(raw_spinlock_t *lock)
  * This turns out to be the same for read and write locks, since
  * we only know the holder if it is write-locked.
  */
-void __rw_yield(raw_rwlock_t *rw)
+void __rw_yield(arch_rwlock_t *rw)
 {
 	int lock_value;
 	unsigned int holder_cpu, yield_count;
@@ -65,24 +57,18 @@ void __rw_yield(raw_rwlock_t *rw)
 		return;		/* no write lock at present */
 	holder_cpu = lock_value & 0xffff;
 	BUG_ON(holder_cpu >= NR_CPUS);
-	yield_count = lppaca[holder_cpu].yield_count;
+	yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count);
 	if ((yield_count & 1) == 0)
 		return;		/* virtual cpu is currently running */
 	rmb();
 	if (rw->lock != lock_value)
 		return;		/* something has changed */
-	if (firmware_has_feature(FW_FEATURE_ISERIES))
-		HvCall2(HvCallBaseYieldProcessor, HvCall_YieldToProc,
-			((u64)holder_cpu << 32) | yield_count);
-#ifdef CONFIG_PPC_SPLPAR
-	else
-		plpar_hcall_norets(H_CONFER,
-			get_hard_smp_processor_id(holder_cpu), yield_count);
-#endif
+	plpar_hcall_norets(H_CONFER,
+		get_hard_smp_processor_id(holder_cpu), yield_count);
 }
 #endif
 
-void __raw_spin_unlock_wait(raw_spinlock_t *lock)
+void arch_spin_unlock_wait(arch_spinlock_t *lock)
 {
 	while (lock->slock) {
 		HMT_low();
@@ -92,4 +78,4 @@ void __raw_spin_unlock_wait(raw_spinlock_t *lock)
 	HMT_medium();
 }
 
-EXPORT_SYMBOL(__raw_spin_unlock_wait);
+EXPORT_SYMBOL(arch_spin_unlock_wait);
diff --git a/arch/powerpc/lib/mem_64.S b/arch/powerpc/lib/mem_64.S
index 11ce045e21f..43435c6892f 100644
--- a/arch/powerpc/lib/mem_64.S
+++ b/arch/powerpc/lib/mem_64.S
@@ -19,7 +19,7 @@ _GLOBAL(memset)
 	rlwimi	r4,r4,16,0,15
 	cmplw	cr1,r5,r0		/* do we get that far? */
 	rldimi	r4,r4,32,0
-	PPC_MTOCRF	1,r0
+	PPC_MTOCRF(1,r0)
 	mr	r6,r3
 	blt	cr1,8f
 	beq+	3f			/* if already 8-byte aligned */
@@ -49,7 +49,7 @@ _GLOBAL(memset)
 	bdnz	4b
 5:	srwi.	r0,r5,3
 	clrlwi	r5,r5,29
-	PPC_MTOCRF	1,r0
+	PPC_MTOCRF(1,r0)
 	beq	8f
 	bf	29,6f
 	std	r4,0(r6)
@@ -65,7 +65,7 @@ _GLOBAL(memset)
 	std	r4,0(r6)
 	addi	r6,r6,8
 8:	cmpwi	r5,0
-	PPC_MTOCRF	1,r5
+	PPC_MTOCRF(1,r5)
 	beqlr+
 	bf	29,9f
 	stw	r4,0(r6)
@@ -77,10 +77,10 @@ _GLOBAL(memset)
 	stb	r4,0(r6)
 	blr
 
-_GLOBAL(memmove)
+_GLOBAL_TOC(memmove)
 	cmplw	0,r3,r4
-	bgt	.backwards_memcpy
-	b	.memcpy
+	bgt	backwards_memcpy
+	b	memcpy
 
 _GLOBAL(backwards_memcpy)
 	rlwinm.	r7,r5,32-3,3,31		/* r0 = r5 >> 3 */
diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S
index 3f131129d1c..32a06ec395d 100644
--- a/arch/powerpc/lib/memcpy_64.S
+++ b/arch/powerpc/lib/memcpy_64.S
@@ -10,19 +10,52 @@
 #include <asm/ppc_asm.h>
 
 	.align	7
-_GLOBAL(memcpy)
-	std	r3,48(r1)	/* save destination pointer for return value */
-	PPC_MTOCRF	0x01,r5
+_GLOBAL_TOC(memcpy)
+BEGIN_FTR_SECTION
+#ifdef __LITTLE_ENDIAN__
+	cmpdi	cr7,r5,0
+#else
+	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* save destination pointer for return value */
+#endif
+FTR_SECTION_ELSE
+#ifndef SELFTEST
+	b	memcpy_power7
+#endif
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
+#ifdef __LITTLE_ENDIAN__
+	/* dumb little-endian memcpy that will get replaced at runtime */
+	addi r9,r3,-1
+	addi r4,r4,-1
+	beqlr cr7
+	mtctr r5
+1:	lbzu r10,1(r4)
+	stbu r10,1(r9)
+	bdnz 1b
+	blr
+#else
+	PPC_MTOCRF(0x01,r5)
 	cmpldi	cr1,r5,16
 	neg	r6,r3		# LS 3 bits = # bytes to 8-byte dest bdry
 	andi.	r6,r6,7
 	dcbt	0,r4
 	blt	cr1,.Lshort_copy
+/* Below we want to nop out the bne if we're on a CPU that has the
+   CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
+   cleared.
+   At the time of writing the only CPU that has this combination of bits
+   set is Power6. */
+BEGIN_FTR_SECTION
+	nop
+FTR_SECTION_ELSE
 	bne	.Ldst_unaligned
+ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
+                    CPU_FTR_UNALIGNED_LD_STD)
 .Ldst_aligned:
-	andi.	r0,r4,7
 	addi	r3,r3,-16
+BEGIN_FTR_SECTION
+	andi.	r0,r4,7
 	bne	.Lsrc_unaligned
+END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 	srdi	r7,r5,4
 	ld	r9,0(r4)
 	addi	r4,r4,-8
@@ -41,20 +74,21 @@ _GLOBAL(memcpy)
 3:	std	r8,8(r3)
 	beq	3f
 	addi	r3,r3,16
-	ld	r9,8(r4)
 .Ldo_tail:
 	bf	cr7*4+1,1f
-	rotldi	r9,r9,32
+	lwz	r9,8(r4)
+	addi	r4,r4,4
 	stw	r9,0(r3)
 	addi	r3,r3,4
 1:	bf	cr7*4+2,2f
-	rotldi	r9,r9,16
+	lhz	r9,8(r4)
+	addi	r4,r4,2
 	sth	r9,0(r3)
 	addi	r3,r3,2
 2:	bf	cr7*4+3,3f
-	rotldi	r9,r9,8
+	lbz	r9,8(r4)
 	stb	r9,0(r3)
-3:	ld	r3,48(r1)	/* return dest pointer */
+3:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* return dest pointer */
 	blr
 
 .Lsrc_unaligned:
@@ -121,17 +155,30 @@ _GLOBAL(memcpy)
 	cmpwi	cr1,r5,8
 	addi	r3,r3,32
 	sld	r9,r9,r10
-	ble	cr1,.Ldo_tail
+	ble	cr1,6f
 	ld	r0,8(r4)
 	srd	r7,r0,r11
 	or	r9,r7,r9
-	b	.Ldo_tail
+6:
+	bf	cr7*4+1,1f
+	rotldi	r9,r9,32
+	stw	r9,0(r3)
+	addi	r3,r3,4
+1:	bf	cr7*4+2,2f
+	rotldi	r9,r9,16
+	sth	r9,0(r3)
+	addi	r3,r3,2
+2:	bf	cr7*4+3,3f
+	rotldi	r9,r9,8
+	stb	r9,0(r3)
+3:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* return dest pointer */
+	blr
 
 .Ldst_unaligned:
-	PPC_MTOCRF	0x01,r6		# put #bytes to 8B bdry into cr7
+	PPC_MTOCRF(0x01,r6)		# put #bytes to 8B bdry into cr7
 	subf	r5,r6,r5
 	li	r7,0
-	cmpldi	r1,r5,16
+	cmpldi	cr1,r5,16
 	bf	cr7*4+3,1f
 	lbz	r0,0(r4)
 	stb	r0,0(r3)
@@ -143,7 +190,7 @@ _GLOBAL(memcpy)
 2:	bf	cr7*4+1,3f
 	lwzx	r0,r7,r4
 	stwx	r0,r7,r3
-3:	PPC_MTOCRF	0x01,r5
+3:	PPC_MTOCRF(0x01,r5)
 	add	r4,r6,r4
 	add	r3,r6,r3
 	b	.Ldst_aligned
@@ -169,5 +216,6 @@ _GLOBAL(memcpy)
 3:	bf	cr7*4+3,4f
 	lbz	r0,0(r4)
 	stb	r0,0(r3)
-4:	ld	r3,48(r1)	/* return dest pointer */
+4:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* return dest pointer */
 	blr
+#endif
diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S
new file mode 100644
index 00000000000..2ff5c142f87
--- /dev/null
+++ b/arch/powerpc/lib/memcpy_power7.S
@@ -0,0 +1,656 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2012
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <asm/ppc_asm.h>
+
+_GLOBAL(memcpy_power7)
+
+#ifdef __BIG_ENDIAN__
+#define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB
+#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRA,VRB,VRC
+#else
+#define LVS(VRT,RA,RB)		lvsr	VRT,RA,RB
+#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRB,VRA,VRC
+#endif
+
+#ifdef CONFIG_ALTIVEC
+	cmpldi	r5,16
+	cmpldi	cr1,r5,4096
+
+	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
+
+	blt	.Lshort_copy
+	bgt	cr1,.Lvmx_copy
+#else
+	cmpldi	r5,16
+
+	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
+
+	blt	.Lshort_copy
+#endif
+
+.Lnonvmx_copy:
+	/* Get the source 8B aligned */
+	neg	r6,r4
+	mtocrf	0x01,r6
+	clrldi	r6,r6,(64-3)
+
+	bf	cr7*4+3,1f
+	lbz	r0,0(r4)
+	addi	r4,r4,1
+	stb	r0,0(r3)
+	addi	r3,r3,1
+
+1:	bf	cr7*4+2,2f
+	lhz	r0,0(r4)
+	addi	r4,r4,2
+	sth	r0,0(r3)
+	addi	r3,r3,2
+
+2:	bf	cr7*4+1,3f
+	lwz	r0,0(r4)
+	addi	r4,r4,4
+	stw	r0,0(r3)
+	addi	r3,r3,4
+
+3:	sub	r5,r5,r6
+	cmpldi	r5,128
+	blt	5f
+
+	mflr	r0
+	stdu	r1,-STACKFRAMESIZE(r1)
+	std	r14,STK_REG(R14)(r1)
+	std	r15,STK_REG(R15)(r1)
+	std	r16,STK_REG(R16)(r1)
+	std	r17,STK_REG(R17)(r1)
+	std	r18,STK_REG(R18)(r1)
+	std	r19,STK_REG(R19)(r1)
+	std	r20,STK_REG(R20)(r1)
+	std	r21,STK_REG(R21)(r1)
+	std	r22,STK_REG(R22)(r1)
+	std	r0,STACKFRAMESIZE+16(r1)
+
+	srdi	r6,r5,7
+	mtctr	r6
+
+	/* Now do cacheline (128B) sized loads and stores. */
+	.align	5
+4:
+	ld	r0,0(r4)
+	ld	r6,8(r4)
+	ld	r7,16(r4)
+	ld	r8,24(r4)
+	ld	r9,32(r4)
+	ld	r10,40(r4)
+	ld	r11,48(r4)
+	ld	r12,56(r4)
+	ld	r14,64(r4)
+	ld	r15,72(r4)
+	ld	r16,80(r4)
+	ld	r17,88(r4)
+	ld	r18,96(r4)
+	ld	r19,104(r4)
+	ld	r20,112(r4)
+	ld	r21,120(r4)
+	addi	r4,r4,128
+	std	r0,0(r3)
+	std	r6,8(r3)
+	std	r7,16(r3)
+	std	r8,24(r3)
+	std	r9,32(r3)
+	std	r10,40(r3)
+	std	r11,48(r3)
+	std	r12,56(r3)
+	std	r14,64(r3)
+	std	r15,72(r3)
+	std	r16,80(r3)
+	std	r17,88(r3)
+	std	r18,96(r3)
+	std	r19,104(r3)
+	std	r20,112(r3)
+	std	r21,120(r3)
+	addi	r3,r3,128
+	bdnz	4b
+
+	clrldi	r5,r5,(64-7)
+
+	ld	r14,STK_REG(R14)(r1)
+	ld	r15,STK_REG(R15)(r1)
+	ld	r16,STK_REG(R16)(r1)
+	ld	r17,STK_REG(R17)(r1)
+	ld	r18,STK_REG(R18)(r1)
+	ld	r19,STK_REG(R19)(r1)
+	ld	r20,STK_REG(R20)(r1)
+	ld	r21,STK_REG(R21)(r1)
+	ld	r22,STK_REG(R22)(r1)
+	addi	r1,r1,STACKFRAMESIZE
+
+	/* Up to 127B to go */
+5:	srdi	r6,r5,4
+	mtocrf	0x01,r6
+
+6:	bf	cr7*4+1,7f
+	ld	r0,0(r4)
+	ld	r6,8(r4)
+	ld	r7,16(r4)
+	ld	r8,24(r4)
+	ld	r9,32(r4)
+	ld	r10,40(r4)
+	ld	r11,48(r4)
+	ld	r12,56(r4)
+	addi	r4,r4,64
+	std	r0,0(r3)
+	std	r6,8(r3)
+	std	r7,16(r3)
+	std	r8,24(r3)
+	std	r9,32(r3)
+	std	r10,40(r3)
+	std	r11,48(r3)
+	std	r12,56(r3)
+	addi	r3,r3,64
+
+	/* Up to 63B to go */
+7:	bf	cr7*4+2,8f
+	ld	r0,0(r4)
+	ld	r6,8(r4)
+	ld	r7,16(r4)
+	ld	r8,24(r4)
+	addi	r4,r4,32
+	std	r0,0(r3)
+	std	r6,8(r3)
+	std	r7,16(r3)
+	std	r8,24(r3)
+	addi	r3,r3,32
+
+	/* Up to 31B to go */
+8:	bf	cr7*4+3,9f
+	ld	r0,0(r4)
+	ld	r6,8(r4)
+	addi	r4,r4,16
+	std	r0,0(r3)
+	std	r6,8(r3)
+	addi	r3,r3,16
+
+9:	clrldi	r5,r5,(64-4)
+
+	/* Up to 15B to go */
+.Lshort_copy:
+	mtocrf	0x01,r5
+	bf	cr7*4+0,12f
+	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
+	lwz	r6,4(r4)
+	addi	r4,r4,8
+	stw	r0,0(r3)
+	stw	r6,4(r3)
+	addi	r3,r3,8
+
+12:	bf	cr7*4+1,13f
+	lwz	r0,0(r4)
+	addi	r4,r4,4
+	stw	r0,0(r3)
+	addi	r3,r3,4
+
+13:	bf	cr7*4+2,14f
+	lhz	r0,0(r4)
+	addi	r4,r4,2
+	sth	r0,0(r3)
+	addi	r3,r3,2
+
+14:	bf	cr7*4+3,15f
+	lbz	r0,0(r4)
+	stb	r0,0(r3)
+
+15:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
+	blr
+
+.Lunwind_stack_nonvmx_copy:
+	addi	r1,r1,STACKFRAMESIZE
+	b	.Lnonvmx_copy
+
+#ifdef CONFIG_ALTIVEC
+.Lvmx_copy:
+	mflr	r0
+	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
+	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
+	std	r0,16(r1)
+	stdu	r1,-STACKFRAMESIZE(r1)
+	bl	enter_vmx_copy
+	cmpwi	cr1,r3,0
+	ld	r0,STACKFRAMESIZE+16(r1)
+	ld	r3,STK_REG(R31)(r1)
+	ld	r4,STK_REG(R30)(r1)
+	ld	r5,STK_REG(R29)(r1)
+	mtlr	r0
+
+	/*
+	 * We prefetch both the source and destination using enhanced touch
+	 * instructions. We use a stream ID of 0 for the load side and
+	 * 1 for the store side.
+	 */
+	clrrdi	r6,r4,7
+	clrrdi	r9,r3,7
+	ori	r9,r9,1		/* stream=1 */
+
+	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */
+	cmpldi	r7,0x3FF
+	ble	1f
+	li	r7,0x3FF
+1:	lis	r0,0x0E00	/* depth=7 */
+	sldi	r7,r7,7
+	or	r7,r7,r0
+	ori	r10,r7,1	/* stream=1 */
+
+	lis	r8,0x8000	/* GO=1 */
+	clrldi	r8,r8,32
+
+.machine push
+.machine "power4"
+	dcbt	r0,r6,0b01000
+	dcbt	r0,r7,0b01010
+	dcbtst	r0,r9,0b01000
+	dcbtst	r0,r10,0b01010
+	eieio
+	dcbt	r0,r8,0b01010	/* GO */
+.machine pop
+
+	beq	cr1,.Lunwind_stack_nonvmx_copy
+
+	/*
+	 * If source and destination are not relatively aligned we use a
+	 * slower permute loop.
+	 */
+	xor	r6,r4,r3
+	rldicl.	r6,r6,0,(64-4)
+	bne	.Lvmx_unaligned_copy
+
+	/* Get the destination 16B aligned */
+	neg	r6,r3
+	mtocrf	0x01,r6
+	clrldi	r6,r6,(64-4)
+
+	bf	cr7*4+3,1f
+	lbz	r0,0(r4)
+	addi	r4,r4,1
+	stb	r0,0(r3)
+	addi	r3,r3,1
+
+1:	bf	cr7*4+2,2f
+	lhz	r0,0(r4)
+	addi	r4,r4,2
+	sth	r0,0(r3)
+	addi	r3,r3,2
+
+2:	bf	cr7*4+1,3f
+	lwz	r0,0(r4)
+	addi	r4,r4,4
+	stw	r0,0(r3)
+	addi	r3,r3,4
+
+3:	bf	cr7*4+0,4f
+	ld	r0,0(r4)
+	addi	r4,r4,8
+	std	r0,0(r3)
+	addi	r3,r3,8
+
+4:	sub	r5,r5,r6
+
+	/* Get the desination 128B aligned */
+	neg	r6,r3
+	srdi	r7,r6,4
+	mtocrf	0x01,r7
+	clrldi	r6,r6,(64-7)
+
+	li	r9,16
+	li	r10,32
+	li	r11,48
+
+	bf	cr7*4+3,5f
+	lvx	vr1,r0,r4
+	addi	r4,r4,16
+	stvx	vr1,r0,r3
+	addi	r3,r3,16
+
+5:	bf	cr7*4+2,6f
+	lvx	vr1,r0,r4
+	lvx	vr0,r4,r9
+	addi	r4,r4,32
+	stvx	vr1,r0,r3
+	stvx	vr0,r3,r9
+	addi	r3,r3,32
+
+6:	bf	cr7*4+1,7f
+	lvx	vr3,r0,r4
+	lvx	vr2,r4,r9
+	lvx	vr1,r4,r10
+	lvx	vr0,r4,r11
+	addi	r4,r4,64
+	stvx	vr3,r0,r3
+	stvx	vr2,r3,r9
+	stvx	vr1,r3,r10
+	stvx	vr0,r3,r11
+	addi	r3,r3,64
+
+7:	sub	r5,r5,r6
+	srdi	r6,r5,7
+
+	std	r14,STK_REG(R14)(r1)
+	std	r15,STK_REG(R15)(r1)
+	std	r16,STK_REG(R16)(r1)
+
+	li	r12,64
+	li	r14,80
+	li	r15,96
+	li	r16,112
+
+	mtctr	r6
+
+	/*
+	 * Now do cacheline sized loads and stores. By this stage the
+	 * cacheline stores are also cacheline aligned.
+	 */
+	.align	5
+8:
+	lvx	vr7,r0,r4
+	lvx	vr6,r4,r9
+	lvx	vr5,r4,r10
+	lvx	vr4,r4,r11
+	lvx	vr3,r4,r12
+	lvx	vr2,r4,r14
+	lvx	vr1,r4,r15
+	lvx	vr0,r4,r16
+	addi	r4,r4,128
+	stvx	vr7,r0,r3
+	stvx	vr6,r3,r9
+	stvx	vr5,r3,r10
+	stvx	vr4,r3,r11
+	stvx	vr3,r3,r12
+	stvx	vr2,r3,r14
+	stvx	vr1,r3,r15
+	stvx	vr0,r3,r16
+	addi	r3,r3,128
+	bdnz	8b
+
+	ld	r14,STK_REG(R14)(r1)
+	ld	r15,STK_REG(R15)(r1)
+	ld	r16,STK_REG(R16)(r1)
+
+	/* Up to 127B to go */
+	clrldi	r5,r5,(64-7)
+	srdi	r6,r5,4
+	mtocrf	0x01,r6
+
+	bf	cr7*4+1,9f
+	lvx	vr3,r0,r4
+	lvx	vr2,r4,r9
+	lvx	vr1,r4,r10
+	lvx	vr0,r4,r11
+	addi	r4,r4,64
+	stvx	vr3,r0,r3
+	stvx	vr2,r3,r9
+	stvx	vr1,r3,r10
+	stvx	vr0,r3,r11
+	addi	r3,r3,64
+
+9:	bf	cr7*4+2,10f
+	lvx	vr1,r0,r4
+	lvx	vr0,r4,r9
+	addi	r4,r4,32
+	stvx	vr1,r0,r3
+	stvx	vr0,r3,r9
+	addi	r3,r3,32
+
+10:	bf	cr7*4+3,11f
+	lvx	vr1,r0,r4
+	addi	r4,r4,16
+	stvx	vr1,r0,r3
+	addi	r3,r3,16
+
+	/* Up to 15B to go */
+11:	clrldi	r5,r5,(64-4)
+	mtocrf	0x01,r5
+	bf	cr7*4+0,12f
+	ld	r0,0(r4)
+	addi	r4,r4,8
+	std	r0,0(r3)
+	addi	r3,r3,8
+
+12:	bf	cr7*4+1,13f
+	lwz	r0,0(r4)
+	addi	r4,r4,4
+	stw	r0,0(r3)
+	addi	r3,r3,4
+
+13:	bf	cr7*4+2,14f
+	lhz	r0,0(r4)
+	addi	r4,r4,2
+	sth	r0,0(r3)
+	addi	r3,r3,2
+
+14:	bf	cr7*4+3,15f
+	lbz	r0,0(r4)
+	stb	r0,0(r3)
+
+15:	addi	r1,r1,STACKFRAMESIZE
+	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
+	b	exit_vmx_copy		/* tail call optimise */
+
+.Lvmx_unaligned_copy:
+	/* Get the destination 16B aligned */
+	neg	r6,r3
+	mtocrf	0x01,r6
+	clrldi	r6,r6,(64-4)
+
+	bf	cr7*4+3,1f
+	lbz	r0,0(r4)
+	addi	r4,r4,1
+	stb	r0,0(r3)
+	addi	r3,r3,1
+
+1:	bf	cr7*4+2,2f
+	lhz	r0,0(r4)
+	addi	r4,r4,2
+	sth	r0,0(r3)
+	addi	r3,r3,2
+
+2:	bf	cr7*4+1,3f
+	lwz	r0,0(r4)
+	addi	r4,r4,4
+	stw	r0,0(r3)
+	addi	r3,r3,4
+
+3:	bf	cr7*4+0,4f
+	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
+	lwz	r7,4(r4)
+	addi	r4,r4,8
+	stw	r0,0(r3)
+	stw	r7,4(r3)
+	addi	r3,r3,8
+
+4:	sub	r5,r5,r6
+
+	/* Get the desination 128B aligned */
+	neg	r6,r3
+	srdi	r7,r6,4
+	mtocrf	0x01,r7
+	clrldi	r6,r6,(64-7)
+
+	li	r9,16
+	li	r10,32
+	li	r11,48
+
+	LVS(vr16,0,r4)		/* Setup permute control vector */
+	lvx	vr0,0,r4
+	addi	r4,r4,16
+
+	bf	cr7*4+3,5f
+	lvx	vr1,r0,r4
+	VPERM(vr8,vr0,vr1,vr16)
+	addi	r4,r4,16
+	stvx	vr8,r0,r3
+	addi	r3,r3,16
+	vor	vr0,vr1,vr1
+
+5:	bf	cr7*4+2,6f
+	lvx	vr1,r0,r4
+	VPERM(vr8,vr0,vr1,vr16)
+	lvx	vr0,r4,r9
+	VPERM(vr9,vr1,vr0,vr16)
+	addi	r4,r4,32
+	stvx	vr8,r0,r3
+	stvx	vr9,r3,r9
+	addi	r3,r3,32
+
+6:	bf	cr7*4+1,7f
+	lvx	vr3,r0,r4
+	VPERM(vr8,vr0,vr3,vr16)
+	lvx	vr2,r4,r9
+	VPERM(vr9,vr3,vr2,vr16)
+	lvx	vr1,r4,r10
+	VPERM(vr10,vr2,vr1,vr16)
+	lvx	vr0,r4,r11
+	VPERM(vr11,vr1,vr0,vr16)
+	addi	r4,r4,64
+	stvx	vr8,r0,r3
+	stvx	vr9,r3,r9
+	stvx	vr10,r3,r10
+	stvx	vr11,r3,r11
+	addi	r3,r3,64
+
+7:	sub	r5,r5,r6
+	srdi	r6,r5,7
+
+	std	r14,STK_REG(R14)(r1)
+	std	r15,STK_REG(R15)(r1)
+	std	r16,STK_REG(R16)(r1)
+
+	li	r12,64
+	li	r14,80
+	li	r15,96
+	li	r16,112
+
+	mtctr	r6
+
+	/*
+	 * Now do cacheline sized loads and stores. By this stage the
+	 * cacheline stores are also cacheline aligned.
+	 */
+	.align	5
+8:
+	lvx	vr7,r0,r4
+	VPERM(vr8,vr0,vr7,vr16)
+	lvx	vr6,r4,r9
+	VPERM(vr9,vr7,vr6,vr16)
+	lvx	vr5,r4,r10
+	VPERM(vr10,vr6,vr5,vr16)
+	lvx	vr4,r4,r11
+	VPERM(vr11,vr5,vr4,vr16)
+	lvx	vr3,r4,r12
+	VPERM(vr12,vr4,vr3,vr16)
+	lvx	vr2,r4,r14
+	VPERM(vr13,vr3,vr2,vr16)
+	lvx	vr1,r4,r15
+	VPERM(vr14,vr2,vr1,vr16)
+	lvx	vr0,r4,r16
+	VPERM(vr15,vr1,vr0,vr16)
+	addi	r4,r4,128
+	stvx	vr8,r0,r3
+	stvx	vr9,r3,r9
+	stvx	vr10,r3,r10
+	stvx	vr11,r3,r11
+	stvx	vr12,r3,r12
+	stvx	vr13,r3,r14
+	stvx	vr14,r3,r15
+	stvx	vr15,r3,r16
+	addi	r3,r3,128
+	bdnz	8b
+
+	ld	r14,STK_REG(R14)(r1)
+	ld	r15,STK_REG(R15)(r1)
+	ld	r16,STK_REG(R16)(r1)
+
+	/* Up to 127B to go */
+	clrldi	r5,r5,(64-7)
+	srdi	r6,r5,4
+	mtocrf	0x01,r6
+
+	bf	cr7*4+1,9f
+	lvx	vr3,r0,r4
+	VPERM(vr8,vr0,vr3,vr16)
+	lvx	vr2,r4,r9
+	VPERM(vr9,vr3,vr2,vr16)
+	lvx	vr1,r4,r10
+	VPERM(vr10,vr2,vr1,vr16)
+	lvx	vr0,r4,r11
+	VPERM(vr11,vr1,vr0,vr16)
+	addi	r4,r4,64
+	stvx	vr8,r0,r3
+	stvx	vr9,r3,r9
+	stvx	vr10,r3,r10
+	stvx	vr11,r3,r11
+	addi	r3,r3,64
+
+9:	bf	cr7*4+2,10f
+	lvx	vr1,r0,r4
+	VPERM(vr8,vr0,vr1,vr16)
+	lvx	vr0,r4,r9
+	VPERM(vr9,vr1,vr0,vr16)
+	addi	r4,r4,32
+	stvx	vr8,r0,r3
+	stvx	vr9,r3,r9
+	addi	r3,r3,32
+
+10:	bf	cr7*4+3,11f
+	lvx	vr1,r0,r4
+	VPERM(vr8,vr0,vr1,vr16)
+	addi	r4,r4,16
+	stvx	vr8,r0,r3
+	addi	r3,r3,16
+
+	/* Up to 15B to go */
+11:	clrldi	r5,r5,(64-4)
+	addi	r4,r4,-16	/* Unwind the +16 load offset */
+	mtocrf	0x01,r5
+	bf	cr7*4+0,12f
+	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
+	lwz	r6,4(r4)
+	addi	r4,r4,8
+	stw	r0,0(r3)
+	stw	r6,4(r3)
+	addi	r3,r3,8
+
+12:	bf	cr7*4+1,13f
+	lwz	r0,0(r4)
+	addi	r4,r4,4
+	stw	r0,0(r3)
+	addi	r3,r3,4
+
+13:	bf	cr7*4+2,14f
+	lhz	r0,0(r4)
+	addi	r4,r4,2
+	sth	r0,0(r3)
+	addi	r3,r3,2
+
+14:	bf	cr7*4+3,15f
+	lbz	r0,0(r4)
+	stb	r0,0(r3)
+
+15:	addi	r1,r1,STACKFRAMESIZE
+	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
+	b	exit_vmx_copy		/* tail call optimise */
+#endif /* CONFiG_ALTIVEC */
diff --git a/arch/powerpc/lib/rheap.c b/arch/powerpc/lib/rheap.c
index 22c3b4f53de..a1060a868e6 100644
--- a/arch/powerpc/lib/rheap.c
+++ b/arch/powerpc/lib/rheap.c
@@ -15,7 +15,7 @@
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mm.h>
 #include <linux/err.h>
 #include <linux/slab.h>
@@ -54,7 +54,7 @@ static int grow(rh_info_t * info, int max_blocks)
 
 	new_blocks = max_blocks - info->max_blocks;
 
-	block = kmalloc(sizeof(rh_block_t) * max_blocks, GFP_KERNEL);
+	block = kmalloc(sizeof(rh_block_t) * max_blocks, GFP_ATOMIC);
 	if (block == NULL)
 		return -ENOMEM;
 
@@ -258,7 +258,7 @@ rh_info_t *rh_create(unsigned int alignment)
 	if ((alignment & (alignment - 1)) != 0)
 		return ERR_PTR(-EINVAL);
 
-	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	info = kmalloc(sizeof(*info), GFP_ATOMIC);
 	if (info == NULL)
 		return ERR_PTR(-ENOMEM);
 
@@ -556,6 +556,7 @@ unsigned long rh_alloc_fixed(rh_info_t * info, unsigned long start, int size, co
 		be = blk->start + blk->size;
 		if (s >= bs && e <= be)
 			break;
+		blk = NULL;
 	}
 
 	if (blk == NULL)
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 4aae0c38764..5c09f365c84 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -11,8 +11,11 @@
 #include <linux/kernel.h>
 #include <linux/kprobes.h>
 #include <linux/ptrace.h>
+#include <linux/prefetch.h>
 #include <asm/sstep.h>
 #include <asm/processor.h>
+#include <asm/uaccess.h>
+#include <asm/cputable.h>
 
 extern char system_call_common[];
 
@@ -23,6 +26,37 @@ extern char system_call_common[];
 #define MSR_MASK	0x87c0ffff
 #endif
 
+/* Bits in XER */
+#define XER_SO		0x80000000U
+#define XER_OV		0x40000000U
+#define XER_CA		0x20000000U
+
+#ifdef CONFIG_PPC_FPU
+/*
+ * Functions in ldstfp.S
+ */
+extern int do_lfs(int rn, unsigned long ea);
+extern int do_lfd(int rn, unsigned long ea);
+extern int do_stfs(int rn, unsigned long ea);
+extern int do_stfd(int rn, unsigned long ea);
+extern int do_lvx(int rn, unsigned long ea);
+extern int do_stvx(int rn, unsigned long ea);
+extern int do_lxvd2x(int rn, unsigned long ea);
+extern int do_stxvd2x(int rn, unsigned long ea);
+#endif
+
+/*
+ * Emulate the truncation of 64 bit values in 32-bit mode.
+ */
+static unsigned long truncate_if_32bit(unsigned long msr, unsigned long val)
+{
+#ifdef __powerpc64__
+	if ((msr & MSR_64BIT) == 0)
+		val &= 0xffffffffUL;
+#endif
+	return val;
+}
+
 /*
  * Determine whether a conditional branch instruction would branch.
  */
@@ -46,16 +80,569 @@ static int __kprobes branch_taken(unsigned int instr, struct pt_regs *regs)
 	return 1;
 }
 
+
+static long __kprobes address_ok(struct pt_regs *regs, unsigned long ea, int nb)
+{
+	if (!user_mode(regs))
+		return 1;
+	return __access_ok(ea, nb, USER_DS);
+}
+
+/*
+ * Calculate effective address for a D-form instruction
+ */
+static unsigned long __kprobes dform_ea(unsigned int instr, struct pt_regs *regs)
+{
+	int ra;
+	unsigned long ea;
+
+	ra = (instr >> 16) & 0x1f;
+	ea = (signed short) instr;		/* sign-extend */
+	if (ra) {
+		ea += regs->gpr[ra];
+		if (instr & 0x04000000) {		/* update forms */
+			if ((instr>>26) != 47) 		/* stmw is not an update form */
+				regs->gpr[ra] = ea;
+		}
+	}
+
+	return truncate_if_32bit(regs->msr, ea);
+}
+
+#ifdef __powerpc64__
+/*
+ * Calculate effective address for a DS-form instruction
+ */
+static unsigned long __kprobes dsform_ea(unsigned int instr, struct pt_regs *regs)
+{
+	int ra;
+	unsigned long ea;
+
+	ra = (instr >> 16) & 0x1f;
+	ea = (signed short) (instr & ~3);	/* sign-extend */
+	if (ra) {
+		ea += regs->gpr[ra];
+		if ((instr & 3) == 1)		/* update forms */
+			regs->gpr[ra] = ea;
+	}
+
+	return truncate_if_32bit(regs->msr, ea);
+}
+#endif /* __powerpc64 */
+
+/*
+ * Calculate effective address for an X-form instruction
+ */
+static unsigned long __kprobes xform_ea(unsigned int instr, struct pt_regs *regs,
+				     int do_update)
+{
+	int ra, rb;
+	unsigned long ea;
+
+	ra = (instr >> 16) & 0x1f;
+	rb = (instr >> 11) & 0x1f;
+	ea = regs->gpr[rb];
+	if (ra) {
+		ea += regs->gpr[ra];
+		if (do_update)		/* update forms */
+			regs->gpr[ra] = ea;
+	}
+
+	return truncate_if_32bit(regs->msr, ea);
+}
+
+/*
+ * Return the largest power of 2, not greater than sizeof(unsigned long),
+ * such that x is a multiple of it.
+ */
+static inline unsigned long max_align(unsigned long x)
+{
+	x |= sizeof(unsigned long);
+	return x & -x;		/* isolates rightmost bit */
+}
+
+
+static inline unsigned long byterev_2(unsigned long x)
+{
+	return ((x >> 8) & 0xff) | ((x & 0xff) << 8);
+}
+
+static inline unsigned long byterev_4(unsigned long x)
+{
+	return ((x >> 24) & 0xff) | ((x >> 8) & 0xff00) |
+		((x & 0xff00) << 8) | ((x & 0xff) << 24);
+}
+
+#ifdef __powerpc64__
+static inline unsigned long byterev_8(unsigned long x)
+{
+	return (byterev_4(x) << 32) | byterev_4(x >> 32);
+}
+#endif
+
+static int __kprobes read_mem_aligned(unsigned long *dest, unsigned long ea,
+				      int nb)
+{
+	int err = 0;
+	unsigned long x = 0;
+
+	switch (nb) {
+	case 1:
+		err = __get_user(x, (unsigned char __user *) ea);
+		break;
+	case 2:
+		err = __get_user(x, (unsigned short __user *) ea);
+		break;
+	case 4:
+		err = __get_user(x, (unsigned int __user *) ea);
+		break;
+#ifdef __powerpc64__
+	case 8:
+		err = __get_user(x, (unsigned long __user *) ea);
+		break;
+#endif
+	}
+	if (!err)
+		*dest = x;
+	return err;
+}
+
+static int __kprobes read_mem_unaligned(unsigned long *dest, unsigned long ea,
+					int nb, struct pt_regs *regs)
+{
+	int err;
+	unsigned long x, b, c;
+#ifdef __LITTLE_ENDIAN__
+	int len = nb; /* save a copy of the length for byte reversal */
+#endif
+
+	/* unaligned, do this in pieces */
+	x = 0;
+	for (; nb > 0; nb -= c) {
+#ifdef __LITTLE_ENDIAN__
+		c = 1;
+#endif
+#ifdef __BIG_ENDIAN__
+		c = max_align(ea);
+#endif
+		if (c > nb)
+			c = max_align(nb);
+		err = read_mem_aligned(&b, ea, c);
+		if (err)
+			return err;
+		x = (x << (8 * c)) + b;
+		ea += c;
+	}
+#ifdef __LITTLE_ENDIAN__
+	switch (len) {
+	case 2:
+		*dest = byterev_2(x);
+		break;
+	case 4:
+		*dest = byterev_4(x);
+		break;
+#ifdef __powerpc64__
+	case 8:
+		*dest = byterev_8(x);
+		break;
+#endif
+	}
+#endif
+#ifdef __BIG_ENDIAN__
+	*dest = x;
+#endif
+	return 0;
+}
+
 /*
- * Emulate instructions that cause a transfer of control.
+ * Read memory at address ea for nb bytes, return 0 for success
+ * or -EFAULT if an error occurred.
+ */
+static int __kprobes read_mem(unsigned long *dest, unsigned long ea, int nb,
+			      struct pt_regs *regs)
+{
+	if (!address_ok(regs, ea, nb))
+		return -EFAULT;
+	if ((ea & (nb - 1)) == 0)
+		return read_mem_aligned(dest, ea, nb);
+	return read_mem_unaligned(dest, ea, nb, regs);
+}
+
+static int __kprobes write_mem_aligned(unsigned long val, unsigned long ea,
+				       int nb)
+{
+	int err = 0;
+
+	switch (nb) {
+	case 1:
+		err = __put_user(val, (unsigned char __user *) ea);
+		break;
+	case 2:
+		err = __put_user(val, (unsigned short __user *) ea);
+		break;
+	case 4:
+		err = __put_user(val, (unsigned int __user *) ea);
+		break;
+#ifdef __powerpc64__
+	case 8:
+		err = __put_user(val, (unsigned long __user *) ea);
+		break;
+#endif
+	}
+	return err;
+}
+
+static int __kprobes write_mem_unaligned(unsigned long val, unsigned long ea,
+					 int nb, struct pt_regs *regs)
+{
+	int err;
+	unsigned long c;
+
+#ifdef __LITTLE_ENDIAN__
+	switch (nb) {
+	case 2:
+		val = byterev_2(val);
+		break;
+	case 4:
+		val = byterev_4(val);
+		break;
+#ifdef __powerpc64__
+	case 8:
+		val = byterev_8(val);
+		break;
+#endif
+	}
+#endif
+	/* unaligned or little-endian, do this in pieces */
+	for (; nb > 0; nb -= c) {
+#ifdef __LITTLE_ENDIAN__
+		c = 1;
+#endif
+#ifdef __BIG_ENDIAN__
+		c = max_align(ea);
+#endif
+		if (c > nb)
+			c = max_align(nb);
+		err = write_mem_aligned(val >> (nb - c) * 8, ea, c);
+		if (err)
+			return err;
+		ea += c;
+	}
+	return 0;
+}
+
+/*
+ * Write memory at address ea for nb bytes, return 0 for success
+ * or -EFAULT if an error occurred.
+ */
+static int __kprobes write_mem(unsigned long val, unsigned long ea, int nb,
+			       struct pt_regs *regs)
+{
+	if (!address_ok(regs, ea, nb))
+		return -EFAULT;
+	if ((ea & (nb - 1)) == 0)
+		return write_mem_aligned(val, ea, nb);
+	return write_mem_unaligned(val, ea, nb, regs);
+}
+
+#ifdef CONFIG_PPC_FPU
+/*
+ * Check the address and alignment, and call func to do the actual
+ * load or store.
+ */
+static int __kprobes do_fp_load(int rn, int (*func)(int, unsigned long),
+				unsigned long ea, int nb,
+				struct pt_regs *regs)
+{
+	int err;
+	union {
+		double dbl;
+		unsigned long ul[2];
+		struct {
+#ifdef __BIG_ENDIAN__
+			unsigned _pad_;
+			unsigned word;
+#endif
+#ifdef __LITTLE_ENDIAN__
+			unsigned word;
+			unsigned _pad_;
+#endif
+		} single;
+	} data;
+	unsigned long ptr;
+
+	if (!address_ok(regs, ea, nb))
+		return -EFAULT;
+	if ((ea & 3) == 0)
+		return (*func)(rn, ea);
+	ptr = (unsigned long) &data.ul;
+	if (sizeof(unsigned long) == 8 || nb == 4) {
+		err = read_mem_unaligned(&data.ul[0], ea, nb, regs);
+		if (nb == 4)
+			ptr = (unsigned long)&(data.single.word);
+	} else {
+		/* reading a double on 32-bit */
+		err = read_mem_unaligned(&data.ul[0], ea, 4, regs);
+		if (!err)
+			err = read_mem_unaligned(&data.ul[1], ea + 4, 4, regs);
+	}
+	if (err)
+		return err;
+	return (*func)(rn, ptr);
+}
+
+static int __kprobes do_fp_store(int rn, int (*func)(int, unsigned long),
+				 unsigned long ea, int nb,
+				 struct pt_regs *regs)
+{
+	int err;
+	union {
+		double dbl;
+		unsigned long ul[2];
+		struct {
+#ifdef __BIG_ENDIAN__
+			unsigned _pad_;
+			unsigned word;
+#endif
+#ifdef __LITTLE_ENDIAN__
+			unsigned word;
+			unsigned _pad_;
+#endif
+		} single;
+	} data;
+	unsigned long ptr;
+
+	if (!address_ok(regs, ea, nb))
+		return -EFAULT;
+	if ((ea & 3) == 0)
+		return (*func)(rn, ea);
+	ptr = (unsigned long) &data.ul[0];
+	if (sizeof(unsigned long) == 8 || nb == 4) {
+		if (nb == 4)
+			ptr = (unsigned long)&(data.single.word);
+		err = (*func)(rn, ptr);
+		if (err)
+			return err;
+		err = write_mem_unaligned(data.ul[0], ea, nb, regs);
+	} else {
+		/* writing a double on 32-bit */
+		err = (*func)(rn, ptr);
+		if (err)
+			return err;
+		err = write_mem_unaligned(data.ul[0], ea, 4, regs);
+		if (!err)
+			err = write_mem_unaligned(data.ul[1], ea + 4, 4, regs);
+	}
+	return err;
+}
+#endif
+
+#ifdef CONFIG_ALTIVEC
+/* For Altivec/VMX, no need to worry about alignment */
+static int __kprobes do_vec_load(int rn, int (*func)(int, unsigned long),
+				 unsigned long ea, struct pt_regs *regs)
+{
+	if (!address_ok(regs, ea & ~0xfUL, 16))
+		return -EFAULT;
+	return (*func)(rn, ea);
+}
+
+static int __kprobes do_vec_store(int rn, int (*func)(int, unsigned long),
+				  unsigned long ea, struct pt_regs *regs)
+{
+	if (!address_ok(regs, ea & ~0xfUL, 16))
+		return -EFAULT;
+	return (*func)(rn, ea);
+}
+#endif /* CONFIG_ALTIVEC */
+
+#ifdef CONFIG_VSX
+static int __kprobes do_vsx_load(int rn, int (*func)(int, unsigned long),
+				 unsigned long ea, struct pt_regs *regs)
+{
+	int err;
+	unsigned long val[2];
+
+	if (!address_ok(regs, ea, 16))
+		return -EFAULT;
+	if ((ea & 3) == 0)
+		return (*func)(rn, ea);
+	err = read_mem_unaligned(&val[0], ea, 8, regs);
+	if (!err)
+		err = read_mem_unaligned(&val[1], ea + 8, 8, regs);
+	if (!err)
+		err = (*func)(rn, (unsigned long) &val[0]);
+	return err;
+}
+
+static int __kprobes do_vsx_store(int rn, int (*func)(int, unsigned long),
+				 unsigned long ea, struct pt_regs *regs)
+{
+	int err;
+	unsigned long val[2];
+
+	if (!address_ok(regs, ea, 16))
+		return -EFAULT;
+	if ((ea & 3) == 0)
+		return (*func)(rn, ea);
+	err = (*func)(rn, (unsigned long) &val[0]);
+	if (err)
+		return err;
+	err = write_mem_unaligned(val[0], ea, 8, regs);
+	if (!err)
+		err = write_mem_unaligned(val[1], ea + 8, 8, regs);
+	return err;
+}
+#endif /* CONFIG_VSX */
+
+#define __put_user_asmx(x, addr, err, op, cr)		\
+	__asm__ __volatile__(				\
+		"1:	" op " %2,0,%3\n"		\
+		"	mfcr	%1\n"			\
+		"2:\n"					\
+		".section .fixup,\"ax\"\n"		\
+		"3:	li	%0,%4\n"		\
+		"	b	2b\n"			\
+		".previous\n"				\
+		".section __ex_table,\"a\"\n"		\
+			PPC_LONG_ALIGN "\n"		\
+			PPC_LONG "1b,3b\n"		\
+		".previous"				\
+		: "=r" (err), "=r" (cr)			\
+		: "r" (x), "r" (addr), "i" (-EFAULT), "0" (err))
+
+#define __get_user_asmx(x, addr, err, op)		\
+	__asm__ __volatile__(				\
+		"1:	"op" %1,0,%2\n"			\
+		"2:\n"					\
+		".section .fixup,\"ax\"\n"		\
+		"3:	li	%0,%3\n"		\
+		"	b	2b\n"			\
+		".previous\n"				\
+		".section __ex_table,\"a\"\n"		\
+			PPC_LONG_ALIGN "\n"		\
+			PPC_LONG "1b,3b\n"		\
+		".previous"				\
+		: "=r" (err), "=r" (x)			\
+		: "r" (addr), "i" (-EFAULT), "0" (err))
+
+#define __cacheop_user_asmx(addr, err, op)		\
+	__asm__ __volatile__(				\
+		"1:	"op" 0,%1\n"			\
+		"2:\n"					\
+		".section .fixup,\"ax\"\n"		\
+		"3:	li	%0,%3\n"		\
+		"	b	2b\n"			\
+		".previous\n"				\
+		".section __ex_table,\"a\"\n"		\
+			PPC_LONG_ALIGN "\n"		\
+			PPC_LONG "1b,3b\n"		\
+		".previous"				\
+		: "=r" (err)				\
+		: "r" (addr), "i" (-EFAULT), "0" (err))
+
+static void __kprobes set_cr0(struct pt_regs *regs, int rd)
+{
+	long val = regs->gpr[rd];
+
+	regs->ccr = (regs->ccr & 0x0fffffff) | ((regs->xer >> 3) & 0x10000000);
+#ifdef __powerpc64__
+	if (!(regs->msr & MSR_64BIT))
+		val = (int) val;
+#endif
+	if (val < 0)
+		regs->ccr |= 0x80000000;
+	else if (val > 0)
+		regs->ccr |= 0x40000000;
+	else
+		regs->ccr |= 0x20000000;
+}
+
+static void __kprobes add_with_carry(struct pt_regs *regs, int rd,
+				     unsigned long val1, unsigned long val2,
+				     unsigned long carry_in)
+{
+	unsigned long val = val1 + val2;
+
+	if (carry_in)
+		++val;
+	regs->gpr[rd] = val;
+#ifdef __powerpc64__
+	if (!(regs->msr & MSR_64BIT)) {
+		val = (unsigned int) val;
+		val1 = (unsigned int) val1;
+	}
+#endif
+	if (val < val1 || (carry_in && val == val1))
+		regs->xer |= XER_CA;
+	else
+		regs->xer &= ~XER_CA;
+}
+
+static void __kprobes do_cmp_signed(struct pt_regs *regs, long v1, long v2,
+				    int crfld)
+{
+	unsigned int crval, shift;
+
+	crval = (regs->xer >> 31) & 1;		/* get SO bit */
+	if (v1 < v2)
+		crval |= 8;
+	else if (v1 > v2)
+		crval |= 4;
+	else
+		crval |= 2;
+	shift = (7 - crfld) * 4;
+	regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift);
+}
+
+static void __kprobes do_cmp_unsigned(struct pt_regs *regs, unsigned long v1,
+				      unsigned long v2, int crfld)
+{
+	unsigned int crval, shift;
+
+	crval = (regs->xer >> 31) & 1;		/* get SO bit */
+	if (v1 < v2)
+		crval |= 8;
+	else if (v1 > v2)
+		crval |= 4;
+	else
+		crval |= 2;
+	shift = (7 - crfld) * 4;
+	regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift);
+}
+
+/*
+ * Elements of 32-bit rotate and mask instructions.
+ */
+#define MASK32(mb, me)	((0xffffffffUL >> (mb)) + \
+			 ((signed long)-0x80000000L >> (me)) + ((me) >= (mb)))
+#ifdef __powerpc64__
+#define MASK64_L(mb)	(~0UL >> (mb))
+#define MASK64_R(me)	((signed long)-0x8000000000000000L >> (me))
+#define MASK64(mb, me)	(MASK64_L(mb) + MASK64_R(me) + ((me) >= (mb)))
+#define DATA32(x)	(((x) & 0xffffffffUL) | (((x) & 0xffffffffUL) << 32))
+#else
+#define DATA32(x)	(x)
+#endif
+#define ROTATE(x, n)	((n) ? (((x) << (n)) | ((x) >> (8 * sizeof(long) - (n)))) : (x))
+
+/*
+ * Emulate instructions that cause a transfer of control,
+ * loads and stores, and a few other instructions.
  * Returns 1 if the step was emulated, 0 if not,
  * or -1 if the instruction is one that should not be stepped,
  * such as an rfid, or a mtmsrd that would clear MSR_RI.
  */
 int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
 {
-	unsigned int opcode, rs, rb, rd, spr;
+	unsigned int opcode, ra, rb, rd, spr, u;
 	unsigned long int imm;
+	unsigned long int val, val2;
+	unsigned long int ea;
+	unsigned int cr, mb, me, sh;
+	int err;
+	unsigned long old_ra, val3;
+	long ival;
 
 	opcode = instr >> 26;
 	switch (opcode) {
@@ -64,12 +651,11 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
 		if ((instr & 2) == 0)
 			imm += regs->nip;
 		regs->nip += 4;
-		if ((regs->msr & MSR_SF) == 0)
-			regs->nip &= 0xffffffffUL;
+		regs->nip = truncate_if_32bit(regs->msr, regs->nip);
 		if (instr & 1)
 			regs->link = regs->nip;
 		if (branch_taken(instr, regs))
-			regs->nip = imm;
+			regs->nip = truncate_if_32bit(regs->msr, imm);
 		return 1;
 #ifdef CONFIG_PPC64
 	case 17:	/* sc */
@@ -78,7 +664,13 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
 		 * entry code works.  If that is changed, this will
 		 * need to be changed also.
 		 */
+		if (regs->gpr[0] == 0x1ebe &&
+		    cpu_has_feature(CPU_FTR_REAL_LE)) {
+			regs->msr ^= MSR_LE;
+			goto instr_done;
+		}
 		regs->gpr[9] = regs->gpr[13];
+		regs->gpr[10] = MSR_KERNEL;
 		regs->gpr[11] = regs->nip + 4;
 		regs->gpr[12] = regs->msr & MSR_MASK;
 		regs->gpr[13] = (unsigned long) get_paca();
@@ -92,54 +684,250 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
 			imm -= 0x04000000;
 		if ((instr & 2) == 0)
 			imm += regs->nip;
-		if (instr & 1) {
-			regs->link = regs->nip + 4;
-			if ((regs->msr & MSR_SF) == 0)
-				regs->link &= 0xffffffffUL;
-		}
-		if ((regs->msr & MSR_SF) == 0)
-			imm &= 0xffffffffUL;
+		if (instr & 1)
+			regs->link = truncate_if_32bit(regs->msr, regs->nip + 4);
+		imm = truncate_if_32bit(regs->msr, imm);
 		regs->nip = imm;
 		return 1;
 	case 19:
-		switch (instr & 0x7fe) {
-		case 0x20:	/* bclr */
-		case 0x420:	/* bcctr */
+		switch ((instr >> 1) & 0x3ff) {
+		case 16:	/* bclr */
+		case 528:	/* bcctr */
 			imm = (instr & 0x400)? regs->ctr: regs->link;
-			regs->nip += 4;
-			if ((regs->msr & MSR_SF) == 0) {
-				regs->nip &= 0xffffffffUL;
-				imm &= 0xffffffffUL;
-			}
+			regs->nip = truncate_if_32bit(regs->msr, regs->nip + 4);
+			imm = truncate_if_32bit(regs->msr, imm);
 			if (instr & 1)
 				regs->link = regs->nip;
 			if (branch_taken(instr, regs))
 				regs->nip = imm;
 			return 1;
-		case 0x24:	/* rfid, scary */
+
+		case 18:	/* rfid, scary */
 			return -1;
+
+		case 150:	/* isync */
+			isync();
+			goto instr_done;
+
+		case 33:	/* crnor */
+		case 129:	/* crandc */
+		case 193:	/* crxor */
+		case 225:	/* crnand */
+		case 257:	/* crand */
+		case 289:	/* creqv */
+		case 417:	/* crorc */
+		case 449:	/* cror */
+			ra = (instr >> 16) & 0x1f;
+			rb = (instr >> 11) & 0x1f;
+			rd = (instr >> 21) & 0x1f;
+			ra = (regs->ccr >> (31 - ra)) & 1;
+			rb = (regs->ccr >> (31 - rb)) & 1;
+			val = (instr >> (6 + ra * 2 + rb)) & 1;
+			regs->ccr = (regs->ccr & ~(1UL << (31 - rd))) |
+				(val << (31 - rd));
+			goto instr_done;
 		}
+		break;
 	case 31:
-		rd = (instr >> 21) & 0x1f;
-		switch (instr & 0x7fe) {
-		case 0xa6:	/* mfmsr */
+		switch ((instr >> 1) & 0x3ff) {
+		case 598:	/* sync */
+#ifdef __powerpc64__
+			switch ((instr >> 21) & 3) {
+			case 1:		/* lwsync */
+				asm volatile("lwsync" : : : "memory");
+				goto instr_done;
+			case 2:		/* ptesync */
+				asm volatile("ptesync" : : : "memory");
+				goto instr_done;
+			}
+#endif
+			mb();
+			goto instr_done;
+
+		case 854:	/* eieio */
+			eieio();
+			goto instr_done;
+		}
+		break;
+	}
+
+	/* Following cases refer to regs->gpr[], so we need all regs */
+	if (!FULL_REGS(regs))
+		return 0;
+
+	rd = (instr >> 21) & 0x1f;
+	ra = (instr >> 16) & 0x1f;
+	rb = (instr >> 11) & 0x1f;
+
+	switch (opcode) {
+	case 7:		/* mulli */
+		regs->gpr[rd] = regs->gpr[ra] * (short) instr;
+		goto instr_done;
+
+	case 8:		/* subfic */
+		imm = (short) instr;
+		add_with_carry(regs, rd, ~regs->gpr[ra], imm, 1);
+		goto instr_done;
+
+	case 10:	/* cmpli */
+		imm = (unsigned short) instr;
+		val = regs->gpr[ra];
+#ifdef __powerpc64__
+		if ((rd & 1) == 0)
+			val = (unsigned int) val;
+#endif
+		do_cmp_unsigned(regs, val, imm, rd >> 2);
+		goto instr_done;
+
+	case 11:	/* cmpi */
+		imm = (short) instr;
+		val = regs->gpr[ra];
+#ifdef __powerpc64__
+		if ((rd & 1) == 0)
+			val = (int) val;
+#endif
+		do_cmp_signed(regs, val, imm, rd >> 2);
+		goto instr_done;
+
+	case 12:	/* addic */
+		imm = (short) instr;
+		add_with_carry(regs, rd, regs->gpr[ra], imm, 0);
+		goto instr_done;
+
+	case 13:	/* addic. */
+		imm = (short) instr;
+		add_with_carry(regs, rd, regs->gpr[ra], imm, 0);
+		set_cr0(regs, rd);
+		goto instr_done;
+
+	case 14:	/* addi */
+		imm = (short) instr;
+		if (ra)
+			imm += regs->gpr[ra];
+		regs->gpr[rd] = imm;
+		goto instr_done;
+
+	case 15:	/* addis */
+		imm = ((short) instr) << 16;
+		if (ra)
+			imm += regs->gpr[ra];
+		regs->gpr[rd] = imm;
+		goto instr_done;
+
+	case 20:	/* rlwimi */
+		mb = (instr >> 6) & 0x1f;
+		me = (instr >> 1) & 0x1f;
+		val = DATA32(regs->gpr[rd]);
+		imm = MASK32(mb, me);
+		regs->gpr[ra] = (regs->gpr[ra] & ~imm) | (ROTATE(val, rb) & imm);
+		goto logical_done;
+
+	case 21:	/* rlwinm */
+		mb = (instr >> 6) & 0x1f;
+		me = (instr >> 1) & 0x1f;
+		val = DATA32(regs->gpr[rd]);
+		regs->gpr[ra] = ROTATE(val, rb) & MASK32(mb, me);
+		goto logical_done;
+
+	case 23:	/* rlwnm */
+		mb = (instr >> 6) & 0x1f;
+		me = (instr >> 1) & 0x1f;
+		rb = regs->gpr[rb] & 0x1f;
+		val = DATA32(regs->gpr[rd]);
+		regs->gpr[ra] = ROTATE(val, rb) & MASK32(mb, me);
+		goto logical_done;
+
+	case 24:	/* ori */
+		imm = (unsigned short) instr;
+		regs->gpr[ra] = regs->gpr[rd] | imm;
+		goto instr_done;
+
+	case 25:	/* oris */
+		imm = (unsigned short) instr;
+		regs->gpr[ra] = regs->gpr[rd] | (imm << 16);
+		goto instr_done;
+
+	case 26:	/* xori */
+		imm = (unsigned short) instr;
+		regs->gpr[ra] = regs->gpr[rd] ^ imm;
+		goto instr_done;
+
+	case 27:	/* xoris */
+		imm = (unsigned short) instr;
+		regs->gpr[ra] = regs->gpr[rd] ^ (imm << 16);
+		goto instr_done;
+
+	case 28:	/* andi. */
+		imm = (unsigned short) instr;
+		regs->gpr[ra] = regs->gpr[rd] & imm;
+		set_cr0(regs, ra);
+		goto instr_done;
+
+	case 29:	/* andis. */
+		imm = (unsigned short) instr;
+		regs->gpr[ra] = regs->gpr[rd] & (imm << 16);
+		set_cr0(regs, ra);
+		goto instr_done;
+
+#ifdef __powerpc64__
+	case 30:	/* rld* */
+		mb = ((instr >> 6) & 0x1f) | (instr & 0x20);
+		val = regs->gpr[rd];
+		if ((instr & 0x10) == 0) {
+			sh = rb | ((instr & 2) << 4);
+			val = ROTATE(val, sh);
+			switch ((instr >> 2) & 3) {
+			case 0:		/* rldicl */
+				regs->gpr[ra] = val & MASK64_L(mb);
+				goto logical_done;
+			case 1:		/* rldicr */
+				regs->gpr[ra] = val & MASK64_R(mb);
+				goto logical_done;
+			case 2:		/* rldic */
+				regs->gpr[ra] = val & MASK64(mb, 63 - sh);
+				goto logical_done;
+			case 3:		/* rldimi */
+				imm = MASK64(mb, 63 - sh);
+				regs->gpr[ra] = (regs->gpr[ra] & ~imm) |
+					(val & imm);
+				goto logical_done;
+			}
+		} else {
+			sh = regs->gpr[rb] & 0x3f;
+			val = ROTATE(val, sh);
+			switch ((instr >> 1) & 7) {
+			case 0:		/* rldcl */
+				regs->gpr[ra] = val & MASK64_L(mb);
+				goto logical_done;
+			case 1:		/* rldcr */
+				regs->gpr[ra] = val & MASK64_R(mb);
+				goto logical_done;
+			}
+		}
+#endif
+
+	case 31:
+		switch ((instr >> 1) & 0x3ff) {
+		case 83:	/* mfmsr */
+			if (regs->msr & MSR_PR)
+				break;
 			regs->gpr[rd] = regs->msr & MSR_MASK;
-			regs->nip += 4;
-			if ((regs->msr & MSR_SF) == 0)
-				regs->nip &= 0xffffffffUL;
-			return 1;
-		case 0x124:	/* mtmsr */
+			goto instr_done;
+		case 146:	/* mtmsr */
+			if (regs->msr & MSR_PR)
+				break;
 			imm = regs->gpr[rd];
 			if ((imm & MSR_RI) == 0)
 				/* can't step mtmsr that would clear MSR_RI */
 				return -1;
 			regs->msr = imm;
-			regs->nip += 4;
-			return 1;
+			goto instr_done;
 #ifdef CONFIG_PPC64
-		case 0x164:	/* mtmsrd */
+		case 178:	/* mtmsrd */
 			/* only MSR_EE and MSR_RI get changed if bit 15 set */
 			/* mtmsrd doesn't change MSR_HV and MSR_ME */
+			if (regs->msr & MSR_PR)
+				break;
 			imm = (instr & 0x10000)? 0x8002: 0xefffffffffffefffUL;
 			imm = (regs->msr & MSR_MASK & ~imm)
 				| (regs->gpr[rd] & imm);
@@ -147,55 +935,803 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
 				/* can't step mtmsrd that would clear MSR_RI */
 				return -1;
 			regs->msr = imm;
-			regs->nip += 4;
-			if ((imm & MSR_SF) == 0)
-				regs->nip &= 0xffffffffUL;
-			return 1;
+			goto instr_done;
 #endif
-		case 0x26:	/* mfcr */
+		case 19:	/* mfcr */
 			regs->gpr[rd] = regs->ccr;
 			regs->gpr[rd] &= 0xffffffffUL;
-			goto mtspr_out;
-		case 0x2a6:	/* mfspr */
+			goto instr_done;
+
+		case 144:	/* mtcrf */
+			imm = 0xf0000000UL;
+			val = regs->gpr[rd];
+			for (sh = 0; sh < 8; ++sh) {
+				if (instr & (0x80000 >> sh))
+					regs->ccr = (regs->ccr & ~imm) |
+						(val & imm);
+				imm >>= 4;
+			}
+			goto instr_done;
+
+		case 339:	/* mfspr */
 			spr = (instr >> 11) & 0x3ff;
 			switch (spr) {
 			case 0x20:	/* mfxer */
 				regs->gpr[rd] = regs->xer;
 				regs->gpr[rd] &= 0xffffffffUL;
-				goto mtspr_out;
+				goto instr_done;
 			case 0x100:	/* mflr */
 				regs->gpr[rd] = regs->link;
-				goto mtspr_out;
+				goto instr_done;
 			case 0x120:	/* mfctr */
 				regs->gpr[rd] = regs->ctr;
-				goto mtspr_out;
-			}
-			break;
-		case 0x378:	/* orx */
-			rs = (instr >> 21) & 0x1f;
-			rb = (instr >> 11) & 0x1f;
-			if (rs == rb) {		/* mr */
-				rd = (instr >> 16) & 0x1f;
-				regs->gpr[rd] = regs->gpr[rs];
-				goto mtspr_out;
+				goto instr_done;
 			}
 			break;
-		case 0x3a6:	/* mtspr */
+
+		case 467:	/* mtspr */
 			spr = (instr >> 11) & 0x3ff;
 			switch (spr) {
 			case 0x20:	/* mtxer */
 				regs->xer = (regs->gpr[rd] & 0xffffffffUL);
-				goto mtspr_out;
+				goto instr_done;
 			case 0x100:	/* mtlr */
 				regs->link = regs->gpr[rd];
-				goto mtspr_out;
+				goto instr_done;
 			case 0x120:	/* mtctr */
 				regs->ctr = regs->gpr[rd];
-mtspr_out:
-				regs->nip += 4;
-				return 1;
+				goto instr_done;
+			}
+			break;
+
+/*
+ * Compare instructions
+ */
+		case 0:	/* cmp */
+			val = regs->gpr[ra];
+			val2 = regs->gpr[rb];
+#ifdef __powerpc64__
+			if ((rd & 1) == 0) {
+				/* word (32-bit) compare */
+				val = (int) val;
+				val2 = (int) val2;
+			}
+#endif
+			do_cmp_signed(regs, val, val2, rd >> 2);
+			goto instr_done;
+
+		case 32:	/* cmpl */
+			val = regs->gpr[ra];
+			val2 = regs->gpr[rb];
+#ifdef __powerpc64__
+			if ((rd & 1) == 0) {
+				/* word (32-bit) compare */
+				val = (unsigned int) val;
+				val2 = (unsigned int) val2;
+			}
+#endif
+			do_cmp_unsigned(regs, val, val2, rd >> 2);
+			goto instr_done;
+
+/*
+ * Arithmetic instructions
+ */
+		case 8:	/* subfc */
+			add_with_carry(regs, rd, ~regs->gpr[ra],
+				       regs->gpr[rb], 1);
+			goto arith_done;
+#ifdef __powerpc64__
+		case 9:	/* mulhdu */
+			asm("mulhdu %0,%1,%2" : "=r" (regs->gpr[rd]) :
+			    "r" (regs->gpr[ra]), "r" (regs->gpr[rb]));
+			goto arith_done;
+#endif
+		case 10:	/* addc */
+			add_with_carry(regs, rd, regs->gpr[ra],
+				       regs->gpr[rb], 0);
+			goto arith_done;
+
+		case 11:	/* mulhwu */
+			asm("mulhwu %0,%1,%2" : "=r" (regs->gpr[rd]) :
+			    "r" (regs->gpr[ra]), "r" (regs->gpr[rb]));
+			goto arith_done;
+
+		case 40:	/* subf */
+			regs->gpr[rd] = regs->gpr[rb] - regs->gpr[ra];
+			goto arith_done;
+#ifdef __powerpc64__
+		case 73:	/* mulhd */
+			asm("mulhd %0,%1,%2" : "=r" (regs->gpr[rd]) :
+			    "r" (regs->gpr[ra]), "r" (regs->gpr[rb]));
+			goto arith_done;
+#endif
+		case 75:	/* mulhw */
+			asm("mulhw %0,%1,%2" : "=r" (regs->gpr[rd]) :
+			    "r" (regs->gpr[ra]), "r" (regs->gpr[rb]));
+			goto arith_done;
+
+		case 104:	/* neg */
+			regs->gpr[rd] = -regs->gpr[ra];
+			goto arith_done;
+
+		case 136:	/* subfe */
+			add_with_carry(regs, rd, ~regs->gpr[ra], regs->gpr[rb],
+				       regs->xer & XER_CA);
+			goto arith_done;
+
+		case 138:	/* adde */
+			add_with_carry(regs, rd, regs->gpr[ra], regs->gpr[rb],
+				       regs->xer & XER_CA);
+			goto arith_done;
+
+		case 200:	/* subfze */
+			add_with_carry(regs, rd, ~regs->gpr[ra], 0L,
+				       regs->xer & XER_CA);
+			goto arith_done;
+
+		case 202:	/* addze */
+			add_with_carry(regs, rd, regs->gpr[ra], 0L,
+				       regs->xer & XER_CA);
+			goto arith_done;
+
+		case 232:	/* subfme */
+			add_with_carry(regs, rd, ~regs->gpr[ra], -1L,
+				       regs->xer & XER_CA);
+			goto arith_done;
+#ifdef __powerpc64__
+		case 233:	/* mulld */
+			regs->gpr[rd] = regs->gpr[ra] * regs->gpr[rb];
+			goto arith_done;
+#endif
+		case 234:	/* addme */
+			add_with_carry(regs, rd, regs->gpr[ra], -1L,
+				       regs->xer & XER_CA);
+			goto arith_done;
+
+		case 235:	/* mullw */
+			regs->gpr[rd] = (unsigned int) regs->gpr[ra] *
+				(unsigned int) regs->gpr[rb];
+			goto arith_done;
+
+		case 266:	/* add */
+			regs->gpr[rd] = regs->gpr[ra] + regs->gpr[rb];
+			goto arith_done;
+#ifdef __powerpc64__
+		case 457:	/* divdu */
+			regs->gpr[rd] = regs->gpr[ra] / regs->gpr[rb];
+			goto arith_done;
+#endif
+		case 459:	/* divwu */
+			regs->gpr[rd] = (unsigned int) regs->gpr[ra] /
+				(unsigned int) regs->gpr[rb];
+			goto arith_done;
+#ifdef __powerpc64__
+		case 489:	/* divd */
+			regs->gpr[rd] = (long int) regs->gpr[ra] /
+				(long int) regs->gpr[rb];
+			goto arith_done;
+#endif
+		case 491:	/* divw */
+			regs->gpr[rd] = (int) regs->gpr[ra] /
+				(int) regs->gpr[rb];
+			goto arith_done;
+
+
+/*
+ * Logical instructions
+ */
+		case 26:	/* cntlzw */
+			asm("cntlzw %0,%1" : "=r" (regs->gpr[ra]) :
+			    "r" (regs->gpr[rd]));
+			goto logical_done;
+#ifdef __powerpc64__
+		case 58:	/* cntlzd */
+			asm("cntlzd %0,%1" : "=r" (regs->gpr[ra]) :
+			    "r" (regs->gpr[rd]));
+			goto logical_done;
+#endif
+		case 28:	/* and */
+			regs->gpr[ra] = regs->gpr[rd] & regs->gpr[rb];
+			goto logical_done;
+
+		case 60:	/* andc */
+			regs->gpr[ra] = regs->gpr[rd] & ~regs->gpr[rb];
+			goto logical_done;
+
+		case 124:	/* nor */
+			regs->gpr[ra] = ~(regs->gpr[rd] | regs->gpr[rb]);
+			goto logical_done;
+
+		case 284:	/* xor */
+			regs->gpr[ra] = ~(regs->gpr[rd] ^ regs->gpr[rb]);
+			goto logical_done;
+
+		case 316:	/* xor */
+			regs->gpr[ra] = regs->gpr[rd] ^ regs->gpr[rb];
+			goto logical_done;
+
+		case 412:	/* orc */
+			regs->gpr[ra] = regs->gpr[rd] | ~regs->gpr[rb];
+			goto logical_done;
+
+		case 444:	/* or */
+			regs->gpr[ra] = regs->gpr[rd] | regs->gpr[rb];
+			goto logical_done;
+
+		case 476:	/* nand */
+			regs->gpr[ra] = ~(regs->gpr[rd] & regs->gpr[rb]);
+			goto logical_done;
+
+		case 922:	/* extsh */
+			regs->gpr[ra] = (signed short) regs->gpr[rd];
+			goto logical_done;
+
+		case 954:	/* extsb */
+			regs->gpr[ra] = (signed char) regs->gpr[rd];
+			goto logical_done;
+#ifdef __powerpc64__
+		case 986:	/* extsw */
+			regs->gpr[ra] = (signed int) regs->gpr[rd];
+			goto logical_done;
+#endif
+
+/*
+ * Shift instructions
+ */
+		case 24:	/* slw */
+			sh = regs->gpr[rb] & 0x3f;
+			if (sh < 32)
+				regs->gpr[ra] = (regs->gpr[rd] << sh) & 0xffffffffUL;
+			else
+				regs->gpr[ra] = 0;
+			goto logical_done;
+
+		case 536:	/* srw */
+			sh = regs->gpr[rb] & 0x3f;
+			if (sh < 32)
+				regs->gpr[ra] = (regs->gpr[rd] & 0xffffffffUL) >> sh;
+			else
+				regs->gpr[ra] = 0;
+			goto logical_done;
+
+		case 792:	/* sraw */
+			sh = regs->gpr[rb] & 0x3f;
+			ival = (signed int) regs->gpr[rd];
+			regs->gpr[ra] = ival >> (sh < 32 ? sh : 31);
+			if (ival < 0 && (sh >= 32 || (ival & ((1ul << sh) - 1)) != 0))
+				regs->xer |= XER_CA;
+			else
+				regs->xer &= ~XER_CA;
+			goto logical_done;
+
+		case 824:	/* srawi */
+			sh = rb;
+			ival = (signed int) regs->gpr[rd];
+			regs->gpr[ra] = ival >> sh;
+			if (ival < 0 && (ival & ((1ul << sh) - 1)) != 0)
+				regs->xer |= XER_CA;
+			else
+				regs->xer &= ~XER_CA;
+			goto logical_done;
+
+#ifdef __powerpc64__
+		case 27:	/* sld */
+			sh = regs->gpr[rb] & 0x7f;
+			if (sh < 64)
+				regs->gpr[ra] = regs->gpr[rd] << sh;
+			else
+				regs->gpr[ra] = 0;
+			goto logical_done;
+
+		case 539:	/* srd */
+			sh = regs->gpr[rb] & 0x7f;
+			if (sh < 64)
+				regs->gpr[ra] = regs->gpr[rd] >> sh;
+			else
+				regs->gpr[ra] = 0;
+			goto logical_done;
+
+		case 794:	/* srad */
+			sh = regs->gpr[rb] & 0x7f;
+			ival = (signed long int) regs->gpr[rd];
+			regs->gpr[ra] = ival >> (sh < 64 ? sh : 63);
+			if (ival < 0 && (sh >= 64 || (ival & ((1ul << sh) - 1)) != 0))
+				regs->xer |= XER_CA;
+			else
+				regs->xer &= ~XER_CA;
+			goto logical_done;
+
+		case 826:	/* sradi with sh_5 = 0 */
+		case 827:	/* sradi with sh_5 = 1 */
+			sh = rb | ((instr & 2) << 4);
+			ival = (signed long int) regs->gpr[rd];
+			regs->gpr[ra] = ival >> sh;
+			if (ival < 0 && (ival & ((1ul << sh) - 1)) != 0)
+				regs->xer |= XER_CA;
+			else
+				regs->xer &= ~XER_CA;
+			goto logical_done;
+#endif /* __powerpc64__ */
+
+/*
+ * Cache instructions
+ */
+		case 54:	/* dcbst */
+			ea = xform_ea(instr, regs, 0);
+			if (!address_ok(regs, ea, 8))
+				return 0;
+			err = 0;
+			__cacheop_user_asmx(ea, err, "dcbst");
+			if (err)
+				return 0;
+			goto instr_done;
+
+		case 86:	/* dcbf */
+			ea = xform_ea(instr, regs, 0);
+			if (!address_ok(regs, ea, 8))
+				return 0;
+			err = 0;
+			__cacheop_user_asmx(ea, err, "dcbf");
+			if (err)
+				return 0;
+			goto instr_done;
+
+		case 246:	/* dcbtst */
+			if (rd == 0) {
+				ea = xform_ea(instr, regs, 0);
+				prefetchw((void *) ea);
+			}
+			goto instr_done;
+
+		case 278:	/* dcbt */
+			if (rd == 0) {
+				ea = xform_ea(instr, regs, 0);
+				prefetch((void *) ea);
 			}
+			goto instr_done;
+
 		}
+		break;
 	}
-	return 0;
+
+	/*
+	 * Following cases are for loads and stores, so bail out
+	 * if we're in little-endian mode.
+	 */
+	if (regs->msr & MSR_LE)
+		return 0;
+
+	/*
+	 * Save register RA in case it's an update form load or store
+	 * and the access faults.
+	 */
+	old_ra = regs->gpr[ra];
+
+	switch (opcode) {
+	case 31:
+		u = instr & 0x40;
+		switch ((instr >> 1) & 0x3ff) {
+		case 20:	/* lwarx */
+			ea = xform_ea(instr, regs, 0);
+			if (ea & 3)
+				break;		/* can't handle misaligned */
+			err = -EFAULT;
+			if (!address_ok(regs, ea, 4))
+				goto ldst_done;
+			err = 0;
+			__get_user_asmx(val, ea, err, "lwarx");
+			if (!err)
+				regs->gpr[rd] = val;
+			goto ldst_done;
+
+		case 150:	/* stwcx. */
+			ea = xform_ea(instr, regs, 0);
+			if (ea & 3)
+				break;		/* can't handle misaligned */
+			err = -EFAULT;
+			if (!address_ok(regs, ea, 4))
+				goto ldst_done;
+			err = 0;
+			__put_user_asmx(regs->gpr[rd], ea, err, "stwcx.", cr);
+			if (!err)
+				regs->ccr = (regs->ccr & 0x0fffffff) |
+					(cr & 0xe0000000) |
+					((regs->xer >> 3) & 0x10000000);
+			goto ldst_done;
+
+#ifdef __powerpc64__
+		case 84:	/* ldarx */
+			ea = xform_ea(instr, regs, 0);
+			if (ea & 7)
+				break;		/* can't handle misaligned */
+			err = -EFAULT;
+			if (!address_ok(regs, ea, 8))
+				goto ldst_done;
+			err = 0;
+			__get_user_asmx(val, ea, err, "ldarx");
+			if (!err)
+				regs->gpr[rd] = val;
+			goto ldst_done;
+
+		case 214:	/* stdcx. */
+			ea = xform_ea(instr, regs, 0);
+			if (ea & 7)
+				break;		/* can't handle misaligned */
+			err = -EFAULT;
+			if (!address_ok(regs, ea, 8))
+				goto ldst_done;
+			err = 0;
+			__put_user_asmx(regs->gpr[rd], ea, err, "stdcx.", cr);
+			if (!err)
+				regs->ccr = (regs->ccr & 0x0fffffff) |
+					(cr & 0xe0000000) |
+					((regs->xer >> 3) & 0x10000000);
+			goto ldst_done;
+
+		case 21:	/* ldx */
+		case 53:	/* ldux */
+			err = read_mem(&regs->gpr[rd], xform_ea(instr, regs, u),
+				       8, regs);
+			goto ldst_done;
+#endif
+
+		case 23:	/* lwzx */
+		case 55:	/* lwzux */
+			err = read_mem(&regs->gpr[rd], xform_ea(instr, regs, u),
+				       4, regs);
+			goto ldst_done;
+
+		case 87:	/* lbzx */
+		case 119:	/* lbzux */
+			err = read_mem(&regs->gpr[rd], xform_ea(instr, regs, u),
+				       1, regs);
+			goto ldst_done;
+
+#ifdef CONFIG_ALTIVEC
+		case 103:	/* lvx */
+		case 359:	/* lvxl */
+			if (!(regs->msr & MSR_VEC))
+				break;
+			ea = xform_ea(instr, regs, 0);
+			err = do_vec_load(rd, do_lvx, ea, regs);
+			goto ldst_done;
+
+		case 231:	/* stvx */
+		case 487:	/* stvxl */
+			if (!(regs->msr & MSR_VEC))
+				break;
+			ea = xform_ea(instr, regs, 0);
+			err = do_vec_store(rd, do_stvx, ea, regs);
+			goto ldst_done;
+#endif /* CONFIG_ALTIVEC */
+
+#ifdef __powerpc64__
+		case 149:	/* stdx */
+		case 181:	/* stdux */
+			val = regs->gpr[rd];
+			err = write_mem(val, xform_ea(instr, regs, u), 8, regs);
+			goto ldst_done;
+#endif
+
+		case 151:	/* stwx */
+		case 183:	/* stwux */
+			val = regs->gpr[rd];
+			err = write_mem(val, xform_ea(instr, regs, u), 4, regs);
+			goto ldst_done;
+
+		case 215:	/* stbx */
+		case 247:	/* stbux */
+			val = regs->gpr[rd];
+			err = write_mem(val, xform_ea(instr, regs, u), 1, regs);
+			goto ldst_done;
+
+		case 279:	/* lhzx */
+		case 311:	/* lhzux */
+			err = read_mem(&regs->gpr[rd], xform_ea(instr, regs, u),
+				       2, regs);
+			goto ldst_done;
+
+#ifdef __powerpc64__
+		case 341:	/* lwax */
+		case 373:	/* lwaux */
+			err = read_mem(&regs->gpr[rd], xform_ea(instr, regs, u),
+				       4, regs);
+			if (!err)
+				regs->gpr[rd] = (signed int) regs->gpr[rd];
+			goto ldst_done;
+#endif
+
+		case 343:	/* lhax */
+		case 375:	/* lhaux */
+			err = read_mem(&regs->gpr[rd], xform_ea(instr, regs, u),
+				       2, regs);
+			if (!err)
+				regs->gpr[rd] = (signed short) regs->gpr[rd];
+			goto ldst_done;
+
+		case 407:	/* sthx */
+		case 439:	/* sthux */
+			val = regs->gpr[rd];
+			err = write_mem(val, xform_ea(instr, regs, u), 2, regs);
+			goto ldst_done;
+
+#ifdef __powerpc64__
+		case 532:	/* ldbrx */
+			err = read_mem(&val, xform_ea(instr, regs, 0), 8, regs);
+			if (!err)
+				regs->gpr[rd] = byterev_8(val);
+			goto ldst_done;
+
+#endif
+
+		case 534:	/* lwbrx */
+			err = read_mem(&val, xform_ea(instr, regs, 0), 4, regs);
+			if (!err)
+				regs->gpr[rd] = byterev_4(val);
+			goto ldst_done;
+
+#ifdef CONFIG_PPC_FPU
+		case 535:	/* lfsx */
+		case 567:	/* lfsux */
+			if (!(regs->msr & MSR_FP))
+				break;
+			ea = xform_ea(instr, regs, u);
+			err = do_fp_load(rd, do_lfs, ea, 4, regs);
+			goto ldst_done;
+
+		case 599:	/* lfdx */
+		case 631:	/* lfdux */
+			if (!(regs->msr & MSR_FP))
+				break;
+			ea = xform_ea(instr, regs, u);
+			err = do_fp_load(rd, do_lfd, ea, 8, regs);
+			goto ldst_done;
+
+		case 663:	/* stfsx */
+		case 695:	/* stfsux */
+			if (!(regs->msr & MSR_FP))
+				break;
+			ea = xform_ea(instr, regs, u);
+			err = do_fp_store(rd, do_stfs, ea, 4, regs);
+			goto ldst_done;
+
+		case 727:	/* stfdx */
+		case 759:	/* stfdux */
+			if (!(regs->msr & MSR_FP))
+				break;
+			ea = xform_ea(instr, regs, u);
+			err = do_fp_store(rd, do_stfd, ea, 8, regs);
+			goto ldst_done;
+#endif
+
+#ifdef __powerpc64__
+		case 660:	/* stdbrx */
+			val = byterev_8(regs->gpr[rd]);
+			err = write_mem(val, xform_ea(instr, regs, 0), 8, regs);
+			goto ldst_done;
+
+#endif
+		case 662:	/* stwbrx */
+			val = byterev_4(regs->gpr[rd]);
+			err = write_mem(val, xform_ea(instr, regs, 0), 4, regs);
+			goto ldst_done;
+
+		case 790:	/* lhbrx */
+			err = read_mem(&val, xform_ea(instr, regs, 0), 2, regs);
+			if (!err)
+				regs->gpr[rd] = byterev_2(val);
+			goto ldst_done;
+
+		case 918:	/* sthbrx */
+			val = byterev_2(regs->gpr[rd]);
+			err = write_mem(val, xform_ea(instr, regs, 0), 2, regs);
+			goto ldst_done;
+
+#ifdef CONFIG_VSX
+		case 844:	/* lxvd2x */
+		case 876:	/* lxvd2ux */
+			if (!(regs->msr & MSR_VSX))
+				break;
+			rd |= (instr & 1) << 5;
+			ea = xform_ea(instr, regs, u);
+			err = do_vsx_load(rd, do_lxvd2x, ea, regs);
+			goto ldst_done;
+
+		case 972:	/* stxvd2x */
+		case 1004:	/* stxvd2ux */
+			if (!(regs->msr & MSR_VSX))
+				break;
+			rd |= (instr & 1) << 5;
+			ea = xform_ea(instr, regs, u);
+			err = do_vsx_store(rd, do_stxvd2x, ea, regs);
+			goto ldst_done;
+
+#endif /* CONFIG_VSX */
+		}
+		break;
+
+	case 32:	/* lwz */
+	case 33:	/* lwzu */
+		err = read_mem(&regs->gpr[rd], dform_ea(instr, regs), 4, regs);
+		goto ldst_done;
+
+	case 34:	/* lbz */
+	case 35:	/* lbzu */
+		err = read_mem(&regs->gpr[rd], dform_ea(instr, regs), 1, regs);
+		goto ldst_done;
+
+	case 36:	/* stw */
+		val = regs->gpr[rd];
+		err = write_mem(val, dform_ea(instr, regs), 4, regs);
+		goto ldst_done;
+
+	case 37:	/* stwu */
+		val = regs->gpr[rd];
+		val3 = dform_ea(instr, regs);
+		/*
+		 * For PPC32 we always use stwu to change stack point with r1. So
+		 * this emulated store may corrupt the exception frame, now we
+		 * have to provide the exception frame trampoline, which is pushed
+		 * below the kprobed function stack. So we only update gpr[1] but
+		 * don't emulate the real store operation. We will do real store
+		 * operation safely in exception return code by checking this flag.
+		 */
+		if ((ra == 1) && !(regs->msr & MSR_PR) \
+			&& (val3 >= (regs->gpr[1] - STACK_INT_FRAME_SIZE))) {
+#ifdef CONFIG_PPC32
+			/*
+			 * Check if we will touch kernel sack overflow
+			 */
+			if (val3 - STACK_INT_FRAME_SIZE <= current->thread.ksp_limit) {
+				printk(KERN_CRIT "Can't kprobe this since Kernel stack overflow.\n");
+				err = -EINVAL;
+				break;
+			}
+#endif /* CONFIG_PPC32 */
+			/*
+			 * Check if we already set since that means we'll
+			 * lose the previous value.
+			 */
+			WARN_ON(test_thread_flag(TIF_EMULATE_STACK_STORE));
+			set_thread_flag(TIF_EMULATE_STACK_STORE);
+			err = 0;
+		} else
+			err = write_mem(val, val3, 4, regs);
+		goto ldst_done;
+
+	case 38:	/* stb */
+	case 39:	/* stbu */
+		val = regs->gpr[rd];
+		err = write_mem(val, dform_ea(instr, regs), 1, regs);
+		goto ldst_done;
+
+	case 40:	/* lhz */
+	case 41:	/* lhzu */
+		err = read_mem(&regs->gpr[rd], dform_ea(instr, regs), 2, regs);
+		goto ldst_done;
+
+	case 42:	/* lha */
+	case 43:	/* lhau */
+		err = read_mem(&regs->gpr[rd], dform_ea(instr, regs), 2, regs);
+		if (!err)
+			regs->gpr[rd] = (signed short) regs->gpr[rd];
+		goto ldst_done;
+
+	case 44:	/* sth */
+	case 45:	/* sthu */
+		val = regs->gpr[rd];
+		err = write_mem(val, dform_ea(instr, regs), 2, regs);
+		goto ldst_done;
+
+	case 46:	/* lmw */
+		ra = (instr >> 16) & 0x1f;
+		if (ra >= rd)
+			break;		/* invalid form, ra in range to load */
+		ea = dform_ea(instr, regs);
+		do {
+			err = read_mem(&regs->gpr[rd], ea, 4, regs);
+			if (err)
+				return 0;
+			ea += 4;
+		} while (++rd < 32);
+		goto instr_done;
+
+	case 47:	/* stmw */
+		ea = dform_ea(instr, regs);
+		do {
+			err = write_mem(regs->gpr[rd], ea, 4, regs);
+			if (err)
+				return 0;
+			ea += 4;
+		} while (++rd < 32);
+		goto instr_done;
+
+#ifdef CONFIG_PPC_FPU
+	case 48:	/* lfs */
+	case 49:	/* lfsu */
+		if (!(regs->msr & MSR_FP))
+			break;
+		ea = dform_ea(instr, regs);
+		err = do_fp_load(rd, do_lfs, ea, 4, regs);
+		goto ldst_done;
+
+	case 50:	/* lfd */
+	case 51:	/* lfdu */
+		if (!(regs->msr & MSR_FP))
+			break;
+		ea = dform_ea(instr, regs);
+		err = do_fp_load(rd, do_lfd, ea, 8, regs);
+		goto ldst_done;
+
+	case 52:	/* stfs */
+	case 53:	/* stfsu */
+		if (!(regs->msr & MSR_FP))
+			break;
+		ea = dform_ea(instr, regs);
+		err = do_fp_store(rd, do_stfs, ea, 4, regs);
+		goto ldst_done;
+
+	case 54:	/* stfd */
+	case 55:	/* stfdu */
+		if (!(regs->msr & MSR_FP))
+			break;
+		ea = dform_ea(instr, regs);
+		err = do_fp_store(rd, do_stfd, ea, 8, regs);
+		goto ldst_done;
+#endif
+
+#ifdef __powerpc64__
+	case 58:	/* ld[u], lwa */
+		switch (instr & 3) {
+		case 0:		/* ld */
+			err = read_mem(&regs->gpr[rd], dsform_ea(instr, regs),
+				       8, regs);
+			goto ldst_done;
+		case 1:		/* ldu */
+			err = read_mem(&regs->gpr[rd], dsform_ea(instr, regs),
+				       8, regs);
+			goto ldst_done;
+		case 2:		/* lwa */
+			err = read_mem(&regs->gpr[rd], dsform_ea(instr, regs),
+				       4, regs);
+			if (!err)
+				regs->gpr[rd] = (signed int) regs->gpr[rd];
+			goto ldst_done;
+		}
+		break;
+
+	case 62:	/* std[u] */
+		val = regs->gpr[rd];
+		switch (instr & 3) {
+		case 0:		/* std */
+			err = write_mem(val, dsform_ea(instr, regs), 8, regs);
+			goto ldst_done;
+		case 1:		/* stdu */
+			err = write_mem(val, dsform_ea(instr, regs), 8, regs);
+			goto ldst_done;
+		}
+		break;
+#endif /* __powerpc64__ */
+
+	}
+	err = -EINVAL;
+
+ ldst_done:
+	if (err) {
+		regs->gpr[ra] = old_ra;
+		return 0;	/* invoke DSI if -EFAULT? */
+	}
+ instr_done:
+	regs->nip = truncate_if_32bit(regs->msr, regs->nip + 4);
+	return 1;
+
+ logical_done:
+	if (instr & 1)
+		set_cr0(regs, ra);
+	goto instr_done;
+
+ arith_done:
+	if (instr & 1)
+		set_cr0(regs, rd);
+	goto instr_done;
 }
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index c4c622d8e6a..1b5a0a09d60 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -13,13 +13,7 @@
 #include <asm/ppc_asm.h>
 
 	.section __ex_table,"a"
-#ifdef CONFIG_PPC64
-	.align	3
-#define EXTBL	.llong
-#else
-	.align	2
-#define EXTBL	.long
-#endif
+	PPC_LONG_ALIGN
 	.text
 	
 _GLOBAL(strcpy)
@@ -34,7 +28,7 @@ _GLOBAL(strcpy)
 /* This clears out any unused part of the destination buffer,
    just as the libc version does.  -- paulus */
 _GLOBAL(strncpy)
-	cmpwi	0,r5,0
+	PPC_LCMPI 0,r5,0
 	beqlr
 	mtctr	r5
 	addi	r6,r3,-1
@@ -45,7 +39,7 @@ _GLOBAL(strncpy)
 	bdnzf	2,1b		/* dec ctr, branch if ctr != 0 && !cr0.eq */
 	bnelr			/* if we didn't hit a null char, we're done */
 	mfctr	r5
-	cmpwi	0,r5,0		/* any space left in destination buffer? */
+	PPC_LCMPI 0,r5,0	/* any space left in destination buffer? */
 	beqlr			/* we know r0 == 0 here */
 2:	stbu	r0,1(r6)	/* clear it out if so */
 	bdnz	2b
@@ -75,6 +69,22 @@ _GLOBAL(strcmp)
 	beq	1b
 	blr
 
+_GLOBAL(strncmp)
+	PPC_LCMPI 0,r5,0
+	beq-	2f
+	mtctr	r5
+	addi	r5,r3,-1
+	addi	r4,r4,-1
+1:	lbzu	r3,1(r5)
+	cmpwi	1,r3,0
+	lbzu	r0,1(r4)
+	subf.	r3,r0,r3
+	beqlr	1
+	bdnzt	eq,1b
+	blr
+2:	li	r3,0
+	blr
+
 _GLOBAL(strlen)
 	addi	r4,r3,-1
 1:	lbzu	r0,1(r4)
@@ -84,8 +94,8 @@ _GLOBAL(strlen)
 	blr
 
 _GLOBAL(memcmp)
-	cmpwi	0,r5,0
-	ble-	2f
+	PPC_LCMPI 0,r5,0
+	beq-	2f
 	mtctr	r5
 	addi	r6,r3,-1
 	addi	r4,r4,-1
@@ -98,8 +108,8 @@ _GLOBAL(memcmp)
 	blr
 
 _GLOBAL(memchr)
-	cmpwi	0,r5,0
-	ble-	2f
+	PPC_LCMPI 0,r5,0
+	beq-	2f
 	mtctr	r5
 	addi	r3,r3,-1
 1:	lbzu	r0,1(r3)
@@ -109,6 +119,7 @@ _GLOBAL(memchr)
 2:	li	r3,0
 	blr
 
+#ifdef CONFIG_PPC32
 _GLOBAL(__clear_user)
 	addi	r6,r3,-4
 	li	r3,0
@@ -146,52 +157,8 @@ _GLOBAL(__clear_user)
 	blr
 
 	.section __ex_table,"a"
-	EXTBL	11b,90b
-	EXTBL	1b,91b
-	EXTBL	8b,92b
+	PPC_LONG	11b,90b
+	PPC_LONG	1b,91b
+	PPC_LONG	8b,92b
 	.text
-
-_GLOBAL(__strncpy_from_user)
-	addi	r6,r3,-1
-	addi	r4,r4,-1
-	cmpwi	0,r5,0
-	beq	2f
-	mtctr	r5
-1:	lbzu	r0,1(r4)
-	cmpwi	0,r0,0
-	stbu	r0,1(r6)
-	bdnzf	2,1b		/* dec ctr, branch if ctr != 0 && !cr0.eq */
-	beq	3f
-2:	addi	r6,r6,1
-3:	subf	r3,r3,r6
-	blr
-99:	li	r3,-EFAULT
-	blr
-
-	.section __ex_table,"a"
-	EXTBL	1b,99b
-	.text
-
-/* r3 = str, r4 = len (> 0), r5 = top (highest addr) */
-_GLOBAL(__strnlen_user)
-	addi	r7,r3,-1
-	subf	r6,r7,r5	/* top+1 - str */
-	cmplw	0,r4,r6
-	bge	0f
-	mr	r6,r4
-0:	mtctr	r6		/* ctr = min(len, top - str) */
-1:	lbzu	r0,1(r7)	/* get next byte */
-	cmpwi	0,r0,0
-	bdnzf	2,1b		/* loop if --ctr != 0 && byte != 0 */
-	addi	r7,r7,1
-	subf	r3,r3,r7	/* number of bytes we have looked at */
-	beqlr			/* return if we found a 0 byte */
-	cmpw	0,r3,r4		/* did we look at all len bytes? */
-	blt	99f		/* if not, must have hit top */
-	addi	r3,r4,1		/* return len + 1 to indicate no null found */
-	blr
-99:	li	r3,0		/* bad address, return 0 */
-	blr
-
-	.section __ex_table,"a"
-	EXTBL	1b,99b
+#endif
diff --git a/arch/powerpc/lib/string_64.S b/arch/powerpc/lib/string_64.S
new file mode 100644
index 00000000000..7bd9549a90a
--- /dev/null
+++ b/arch/powerpc/lib/string_64.S
@@ -0,0 +1,202 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2012
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+
+	.section	".toc","aw"
+PPC64_CACHES:
+	.tc		ppc64_caches[TC],ppc64_caches
+	.section	".text"
+
+/**
+ * __clear_user: - Zero a block of memory in user space, with less checking.
+ * @to:   Destination address, in user space.
+ * @n:    Number of bytes to zero.
+ *
+ * Zero a block of memory in user space.  Caller must check
+ * the specified block with access_ok() before calling this function.
+ *
+ * Returns number of bytes that could not be cleared.
+ * On success, this will be zero.
+ */
+
+	.macro err1
+100:
+	.section __ex_table,"a"
+	.align 3
+	.llong 100b,.Ldo_err1
+	.previous
+	.endm
+
+	.macro err2
+200:
+	.section __ex_table,"a"
+	.align 3
+	.llong 200b,.Ldo_err2
+	.previous
+	.endm
+
+	.macro err3
+300:
+	.section __ex_table,"a"
+	.align 3
+	.llong 300b,.Ldo_err3
+	.previous
+	.endm
+
+.Ldo_err1:
+	mr	r3,r8
+
+.Ldo_err2:
+	mtctr	r4
+1:
+err3;	stb	r0,0(r3)
+	addi	r3,r3,1
+	addi	r4,r4,-1
+	bdnz	1b
+
+.Ldo_err3:
+	mr	r3,r4
+	blr
+
+_GLOBAL_TOC(__clear_user)
+	cmpdi	r4,32
+	neg	r6,r3
+	li	r0,0
+	blt	.Lshort_clear
+	mr	r8,r3
+	mtocrf	0x01,r6
+	clrldi	r6,r6,(64-3)
+
+	/* Get the destination 8 byte aligned */
+	bf	cr7*4+3,1f
+err1;	stb	r0,0(r3)
+	addi	r3,r3,1
+
+1:	bf	cr7*4+2,2f
+err1;	sth	r0,0(r3)
+	addi	r3,r3,2
+
+2:	bf	cr7*4+1,3f
+err1;	stw	r0,0(r3)
+	addi	r3,r3,4
+
+3:	sub	r4,r4,r6
+
+	cmpdi	r4,32
+	cmpdi	cr1,r4,512
+	blt	.Lshort_clear
+	bgt	cr1,.Llong_clear
+
+.Lmedium_clear:
+	srdi	r6,r4,5
+	mtctr	r6
+
+	/* Do 32 byte chunks */
+4:
+err2;	std	r0,0(r3)
+err2;	std	r0,8(r3)
+err2;	std	r0,16(r3)
+err2;	std	r0,24(r3)
+	addi	r3,r3,32
+	addi	r4,r4,-32
+	bdnz	4b
+
+.Lshort_clear:
+	/* up to 31 bytes to go */
+	cmpdi	r4,16
+	blt	6f
+err2;	std	r0,0(r3)
+err2;	std	r0,8(r3)
+	addi	r3,r3,16
+	addi	r4,r4,-16
+
+	/* Up to 15 bytes to go */
+6:	mr	r8,r3
+	clrldi	r4,r4,(64-4)
+	mtocrf	0x01,r4
+	bf	cr7*4+0,7f
+err1;	std	r0,0(r3)
+	addi	r3,r3,8
+
+7:	bf	cr7*4+1,8f
+err1;	stw	r0,0(r3)
+	addi	r3,r3,4
+
+8:	bf	cr7*4+2,9f
+err1;	sth	r0,0(r3)
+	addi	r3,r3,2
+
+9:	bf	cr7*4+3,10f
+err1;	stb	r0,0(r3)
+
+10:	li	r3,0
+	blr
+
+.Llong_clear:
+	ld	r5,PPC64_CACHES@toc(r2)
+
+	bf	cr7*4+0,11f
+err2;	std	r0,0(r3)
+	addi	r3,r3,8
+	addi	r4,r4,-8
+
+	/* Destination is 16 byte aligned, need to get it cacheline aligned */
+11:	lwz	r7,DCACHEL1LOGLINESIZE(r5)
+	lwz	r9,DCACHEL1LINESIZE(r5)
+
+	/*
+	 * With worst case alignment the long clear loop takes a minimum
+	 * of 1 byte less than 2 cachelines.
+	 */
+	sldi	r10,r9,2
+	cmpd	r4,r10
+	blt	.Lmedium_clear
+
+	neg	r6,r3
+	addi	r10,r9,-1
+	and.	r5,r6,r10
+	beq	13f
+
+	srdi	r6,r5,4
+	mtctr	r6
+	mr	r8,r3
+12:
+err1;	std	r0,0(r3)
+err1;	std	r0,8(r3)
+	addi	r3,r3,16
+	bdnz	12b
+
+	sub	r4,r4,r5
+
+13:	srd	r6,r4,r7
+	mtctr	r6
+	mr	r8,r3
+14:
+err1;	dcbz	r0,r3
+	add	r3,r3,r9
+	bdnz	14b
+
+	and	r4,r4,r10
+
+	cmpdi	r4,32
+	blt	.Lshort_clear
+	b	.Lmedium_clear
diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c
new file mode 100644
index 00000000000..3cf529ceec5
--- /dev/null
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -0,0 +1,74 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2011
+ *
+ * Authors: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
+ *          Anton Blanchard <anton@au.ibm.com>
+ */
+#include <linux/uaccess.h>
+#include <linux/hardirq.h>
+#include <asm/switch_to.h>
+
+int enter_vmx_usercopy(void)
+{
+	if (in_interrupt())
+		return 0;
+
+	/* This acts as preempt_disable() as well and will make
+	 * enable_kernel_altivec(). We need to disable page faults
+	 * as they can call schedule and thus make us lose the VMX
+	 * context. So on page faults, we just fail which will cause
+	 * a fallback to the normal non-vmx copy.
+	 */
+	pagefault_disable();
+
+	enable_kernel_altivec();
+
+	return 1;
+}
+
+/*
+ * This function must return 0 because we tail call optimise when calling
+ * from __copy_tofrom_user_power7 which returns 0 on success.
+ */
+int exit_vmx_usercopy(void)
+{
+	pagefault_enable();
+	return 0;
+}
+
+int enter_vmx_copy(void)
+{
+	if (in_interrupt())
+		return 0;
+
+	preempt_disable();
+
+	enable_kernel_altivec();
+
+	return 1;
+}
+
+/*
+ * All calls to this function will be optimised into tail calls. We are
+ * passed a pointer to the destination which we return as required by a
+ * memcpy implementation.
+ */
+void *exit_vmx_copy(void *dest)
+{
+	preempt_enable();
+	return dest;
+}
diff --git a/arch/powerpc/lib/xor_vmx.c b/arch/powerpc/lib/xor_vmx.c
new file mode 100644
index 00000000000..e905f7c2ea7
--- /dev/null
+++ b/arch/powerpc/lib/xor_vmx.c
@@ -0,0 +1,177 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2012
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <altivec.h>
+
+#include <linux/preempt.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <asm/switch_to.h>
+
+typedef vector signed char unative_t;
+
+#define DEFINE(V)				\
+	unative_t *V = (unative_t *)V##_in;	\
+	unative_t V##_0, V##_1, V##_2, V##_3
+
+#define LOAD(V)			\
+	do {			\
+		V##_0 = V[0];	\
+		V##_1 = V[1];	\
+		V##_2 = V[2];	\
+		V##_3 = V[3];	\
+	} while (0)
+
+#define STORE(V)		\
+	do {			\
+		V[0] = V##_0;	\
+		V[1] = V##_1;	\
+		V[2] = V##_2;	\
+		V[3] = V##_3;	\
+	} while (0)
+
+#define XOR(V1, V2)					\
+	do {						\
+		V1##_0 = vec_xor(V1##_0, V2##_0);	\
+		V1##_1 = vec_xor(V1##_1, V2##_1);	\
+		V1##_2 = vec_xor(V1##_2, V2##_2);	\
+		V1##_3 = vec_xor(V1##_3, V2##_3);	\
+	} while (0)
+
+void xor_altivec_2(unsigned long bytes, unsigned long *v1_in,
+		   unsigned long *v2_in)
+{
+	DEFINE(v1);
+	DEFINE(v2);
+	unsigned long lines = bytes / (sizeof(unative_t)) / 4;
+
+	preempt_disable();
+	enable_kernel_altivec();
+
+	do {
+		LOAD(v1);
+		LOAD(v2);
+		XOR(v1, v2);
+		STORE(v1);
+
+		v1 += 4;
+		v2 += 4;
+	} while (--lines > 0);
+
+	preempt_enable();
+}
+EXPORT_SYMBOL(xor_altivec_2);
+
+void xor_altivec_3(unsigned long bytes, unsigned long *v1_in,
+		   unsigned long *v2_in, unsigned long *v3_in)
+{
+	DEFINE(v1);
+	DEFINE(v2);
+	DEFINE(v3);
+	unsigned long lines = bytes / (sizeof(unative_t)) / 4;
+
+	preempt_disable();
+	enable_kernel_altivec();
+
+	do {
+		LOAD(v1);
+		LOAD(v2);
+		LOAD(v3);
+		XOR(v1, v2);
+		XOR(v1, v3);
+		STORE(v1);
+
+		v1 += 4;
+		v2 += 4;
+		v3 += 4;
+	} while (--lines > 0);
+
+	preempt_enable();
+}
+EXPORT_SYMBOL(xor_altivec_3);
+
+void xor_altivec_4(unsigned long bytes, unsigned long *v1_in,
+		   unsigned long *v2_in, unsigned long *v3_in,
+		   unsigned long *v4_in)
+{
+	DEFINE(v1);
+	DEFINE(v2);
+	DEFINE(v3);
+	DEFINE(v4);
+	unsigned long lines = bytes / (sizeof(unative_t)) / 4;
+
+	preempt_disable();
+	enable_kernel_altivec();
+
+	do {
+		LOAD(v1);
+		LOAD(v2);
+		LOAD(v3);
+		LOAD(v4);
+		XOR(v1, v2);
+		XOR(v3, v4);
+		XOR(v1, v3);
+		STORE(v1);
+
+		v1 += 4;
+		v2 += 4;
+		v3 += 4;
+		v4 += 4;
+	} while (--lines > 0);
+
+	preempt_enable();
+}
+EXPORT_SYMBOL(xor_altivec_4);
+
+void xor_altivec_5(unsigned long bytes, unsigned long *v1_in,
+		   unsigned long *v2_in, unsigned long *v3_in,
+		   unsigned long *v4_in, unsigned long *v5_in)
+{
+	DEFINE(v1);
+	DEFINE(v2);
+	DEFINE(v3);
+	DEFINE(v4);
+	DEFINE(v5);
+	unsigned long lines = bytes / (sizeof(unative_t)) / 4;
+
+	preempt_disable();
+	enable_kernel_altivec();
+
+	do {
+		LOAD(v1);
+		LOAD(v2);
+		LOAD(v3);
+		LOAD(v4);
+		LOAD(v5);
+		XOR(v1, v2);
+		XOR(v3, v4);
+		XOR(v1, v5);
+		XOR(v1, v3);
+		STORE(v1);
+
+		v1 += 4;
+		v2 += 4;
+		v3 += 4;
+		v4 += 4;
+		v5 += 4;
+	} while (--lines > 0);
+
+	preempt_enable();
+}
+EXPORT_SYMBOL(xor_altivec_5);