49 files changed, 1963 insertions, 135 deletions
diff --git a/test/CodeGen/ARM/call-noret-minsize.ll b/test/CodeGen/ARM/call-noret-minsize.ll
new file mode 100644
index 0000000000..35490ac69b
--- /dev/null
+++ b/test/CodeGen/ARM/call-noret-minsize.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a8   | FileCheck %s -check-prefix=ARM
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=swift       | FileCheck %s -check-prefix=SWIFT
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 | FileCheck %s -check-prefix=T2
+; rdar://12348580
+
+define void @t1() noreturn minsize nounwind ssp {
+entry:
+; ARM: t1:
+; ARM: bl _bar
+
+; SWIFT: t1:
+; SWIFT: bl _bar
+
+; T2: t1:
+; T2: blx _bar
+  tail call void @bar() noreturn nounwind
+  unreachable
+}
+
+define void @t2() noreturn minsize nounwind ssp {
+entry:
+; ARM: t2:
+; ARM: bl _t1
+
+; SWIFT: t2:
+; SWIFT: bl _t1
+
+; T2: t2:
+; T2: bl _t1
+  tail call void @t1() noreturn nounwind
+  unreachable
+}
+
+declare void @bar() noreturn
diff --git a/test/CodeGen/ARM/coalesce-subregs.ll b/test/CodeGen/ARM/coalesce-subregs.ll
index 238ba24a79..14511ad5ce 100644
--- a/test/CodeGen/ARM/coalesce-subregs.ll
+++ b/test/CodeGen/ARM/coalesce-subregs.ll
@@ -289,3 +289,31 @@ bb:
   %tmp18 = insertvalue %struct.wombat.5 %tmp17, <4 x float> undef, 3, 0
   ret %struct.wombat.5 %tmp18
 }
+
+; CHECK: adjustCopiesBackFrom
+; The shuffle in if.else3 must be preserved even though adjustCopiesBackFrom
+; is tempted to remove it.
+; CHECK: %if.else3
+; CHECK: vorr d
+define internal void @adjustCopiesBackFrom(<2 x i64>* noalias nocapture sret %agg.result, <2 x i64> %in) {
+entry:
+  %0 = extractelement <2 x i64> %in, i32 0
+  %cmp = icmp slt i64 %0, 1
+  %.in = select i1 %cmp, <2 x i64> <i64 0, i64 undef>, <2 x i64> %in
+  %1 = extractelement <2 x i64> %in, i32 1
+  %cmp1 = icmp slt i64 %1, 1
+  br i1 %cmp1, label %if.then2, label %if.else3
+
+if.then2:                                         ; preds = %entry
+  %2 = insertelement <2 x i64> %.in, i64 0, i32 1
+  br label %if.end4
+
+if.else3:                                         ; preds = %entry
+  %3 = shufflevector <2 x i64> %.in, <2 x i64> %in, <2 x i32> <i32 0, i32 3>
+  br label %if.end4
+
+if.end4:                                          ; preds = %if.else3, %if.then2
+  %result.2 = phi <2 x i64> [ %2, %if.then2 ], [ %3, %if.else3 ]
+  store <2 x i64> %result.2, <2 x i64>* %agg.result, align 128
+  ret void
+}
diff --git a/test/CodeGen/ARM/integer_insertelement.ll b/test/CodeGen/ARM/integer_insertelement.ll
index 4f2d7e3f73..1d72afefb5 100644
--- a/test/CodeGen/ARM/integer_insertelement.ll
+++ b/test/CodeGen/ARM/integer_insertelement.ll
@@ -6,7 +6,7 @@
 
 ; CHECK: @f
 ; CHECK-NOT: vorr d
-; CHECK: vmov s
+; CHECK: vmov.32 d
 ; CHECK-NOT: vorr d
 ; CHECK: mov pc, lr
 define <4 x i32> @f(<4 x i32> %in) {
diff --git a/test/CodeGen/ARM/vext.ll b/test/CodeGen/ARM/vext.ll
index e224bdfe25..f404eb8be5 100644
--- a/test/CodeGen/ARM/vext.ll
+++ b/test/CodeGen/ARM/vext.ll
@@ -74,6 +74,39 @@ define <16 x i8> @test_vextRq_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 	ret <16 x i8> %tmp3
 }
 
+define <16 x i8> @test_vextq_undef_op2(<16 x i8> %a) nounwind {
+;CHECK: test_vextq_undef_op2:
+;CHECK: vext
+entry:
+  %tmp1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1>
+  ret <16 x i8> %tmp1
+}
+
+define <8 x i8> @test_vextd_undef_op2(<8 x i8> %a) nounwind {
+;CHECK: test_vextd_undef_op2:
+;CHECK: vext
+entry:
+  %tmp1 = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1>
+  ret <8 x i8> %tmp1
+}
+
+
+define <16 x i8> @test_vextq_undef_op2_undef(<16 x i8> %a) nounwind {
+;CHECK: test_vextq_undef_op2_undef:
+;CHECK: vext
+entry:
+  %tmp1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1>
+  ret <16 x i8> %tmp1
+}
+
+define <8 x i8> @test_vextd_undef_op2_undef(<8 x i8> %a) nounwind {
+;CHECK: test_vextd_undef_op2_undef:
+;CHECK: vext
+entry:
+  %tmp1 = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 1>
+  ret <8 x i8> %tmp1
+}
+
 ; Tests for ReconstructShuffle function. Indices have to be carefully
 ; chosen to reach lowering phase as a BUILD_VECTOR.
 
diff --git a/test/CodeGen/ARM/vget_lane.ll b/test/CodeGen/ARM/vget_lane.ll
index 2ed65c9aee..c9ce3b7450 100644
--- a/test/CodeGen/ARM/vget_lane.ll
+++ b/test/CodeGen/ARM/vget_lane.ll
@@ -200,7 +200,7 @@ define <8 x i16> @vsetQ_lane16(<8 x i16>* %A, i16 %B) nounwind {
 
 define <4 x i32> @vsetQ_lane32(<4 x i32>* %A, i32 %B) nounwind {
 ;CHECK: vsetQ_lane32:
-;CHECK: vmov s
+;CHECK: vmov.32 d{{.*}}[1], r1
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = insertelement <4 x i32> %tmp1, i32 %B, i32 1
 	ret <4 x i32> %tmp2
diff --git a/test/CodeGen/Mips/alloca16.ll b/test/CodeGen/Mips/alloca16.ll
new file mode 100644
index 0000000000..731edae43c
--- /dev/null
+++ b/test/CodeGen/Mips/alloca16.ll
@@ -0,0 +1,75 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 25, align 4
+@jjjj = global i32 35, align 4
+@kkkk = global i32 100, align 4
+@t = global i32 25, align 4
+@riii = common global i32 0, align 4
+@rjjj = common global i32 0, align 4
+@rkkk = common global i32 0, align 4
+
+define void @temp(i32 %foo) nounwind {
+entry:
+  %foo.addr = alloca i32, align 4
+  store i32 %foo, i32* %foo.addr, align 4
+  %0 = load i32* %foo.addr, align 4
+  store i32 %0, i32* @t, align 4
+  ret void
+}
+
+define void @test() nounwind {
+entry:
+; 16: 	.frame	$16,24,$ra
+; 16: 	save 	$ra, $s0, $s1, 24
+; 16: 	move	$16, $sp
+; 16:	move	${{[0-9]+}}, $sp
+; 16:	subu	$[[REGISTER:[0-9]+]], ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	$sp, $[[REGISTER]]
+  %sssi = alloca i32, align 4
+  %ip = alloca i32*, align 4
+  %sssj = alloca i32, align 4
+  %0 = load i32* @iiii, align 4
+  store i32 %0, i32* %sssi, align 4
+  %1 = load i32* @kkkk, align 4
+  %mul = mul nsw i32 %1, 100
+  %2 = alloca i8, i32 %mul
+  %3 = bitcast i8* %2 to i32*
+  store i32* %3, i32** %ip, align 4
+  %4 = load i32* @jjjj, align 4
+  store i32 %4, i32* %sssj, align 4
+  %5 = load i32* @jjjj, align 4
+  %6 = load i32* @iiii, align 4
+  %7 = load i32** %ip, align 4
+  %arrayidx = getelementptr inbounds i32* %7, i32 %6
+  store i32 %5, i32* %arrayidx, align 4
+  %8 = load i32* @kkkk, align 4
+  %9 = load i32* @jjjj, align 4
+  %10 = load i32** %ip, align 4
+  %arrayidx1 = getelementptr inbounds i32* %10, i32 %9
+  store i32 %8, i32* %arrayidx1, align 4
+  %11 = load i32* @iiii, align 4
+  %12 = load i32* @kkkk, align 4
+  %13 = load i32** %ip, align 4
+  %arrayidx2 = getelementptr inbounds i32* %13, i32 %12
+  store i32 %11, i32* %arrayidx2, align 4
+  %14 = load i32** %ip, align 4
+  %arrayidx3 = getelementptr inbounds i32* %14, i32 25
+  %15 = load i32* %arrayidx3, align 4
+  store i32 %15, i32* @riii, align 4
+  %16 = load i32** %ip, align 4
+  %arrayidx4 = getelementptr inbounds i32* %16, i32 35
+  %17 = load i32* %arrayidx4, align 4
+  store i32 %17, i32* @rjjj, align 4
+  %18 = load i32** %ip, align 4
+  %arrayidx5 = getelementptr inbounds i32* %18, i32 100
+  %19 = load i32* %arrayidx5, align 4
+  store i32 %19, i32* @rkkk, align 4
+  %20 = load i32* @t, align 4
+  %21 = load i32** %ip, align 4
+  %arrayidx6 = getelementptr inbounds i32* %21, i32 %20
+  %22 = load i32* %arrayidx6, align 4
+; 16: 	save	16
+  call void @temp(i32 %22)
+; 16: 	restore	16
+  ret void
+}
diff --git a/test/CodeGen/Mips/atomic.ll b/test/CodeGen/Mips/atomic.ll
index 050689dcea..819f258c2a 100644
--- a/test/CodeGen/Mips/atomic.ll
+++ b/test/CodeGen/Mips/atomic.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel --disable-machine-licm < %s | FileCheck %s
 
 @x = common global i32 0, align 4
 
@@ -181,8 +181,9 @@ entry:
 
 ; CHECK:   $[[BB0:[A-Z_0-9]+]]:
 ; CHECK:   ll      $[[R10:[0-9]+]], 0($[[R2]])
+; CHECK:   and     $[[R18:[0-9]+]], $[[R9]], $[[R6]]
 ; CHECK:   and     $[[R13:[0-9]+]], $[[R10]], $[[R7]]
-; CHECK:   or      $[[R14:[0-9]+]], $[[R13]], $[[R9]]
+; CHECK:   or      $[[R14:[0-9]+]], $[[R13]], $[[R18]]
 ; CHECK:   sc      $[[R14]], 0($[[R2]])
 ; CHECK:   beq     $[[R14]], $zero, $[[BB0]]
 
diff --git a/test/CodeGen/Mips/atomicops.ll b/test/CodeGen/Mips/atomicops.ll
new file mode 100644
index 0000000000..b9c3804e0d
--- /dev/null
+++ b/test/CodeGen/Mips/atomicops.ll
@@ -0,0 +1,40 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@.str = private unnamed_addr constant [8 x i8] c"%d, %d\0A\00", align 1
+
+define i32 @foo(i32* %mem, i32 %val, i32 %c) nounwind {
+entry:
+  %0 = atomicrmw add i32* %mem, i32 %val seq_cst
+  %add = add nsw i32 %0, %c
+  ret i32 %add
+; 16: foo:
+; 16:	lw	${{[0-9]+}}, %call16(__sync_synchronize)(${{[0-9]+}})
+; 16: 	lw	${{[0-9]+}}, %call16(__sync_fetch_and_add_4)(${{[0-9]+}})
+}
+
+define i32 @main() nounwind {
+entry:
+  %x = alloca i32, align 4
+  store volatile i32 0, i32* %x, align 4
+  %0 = atomicrmw add i32* %x, i32 1 seq_cst
+  %add.i = add nsw i32 %0, 2
+  %1 = load volatile i32* %x, align 4
+  %call1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 %add.i, i32 %1) nounwind
+  %2 = cmpxchg i32* %x, i32 1, i32 2 seq_cst
+  %3 = load volatile i32* %x, align 4
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 %2, i32 %3) nounwind
+  %4 = atomicrmw xchg i32* %x, i32 1 seq_cst
+  %5 = load volatile i32* %x, align 4
+  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 %4, i32 %5) nounwind
+; 16: main:
+; 16:	lw	${{[0-9]+}}, %call16(__sync_synchronize)(${{[0-9]+}})
+; 16: 	lw	${{[0-9]+}}, %call16(__sync_fetch_and_add_4)(${{[0-9]+}})
+; 16:	lw	${{[0-9]+}}, %call16(__sync_val_compare_and_swap_4)(${{[0-9]+}})
+; 16:	lw	${{[0-9]+}}, %call16(__sync_lock_test_and_set_4)(${{[0-9]+}})
+
+  ret i32 0
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
+
diff --git a/test/CodeGen/Mips/brdelayslot.ll b/test/CodeGen/Mips/brdelayslot.ll
index 1b2fbc8932..2fdb736dc8 100644
--- a/test/CodeGen/Mips/brdelayslot.ll
+++ b/test/CodeGen/Mips/brdelayslot.ll
@@ -19,3 +19,19 @@ entry:
 }
 
 declare void @foo2(i32)
+
+; Check that cvt.d.w goes into jalr's delay slot.
+;
+define void @foo3(i32 %a) nounwind {
+entry:
+; Default:     foo3:
+; Default:     jalr
+; Default:     cvt.d.w
+
+  %conv = sitofp i32 %a to double
+  tail call void @foo4(double %conv) nounwind
+  ret void
+}
+
+declare void @foo4(double)
+
diff --git a/test/CodeGen/Mips/brind.ll b/test/CodeGen/Mips/brind.ll
new file mode 100644
index 0000000000..4c591fa1bb
--- /dev/null
+++ b/test/CodeGen/Mips/brind.ll
@@ -0,0 +1,40 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@main.L = internal unnamed_addr constant [5 x i8*] [i8* blockaddress(@main, %L1), i8* blockaddress(@main, %L2), i8* blockaddress(@main, %L3), i8* blockaddress(@main, %L4), i8* null], align 4
+@str = private unnamed_addr constant [2 x i8] c"A\00"
+@str5 = private unnamed_addr constant [2 x i8] c"B\00"
+@str6 = private unnamed_addr constant [2 x i8] c"C\00"
+@str7 = private unnamed_addr constant [2 x i8] c"D\00"
+@str8 = private unnamed_addr constant [2 x i8] c"E\00"
+
+define i32 @main() nounwind {
+entry:
+  %puts = tail call i32 @puts(i8* getelementptr inbounds ([2 x i8]* @str, i32 0, i32 0))
+  br label %L1
+
+L1:                                               ; preds = %entry, %L3
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %L3 ]
+  %puts5 = tail call i32 @puts(i8* getelementptr inbounds ([2 x i8]* @str5, i32 0, i32 0))
+  br label %L2
+
+L2:                                               ; preds = %L1, %L3
+  %i.1 = phi i32 [ %i.0, %L1 ], [ %inc, %L3 ]
+  %puts6 = tail call i32 @puts(i8* getelementptr inbounds ([2 x i8]* @str6, i32 0, i32 0))
+  br label %L3
+
+L3:                                               ; preds = %L2, %L3
+  %i.2 = phi i32 [ %i.1, %L2 ], [ %inc, %L3 ]
+  %puts7 = tail call i32 @puts(i8* getelementptr inbounds ([2 x i8]* @str7, i32 0, i32 0))
+  %inc = add i32 %i.2, 1
+  %arrayidx = getelementptr inbounds [5 x i8*]* @main.L, i32 0, i32 %i.2
+  %0 = load i8** %arrayidx, align 4
+  indirectbr i8* %0, [label %L1, label %L2, label %L3, label %L4]
+; 16: 	jrc	 ${{[0-9]+}}
+L4:                                               ; preds = %L3
+  %puts8 = tail call i32 @puts(i8* getelementptr inbounds ([2 x i8]* @str8, i32 0, i32 0))
+  ret i32 0
+}
+
+declare i32 @puts(i8* nocapture) nounwind
+
+
diff --git a/test/CodeGen/Mips/check-noat.ll b/test/CodeGen/Mips/check-noat.ll
new file mode 100644
index 0000000000..bfeff677b3
--- /dev/null
+++ b/test/CodeGen/Mips/check-noat.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s 
+
+define void @f() nounwind readnone {
+entry:
+; CHECK: f:
+; CHECK: .set  noat
+; CHECK: .set  at
+
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/helloworld.ll b/test/CodeGen/Mips/helloworld.ll
index bee93accd4..aee58b650e 100644
--- a/test/CodeGen/Mips/helloworld.ll
+++ b/test/CodeGen/Mips/helloworld.ll
@@ -24,10 +24,10 @@ entry:
 ; C1:	addiu	${{[0-9]+}}, %lo($.str)
 ; C2:	move	$25, ${{[0-9]+}}
 ; C1:	move 	$gp, ${{[0-9]+}}
-; C1:	jalr 	${{[0-9]+}}
+; C1:	jalrc 	${{[0-9]+}}
 ; SR:	restore 	$ra, [[FS]]
 ; PE:	li	$2, 0
-; PE:	jr 	$ra
+; PE:	jrc 	$ra
 
 }
 
diff --git a/test/CodeGen/Mips/i32k.ll b/test/CodeGen/Mips/i32k.ll
new file mode 100644
index 0000000000..c6da8b1ac9
--- /dev/null
+++ b/test/CodeGen/Mips/i32k.ll
@@ -0,0 +1,17 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16a
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16b
+
+@.str = private unnamed_addr constant [4 x i8] c"%i\0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 1075344593) nounwind
+; 16a:	li	${{[0-9]+}}, 29905
+; 16b:	li	${{[0-9]+}}, 16408
+  %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 -1075344593) nounwind
+; 16a:	li	${{[0-9]+}}, 49127
+; 16b:	li	${{[0-9]+}}, 35631
+  ret i32 0
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
diff --git a/test/CodeGen/Mips/largeimm1.ll b/test/CodeGen/Mips/largeimm1.ll
index d65cc025d0..1c0f69c590 100644
--- a/test/CodeGen/Mips/largeimm1.ll
+++ b/test/CodeGen/Mips/largeimm1.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=mipsel < %s | FileCheck %s
 
-; CHECK: lui $at, 49152
-; CHECK: lui $at, 16384
+; CHECK: lui ${{[0-9]+}}, 49152
+; CHECK: lui ${{[0-9]+}}, 16384
 define void @f() nounwind {
 entry:
   %a1 = alloca [1073741824 x i8], align 1
diff --git a/test/CodeGen/Mips/largeimmprinting.ll b/test/CodeGen/Mips/largeimmprinting.ll
index 2e548790cd..1e96346d1d 100644
--- a/test/CodeGen/Mips/largeimmprinting.ll
+++ b/test/CodeGen/Mips/largeimmprinting.ll
@@ -1,4 +1,6 @@
-; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck %s
+; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=32
+; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 < %s | \
+; RUN: FileCheck %s -check-prefix=64
 
 %struct.S1 = type { [65536 x i8] }
 
@@ -6,9 +8,21 @@
 
 define void @f() nounwind {
 entry:
-; CHECK:  lui $at, 65535
-; CHECK:  addiu $at, $at, -16
-; CHECK:  addu  $sp, $sp, $at
+; 32:  lui $[[R0:[0-9]+]], 65535
+; 32:  addiu $[[R0]], $[[R0]], -24
+; 32:  addu $sp, $sp, $[[R0]]
+; 32:  lui $[[R1:[0-9]+]], 1
+; 32:  addu $[[R1]], $sp, $[[R1]]
+; 32:  sw $ra, 20($[[R1]])
+; 64:  daddiu  $[[R0:[0-9]+]], $zero, 1
+; 64:  dsll  $[[R0]], $[[R0]], 48
+; 64:  daddiu  $[[R0]], $[[R0]], -1
+; 64:  dsll  $[[R0]], $[[R0]], 16
+; 64:  daddiu  $[[R0]], $[[R0]], -48
+; 64:  daddu $sp, $sp, $[[R0]]
+; 64:  lui $[[R1:[0-9]+]], 1
+; 64:  daddu $[[R1]], $sp, $[[R1]]
+; 64:  sd  $ra, 40($[[R1]])
 
   %agg.tmp = alloca %struct.S1, align 1
   %tmp = getelementptr inbounds %struct.S1* %agg.tmp, i32 0, i32 0, i32 0
diff --git a/test/CodeGen/Mips/llcarry.ll b/test/CodeGen/Mips/llcarry.ll
new file mode 100644
index 0000000000..7763daec3b
--- /dev/null
+++ b/test/CodeGen/Mips/llcarry.ll
@@ -0,0 +1,51 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i64 4294967295, align 8
+@j = global i64 15, align 8
+@ii = global i64 4294967295, align 8
+@k = common global i64 0, align 8
+@l = common global i64 0, align 8
+@m = common global i64 0, align 8
+
+define void @test1() nounwind {
+entry:
+  %0 = load i64* @i, align 8
+  %1 = load i64* @j, align 8
+  %add = add nsw i64 %1, %0
+  store i64 %add, i64* @k, align 8
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $t8
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+  ret void
+}
+
+define void @test2() nounwind {
+entry:
+  %0 = load i64* @i, align 8
+  %1 = load i64* @j, align 8
+  %sub = sub nsw i64 %0, %1
+; 16:	subu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $t8
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	subu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+  store i64 %sub, i64* @l, align 8
+  ret void
+}
+
+define void @test3() nounwind {
+entry:
+  %0 = load i64* @ii, align 8
+  %add = add nsw i64 %0, 15
+; 16:	addiu	${{[0-9]+}}, 15
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $t8
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+  store i64 %add, i64* @m, align 8
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/longbranch.ll b/test/CodeGen/Mips/longbranch.ll
index 873b9f1410..1a4f79c191 100644
--- a/test/CodeGen/Mips/longbranch.ll
+++ b/test/CodeGen/Mips/longbranch.ll
@@ -6,15 +6,15 @@
 define void @foo1(i32 %s) nounwind {
 entry:
 ; O32: bal
-; O32: lui $at, 0
-; O32: addiu $at, $at, {{[0-9]+}} 
-; N64: lui $at, 0
-; N64: daddiu $at, $at, 0
-; N64: dsll $at, $at, 16
-; N64: daddiu $at, $at, 0
+; O32: lui $1, 0
+; O32: addiu $1, $1, {{[0-9]+}} 
+; N64: lui $1, 0
+; N64: daddiu $1, $1, 0
+; N64: dsll $1, $1, 16
+; N64: daddiu $1, $1, 0
 ; N64: bal
-; N64: dsll $at, $at, 16
-; N64: daddiu $at, $at, {{[0-9]+}}  
+; N64: dsll $1, $1, 16
+; N64: daddiu $1, $1, {{[0-9]+}}  
 
   %tobool = icmp eq i32 %s, 0
   br i1 %tobool, label %if.end, label %if.then
diff --git a/test/CodeGen/Mips/misha.ll b/test/CodeGen/Mips/misha.ll
new file mode 100644
index 0000000000..80637edb16
--- /dev/null
+++ b/test/CodeGen/Mips/misha.ll
@@ -0,0 +1,69 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+define i32 @sumc(i8* nocapture %to, i8* nocapture %from, i32) nounwind {
+entry:
+  %sext = shl i32 %0, 16
+  %conv = ashr exact i32 %sext, 16
+  %cmp8 = icmp eq i32 %conv, 0
+  br i1 %cmp8, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %.pre = load i8* %to, align 1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %1 = phi i8 [ %.pre, %for.body.lr.ph ], [ %conv4, %for.body ]
+  %i.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %from.addr.09 = phi i8* [ %from, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i8* %from.addr.09, i32 1
+  %2 = load i8* %from.addr.09, align 1
+  %conv27 = zext i8 %2 to i32
+  %conv36 = zext i8 %1 to i32
+  %add = add nsw i32 %conv36, %conv27
+  %conv4 = trunc i32 %add to i8
+  store i8 %conv4, i8* %to, align 1
+  %inc = add nsw i32 %i.010, 1
+  %cmp = icmp eq i32 %inc, %conv
+  br i1 %cmp, label %for.end, label %for.body
+; 16: sumc:
+; 16: 	lbu	${{[0-9]+}}, 0(${{[0-9]+}})
+; 16: 	lbu	${{[0-9]+}}, 0(${{[0-9]+}})
+; 16: sum:
+; 16: 	lhu	${{[0-9]+}}, 0(${{[0-9]+}})
+; 16: 	lhu	${{[0-9]+}}, 0(${{[0-9]+}})
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 undef
+}
+
+define i32 @sum(i16* nocapture %to, i16* nocapture %from, i32) nounwind {
+entry:
+  %sext = shl i32 %0, 16
+  %conv = ashr exact i32 %sext, 16
+  %cmp8 = icmp eq i32 %conv, 0
+  br i1 %cmp8, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %.pre = load i16* %to, align 2
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %1 = phi i16 [ %.pre, %for.body.lr.ph ], [ %conv4, %for.body ]
+  %i.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %from.addr.09 = phi i16* [ %from, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i16* %from.addr.09, i32 1
+  %2 = load i16* %from.addr.09, align 2
+  %conv27 = zext i16 %2 to i32
+  %conv36 = zext i16 %1 to i32
+  %add = add nsw i32 %conv36, %conv27
+  %conv4 = trunc i32 %add to i16
+  store i16 %conv4, i16* %to, align 2
+  %inc = add nsw i32 %i.010, 1
+  %cmp = icmp eq i32 %inc, %conv
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 undef
+}
+
+
diff --git a/test/CodeGen/Mips/null.ll b/test/CodeGen/Mips/null.ll
index 7beae99c45..00c66a9928 100644
--- a/test/CodeGen/Mips/null.ll
+++ b/test/CodeGen/Mips/null.ll
@@ -8,6 +8,6 @@ entry:
 ; 16: 	.set	mips16                  # @main
 
 
-; 16:	jr	$ra
+; 16:	jrc	$ra
 
 }
diff --git a/test/CodeGen/Mips/o32_cc_byval.ll b/test/CodeGen/Mips/o32_cc_byval.ll
index eac0d80c1c..5558ba6e10 100644
--- a/test/CodeGen/Mips/o32_cc_byval.ll
+++ b/test/CodeGen/Mips/o32_cc_byval.ll
@@ -119,6 +119,16 @@ entry:
   ret void
 }
 
+%struct.S4 = type { [4 x i32] }
+
+define void @f5(i64 %a0, %struct.S4* nocapture byval %a1) nounwind {
+entry:
+  tail call void @f6(%struct.S4* byval %a1, i64 %a0) nounwind
+  ret void
+}
+
+declare void @f6(%struct.S4* nocapture byval, i64)
+
 !0 = metadata !{metadata !"int", metadata !1}
 !1 = metadata !{metadata !"omnipotent char", metadata !2}
 !2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/CodeGen/Mips/remat-immed-load.ll b/test/CodeGen/Mips/remat-immed-load.ll
new file mode 100644
index 0000000000..d93964bcae
--- /dev/null
+++ b/test/CodeGen/Mips/remat-immed-load.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=32
+; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 < %s | FileCheck %s -check-prefix=64
+
+define void @f0() nounwind {
+entry:
+; 32:  addiu $4, $zero, 1
+; 32:  addiu $4, $zero, 1
+
+  tail call void @foo1(i32 1) nounwind
+  tail call void @foo1(i32 1) nounwind
+  ret void
+}
+
+declare void @foo1(i32)
+
+define void @f3() nounwind {
+entry:
+; 64:  daddiu $4, $zero, 1
+; 64:  daddiu $4, $zero, 1
+
+  tail call void @foo2(i64 1) nounwind
+  tail call void @foo2(i64 1) nounwind
+  ret void
+}
+
+declare void @foo2(i64)
+
+define void @f5() nounwind {
+entry:
+; 32:  lui $4, 1
+; 32:  lui $4, 1
+
+  tail call void @f6(i32 65536) nounwind
+  tail call void @f6(i32 65536) nounwind
+  ret void
+}
+
+declare void @f6(i32)
+
+define void @f7() nounwind {
+entry:
+; 64:  lui $4, 1
+; 64:  lui $4, 1
+
+  tail call void @f8(i64 65536) nounwind
+  tail call void @f8(i64 65536) nounwind
+  ret void
+}
+
+declare void @f8(i64)
+
diff --git a/test/CodeGen/Mips/selpat.ll b/test/CodeGen/Mips/selpat.ll
new file mode 100644
index 0000000000..cda0c96ef4
--- /dev/null
+++ b/test/CodeGen/Mips/selpat.ll
@@ -0,0 +1,350 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@t = global i32 10, align 4
+@f = global i32 199, align 4
+@a = global i32 1, align 4
+@b = global i32 10, align 4
+@c = global i32 1, align 4
+@z1 = common global i32 0, align 4
+@z2 = common global i32 0, align 4
+@z3 = common global i32 0, align 4
+@z4 = common global i32 0, align 4
+
+define void @calc_seleq() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp eq i32 %0, %1
+  %2 = load i32* @f, align 4
+  %3 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  store i32 %cond, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp eq i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %3, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  store i32 %cond10, i32* @z4, align 4
+  ret void
+}
+
+
+define void @calc_seleqk() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp eq i32 %0, 1
+  %1 = load i32* @t, align 4
+  %2 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	cmpi	${{[0-9]+}}, 1
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp eq i32 %0, 10
+  %cond5 = select i1 %cmp1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %3 = load i32* @b, align 4
+  %cmp6 = icmp eq i32 %3, 3
+  %cond10 = select i1 %cmp6, i32 %2, i32 %1
+  store i32 %cond10, i32* @z3, align 4
+; 16:	cmpi	${{[0-9]+}}, 10
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp11 = icmp eq i32 %3, 10
+  %cond15 = select i1 %cmp11, i32 %1, i32 %2
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_seleqz() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp eq i32 %0, 0
+  %1 = load i32* @t, align 4
+  %2 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	beqz	${{[0-9]+}}, .+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %3 = load i32* @b, align 4
+  %cmp1 = icmp eq i32 %3, 0
+  %cond5 = select i1 %cmp1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp eq i32 %4, 0
+  %cond10 = select i1 %cmp6, i32 %1, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  store i32 %cond, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selge() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp sge i32 %0, %1
+  %2 = load i32* @f, align 4
+  %3 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp sge i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp sge i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %3, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp sge i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %3, i32 %2
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define i32 @calc_selgt() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp sgt i32 %0, %1
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %2 = load i32* @f, align 4
+  %3 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+  %cmp1 = icmp sgt i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp sgt i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %2, i32 %3
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp sgt i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %2, i32 %3
+  store i32 %cond15, i32* @z4, align 4
+  ret i32 undef
+}
+
+define void @calc_selle() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp sle i32 %0, %1
+  %2 = load i32* @t, align 4
+  %3 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp sle i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp sle i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %2, i32 %3
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp sle i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %2, i32 %3
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selltk() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp slt i32 %0, 10
+  %1 = load i32* @t, align 4
+  %2 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	slti	${{[0-9]+}}, {{[0-9]+}}
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %3 = load i32* @b, align 4
+  %cmp1 = icmp slt i32 %3, 2
+  %cond5 = select i1 %cmp1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp sgt i32 %4, 2
+  %cond10 = select i1 %cmp6, i32 %2, i32 %1
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp sgt i32 %0, 2
+  %cond15 = select i1 %cmp11, i32 %2, i32 %1
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+
+define void @calc_selne() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp ne i32 %0, %1
+  %2 = load i32* @t, align 4
+  %3 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  store i32 %cond, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp ne i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %3, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  store i32 %cond10, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selnek() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp ne i32 %0, 1
+  %1 = load i32* @f, align 4
+  %2 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	cmpi	${{[0-9]+}}, 1
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp ne i32 %0, 10
+  %cond5 = select i1 %cmp1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %3 = load i32* @b, align 4
+  %cmp6 = icmp ne i32 %3, 3
+  %cond10 = select i1 %cmp6, i32 %2, i32 %1
+  store i32 %cond10, i32* @z3, align 4
+; 16:	cmpi	${{[0-9]+}}, 10
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp11 = icmp ne i32 %3, 10
+  %cond15 = select i1 %cmp11, i32 %1, i32 %2
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selnez() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp ne i32 %0, 0
+  %1 = load i32* @f, align 4
+  %2 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	bnez	${{[0-9]+}}, .+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %3 = load i32* @b, align 4
+  %cmp1 = icmp ne i32 %3, 0
+  %cond5 = select i1 %cmp1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp ne i32 %4, 0
+  %cond10 = select i1 %cmp6, i32 %1, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  store i32 %cond, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selnez2() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %tobool = icmp ne i32 %0, 0
+  %1 = load i32* @f, align 4
+  %2 = load i32* @t, align 4
+  %cond = select i1 %tobool, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	bnez	${{[0-9]+}}, .+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %3 = load i32* @b, align 4
+  %tobool1 = icmp ne i32 %3, 0
+  %cond5 = select i1 %tobool1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %tobool6 = icmp ne i32 %4, 0
+  %cond10 = select i1 %tobool6, i32 %1, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  store i32 %cond, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_seluge() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp uge i32 %0, %1
+  %2 = load i32* @f, align 4
+  %3 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp uge i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp uge i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %3, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp uge i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %3, i32 %2
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selugt() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp ugt i32 %0, %1
+  %2 = load i32* @f, align 4
+  %3 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp ugt i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp ugt i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %2, i32 %3
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp ugt i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %2, i32 %3
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selule() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp ule i32 %0, %1
+  %2 = load i32* @t, align 4
+  %3 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp ule i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp ule i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %2, i32 %3
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp ule i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %2, i32 %3
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/setgek.ll b/test/CodeGen/Mips/setgek.ll
index 40aaa7c030..b6bae09bcb 100644
--- a/test/CodeGen/Mips/setgek.ll
+++ b/test/CodeGen/Mips/setgek.ll
@@ -12,7 +12,7 @@ entry:
   %conv = zext i1 %cmp to i32
   store i32 %conv, i32* @r1, align 4
 ; 16:	slti	${{[0-9]+}}, -32768
-; 16:	move	$[[REGISTER:[0-9]+]], $t8
-; 16:	xor	${{[0-9]+}}, $[[REGISTER]]
+; 16:	move	${{[0-9]+}}, $t8
+; 16:	xor	${{[0-9]+}}, ${{[0-9]+}}
   ret void
 }
diff --git a/test/CodeGen/Mips/stchar.ll b/test/CodeGen/Mips/stchar.ll
new file mode 100644
index 0000000000..c00c9fd9d2
--- /dev/null
+++ b/test/CodeGen/Mips/stchar.ll
@@ -0,0 +1,90 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16_h
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16_b
+
+@.str = private unnamed_addr constant [9 x i8] c"%hd %c \0A\00", align 1
+@sp = common global i16* null, align 4
+@cp = common global i8* null, align 4
+
+define void @p1(i16 signext %s, i8 signext %c) nounwind {
+entry:
+  %conv = sext i16 %s to i32
+  %conv1 = sext i8 %c to i32
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 %conv, i32 %conv1) nounwind
+  ret void
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
+define void @p2() nounwind {
+entry:
+  %0 = load i16** @sp, align 4
+  %1 = load i16* %0, align 2
+  %2 = load i8** @cp, align 4
+  %3 = load i8* %2, align 1
+  %conv.i = sext i16 %1 to i32
+  %conv1.i = sext i8 %3 to i32
+  %call.i = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 %conv.i, i32 %conv1.i) nounwind
+  %4 = load i16** @sp, align 4
+  store i16 32, i16* %4, align 2
+  %5 = load i8** @cp, align 4
+  store i8 97, i8* %5, align 1
+  ret void
+}
+
+define void @test() nounwind {
+entry:
+  %s = alloca i16, align 4
+  %c = alloca i8, align 4
+  store i16 16, i16* %s, align 4
+  store i8 99, i8* %c, align 4
+  store i16* %s, i16** @sp, align 4
+  store i8* %c, i8** @cp, align 4
+  %call.i.i = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 16, i32 99) nounwind
+  %0 = load i16** @sp, align 4
+  store i16 32, i16* %0, align 2
+  %1 = load i8** @cp, align 4
+  store i8 97, i8* %1, align 1
+  %2 = load i16* %s, align 4
+  %3 = load i8* %c, align 4
+  %conv.i = sext i16 %2 to i32
+  %conv1.i = sext i8 %3 to i32
+  %call.i = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 %conv.i, i32 %conv1.i) nounwind
+  ret void
+; 16_b: test:
+; 16_h: test:
+; 16_b:	sb	${{[0-9]+}}, [[offset1:[0-9]+]](${{[0-9]+}})
+; 16_b: lb      ${{[0-9]+}}, [[offset1]](${{[0-9]+}})
+; 16_h:	sh	${{[0-9]+}}, [[offset2:[0-9]+]](${{[0-9]+}})
+; 16_h: lh      ${{[0-9]+}}, [[offset2]](${{[0-9]+}})
+}
+
+define i32 @main() nounwind {
+entry:
+  %s.i = alloca i16, align 4
+  %c.i = alloca i8, align 4
+  %0 = bitcast i16* %s.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.start(i64 -1, i8* %c.i) nounwind
+  store i16 16, i16* %s.i, align 4
+  store i8 99, i8* %c.i, align 4
+  store i16* %s.i, i16** @sp, align 4
+  store i8* %c.i, i8** @cp, align 4
+  %call.i.i.i = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 16, i32 99) nounwind
+  %1 = load i16** @sp, align 4
+  store i16 32, i16* %1, align 2
+  %2 = load i8** @cp, align 4
+  store i8 97, i8* %2, align 1
+  %3 = load i16* %s.i, align 4
+  %4 = load i8* %c.i, align 4
+  %conv.i.i = sext i16 %3 to i32
+  %conv1.i.i = sext i8 %4 to i32
+  %call.i.i = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 %conv.i.i, i32 %conv1.i.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %c.i) nounwind
+  ret i32 0
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
diff --git a/test/CodeGen/Mips/tailcall.ll b/test/CodeGen/Mips/tailcall.ll
index 4989636a20..bcd33fca70 100644
--- a/test/CodeGen/Mips/tailcall.ll
+++ b/test/CodeGen/Mips/tailcall.ll
@@ -4,6 +4,8 @@
 ; RUN: -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=STATIC32
 ; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=+n64 -enable-mips-tail-calls \
 ; RUN: < %s | FileCheck %s -check-prefix=N64
+; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic \
+; RUN: -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=PIC16
 
 @g0 = common global i32 0, align 4
 @g1 = common global i32 0, align 4
@@ -21,6 +23,7 @@ entry:
 ; PIC32-NOT: jalr
 ; STATIC32-NOT: jal
 ; N64-NOT: jalr
+; PIC16: jalrc
 
   %call = tail call i32 @callee1(i32 1, i32 1, i32 1, i32 %a0) nounwind
   ret i32 %call
@@ -33,6 +36,7 @@ entry:
 ; PIC32: jalr
 ; STATIC32: jal
 ; N64-NOT: jalr
+; PIC16: jalrc
 
   %call = tail call i32 @callee2(i32 1, i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind
   ret i32 %call
@@ -45,6 +49,7 @@ entry:
 ; PIC32: jalr
 ; STATIC32: jal
 ; N64-NOT: jalr
+; PIC16: jalrc
 
   %call = tail call i32 @callee3(i32 1, i32 1, i32 1, i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4) nounwind
   ret i32 %call
@@ -57,6 +62,7 @@ entry:
 ; PIC32: jalr
 ; STATIC32: jal
 ; N64: jalr
+; PIC16: jalrc
 
   %call = tail call i32 @callee4(i32 1, i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind
   ret i32 %call
@@ -66,9 +72,18 @@ declare i32 @callee4(i32, i32, i32, i32, i32, i32, i32, i32, i32)
 
 define i32 @caller5() nounwind readonly {
 entry:
+; PIC32: .ent caller5
 ; PIC32-NOT: jalr
+; PIC32: .end caller5
+; STATIC32: .ent caller5
 ; STATIC32-NOT: jal
+; STATIC32: .end caller5
+; N64: .ent caller5
 ; N64-NOT: jalr
+; N64: .end caller5
+; PIC16: .ent caller5
+; PIC16: jalrc
+; PIC16: .end caller5
 
   %0 = load i32* @g0, align 4
   %1 = load i32* @g1, align 4
@@ -98,3 +113,133 @@ entry:
   ret i32 %add8
 }
 
+declare i32 @callee8(i32, ...)
+
+define i32 @caller8_0() nounwind {
+entry:
+  %call = tail call fastcc i32 @caller8_1()
+  ret i32 %call
+}
+
+define internal fastcc i32 @caller8_1() nounwind noinline {
+entry:
+; PIC32: .ent caller8_1
+; PIC32: jalr
+; PIC32: .end caller8_1
+; STATIC32: .ent caller8_1
+; STATIC32: jal
+; STATIC32: .end caller8_1
+; N64: .ent caller8_1
+; N64-NOT: jalr
+; N64: .end caller8_1
+; PIC16: .ent caller8_1
+; PIC16: jalrc
+; PIC16: .end caller8_1
+
+  %call = tail call i32 (i32, ...)* @callee8(i32 2, i32 1) nounwind
+  ret i32 %call
+}
+
+%struct.S = type { [2 x i32] }
+
+@gs1 = external global %struct.S
+
+declare i32 @callee9(%struct.S* byval)
+
+define i32 @caller9_0() nounwind {
+entry:
+  %call = tail call fastcc i32 @caller9_1()
+  ret i32 %call
+}
+
+define internal fastcc i32 @caller9_1() nounwind noinline {
+entry:
+; PIC32: .ent caller9_1
+; PIC32: jalr
+; PIC32: .end caller9_1
+; STATIC32: .ent caller9_1
+; STATIC32: jal
+; STATIC32: .end caller9_1
+; N64: .ent caller9_1
+; N64: jalr
+; N64: .end caller9_1
+; PIC16: .ent caller9_1
+; PIC16: jalrc
+; PIC16: .end caller9_1
+
+  %call = tail call i32 @callee9(%struct.S* byval @gs1) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee10(i32, i32, i32, i32, i32, i32, i32, i32, i32)
+
+define i32 @caller10(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) nounwind {
+entry:
+; PIC32: .ent caller10
+; PIC32-NOT: jalr
+; STATIC32: .ent caller10
+; STATIC32-NOT: jal
+; N64: .ent caller10
+; N64-NOT: jalr
+; PIC16: .ent caller10
+; PIC16: jalrc
+
+  %call = tail call i32 @callee10(i32 %a8, i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee11(%struct.S* byval)
+
+define i32 @caller11() nounwind noinline {
+entry:
+; PIC32: .ent caller11
+; PIC32: jalr
+; STATIC32: .ent caller11
+; STATIC32: jal
+; N64: .ent caller11
+; N64: jalr
+; PIC16: .ent caller11
+; PIC16: jalrc
+
+  %call = tail call i32 @callee11(%struct.S* byval @gs1) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee12()
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+define i32 @caller12(%struct.S* nocapture byval %a0) nounwind {
+entry:
+; PIC32: .ent caller12
+; PIC32: jalr
+; STATIC32: .ent caller12
+; STATIC32: jal
+; N64: .ent caller12
+; N64: jalr
+; PIC16: .ent caller12
+; PIC16: jalrc
+
+  %0 = bitcast %struct.S* %a0 to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast (%struct.S* @gs1 to i8*), i8* %0, i32 8, i32 4, i1 false)
+  %call = tail call i32 @callee12() nounwind
+  ret i32 %call
+}
+
+declare i32 @callee13(i32, ...)
+
+define i32 @caller13() nounwind {
+entry:
+; PIC32: .ent caller13
+; PIC32-NOT: jalr
+; STATIC32: .ent caller13
+; STATIC32-NOT: jal
+; N64: .ent caller13
+; N64-NOT: jalr
+; PIC16: .ent caller13
+; PIC16: jalrc
+
+  %call = tail call i32 (i32, ...)* @callee13(i32 1, i32 2) nounwind
+  ret i32 %call
+}
+
diff --git a/test/CodeGen/Mips/tls16.ll b/test/CodeGen/Mips/tls16.ll
new file mode 100644
index 0000000000..861864bcfe
--- /dev/null
+++ b/test/CodeGen/Mips/tls16.ll
@@ -0,0 +1,13 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=PIC16
+
+@a = thread_local global i32 4, align 4
+
+define i32 @foo() nounwind readonly {
+entry:
+  %0 = load i32* @a, align 4
+; PIC16:	lw	${{[0-9]+}}, %call16(__tls_get_addr)(${{[0-9]+}})
+; PIC16:	addiu	${{[0-9]+}}, %tlsgd(a)
+  ret i32 %0
+}
+
+
diff --git a/test/CodeGen/Mips/tls16_2.ll b/test/CodeGen/Mips/tls16_2.ll
new file mode 100644
index 0000000000..b33e3c3766
--- /dev/null
+++ b/test/CodeGen/Mips/tls16_2.ll
@@ -0,0 +1,15 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=PIC16
+
+@f.i = internal thread_local unnamed_addr global i32 1, align 4
+
+define i8* @f(i8* nocapture %a) nounwind {
+entry:
+  %0 = load i32* @f.i, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @f.i, align 4
+  %1 = inttoptr i32 %inc to i8*
+; PIC16: addiu	${{[0-9]+}}, %tlsldm(f.i)
+  ret i8* %1
+}
+
+
diff --git a/test/CodeGen/PowerPC/emptystruct.ll b/test/CodeGen/PowerPC/emptystruct.ll
new file mode 100644
index 0000000000..36b4abd2bf
--- /dev/null
+++ b/test/CodeGen/PowerPC/emptystruct.ll
@@ -0,0 +1,51 @@
+; RUN: llc -mcpu=pwr7 -O0 < %s | FileCheck %s
+
+; This tests correct handling of empty aggregate parameters and return values.
+; An empty parameter passed by value does not consume a protocol register or
+; a parameter save area doubleword.  An empty parameter passed by reference
+; is treated as any other pointer parameter.  An empty aggregate return value 
+; is treated as any other aggregate return value, passed via address as a 
+; hidden parameter in GPR3.  In this example, GPR3 contains the return value
+; address, GPR4 contains the address of e2, and e1 and e3 are not passed or
+; received.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.empty = type {}
+
+define void @callee(%struct.empty* noalias sret %agg.result, %struct.empty* byval %a1, %struct.empty* %a2, %struct.empty* byval %a3) nounwind {
+entry:
+  %a2.addr = alloca %struct.empty*, align 8
+  store %struct.empty* %a2, %struct.empty** %a2.addr, align 8
+  %0 = load %struct.empty** %a2.addr, align 8
+  %1 = bitcast %struct.empty* %agg.result to i8*
+  %2 = bitcast %struct.empty* %0 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 0, i32 1, i1 false)
+  ret void
+}
+
+; CHECK: callee:
+; CHECK: std 4,
+; CHECK: std 3,
+; CHECK-NOT: std 5,
+; CHECK-NOT: std 6,
+; CHECK: blr
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define void @caller(%struct.empty* noalias sret %agg.result) nounwind {
+entry:
+  %e1 = alloca %struct.empty, align 1
+  %e2 = alloca %struct.empty, align 1
+  %e3 = alloca %struct.empty, align 1
+  call void @callee(%struct.empty* sret %agg.result, %struct.empty* byval %e1, %struct.empty* %e2, %struct.empty* byval %e3)
+  ret void
+}
+
+; CHECK: caller:
+; CHECK: addi 4,
+; CHECK: std 3,
+; CHECK-NOT: std 5,
+; CHECK-NOT: std 6,
+; CHECK: bl callee
diff --git a/test/CodeGen/PowerPC/int-fp-conv-1.ll b/test/CodeGen/PowerPC/int-fp-conv-1.ll
index 6c82723519..d2887b9b94 100644
--- a/test/CodeGen/PowerPC/int-fp-conv-1.ll
+++ b/test/CodeGen/PowerPC/int-fp-conv-1.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=ppc64 | grep __floatditf
+; RUN: llc < %s -march=ppc64 | FileCheck %s
+; CHECK-NOT: __floatditf
 
 define i64 @__fixunstfdi(ppc_fp128 %a) nounwind  {
 entry:
diff --git a/test/CodeGen/PowerPC/ppc64-align-long-double.ll b/test/CodeGen/PowerPC/ppc64-align-long-double.ll
new file mode 100644
index 0000000000..10b70d02e5
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-align-long-double.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mcpu=pwr7 -O0 < %s | FileCheck %s
+
+; Verify internal alignment of long double in a struct.  The double
+; argument comes in in GPR3; GPR4 is skipped; GPRs 5 and 6 contain
+; the long double.  Check that these are stored to proper locations
+; in the parameter save area and loaded from there for return in FPR1/2.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.S = type { double, ppc_fp128 }
+
+define ppc_fp128 @test(%struct.S* byval %x) nounwind {
+entry:
+  %b = getelementptr inbounds %struct.S* %x, i32 0, i32 1
+  %0 = load ppc_fp128* %b, align 16
+  ret ppc_fp128 %0
+}
+
+; CHECK: std 6, 72(1)
+; CHECK: std 5, 64(1)
+; CHECK: std 4, 56(1)
+; CHECK: std 3, 48(1)
+; CHECK: lfd 1, 64(1)
+; CHECK: lfd 2, 72(1)
+
diff --git a/test/CodeGen/PowerPC/pr12757.ll b/test/CodeGen/PowerPC/pr12757.ll
new file mode 100644
index 0000000000..c344656d29
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr12757.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define i32 @__flt_rounds() nounwind {
+entry:
+  %0 = tail call i64 asm sideeffect "mffs $0", "=f"() nounwind
+  %conv = trunc i64 %0 to i32
+  ret i32 %conv
+}
+
+; CHECK: @__flt_rounds
+; CHECK: mffs
+
diff --git a/test/CodeGen/PowerPC/varargs-struct-float.ll b/test/CodeGen/PowerPC/varargs-struct-float.ll
new file mode 100644
index 0000000000..fb1835f580
--- /dev/null
+++ b/test/CodeGen/PowerPC/varargs-struct-float.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mcpu=pwr7 -O0 < %s | FileCheck %s
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.Sf1 = type { float }
+
+define void @foo(float inreg %s.coerce) nounwind {
+entry:
+  %s = alloca %struct.Sf1, align 4
+  %coerce.dive = getelementptr %struct.Sf1* %s, i32 0, i32 0
+  store float %s.coerce, float* %coerce.dive, align 1
+  %coerce.dive1 = getelementptr %struct.Sf1* %s, i32 0, i32 0
+  %0 = load float* %coerce.dive1, align 1
+  call void (i32, ...)* @testvaSf1(i32 1, float inreg %0)
+  ret void
+}
+
+; CHECK: stfs {{[0-9]+}}, 60(1)
+; CHECK: ld 4, 56(1)
+; CHECK: bl
+
+declare void @testvaSf1(i32, ...)
diff --git a/test/CodeGen/PowerPC/vec_cmp.ll b/test/CodeGen/PowerPC/vec_cmp.ll
index b2b59db8f1..3180f464d1 100644
--- a/test/CodeGen/PowerPC/vec_cmp.ll
+++ b/test/CodeGen/PowerPC/vec_cmp.ll
@@ -1,6 +1,9 @@
-; RUN: llc -mattr=+altivec < %s | FileCheck %s
+; RUN: llc -mcpu=pwr6 -mattr=+altivec < %s | FileCheck %s
 
-; Check vector comparisons using altivec.
+; Check vector comparisons using altivec. For non native types, just basic
+; comparison instruction check is done. For altivec supported type (16i8,
+; 8i16, 4i32, and 4f32) all the comparisons operators (==, !=, >, >=, <, <=)
+; are checked.
 
 
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
@@ -33,13 +36,105 @@ define <8 x i8> @v8si8_cmp(<8 x i8> %x, <8 x i8> %y) nounwind readnone {
 ; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
 
 
-define <16 x i8> @v16si8_cmp(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+; Adicional tests for v16i8 since it is a altivec native type
+
+define <16 x i8> @v16si8_cmp_eq(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
   %cmp = icmp eq <16 x i8> %x, %y
   %sext = sext <16 x i1> %cmp to <16 x i8>
   ret <16 x i8> %sext
 }
-; CHECK: v16si8_cmp:
-; CHECK: vcmpequb {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: v16si8_cmp_eq:
+; CHECK: vcmpequb 2, 2, 3
+
+define <16 x i8> @v16si8_cmp_ne(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp ne <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK:     v16si8_cmp_ne:
+; CHECK:     vcmpequb [[RET:[0-9]+]], 2, 3
+; CHECK-NOR: vnor     2, [[RET]], [[RET]]
+
+define <16 x i8> @v16si8_cmp_le(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp sle <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK:      v16si8_cmp_le:
+; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsb [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <16 x i8> @v16ui8_cmp_le(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp ule <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK:      v16ui8_cmp_le:
+; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtub [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <16 x i8> @v16si8_cmp_lt(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp slt <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK: v16si8_cmp_lt:
+; CHECK: vcmpgtsb 2, 3, 2
+
+define <16 x i8> @v16ui8_cmp_lt(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp ult <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK: v16ui8_cmp_lt:
+; CHECK: vcmpgtub 2, 3, 2
+
+define <16 x i8> @v16si8_cmp_gt(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp sgt <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK: v16si8_cmp_gt:
+; CHECK: vcmpgtsb 2, 2, 3
+
+define <16 x i8> @v16ui8_cmp_gt(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp ugt <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK: v16ui8_cmp_gt:
+; CHECK: vcmpgtub 2, 2, 3
+
+define <16 x i8> @v16si8_cmp_ge(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp sge <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK:      v16si8_cmp_ge:
+; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsb [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+
+define <16 x i8> @v16ui8_cmp_ge(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp uge <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK:      v16ui8_cmp_ge:
+; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtub [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
 
 
 define <32 x i8> @v32si8_cmp(<32 x i8> %x, <32 x i8> %y) nounwind readnone {
@@ -70,13 +165,106 @@ define <4 x i16> @v4si16_cmp(<4 x i16> %x, <4 x i16> %y) nounwind readnone {
 ; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
 
 
-define <8 x i16> @v8si16_cmp(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+; Adicional tests for v8i16 since it is an altivec native type
+
+define <8 x i16> @v8si16_cmp_eq(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
   %cmp = icmp eq <8 x i16> %x, %y
   %sext = sext <8 x i1> %cmp to <8 x i16>
   ret <8 x i16> %sext
 }
-; CHECK: v8si16_cmp:
-; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: v8si16_cmp_eq:
+; CHECK: vcmpequh 2, 2, 3
+
+define <8 x i16> @v8si16_cmp_ne(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp ne <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK:      v8si16_cmp_ne:
+; CHECK:      vcmpequh [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+
+define <8 x i16> @v8si16_cmp_le(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp sle <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK:      v8si16_cmp_le:
+; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsh [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <8 x i16> @v8ui16_cmp_le(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp ule <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK:      v8ui16_cmp_le:
+; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtuh [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <8 x i16> @v8si16_cmp_lt(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp slt <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK: v8si16_cmp_lt:
+; CHECK: vcmpgtsh 2, 3, 2
+
+define <8 x i16> @v8ui16_cmp_lt(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp ult <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK: v8ui16_cmp_lt:
+; CHECK: vcmpgtuh 2, 3, 2
+
+define <8 x i16> @v8si16_cmp_gt(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp sgt <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK: v8si16_cmp_gt:
+; CHECK: vcmpgtsh 2, 2, 3
+
+define <8 x i16> @v8ui16_cmp_gt(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp ugt <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK: v8ui16_cmp_gt:
+; CHECK: vcmpgtuh 2, 2, 3
+
+define <8 x i16> @v8si16_cmp_ge(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp sge <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK:      v8si16_cmp_ge:
+; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsh [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+
+define <8 x i16> @v8ui16_cmp_ge(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp uge <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK:      v8ui16_cmp_ge:
+; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtuh [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
 
 
 define <16 x i16> @v16si16_cmp(<16 x i16> %x, <16 x i16> %y) nounwind readnone {
@@ -110,13 +298,106 @@ define <2 x i32> @v2si32_cmp(<2 x i32> %x, <2 x i32> %y) nounwind readnone {
 ; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
 
 
-define <4 x i32> @v4si32_cmp(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+; Adicional tests for v4si32 since it is an altivec native type
+
+define <4 x i32> @v4si32_cmp_eq(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
   %cmp = icmp eq <4 x i32> %x, %y
   %sext = sext <4 x i1> %cmp to <4 x i32>
   ret <4 x i32> %sext
 }
-; CHECK: v4si32_cmp:
-; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: v4si32_cmp_eq:
+; CHECK: vcmpequw 2, 2, 3
+
+define <4 x i32> @v4si32_cmp_ne(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp ne <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK:      v4si32_cmp_ne:
+; CHECK:      vcmpequw [[RCMP:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RCMP]], [[RCMP]]
+
+define <4 x i32> @v4si32_cmp_le(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp sle <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK:      v4si32_cmp_le:
+; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsw [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <4 x i32> @v4ui32_cmp_le(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp ule <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK:      v4ui32_cmp_le:
+; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtuw [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <4 x i32> @v4si32_cmp_lt(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp slt <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK: v4si32_cmp_lt:
+; CHECK: vcmpgtsw 2, 3, 2
+
+define <4 x i32> @v4ui32_cmp_lt(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp ult <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK: v4ui32_cmp_lt:
+; CHECK: vcmpgtuw 2, 3, 2
+
+define <4 x i32> @v4si32_cmp_gt(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp sgt <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK: v4si32_cmp_gt:
+; CHECK: vcmpgtsw 2, 2, 3
+
+define <4 x i32> @v4ui32_cmp_gt(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp ugt <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK: v4ui32_cmp_gt:
+; CHECK: vcmpgtuw 2, 2, 3
+
+define <4 x i32> @v4si32_cmp_ge(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp sge <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK:      v4si32_cmp_ge:
+; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsw [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+
+define <4 x i32> @v4ui32_cmp_ge(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp uge <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK:      v4ui32_cmp_ge:
+; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtuw [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
 
 
 define <8 x i32> @v8si32_cmp(<8 x i32> %x, <8 x i32> %y) nounwind readnone {
@@ -168,15 +449,70 @@ entry:
 ; CHECK: vcmpeqfp {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
 
 
-define <4 x float> @v4f32_cmp(<4 x float> %x, <4 x float> %y) nounwind readnone {
+; Adicional tests for v4f32 since it is a altivec native type
+
+define <4 x float> @v4f32_cmp_eq(<4 x float> %x, <4 x float> %y) nounwind readnone {
 entry:
   %cmp = fcmp oeq <4 x float> %x, %y
   %sext = sext <4 x i1> %cmp to <4 x i32>
   %0 = bitcast <4 x i32> %sext to <4 x float>
   ret <4 x float> %0
 }
-; CHECK: v4f32_cmp:
-; CHECK: vcmpeqfp {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: v4f32_cmp_eq:
+; CHECK: vcmpeqfp 2, 2, 3
+
+define <4 x float> @v4f32_cmp_ne(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp une <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK:      v4f32_cmp_ne:
+; CHECK:      vcmpeqfp [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+
+define <4 x float> @v4f32_cmp_le(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp ole <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK:      v4f32_cmp_le:
+; CHECK:      vcmpeqfp [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtfp [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <4 x float> @v4f32_cmp_lt(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp olt <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK: v4f32_cmp_lt:
+; CHECK: vcmpgtfp 2, 3, 2
+
+define <4 x float> @v4f32_cmp_ge(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp oge <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK: v4f32_cmp_ge:
+; CHECK: vcmpgefp 2, 2, 3
+
+define <4 x float> @v4f32_cmp_gt(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp ogt <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK: v4f32_cmp_gt:
+; CHECK: vcmpgtfp 2, 2, 3
 
 
 define <8 x float> @v8f32_cmp(<8 x float> %x, <8 x float> %y) nounwind readnone {
diff --git a/test/CodeGen/PowerPC/vec_extload.ll b/test/CodeGen/PowerPC/vec_extload.ll
new file mode 100644
index 0000000000..201c15b9c7
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_extload.ll
@@ -0,0 +1,155 @@
+; RUN: llc -mcpu=pwr6 -mattr=+altivec < %s | FileCheck %s
+
+; Check vector extend load expansion with altivec enabled.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Altivec does not provides an sext intruction, so it expands
+; a set of vector stores (stvx), bytes load/sign expand/store
+; (lbz/stb), and a final vector load (lvx) to load the result
+; extended vector.
+define <16 x i8> @v16si8_sext_in_reg(<16 x i8> %a) {
+  %b = trunc <16 x i8> %a to <16 x i4>
+  %c = sext <16 x i4> %b to <16 x i8>
+  ret <16 x i8> %c
+}
+; CHECK: v16si8_sext_in_reg:
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: lvx 2, {{[0-9]+}}, {{[0-9]+}}
+
+; The zero extend uses a more clever logic: a vector splat
+; and a logic and to set higher bits to 0.
+define <16 x i8> @v16si8_zext_in_reg(<16 x i8> %a) {
+  %b = trunc <16 x i8> %a to <16 x i4>
+  %c = zext <16 x i4> %b to <16 x i8>
+  ret <16 x i8> %c
+}
+; CHECK:      v16si8_zext_in_reg:
+; CHECK:      vspltisb [[VMASK:[0-9]+]], 15
+; CHECK-NEXT: vand 2, 2, [[VMASK]]
+
+; Same as v16si8_sext_in_reg, expands to load/store halfwords (lhz/sth).
+define <8 x i16> @v8si16_sext_in_reg(<8 x i16> %a) {
+  %b = trunc <8 x i16> %a to <8 x i8>
+  %c = sext <8 x i8> %b to <8 x i16>
+  ret <8 x i16> %c
+}
+; CHECK: v8si16_sext_in_reg:
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: lvx 2, {{[0-9]+}}, {{[0-9]+}}
+
+; Same as v8si16_sext_in_reg, but instead of creating the mask
+; with a splat, loads it from memory.
+define <8 x i16> @v8si16_zext_in_reg(<8 x i16> %a) {
+  %b = trunc <8 x i16> %a to <8 x i8>
+  %c = zext <8 x i8> %b to <8 x i16>
+  ret <8 x i16> %c
+}
+; CHECK:      v8si16_zext_in_reg:
+; CHECK:      ld [[RMASKTOC:[0-9]+]], .LC{{[0-9]+}}@toc(2)
+; CHECK-NEXT: lvx [[VMASK:[0-9]+]], {{[0-9]+}}, [[RMASKTOC]]
+; CHECK-NEXT: vand 2, 2, [[VMASK]]
+
+; Same as v16si8_sext_in_reg, expands to load halfword (lha) and
+; store words (stw).
+define <4 x i32> @v4si32_sext_in_reg(<4 x i32> %a) {
+  %b = trunc <4 x i32> %a to <4 x i16>
+  %c = sext <4 x i16> %b to <4 x i32>
+  ret <4 x i32> %c
+}
+; CHECK: v4si32_sext_in_reg:
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lha
+; CHECK: stw
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lha
+; CHECK: stw
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lha
+; CHECK: stw
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lha
+; CHECK: stw
+; CHECK: lvx 2, {{[0-9]+}}, {{[0-9]+}}
+
+; Same as v8si16_sext_in_reg.
+define <4 x i32> @v4si32_zext_in_reg(<4 x i32> %a) {
+  %b = trunc <4 x i32> %a to <4 x i16>
+  %c = zext <4 x i16> %b to <4 x i32>
+  ret <4 x i32> %c
+}
+; CHECK:      v4si32_zext_in_reg:
+; CHECK:      vspltisw [[VMASK:[0-9]+]], -16
+; CHECK-NEXT: vsrw [[VMASK]], [[VMASK]], [[VMASK]]
+; CHECK-NEXT: vand 2, 2, [[VMASK]]
diff --git a/test/CodeGen/PowerPC/vec_sqrt.ll b/test/CodeGen/PowerPC/vec_sqrt.ll
new file mode 100644
index 0000000000..055da1a229
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_sqrt.ll
@@ -0,0 +1,71 @@
+; RUN: llc -mcpu=pwr6 -mattr=+altivec,+fsqrt < %s | FileCheck %s
+
+; Check for vector sqrt expansion using floating-point types, since altivec
+; does not provide an fsqrt instruction for vector.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %val)
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %val)
+declare <8 x float> @llvm.sqrt.v8f32(<8 x float> %val)
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double> %val)
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double> %val)
+
+define <2 x float> @v2f32_sqrt(<2 x float> %x) nounwind readnone {
+entry:
+  %sqrt = call <2 x float> @llvm.sqrt.v2f32 (<2 x float> %x)
+  ret <2 x float> %sqrt
+}
+; sqrt (<2 x float>) is promoted to sqrt (<4 x float>)
+; CHECK: v2f32_sqrt:
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+
+define <4 x float> @v4f32_sqrt(<4 x float> %x) nounwind readnone {
+entry:
+  %sqrt = call <4 x float> @llvm.sqrt.v4f32 (<4 x float> %x)
+  ret <4 x float> %sqrt
+}
+; CHECK: v4f32_sqrt:
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+
+define <8 x float> @v8f32_sqrt(<8 x float> %x) nounwind readnone {
+entry:
+  %sqrt = call <8 x float> @llvm.sqrt.v8f32 (<8 x float> %x)
+  ret <8 x float> %sqrt
+}
+; CHECK: v8f32_sqrt:
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+
+define <2 x double> @v2f64_sqrt(<2 x double> %x) nounwind readnone {
+entry:
+  %sqrt = call <2 x double> @llvm.sqrt.v2f64 (<2 x double> %x)
+  ret <2 x double> %sqrt
+}
+; CHECK: v2f64_sqrt:
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
+
+define <4 x double> @v4f64_sqrt(<4 x double> %x) nounwind readnone {
+entry:
+  %sqrt = call <4 x double> @llvm.sqrt.v4f64 (<4 x double> %x)
+  ret <4 x double> %sqrt
+}
+; CHECK: v4f64_sqrt:
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
diff --git a/test/CodeGen/SPARC/load_to_switch.ll b/test/CodeGen/SPARC/load_to_switch.ll
deleted file mode 100644
index 8d62de527e..0000000000
--- a/test/CodeGen/SPARC/load_to_switch.ll
+++ /dev/null
@@ -1,84 +0,0 @@
-; RUN: llc -march=sparc < %s | FileCheck %s
-
-; Check that all the switches turned into lookup tables by SimplifyCFG are
-; turned back into switches for targets that don't like lookup tables.
-
-@.str = private unnamed_addr constant [4 x i8] c"foo\00", align 1
-@.str1 = private unnamed_addr constant [4 x i8] c"bar\00", align 1
-@.str2 = private unnamed_addr constant [4 x i8] c"baz\00", align 1
-@.str3 = private unnamed_addr constant [4 x i8] c"qux\00", align 1
-@.str4 = private unnamed_addr constant [6 x i8] c"error\00", align 1
-@switch.table = private unnamed_addr constant [7 x i32] [i32 55, i32 123, i32 0, i32 -1, i32 27, i32 62, i32 1]
-@switch.table1 = private unnamed_addr constant [4 x i8] c"*\09X\05"
-@switch.table2 = private unnamed_addr constant [4 x float] [float 0x40091EB860000000, float 0x3FF3BE76C0000000, float 0x4012449BA0000000, float 0x4001AE1480000000]
-@switch.table3 = private unnamed_addr constant [4 x i8*] [i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str1, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str2, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str3, i64 0, i64 0)]
-
-define i32 @f(i32 %c)  {
-entry:
-  %switch.tableidx = sub i32 %c, 42
-  %0 = icmp ult i32 %switch.tableidx, 7
-  br i1 %0, label %switch.lookup, label %return
-
-switch.lookup:
-  %switch.gep = getelementptr inbounds [7 x i32]* @switch.table, i32 0, i32 %switch.tableidx
-  %switch.load = load i32* %switch.gep
-  ret i32 %switch.load
-
-return:
-  ret i32 15
-
-; CHECK: f:
-; CHECK: %switch.lookup
-; CHECK-NOT: sethi %hi(.Lswitch.table)
-}
-
-declare void @dummy(i8 signext, float)
-
-define void @h(i32 %x) {
-entry:
-  %switch.tableidx = sub i32 %x, 0
-  %0 = icmp ult i32 %switch.tableidx, 4
-  br i1 %0, label %switch.lookup, label %sw.epilog
-
-switch.lookup:
-  %switch.gep = getelementptr inbounds [4 x i8]* @switch.table1, i32 0, i32 %switch.tableidx
-  %switch.load = load i8* %switch.gep
-  %switch.gep1 = getelementptr inbounds [4 x float]* @switch.table2, i32 0, i32 %switch.tableidx
-  %switch.load2 = load float* %switch.gep1
-  br label %sw.epilog
-
-sw.epilog:
-  %a.0 = phi i8 [ %switch.load, %switch.lookup ], [ 7, %entry ]
-  %b.0 = phi float [ %switch.load2, %switch.lookup ], [ 0x4023FAE140000000, %entry ]
-  call void @dummy(i8 signext %a.0, float %b.0)
-  ret void
-
-; CHECK: h:
-; CHECK: %switch.lookup
-; CHECK-NOT: sethi %hi(.Lswitch.table{{[0-9]}})
-; CHECK-NOT: sethi %hi(.Lswitch.table{{[0-9]}})
-}
-
-define i8* @foostring(i32 %x) {
-entry:
-  %switch.tableidx = sub i32 %x, 0
-  %0 = icmp ult i32 %switch.tableidx, 4
-  br i1 %0, label %switch.lookup, label %return
-
-switch.lookup:
-  %switch.gep = getelementptr inbounds [4 x i8*]* @switch.table3, i32 0, i32 %switch.tableidx
-  %switch.load = load i8** %switch.gep
-  ret i8* %switch.load
-
-return:
-  ret i8* getelementptr inbounds ([6 x i8]* @.str4, i64 0, i64 0)
-
-; CHECK: foostring:
-; CHECK: %switch.lookup
-; CHECK-NOT: sethi %hi(.Lswitch.table3)
-}
-
-; CHECK-NOT: .Lswitch.table
-; CHECK-NOT: .Lswitch.table1
-; CHECK-NOT: .Lswitch.table2
-; CHECK-NOT: .Lswitch.table3
diff --git a/test/CodeGen/Thumb2/thumb2-uxtb.ll b/test/CodeGen/Thumb2/thumb2-uxtb.ll
index 35914b1679..2074f98cb6 100644
--- a/test/CodeGen/Thumb2/thumb2-uxtb.ll
+++ b/test/CodeGen/Thumb2/thumb2-uxtb.ll
@@ -128,9 +128,9 @@ define i32 @test10(i32 %p0) {
 
 ; ARMv7M: test10
 ; ARMv7M: mov.w r1, #16253176
-; ARMv7M: mov.w r2, #458759
 ; ARMv7M: and.w r0, r1, r0, lsr #7
-; ARMv7M: and.w r1, r2, r0, lsr #5
+; ARMv7M: mov.w r1, #458759
+; ARMv7M: and.w r1, r1, r0, lsr #5
 ; ARMv7M: orrs r0, r1
 	%tmp1 = lshr i32 %p0, 7		; <i32> [#uses=1]
 	%tmp2 = and i32 %tmp1, 16253176		; <i32> [#uses=2]
diff --git a/test/CodeGen/X86/add-of-carry.ll b/test/CodeGen/X86/add-of-carry.ll
index a4abccba7e..4e30f2b05a 100644
--- a/test/CodeGen/X86/add-of-carry.ll
+++ b/test/CodeGen/X86/add-of-carry.ll
@@ -30,4 +30,17 @@ entry:
   ret i32 %z.0
 }
 
+; <rdar://problem/12579915>
+define i32 @test3(i32 %x, i32 %y, i32 %res) nounwind uwtable readnone ssp {
+entry:
+  %cmp = icmp ugt i32 %x, %y
+  %dec = sext i1 %cmp to i32
+  %dec.res = add nsw i32 %dec, %res
+  ret i32 %dec.res
+; CHECK: test3:
+; CHECK: cmpl
+; CHECK: sbbl
+; CHECK: ret
+}
+
 declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
diff --git a/test/CodeGen/X86/atom-shuf.ll b/test/CodeGen/X86/atom-shuf.ll
new file mode 100644
index 0000000000..4c3f2f67c5
--- /dev/null
+++ b/test/CodeGen/X86/atom-shuf.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -mtriple=x86_64-linux-pc -mcpu=atom | FileCheck %s
+
+define <16 x i8> @foo(<16 x i8> %in) {
+  %r = shufflevector <16 x i8> %in, <16 x i8> undef, <16 x i32> < i32 7, i32 3, i32 2, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %r
+; CHECK: foo
+; CHECK: pshufb
+; CHECK-NEXT: ret
+}
diff --git a/test/CodeGen/X86/crash.ll b/test/CodeGen/X86/crash.ll
index 3eb7b37ee6..276d0db9a4 100644
--- a/test/CodeGen/X86/crash.ll
+++ b/test/CodeGen/X86/crash.ll
@@ -580,3 +580,12 @@ bb28:                                             ; preds = %bb21
 bb29:                                             ; preds = %bb28, %bb26, %bb25, %bb21
   unreachable
 }
+
+define void @pr14194() nounwind uwtable {
+  %tmp = load i64* undef, align 16
+  %tmp1 = trunc i64 %tmp to i32
+  %tmp2 = lshr i64 %tmp, 32
+  %tmp3 = trunc i64 %tmp2 to i32
+  %tmp4 = call { i32, i32 } asm sideeffect "", "=&r,=&r,r,r,0,1,~{dirflag},~{fpsr},~{flags}"(i32 %tmp3, i32 undef, i32 %tmp3, i32 %tmp1) nounwind
+ ret void
+}
diff --git a/test/CodeGen/X86/fp-fast.ll b/test/CodeGen/X86/fp-fast.ll
index 091f0de930..d70aa7d79f 100644
--- a/test/CodeGen/X86/fp-fast.ll
+++ b/test/CodeGen/X86/fp-fast.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 -mattr=-fma4 -mtriple=x86_64-apple-darwin -enable-unsafe-fp-math < %s | FileCheck %s
+; RUN: llc -march=x86-64 -mattr=+avx,-fma4 -mtriple=x86_64-apple-darwin -enable-unsafe-fp-math < %s | FileCheck %s
 
 ; CHECK: test1
 define float @test1(float %a) {
@@ -35,3 +35,23 @@ define float @test3(float %a) {
   ret float %r
 }
 
+; CHECK: test4
+define float @test4(float %a) {
+; CHECK-NOT: fma
+; CHECK-NOT mul
+; CHECK-NOT: add
+; CHECK: ret
+  %t1 = fmul float %a, 0.0
+  %t2 = fadd float %a, %t1
+  ret float %t2
+}
+
+; CHECK: test5
+define float @test5(float %a) {
+; CHECK-NOT: add
+; CHECK: vxorps
+; CHECK: ret
+  %t1 = fsub float -0.0, %a
+  %t2 = fadd float %a, %t1
+  ret float %t2
+}
diff --git a/test/CodeGen/X86/inlineasm-sched-bug.ll b/test/CodeGen/X86/inlineasm-sched-bug.ll
new file mode 100644
index 0000000000..08de0c02d2
--- /dev/null
+++ b/test/CodeGen/X86/inlineasm-sched-bug.ll
@@ -0,0 +1,13 @@
+; PR13504
+; RUN: llc -march=x86 -mcpu=atom <%s | FileCheck %s
+; CHECK: bsfl
+; CHECK-NOT: movl
+
+define i32 @foo(i32 %treemap) nounwind uwtable {
+entry:
+  %sub = sub i32 0, %treemap
+  %and = and i32 %treemap, %sub
+  %0 = tail call i32 asm "bsfl $1,$0\0A\09", "=r,rm,~{dirflag},~{fpsr},~{flags}"(i32 %and) nounwind
+  ret i32 %0
+}
+
diff --git a/test/CodeGen/X86/jump_sign.ll b/test/CodeGen/X86/jump_sign.ll
index 78d9e06f59..0e34222b94 100644
--- a/test/CodeGen/X86/jump_sign.ll
+++ b/test/CodeGen/X86/jump_sign.ll
@@ -219,7 +219,6 @@ entry:
 ; by sbb, we should not optimize cmp away.
 define i32 @q(i32 %j.4, i32 %w, i32 %el) {
 ; CHECK: q:
-; CHECK: sub
 ; CHECK: cmp
 ; CHECK-NEXT: sbb
   %tmp532 = add i32 %j.4, %w
diff --git a/test/CodeGen/X86/mmx-builtins.ll b/test/CodeGen/X86/mmx-builtins.ll
index 8b7200d2f7..a8d33f43da 100644
--- a/test/CodeGen/X86/mmx-builtins.ll
+++ b/test/CodeGen/X86/mmx-builtins.ll
@@ -1043,6 +1043,20 @@ entry:
   ret i64 %5
 }
 
+define i32 @test21_2(<1 x i64> %a) nounwind readnone optsize ssp {
+; CHECK: test21_2
+; CHECK: pshufw
+; CHECK: movd
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <2 x i32>
+  %5 = extractelement <2 x i32> %4, i32 0
+  ret i32 %5
+}
+
 declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test20(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
diff --git a/test/CodeGen/X86/pr14204.ll b/test/CodeGen/X86/pr14204.ll
new file mode 100644
index 0000000000..42e362bf3b
--- /dev/null
+++ b/test/CodeGen/X86/pr14204.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=x86_64-linux-pc -mcpu=core-avx2 | FileCheck %s
+
+; FIXME: vpmovsxwd should be generated instead of vpmovzxwd followed by
+; SLL/SRA.
+
+define <8 x i32> @foo(<8 x i1> %bar) nounwind readnone {
+entry:
+  %s = sext <8 x i1> %bar to <8 x i32>
+  ret <8 x i32> %s
+; CHECK: foo
+; CHECK: vpmovzxwd
+; CHECK: vpslld
+; CHECK: vpsrad
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/sse_partial_update.ll b/test/CodeGen/X86/sse_partial_update.ll
new file mode 100644
index 0000000000..655f75800c
--- /dev/null
+++ b/test/CodeGen/X86/sse_partial_update.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse2 -mcpu=nehalem | FileCheck %s
+
+; rdar: 12558838
+; PR14221
+; There is a mismatch between the intrinsic and the actual instruction.
+; The actual instruction has a partial update of dest, while the intrinsic
+; passes through the upper FP values. Here, we make sure the source and
+; destination of rsqrtss are the same.
+define void @t1(<4 x float> %a) nounwind uwtable ssp {
+entry:
+; CHECK: t1:
+; CHECK: rsqrtss %xmm0, %xmm0
+  %0 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a) nounwind
+  %a.addr.0.extract = extractelement <4 x float> %0, i32 0
+  %conv = fpext float %a.addr.0.extract to double
+  %a.addr.4.extract = extractelement <4 x float> %0, i32 1
+  %conv3 = fpext float %a.addr.4.extract to double
+  tail call void @callee(double %conv, double %conv3) nounwind
+  ret void
+}
+declare void @callee(double, double)
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define void @t2(<4 x float> %a) nounwind uwtable ssp {
+entry:
+; CHECK: t2:
+; CHECK: rcpss %xmm0, %xmm0
+  %0 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a) nounwind
+  %a.addr.0.extract = extractelement <4 x float> %0, i32 0
+  %conv = fpext float %a.addr.0.extract to double
+  %a.addr.4.extract = extractelement <4 x float> %0, i32 1
+  %conv3 = fpext float %a.addr.4.extract to double
+  tail call void @callee(double %conv, double %conv3) nounwind
+  ret void
+}
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
diff --git a/test/CodeGen/X86/vec_shuffle-26.ll b/test/CodeGen/X86/vec_shuffle-26.ll
index 8dfc2eab41..4c56f848de 100644
--- a/test/CodeGen/X86/vec_shuffle-26.ll
+++ b/test/CodeGen/X86/vec_shuffle-26.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -mcpu=generic -mattr=sse41 | FileCheck %s
-; RUN: llc < %s -march=x86 -mcpu=atom -mattr=+sse41 | FileCheck -check-prefix=ATOM %s
+; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck -check-prefix=ATOM %s
 
 ; Transpose example using the more generic vector shuffle. Return float8
 ; instead of float16
@@ -47,8 +47,8 @@ entry:
 ; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
 ; ATOM: lo_hi_shift
 ; ATOM: movhps ([[BASEREG:%[a-z]+]]),
-; ATOM: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
-; ATOM: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; ATOM: movd %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; ATOM: movd %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
   %v.i = bitcast float* %y to <4 x float>*
   %0 = load <4 x float>* %v.i, align 1
   %1 = bitcast float* %x to <1 x i64>*
diff --git a/test/CodeGen/X86/vec_shuffle-30.ll b/test/CodeGen/X86/vec_shuffle-30.ll
index 1651c4cdac..f5f8842605 100644
--- a/test/CodeGen/X86/vec_shuffle-30.ll
+++ b/test/CodeGen/X86/vec_shuffle-30.ll
@@ -1,21 +1,25 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -o %t
-; RUN: grep pshufhw %t | grep -- -95 | count 1
-; RUN: grep shufps %t | count 1
-; RUN: not grep pslldq %t
+; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck %s
 
+; CHECK: test
 ; Test case when creating pshufhw, we incorrectly set the higher order bit
 ; for an undef,
 define void @test(<8 x i16>* %dest, <8 x i16> %in) nounwind {
 entry:
+; CHECK-NOT: vmovaps
+; CHECK: vmovlpd
+; CHECK: vpshufhw        $-95
   %0 = load <8 x i16>* %dest
   %1 = shufflevector <8 x i16> %0, <8 x i16> %in, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 13, i32 undef, i32 14, i32 14>
   store <8 x i16> %1, <8 x i16>* %dest
   ret void
-}                              
+}
 
+; CHECK: test2
 ; A test case where we shouldn't generate a punpckldq but a pshufd and a pslldq
 define void @test2(<4 x i32>* %dest, <4 x i32> %in) nounwind {
 entry:
+; CHECK-NOT: pslldq
+; CHECK: shufps
   %0 = shufflevector <4 x i32> %in, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> < i32 undef, i32 5, i32 undef, i32 2>
   store <4 x i32> %0, <4 x i32>* %dest
   ret void
diff --git a/test/CodeGen/X86/widen_cast-1.ll b/test/CodeGen/X86/widen_cast-1.ll
index ebdfea9a37..56c63644e0 100644
--- a/test/CodeGen/X86/widen_cast-1.ll
+++ b/test/CodeGen/X86/widen_cast-1.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=x86 -mcpu=generic -mattr=+sse42 < %s | FileCheck %s
-; RUN: llc -march=x86 -mcpu=atom -mattr=+sse42 < %s | FileCheck -check-prefix=ATOM %s
+; RUN: llc -march=x86 -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s
 
 ; CHECK: paddd
 ; CHECK: movl