446 files changed, 16943 insertions, 797 deletions
diff --git a/test/Analysis/CostModel/X86/cmp.ll b/test/Analysis/CostModel/X86/cmp.ll
index 90b09c1154..713b3742e9 100644
--- a/test/Analysis/CostModel/X86/cmp.ll
+++ b/test/Analysis/CostModel/X86/cmp.ll
@@ -1,38 +1,52 @@
-; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck --check-prefix=AVX1 %s
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core-avx2 | FileCheck --check-prefix=AVX2 %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
 define i32 @cmp(i32 %arg) {
   ;  -- floats --
-  ;CHECK: cost of 1 {{.*}} fcmp
+  ;AVX1: cost of 1 {{.*}} fcmp
+  ;AVX2: cost of 1 {{.*}} fcmp
   %A = fcmp olt <2 x float> undef, undef
-  ;CHECK: cost of 1 {{.*}} fcmp
+  ;AVX1: cost of 1 {{.*}} fcmp
+  ;AVX2: cost of 1 {{.*}} fcmp
   %B = fcmp olt <4 x float> undef, undef
-  ;CHECK: cost of 1 {{.*}} fcmp
+  ;AVX1: cost of 1 {{.*}} fcmp
+  ;AVX2: cost of 1 {{.*}} fcmp
   %C = fcmp olt <8 x float> undef, undef
-  ;CHECK: cost of 1 {{.*}} fcmp
+  ;AVX1: cost of 1 {{.*}} fcmp
+  ;AVX2: cost of 1 {{.*}} fcmp
   %D = fcmp olt <2 x double> undef, undef
-  ;CHECK: cost of 1 {{.*}} fcmp
+  ;AVX1: cost of 1 {{.*}} fcmp
+  ;AVX2: cost of 1 {{.*}} fcmp
   %E = fcmp olt <4 x double> undef, undef
 
   ;  -- integers --
 
-  ;CHECK: cost of 1 {{.*}} icmp
+  ;AVX1: cost of 1 {{.*}} icmp
+  ;AVX2: cost of 1 {{.*}} icmp
   %F = icmp eq <16 x i8> undef, undef
-  ;CHECK: cost of 1 {{.*}} icmp
+  ;AVX1: cost of 1 {{.*}} icmp
+  ;AVX2: cost of 1 {{.*}} icmp
   %G = icmp eq <8 x i16> undef, undef
-  ;CHECK: cost of 1 {{.*}} icmp
+  ;AVX1: cost of 1 {{.*}} icmp
+  ;AVX2: cost of 1 {{.*}} icmp
   %H = icmp eq <4 x i32> undef, undef
-  ;CHECK: cost of 1 {{.*}} icmp
+  ;AVX1: cost of 1 {{.*}} icmp
+  ;AVX2: cost of 1 {{.*}} icmp
   %I = icmp eq <2 x i64> undef, undef
-  ;CHECK: cost of 4 {{.*}} icmp
+  ;AVX1: cost of 4 {{.*}} icmp
+  ;AVX2: cost of 1 {{.*}} icmp
   %J = icmp eq <4 x i64> undef, undef
-  ;CHECK: cost of 4 {{.*}} icmp
+  ;AVX1: cost of 4 {{.*}} icmp
+  ;AVX2: cost of 1 {{.*}} icmp
   %K = icmp eq <8 x i32> undef, undef
-  ;CHECK: cost of 4 {{.*}} icmp
+  ;AVX1: cost of 4 {{.*}} icmp
+  ;AVX2: cost of 1 {{.*}} icmp
   %L = icmp eq <16 x i16> undef, undef
-  ;CHECK: cost of 4 {{.*}} icmp
+  ;AVX1: cost of 4 {{.*}} icmp
+  ;AVX2: cost of 1 {{.*}} icmp
   %M = icmp eq <32 x i8> undef, undef
 
   ;CHECK: cost of 0 {{.*}} ret
diff --git a/test/Analysis/CostModel/X86/i32.ll b/test/Analysis/CostModel/X86/i32.ll
index 52c295934c..c2dce762a0 100644
--- a/test/Analysis/CostModel/X86/i32.ll
+++ b/test/Analysis/CostModel/X86/i32.ll
@@ -1,7 +1,5 @@
 ; RUN: opt < %s  -cost-model -analyze -mtriple=i386 -mcpu=corei7-avx | FileCheck %s
 
-
-;CHECK: cost of 2 {{.*}} add
 ;CHECK: cost of 0 {{.*}} ret
 define i32 @no_info(i32 %arg) {
   %e = add i64 undef, undef
diff --git a/test/Analysis/CostModel/X86/load_store.ll b/test/Analysis/CostModel/X86/load_store.ll
new file mode 100644
index 0000000000..4195b1d879
--- /dev/null
+++ b/test/Analysis/CostModel/X86/load_store.ll
@@ -0,0 +1,64 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @stores(i32 %arg) {
+
+  ;CHECK: cost of 1 {{.*}} store
+  store i8 undef, i8* undef, align 4
+  ;CHECK: cost of 1 {{.*}} store
+  store i16 undef, i16* undef, align 4
+  ;CHECK: cost of 1 {{.*}} store
+  store i32 undef, i32* undef, align 4
+  ;CHECK: cost of 1 {{.*}} store
+  store i64 undef, i64* undef, align 4
+  ;CHECK: cost of 2 {{.*}} store
+  store i128 undef, i128* undef, align 4
+
+  ;CHECK: cost of 1 {{.*}} store
+  store <4 x i16> undef, <4 x i16>* undef, align 4
+  ;CHECK: cost of 1 {{.*}} store
+  store <4 x i32> undef, <4 x i32>* undef, align 4
+  ;CHECK: cost of 2 {{.*}} store
+  store <4 x i64> undef, <4 x i64>* undef, align 4
+
+  ;CHECK: cost of 1 {{.*}} store
+  store <8 x i16> undef, <8 x i16>* undef, align 4
+  ;CHECK: cost of 2 {{.*}} store
+  store <8 x i32> undef, <8 x i32>* undef, align 4
+  ;CHECK: cost of 4 {{.*}} store
+  store <8 x i64> undef, <8 x i64>* undef, align 4
+
+  ret i32 undef
+}
+define i32 @loads(i32 %arg) {
+  ;CHECK: cost of 1 {{.*}} load
+  load i8* undef, align 4
+  ;CHECK: cost of 1 {{.*}} load
+  load i16* undef, align 4
+  ;CHECK: cost of 1 {{.*}} load
+  load i32* undef, align 4
+  ;CHECK: cost of 1 {{.*}} load
+  load i64* undef, align 4
+  ;CHECK: cost of 2 {{.*}} load
+  load i128* undef, align 4
+
+  ;CHECK: cost of 1 {{.*}} load
+  load <2 x i32>* undef, align 4
+  ;CHECK: cost of 1 {{.*}} load
+  load <4 x i32>* undef, align 4
+  ;CHECK: cost of 2 {{.*}} load
+  load <8 x i32>* undef, align 4
+
+
+  ;CHECK: cost of 1 {{.*}} load
+  load <2 x i64>* undef, align 4
+  ;CHECK: cost of 2 {{.*}} load
+  load <4 x i64>* undef, align 4
+  ;CHECK: cost of 4 {{.*}} load
+  load <8 x i64>* undef, align 4
+
+  ret i32 undef
+}
+
diff --git a/test/Analysis/CostModel/X86/vectorized-loop.ll b/test/Analysis/CostModel/X86/vectorized-loop.ll
index 6c9e111bb1..25b11145c6 100644
--- a/test/Analysis/CostModel/X86/vectorized-loop.ll
+++ b/test/Analysis/CostModel/X86/vectorized-loop.ll
@@ -28,16 +28,17 @@ vector.body:                                      ; preds = %for.body.lr.ph, %ve
   %4 = getelementptr inbounds i32* %B, i64 %3
   ;CHECK: cost of 0 {{.*}} bitcast
   %5 = bitcast i32* %4 to <8 x i32>*
-  ;CHECK: cost of 1 {{.*}} load
+  ;CHECK: cost of 2 {{.*}} load
   %6 = load <8 x i32>* %5, align 4
   ;CHECK: cost of 4 {{.*}} mul
   %7 = mul nsw <8 x i32> %6, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   %8 = getelementptr inbounds i32* %A, i64 %index
   %9 = bitcast i32* %8 to <8 x i32>*
+  ;CHECK: cost of 2 {{.*}} load
   %10 = load <8 x i32>* %9, align 4
   ;CHECK: cost of 4 {{.*}} add
   %11 = add nsw <8 x i32> %10, %7
-  ;CHECK: cost of 1 {{.*}} store
+  ;CHECK: cost of 2 {{.*}} store
   store <8 x i32> %11, <8 x i32>* %9, align 4
   %index.next = add i64 %index, 8
   %12 = icmp eq i64 %index.next, %end.idx.rnd.down
diff --git a/test/Analysis/CostModel/no_info.ll b/test/Analysis/CostModel/no_info.ll
index d20d56b79a..f3f165b1b5 100644
--- a/test/Analysis/CostModel/no_info.ll
+++ b/test/Analysis/CostModel/no_info.ll
@@ -1,11 +1,8 @@
 ; RUN: opt < %s -cost-model -analyze | FileCheck %s
 
 ; The cost model does not have any target information so it can't make a decision.
-; Notice that OPT does not read the triple information from the module itself, only through the command line.
 
-; This info ignored:
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.8.0"
+; -- No triple in this module --
 
 ;CHECK: Unknown cost {{.*}} add
 ;CHECK: Unknown cost {{.*}} ret
diff --git a/test/Analysis/Dominators/invoke.ll b/test/Analysis/Dominators/invoke.ll
index f935750c98..da0b246165 100644
--- a/test/Analysis/Dominators/invoke.ll
+++ b/test/Analysis/Dominators/invoke.ll
@@ -1,4 +1,4 @@
-; RUN: opt -verify -disable-output %s
+; RUN: opt -verify -disable-output < %s
 ; This tests that we handle unreachable blocks correctly
 
 define void @f() {
diff --git a/test/Analysis/RegionInfo/20100809_bb_not_in_domtree.ll b/test/Analysis/RegionInfo/20100809_bb_not_in_domtree.ll
index 218b4375f7..0dfa0bf9cd 100644
--- a/test/Analysis/RegionInfo/20100809_bb_not_in_domtree.ll
+++ b/test/Analysis/RegionInfo/20100809_bb_not_in_domtree.ll
@@ -1,4 +1,4 @@
-; RUN: opt -regions %s
+; RUN: opt -regions < %s
 define i32 @main() nounwind {
 entry:
   br label %for.cond
diff --git a/test/Analysis/ScalarEvolution/2010-09-03-RequiredTransitive.ll b/test/Analysis/ScalarEvolution/2010-09-03-RequiredTransitive.ll
index aba0ce7467..5a02398104 100644
--- a/test/Analysis/ScalarEvolution/2010-09-03-RequiredTransitive.ll
+++ b/test/Analysis/ScalarEvolution/2010-09-03-RequiredTransitive.ll
@@ -1,8 +1,10 @@
-; RUN: opt -indvars -scalar-evolution -analyze %s
+; RUN: opt -indvars -scalar-evolution -analyze < %s | FileCheck %s
 ; This test checks if the SCEV analysis is printed out at all.
 ; It failed once as the RequiredTransitive option was not implemented
 ; correctly.
 
+; CHECK: Classifying expressions for: @main
+
 define i32 @main() nounwind {
 entry:
   br label %for.cond
diff --git a/test/Analysis/ScalarEvolution/2011-03-09-ExactNoMaxBECount.ll b/test/Analysis/ScalarEvolution/2011-03-09-ExactNoMaxBECount.ll
index 9f17e27577..49e944dcd2 100644
--- a/test/Analysis/ScalarEvolution/2011-03-09-ExactNoMaxBECount.ll
+++ b/test/Analysis/ScalarEvolution/2011-03-09-ExactNoMaxBECount.ll
@@ -1,4 +1,4 @@
-; RUN: opt -indvars  %s
+; RUN: opt -indvars < %s
 ; PR9424: Attempt to use a SCEVCouldNotCompute object!
 ; The inner loop computes the Step and Start of the outer loop.
 ; Call that Vexit. The outer End value is max(2,Vexit), because
diff --git a/test/Analysis/ScalarEvolution/fold.ll b/test/Analysis/ScalarEvolution/fold.ll
index 4e2adf187e..57006dd9bb 100644
--- a/test/Analysis/ScalarEvolution/fold.ll
+++ b/test/Analysis/ScalarEvolution/fold.ll
@@ -1,4 +1,4 @@
-; RUN: opt -analyze -scalar-evolution %s -S | FileCheck %s
+; RUN: opt -analyze -scalar-evolution -S < %s | FileCheck %s
 
 define i16 @test1(i8 %x) {
   %A = zext i8 %x to i12
diff --git a/test/CodeGen/ARM/2010-11-30-reloc-movt.ll b/test/CodeGen/ARM/2010-11-30-reloc-movt.ll
index 8b164c5d91..94a05412f5 100644
--- a/test/CodeGen/ARM/2010-11-30-reloc-movt.ll
+++ b/test/CodeGen/ARM/2010-11-30-reloc-movt.ll
@@ -23,7 +23,7 @@ entry:
 
 ; OBJ:            Relocation 0
 ; OBJ-NEXT:       'r_offset', 0x00000004
-; OBJ-NEXT:       'r_sym', 0x000007
+; OBJ-NEXT:       'r_sym', 0x000009
 ; OBJ-NEXT:        'r_type', 0x2b
 
 ; OBJ:          Relocation 1
@@ -33,7 +33,7 @@ entry:
 
 ; OBJ:          # Relocation 2
 ; OBJ-NEXT:       'r_offset', 0x0000000c
-; OBJ-NEXT:       'r_sym', 0x000008
+; OBJ-NEXT:       'r_sym', 0x00000a
 ; OBJ-NEXT:       'r_type', 0x1c
 
 }
diff --git a/test/CodeGen/ARM/alloc-no-stack-realign-error.ll b/test/CodeGen/ARM/alloc-no-stack-realign-error.ll
new file mode 100644
index 0000000000..96c00174db
--- /dev/null
+++ b/test/CodeGen/ARM/alloc-no-stack-realign-error.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mtriple=armv7-apple-ios -O0 -realign-stack=0 2>&1 | FileCheck %s
+
+; rdar://12713765
+@T3_retval = common global <16 x float> zeroinitializer, align 16
+
+; If alignment for alloc is smaller than or equal to stack alignment, but the 
+; preferred type alignment is bigger, the alignment will be clamped.
+; If alignment for alloca is bigger than stack alignment, the compiler
+; will emit an error.
+define void @test(<16 x float>* noalias sret %agg.result) nounwind ssp {
+entry:
+; CHECK: Requested Minimal Alignment exceeds the Stack Alignment!
+ %retval = alloca <16 x float>, align 16
+ %0 = load <16 x float>* @T3_retval, align 16
+ store <16 x float> %0, <16 x float>* %retval
+ %1 = load <16 x float>* %retval
+ store <16 x float> %1, <16 x float>* %agg.result, align 16
+ ret void
+}
diff --git a/test/CodeGen/ARM/alloc-no-stack-realign.ll b/test/CodeGen/ARM/alloc-no-stack-realign.ll
index 273041dee3..94adc9c67d 100644
--- a/test/CodeGen/ARM/alloc-no-stack-realign.ll
+++ b/test/CodeGen/ARM/alloc-no-stack-realign.ll
@@ -39,7 +39,7 @@ entry:
 ; NO-REALIGN: add [[R2:r[0-9]+]], [[R1:r[0-9]+]], #16
 ; NO-REALIGN: vst1.64
 ; NO-REALIGN: vst1.64
- %retval = alloca <16 x float>, align 16
+ %retval = alloca <16 x float>, align 4
  %0 = load <16 x float>* @T3_retval, align 16
  store <16 x float> %0, <16 x float>* %retval
  %1 = load <16 x float>* %retval
diff --git a/test/CodeGen/ARM/avoid-cpsr-rmw.ll b/test/CodeGen/ARM/avoid-cpsr-rmw.ll
index 96e83dd88e..d98925ef8f 100644
--- a/test/CodeGen/ARM/avoid-cpsr-rmw.ll
+++ b/test/CodeGen/ARM/avoid-cpsr-rmw.ll
@@ -49,3 +49,37 @@ while.body:
 while.end:
   ret void
 }
+
+; Allow partial CPSR dependency when code size is the priority.
+; rdar://12878928
+define void @t3(i32* nocapture %ptr1, i32* %ptr2, i32 %c) nounwind minsize {
+entry:
+; CHECK: t3:
+  %tobool7 = icmp eq i32* %ptr2, null
+  br i1 %tobool7, label %while.end, label %while.body
+
+while.body:
+; CHECK: while.body
+; CHECK: mul r{{[0-9]+}}
+; CHECK: muls
+  %ptr1.addr.09 = phi i32* [ %add.ptr, %while.body ], [ %ptr1, %entry ]
+  %ptr2.addr.08 = phi i32* [ %incdec.ptr, %while.body ], [ %ptr2, %entry ]
+  %0 = load i32* %ptr1.addr.09, align 4
+  %arrayidx1 = getelementptr inbounds i32* %ptr1.addr.09, i32 1
+  %1 = load i32* %arrayidx1, align 4
+  %arrayidx3 = getelementptr inbounds i32* %ptr1.addr.09, i32 2
+  %2 = load i32* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds i32* %ptr1.addr.09, i32 3
+  %3 = load i32* %arrayidx4, align 4
+  %add.ptr = getelementptr inbounds i32* %ptr1.addr.09, i32 4
+  %mul = mul i32 %1, %0
+  %mul5 = mul i32 %mul, %2
+  %mul6 = mul i32 %mul5, %3
+  store i32 %mul6, i32* %ptr2.addr.08, align 4
+  %incdec.ptr = getelementptr inbounds i32* %ptr2.addr.08, i32 -1
+  %tobool = icmp eq i32* %incdec.ptr, null
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:
+  ret void
+}
diff --git a/test/CodeGen/ARM/bfx.ll b/test/CodeGen/ARM/bfx.ll
index 519c1353a3..394da9e157 100644
--- a/test/CodeGen/ARM/bfx.ll
+++ b/test/CodeGen/ARM/bfx.ll
@@ -26,3 +26,28 @@ define i32 @ubfx2(i32 %a) {
 	ret i32 %t2
 }
 
+; rdar://12870177
+define i32 @ubfx_opt(i32* nocapture %ctx, i32 %x) nounwind readonly ssp {
+entry:
+; CHECK: ubfx_opt
+; CHECK: lsr [[REG1:(lr|r[0-9]+)]], r1, #24
+; CHECK: ldr {{lr|r[0-9]+}}, [r0, [[REG1]], lsl #2]
+; CHECK: ubfx [[REG2:(lr|r[0-9]+)]], r1, #16, #8
+; CHECK: ldr {{lr|r[0-9]+}}, [r0, [[REG2]], lsl #2]
+; CHECK: ubfx [[REG3:(lr|r[0-9]+)]], r1, #8, #8
+; CHECK: ldr {{lr|r[0-9]+}}, [r0, [[REG3]], lsl #2]
+  %and = lshr i32 %x, 8
+  %shr = and i32 %and, 255
+  %and1 = lshr i32 %x, 16
+  %shr2 = and i32 %and1, 255
+  %shr4 = lshr i32 %x, 24
+  %arrayidx = getelementptr inbounds i32* %ctx, i32 %shr4
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx5 = getelementptr inbounds i32* %ctx, i32 %shr2
+  %1 = load i32* %arrayidx5, align 4
+  %add = add i32 %1, %0
+  %arrayidx6 = getelementptr inbounds i32* %ctx, i32 %shr
+  %2 = load i32* %arrayidx6, align 4
+  %add7 = add i32 %add, %2
+  ret i32 %add7
+}
diff --git a/test/CodeGen/ARM/global-merge-addrspace.ll b/test/CodeGen/ARM/global-merge-addrspace.ll
new file mode 100644
index 0000000000..0efa690bde
--- /dev/null
+++ b/test/CodeGen/ARM/global-merge-addrspace.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -mtriple=thumb-apple-darwin | FileCheck %s
+; Test the GlobalMerge pass. Check that the pass does not crash when using
+; multiple address spaces.
+
+; CHECK: _MergedGlobals:
+@g1 = internal addrspace(1) global i32 1
+@g2 = internal addrspace(1) global i32 2
+
+
+; CHECK: _MergedGlobals1:
+@g3 = internal addrspace(2) global i32 3
+@g4 = internal addrspace(2) global i32 4
diff --git a/test/CodeGen/Generic/dag-combine-crash.ll b/test/CodeGen/Generic/dag-combine-crash.ll
new file mode 100644
index 0000000000..a7810b5c05
--- /dev/null
+++ b/test/CodeGen/Generic/dag-combine-crash.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s
+
+define void @main()  {
+if.end:
+  br label %block.i.i
+
+block.i.i:
+  %tmpbb = load i8* undef
+  %tmp54 = zext i8 %tmpbb to i64
+  %tmp59 = and i64 %tmp54, 8
+  %tmp60 = add i64 %tmp59, 3691045929300498764
+  %tmp62 = sub i64 %tmp60, 3456506383779105993
+  %tmp63 = xor i64 1050774804270620004, %tmp62
+  %tmp65 = xor i64 %tmp62, 234539545521392771
+  %tmp67 = or i64 %tmp65, %tmp63
+  %tmp71 = xor i64 %tmp67, 6781485823212740913
+  %tmp72 = trunc i64 %tmp71 to i32
+  %tmp74 = lshr i32 2, %tmp72
+  store i32 %tmp74, i32* undef
+  br label %block.i.i
+}
diff --git a/test/CodeGen/Generic/inline-asm-mem-clobber.ll b/test/CodeGen/Generic/inline-asm-mem-clobber.ll
new file mode 100644
index 0000000000..e523d031dc
--- /dev/null
+++ b/test/CodeGen/Generic/inline-asm-mem-clobber.ll
@@ -0,0 +1,21 @@
+; RUN: llc -O2 < %s | FileCheck %s
+
+@G = common global i32 0, align 4
+
+define i32 @foo(i8* %p) nounwind uwtable {
+entry:
+  %p.addr = alloca i8*, align 8
+  %rv = alloca i32, align 4
+  store i8* %p, i8** %p.addr, align 8
+  store i32 0, i32* @G, align 4
+  %0 = load i8** %p.addr, align 8
+; CHECK: blah
+  %1 = call i32 asm "blah", "=r,r,~{memory}"(i8* %0) nounwind
+; CHECK: @G
+  store i32 %1, i32* %rv, align 4
+  %2 = load i32* %rv, align 4
+  %3 = load i32* @G, align 4
+  %add = add nsw i32 %2, %3
+  ret i32 %add
+}
+
diff --git a/test/CodeGen/Mips/2012-12-12-ExpandMemcpy.ll b/test/CodeGen/Mips/2012-12-12-ExpandMemcpy.ll
new file mode 100644
index 0000000000..9d4daee696
--- /dev/null
+++ b/test/CodeGen/Mips/2012-12-12-ExpandMemcpy.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=mips64el -mcpu=mips64r2 < %s
+
+@.str = private unnamed_addr constant [7 x i8] c"hello\0A\00", align 1
+
+define void @t(i8* %ptr) {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %ptr, i8* getelementptr inbounds ([7 x i8]* @.str, i64 0, i64 0), i64 7, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/Mips/alloca.ll b/test/CodeGen/Mips/alloca.ll
index 29f43c8afa..220f33bd45 100644
--- a/test/CodeGen/Mips/alloca.ll
+++ b/test/CodeGen/Mips/alloca.ll
@@ -3,11 +3,11 @@
 define i32 @twoalloca(i32 %size) nounwind {
 entry:
 ; CHECK: subu  $[[T0:[0-9]+]], $sp, $[[SZ:[0-9]+]]
-; CHECK: addu  $sp, $zero, $[[T0]]
+; CHECK: or    $sp, $[[T0]], $zero
 ; CHECK: subu  $[[T2:[0-9]+]], $sp, $[[SZ]]
-; CHECK: addu  $sp, $zero, $[[T2]]
-; CHECK: addu  $4, $zero, $[[T0]]
-; CHECK: addu  $4, $zero, $[[T2]]
+; CHECK: or    $sp, $[[T2]], $zero
+; CHECK: or    $4, $[[T0]], $zero
+; CHECK: or    $4, $[[T2]], $zero
   %tmp1 = alloca i8, i32 %size, align 4
   %add.ptr = getelementptr inbounds i8* %tmp1, i32 5
   store i8 97, i8* %add.ptr, align 1
@@ -29,7 +29,7 @@ define i32 @alloca2(i32 %size) nounwind {
 entry:
 ; CHECK: alloca2
 ; CHECK: subu  $[[T0:[0-9]+]], $sp
-; CHECK: addu  $sp, $zero, $[[T0]]
+; CHECK: or    $sp, $[[T0]], $zero
 
   %tmp1 = alloca i8, i32 %size, align 4
   %0 = bitcast i8* %tmp1 to i32*
diff --git a/test/CodeGen/Mips/alloca16.ll b/test/CodeGen/Mips/alloca16.ll
index 731edae43c..5ae9a84791 100644
--- a/test/CodeGen/Mips/alloca16.ll
+++ b/test/CodeGen/Mips/alloca16.ll
@@ -68,8 +68,8 @@ entry:
   %21 = load i32** %ip, align 4
   %arrayidx6 = getelementptr inbounds i32* %21, i32 %20
   %22 = load i32* %arrayidx6, align 4
-; 16: 	save	16
+; 16: 	addiu $sp, -16
   call void @temp(i32 %22)
-; 16: 	restore	16
+; 16: 	addiu $sp, 16
   ret void
 }
diff --git a/test/CodeGen/Mips/ex2.ll b/test/CodeGen/Mips/ex2.ll
new file mode 100644
index 0000000000..67d19e4b84
--- /dev/null
+++ b/test/CodeGen/Mips/ex2.ll
@@ -0,0 +1,29 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@.str = private unnamed_addr constant [6 x i8] c"hello\00", align 1
+@_ZTIPKc = external constant i8*
+
+define i32 @main() {
+; 16: main:
+; 16: 	.cfi_startproc
+; 16: 	save	$ra, $s0, $s1, 32
+; 16:   .cfi_offset 17, -8
+; 16: 	.cfi_offset 16, -12
+; 16: 	.cfi_offset 31, -4
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %exception = call i8* @__cxa_allocate_exception(i32 4) nounwind
+  %0 = bitcast i8* %exception to i8**
+  store i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), i8** %0
+  call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIPKc to i8*), i8* null) noreturn
+  unreachable
+
+return:                                           ; No predecessors!
+  %1 = load i32* %retval
+  ret i32 %1
+}
+
+declare i8* @__cxa_allocate_exception(i32)
+
+declare void @__cxa_throw(i8*, i8*, i8*)
diff --git a/test/CodeGen/Mips/frame-address.ll b/test/CodeGen/Mips/frame-address.ll
index 9df1808fde..e64e6d8cfe 100644
--- a/test/CodeGen/Mips/frame-address.ll
+++ b/test/CodeGen/Mips/frame-address.ll
@@ -8,5 +8,5 @@ entry:
   ret i8* %0
 
 ; CHECK:   addu    $fp, $sp, $zero
-; CHECK:   addu    $2, $zero, $fp
+; CHECK:   or      $2, $fp, $zero
 }
diff --git a/test/CodeGen/Mips/gpreg-lazy-binding.ll b/test/CodeGen/Mips/gpreg-lazy-binding.ll
new file mode 100644
index 0000000000..bb3ad4264e
--- /dev/null
+++ b/test/CodeGen/Mips/gpreg-lazy-binding.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=mipsel -disable-mips-delay-filler < %s | FileCheck %s 
+
+@g = external global i32
+
+; CHECK:     or    $gp
+; CHECK:     jalr  $25
+; CHECK:     nop
+; CHECK-NOT: or    $gp
+; CHECK:     jalr  $25
+
+define void @f0() nounwind {
+entry:
+  tail call void @externalFunc() nounwind
+  tail call fastcc void @internalFunc()
+  ret void
+}
+
+declare void @externalFunc()
+
+define internal fastcc void @internalFunc() nounwind noinline {
+entry:
+  %0 = load i32* @g, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @g, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/i64arg.ll b/test/CodeGen/Mips/i64arg.ll
index 8b1f71b69f..e16e126af4 100644
--- a/test/CodeGen/Mips/i64arg.ll
+++ b/test/CodeGen/Mips/i64arg.ll
@@ -2,8 +2,8 @@
 
 define void @f1(i64 %ll1, float %f, i64 %ll, i32 %i, float %f2) nounwind {
 entry:
-; CHECK: addu $[[R1:[0-9]+]], $zero, $5
-; CHECK: addu $[[R0:[0-9]+]], $zero, $4
+; CHECK: or  $[[R1:[0-9]+]], $5, $zero
+; CHECK: or  $[[R0:[0-9]+]], $4, $zero
 ; CHECK: ori $6, ${{[0-9]+}}, 3855
 ; CHECK: ori $7, ${{[0-9]+}}, 22136
 ; CHECK: lw  $25, %call16(ff1)
@@ -12,16 +12,16 @@ entry:
 ; CHECK: lw $25, %call16(ff2)
 ; CHECK: lw $[[R2:[0-9]+]], 80($sp)
 ; CHECK: lw $[[R3:[0-9]+]], 84($sp)
-; CHECK: addu $4, $zero, $[[R2]]
-; CHECK: addu $5, $zero, $[[R3]]
+; CHECK: or $4, $[[R2]], $zero
+; CHECK: or $5, $[[R3]], $zero
 ; CHECK: jalr $25
   tail call void @ff2(i64 %ll, double 3.000000e+00) nounwind
   %sub = add nsw i32 %i, -1
 ; CHECK: sw $[[R1]], 28($sp)
 ; CHECK: sw $[[R0]], 24($sp)
 ; CHECK: lw $25, %call16(ff3)
-; CHECK: addu $6, $zero, $[[R2]]
-; CHECK: addu $7, $zero, $[[R3]]
+; CHECK: or $6, $[[R2]], $zero
+; CHECK: or $7, $[[R3]], $zero
 ; CHECK: jalr $25
   tail call void @ff3(i32 %i, i64 %ll, i32 %sub, i64 %ll1) nounwind
   ret void
diff --git a/test/CodeGen/Mips/mips16ex.ll b/test/CodeGen/Mips/mips16ex.ll
new file mode 100644
index 0000000000..ecb30b5c63
--- /dev/null
+++ b/test/CodeGen/Mips/mips16ex.ll
@@ -0,0 +1,87 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+;16: $eh_func_begin0=.
+@.str = private unnamed_addr constant [7 x i8] c"hello\0A\00", align 1
+@_ZTIi = external constant i8*
+@.str1 = private unnamed_addr constant [15 x i8] c"exception %i \0A\00", align 1
+
+define i32 @main() {
+entry:
+  %retval = alloca i32, align 4
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  %e = alloca i32, align 4
+  store i32 0, i32* %retval
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8]* @.str, i32 0, i32 0))
+  %exception = call i8* @__cxa_allocate_exception(i32 4) nounwind
+  %0 = bitcast i8* %exception to i32*
+  store i32 20, i32* %0
+  invoke void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null) noreturn
+          to label %unreachable unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %1 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %2 = extractvalue { i8*, i32 } %1, 0
+  store i8* %2, i8** %exn.slot
+  %3 = extractvalue { i8*, i32 } %1, 1
+  store i32 %3, i32* %ehselector.slot
+  br label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %lpad
+  %sel = load i32* %ehselector.slot
+  %4 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) nounwind
+  %matches = icmp eq i32 %sel, %4
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:                                            ; preds = %catch.dispatch
+  %exn = load i8** %exn.slot
+  %5 = call i8* @__cxa_begin_catch(i8* %exn) nounwind
+  %6 = bitcast i8* %5 to i32*
+  %exn.scalar = load i32* %6
+  store i32 %exn.scalar, i32* %e, align 4
+  %7 = load i32* %e, align 4
+  %call2 = invoke i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([15 x i8]* @.str1, i32 0, i32 0), i32 %7)
+          to label %invoke.cont unwind label %lpad1
+
+invoke.cont:                                      ; preds = %catch
+  call void @__cxa_end_catch() nounwind
+  br label %try.cont
+
+try.cont:                                         ; preds = %invoke.cont
+  ret i32 0
+
+lpad1:                                            ; preds = %catch
+  %8 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  %9 = extractvalue { i8*, i32 } %8, 0
+  store i8* %9, i8** %exn.slot
+  %10 = extractvalue { i8*, i32 } %8, 1
+  store i32 %10, i32* %ehselector.slot
+  call void @__cxa_end_catch() nounwind
+  br label %eh.resume
+
+eh.resume:                                        ; preds = %lpad1, %catch.dispatch
+  %exn3 = load i8** %exn.slot
+  %sel4 = load i32* %ehselector.slot
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn3, 0
+  %lpad.val5 = insertvalue { i8*, i32 } %lpad.val, i32 %sel4, 1
+  resume { i8*, i32 } %lpad.val5
+
+unreachable:                                      ; preds = %entry
+  unreachable
+}
+
+declare i32 @printf(i8*, ...)
+
+declare i8* @__cxa_allocate_exception(i32)
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @__cxa_throw(i8*, i8*, i8*)
+
+declare i32 @llvm.eh.typeid.for(i8*) nounwind readnone
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
diff --git a/test/CodeGen/Mips/mips16fpe.ll b/test/CodeGen/Mips/mips16fpe.ll
new file mode 100644
index 0000000000..4335436079
--- /dev/null
+++ b/test/CodeGen/Mips/mips16fpe.ll
@@ -0,0 +1,381 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 -soft-float -mips16-hard-float < %s | FileCheck %s -check-prefix=16hf
+
+@x = global float 5.000000e+00, align 4
+@y = global float 1.500000e+01, align 4
+@xd = global double 6.000000e+00, align 8
+@yd = global double 1.800000e+01, align 8
+@two = global i32 2, align 4
+@addsf3_result = common global float 0.000000e+00, align 4
+@adddf3_result = common global double 0.000000e+00, align 8
+@subsf3_result = common global float 0.000000e+00, align 4
+@subdf3_result = common global double 0.000000e+00, align 8
+@mulsf3_result = common global float 0.000000e+00, align 4
+@muldf3_result = common global double 0.000000e+00, align 8
+@divsf3_result = common global float 0.000000e+00, align 4
+@divdf3_result = common global double 0.000000e+00, align 8
+@extendsfdf2_result = common global double 0.000000e+00, align 8
+@xd2 = global double 0x40147E6B74B4CF6A, align 8
+@truncdfsf2_result = common global float 0.000000e+00, align 4
+@fix_truncsfsi_result = common global i32 0, align 4
+@fix_truncdfsi_result = common global i32 0, align 4
+@si = global i32 -9, align 4
+@ui = global i32 9, align 4
+@floatsisf_result = common global float 0.000000e+00, align 4
+@floatsidf_result = common global double 0.000000e+00, align 8
+@floatunsisf_result = common global float 0.000000e+00, align 4
+@floatunsidf_result = common global double 0.000000e+00, align 8
+@xx = global float 5.000000e+00, align 4
+@eqsf2_result = common global i32 0, align 4
+@xxd = global double 6.000000e+00, align 8
+@eqdf2_result = common global i32 0, align 4
+@nesf2_result = common global i32 0, align 4
+@nedf2_result = common global i32 0, align 4
+@gesf2_result = common global i32 0, align 4
+@gedf2_result = common global i32 0, align 4
+@ltsf2_result = common global i32 0, align 4
+@ltdf2_result = common global i32 0, align 4
+@lesf2_result = common global i32 0, align 4
+@ledf2_result = common global i32 0, align 4
+@gtsf2_result = common global i32 0, align 4
+@gtdf2_result = common global i32 0, align 4
+
+define void @test_addsf3() nounwind {
+entry:
+;16hf: test_addsf3:
+  %0 = load float* @x, align 4
+  %1 = load float* @y, align 4
+  %add = fadd float %0, %1
+  store float %add, float* @addsf3_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_addsf3)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_adddf3() nounwind {
+entry:
+;16hf: test_adddf3:
+  %0 = load double* @xd, align 8
+  %1 = load double* @yd, align 8
+  %add = fadd double %0, %1
+  store double %add, double* @adddf3_result, align 8
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_adddf3)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_subsf3() nounwind {
+entry:
+;16hf: test_subsf3:
+  %0 = load float* @x, align 4
+  %1 = load float* @y, align 4
+  %sub = fsub float %0, %1
+  store float %sub, float* @subsf3_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_subsf3)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_subdf3() nounwind {
+entry:
+;16hf: test_subdf3:
+  %0 = load double* @xd, align 8
+  %1 = load double* @yd, align 8
+  %sub = fsub double %0, %1
+  store double %sub, double* @subdf3_result, align 8
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_subdf3)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_mulsf3() nounwind {
+entry:
+;16hf: test_mulsf3:
+  %0 = load float* @x, align 4
+  %1 = load float* @y, align 4
+  %mul = fmul float %0, %1
+  store float %mul, float* @mulsf3_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_mulsf3)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_muldf3() nounwind {
+entry:
+;16hf: test_muldf3:
+  %0 = load double* @xd, align 8
+  %1 = load double* @yd, align 8
+  %mul = fmul double %0, %1
+  store double %mul, double* @muldf3_result, align 8
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_muldf3)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_divsf3() nounwind {
+entry:
+;16hf: test_divsf3:
+  %0 = load float* @y, align 4
+  %1 = load float* @x, align 4
+  %div = fdiv float %0, %1
+  store float %div, float* @divsf3_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_divsf3)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_divdf3() nounwind {
+entry:
+;16hf: test_divdf3:
+  %0 = load double* @yd, align 8
+  %mul = fmul double %0, 2.000000e+00
+  %1 = load double* @xd, align 8
+  %div = fdiv double %mul, %1
+  store double %div, double* @divdf3_result, align 8
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_divdf3)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_extendsfdf2() nounwind {
+entry:
+;16hf: test_extendsfdf2:
+  %0 = load float* @x, align 4
+  %conv = fpext float %0 to double
+  store double %conv, double* @extendsfdf2_result, align 8
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_extendsfdf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_truncdfsf2() nounwind {
+entry:
+;16hf: test_truncdfsf2:
+  %0 = load double* @xd2, align 8
+  %conv = fptrunc double %0 to float
+  store float %conv, float* @truncdfsf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_truncdfsf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_fix_truncsfsi() nounwind {
+entry:
+;16hf: test_fix_truncsfsi:
+  %0 = load float* @x, align 4
+  %conv = fptosi float %0 to i32
+  store i32 %conv, i32* @fix_truncsfsi_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_fix_truncsfsi)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_fix_truncdfsi() nounwind {
+entry:
+;16hf: test_fix_truncdfsi:
+  %0 = load double* @xd, align 8
+  %conv = fptosi double %0 to i32
+  store i32 %conv, i32* @fix_truncdfsi_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_fix_truncdfsi)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_floatsisf() nounwind {
+entry:
+;16hf: test_floatsisf:
+  %0 = load i32* @si, align 4
+  %conv = sitofp i32 %0 to float
+  store float %conv, float* @floatsisf_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_floatsisf)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_floatsidf() nounwind {
+entry:
+;16hf: test_floatsidf:
+  %0 = load i32* @si, align 4
+  %conv = sitofp i32 %0 to double
+  store double %conv, double* @floatsidf_result, align 8
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_floatsidf)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_floatunsisf() nounwind {
+entry:
+;16hf: test_floatunsisf:
+  %0 = load i32* @ui, align 4
+  %conv = uitofp i32 %0 to float
+  store float %conv, float* @floatunsisf_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_floatunsisf)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_floatunsidf() nounwind {
+entry:
+;16hf: test_floatunsidf:
+  %0 = load i32* @ui, align 4
+  %conv = uitofp i32 %0 to double
+  store double %conv, double* @floatunsidf_result, align 8
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_floatunsidf)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_eqsf2() nounwind {
+entry:
+;16hf: test_eqsf2:
+  %0 = load float* @x, align 4
+  %1 = load float* @xx, align 4
+  %cmp = fcmp oeq float %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @eqsf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_eqsf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_eqdf2() nounwind {
+entry:
+;16hf: test_eqdf2:
+  %0 = load double* @xd, align 8
+  %1 = load double* @xxd, align 8
+  %cmp = fcmp oeq double %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @eqdf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_eqdf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_nesf2() nounwind {
+entry:
+;16hf: test_nesf2:
+  %0 = load float* @x, align 4
+  %1 = load float* @y, align 4
+  %cmp = fcmp une float %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @nesf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_nesf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_nedf2() nounwind {
+entry:
+;16hf: test_nedf2:
+  %0 = load double* @xd, align 8
+  %1 = load double* @yd, align 8
+  %cmp = fcmp une double %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @nedf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_nedf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_gesf2() nounwind {
+entry:
+;16hf: test_gesf2:
+  %0 = load float* @x, align 4
+  %1 = load float* @xx, align 4
+  %cmp = fcmp oge float %0, %1
+  %2 = load float* @y, align 4
+  %cmp1 = fcmp oge float %2, %0
+  %and3 = and i1 %cmp, %cmp1
+  %and = zext i1 %and3 to i32
+  store i32 %and, i32* @gesf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_gesf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_gedf2() nounwind {
+entry:
+;16hf: test_gedf2:
+  %0 = load double* @xd, align 8
+  %1 = load double* @xxd, align 8
+  %cmp = fcmp oge double %0, %1
+  %2 = load double* @yd, align 8
+  %cmp1 = fcmp oge double %2, %0
+  %and3 = and i1 %cmp, %cmp1
+  %and = zext i1 %and3 to i32
+  store i32 %and, i32* @gedf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_gedf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_ltsf2() nounwind {
+entry:
+;16hf: test_ltsf2:
+  %0 = load float* @x, align 4
+  %1 = load float* @xx, align 4
+  %lnot = fcmp uge float %0, %1
+  %2 = load float* @y, align 4
+  %cmp1 = fcmp olt float %0, %2
+  %and2 = and i1 %lnot, %cmp1
+  %and = zext i1 %and2 to i32
+  store i32 %and, i32* @ltsf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_unordsf2)(${{[0-9]+}})
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_ltsf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_ltdf2() nounwind {
+entry:
+;16hf: test_ltdf2:
+  %0 = load double* @xd, align 8
+  %1 = load double* @xxd, align 8
+  %lnot = fcmp uge double %0, %1
+  %2 = load double* @yd, align 8
+  %cmp1 = fcmp olt double %0, %2
+  %and2 = and i1 %lnot, %cmp1
+  %and = zext i1 %and2 to i32
+  store i32 %and, i32* @ltdf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_unorddf2)(${{[0-9]+}})
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_ltdf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_lesf2() nounwind {
+entry:
+;16hf: test_lesf2:
+  %0 = load float* @x, align 4
+  %1 = load float* @xx, align 4
+  %cmp = fcmp ole float %0, %1
+  %2 = load float* @y, align 4
+  %cmp1 = fcmp ole float %0, %2
+  %and3 = and i1 %cmp, %cmp1
+  %and = zext i1 %and3 to i32
+  store i32 %and, i32* @lesf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_lesf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_ledf2() nounwind {
+entry:
+;16hf: test_ledf2:
+  %0 = load double* @xd, align 8
+  %1 = load double* @xxd, align 8
+  %cmp = fcmp ole double %0, %1
+  %2 = load double* @yd, align 8
+  %cmp1 = fcmp ole double %0, %2
+  %and3 = and i1 %cmp, %cmp1
+  %and = zext i1 %and3 to i32
+  store i32 %and, i32* @ledf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_ledf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_gtsf2() nounwind {
+entry:
+;16hf: test_gtsf2:
+  %0 = load float* @x, align 4
+  %1 = load float* @xx, align 4
+  %lnot = fcmp ule float %0, %1
+  %2 = load float* @y, align 4
+  %cmp1 = fcmp ogt float %2, %0
+  %and2 = and i1 %lnot, %cmp1
+  %and = zext i1 %and2 to i32
+  store i32 %and, i32* @gtsf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_gtsf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_gtdf2() nounwind {
+entry:
+;16hf: test_gtdf2:
+  %0 = load double* @xd, align 8
+  %1 = load double* @xxd, align 8
+  %lnot = fcmp ule double %0, %1
+  %2 = load double* @yd, align 8
+  %cmp1 = fcmp ogt double %2, %0
+  %and2 = and i1 %lnot, %cmp1
+  %and = zext i1 %and2 to i32
+  store i32 %and, i32* @gtdf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_gtdf2)(${{[0-9]+}})
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/mips64-sret.ll b/test/CodeGen/Mips/mips64-sret.ll
index e26b0223b4..eb08e700bc 100644
--- a/test/CodeGen/Mips/mips64-sret.ll
+++ b/test/CodeGen/Mips/mips64-sret.ll
@@ -6,7 +6,7 @@
 
 define void @f(%struct.S* noalias sret %agg.result) nounwind {
 entry:
-; CHECK: daddu $2, $zero, $4
+; CHECK: or $2, $4, $zero
 
   %0 = bitcast %struct.S* %agg.result to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.S* @g to i8*), i64 32, i32 4, i1 false)
diff --git a/test/CodeGen/Mips/return_address.ll b/test/CodeGen/Mips/return_address.ll
index e1c9241984..3bcd5601ee 100644
--- a/test/CodeGen/Mips/return_address.ll
+++ b/test/CodeGen/Mips/return_address.ll
@@ -5,7 +5,7 @@ entry:
   %0 = call i8* @llvm.returnaddress(i32 0)
   ret i8* %0
 
-; CHECK:    addu    $2, $zero, $ra
+; CHECK:    or    $2, $ra, $zero
 }
 
 define i8* @f2() nounwind {
@@ -14,9 +14,9 @@ entry:
   %0 = call i8* @llvm.returnaddress(i32 0)
   ret i8* %0
 
-; CHECK:    addu    $[[R0:[0-9]+]], $zero, $ra
+; CHECK:    or    $[[R0:[0-9]+]], $ra, $zero
 ; CHECK:    jal
-; CHECK:    addu    $2,  $zero, $[[R0]]
+; CHECK:    or    $2, $[[R0]], $zero
 }
 
 declare i8* @llvm.returnaddress(i32) nounwind readnone
diff --git a/test/CodeGen/Mips/vector-setcc.ll b/test/CodeGen/Mips/vector-setcc.ll
new file mode 100644
index 0000000000..aeff4918c8
--- /dev/null
+++ b/test/CodeGen/Mips/vector-setcc.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=mipsel < %s
+
+@a = common global <4 x i32> zeroinitializer, align 16
+@b = common global <4 x i32> zeroinitializer, align 16
+@g0 = common global <4 x i32> zeroinitializer, align 16
+
+define void @foo0() nounwind {
+entry:
+  %0 = load <4 x i32>* @a, align 16
+  %1 = load <4 x i32>* @b, align 16
+  %cmp = icmp slt <4 x i32> %0, %1
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  store <4 x i32> %sext, <4 x i32>* @g0, align 16
+  ret void
+}
+
diff --git a/test/CodeGen/PowerPC/atomic-2.ll b/test/CodeGen/PowerPC/atomic-2.ll
index a427379a8b..40b4a2eea9 100644
--- a/test/CodeGen/PowerPC/atomic-2.ll
+++ b/test/CodeGen/PowerPC/atomic-2.ll
@@ -24,3 +24,23 @@ define i64 @exchange(i64* %mem, i64 %val) nounwind {
 ; CHECK: stdcx.
   ret i64 %tmp
 }
+
+define void @atomic_store(i64* %mem, i64 %val) nounwind {
+entry:
+; CHECK: @atomic_store
+  store atomic i64 %val, i64* %mem release, align 64
+; CHECK: ldarx
+; CHECK: stdcx.
+  ret void
+}
+
+define i64 @atomic_load(i64* %mem) nounwind {
+entry:
+; CHECK: @atomic_load
+  %tmp = load atomic i64* %mem acquire, align 64
+; CHECK: ldarx
+; CHECK: stdcx.
+; CHECK: stdcx.
+  ret i64 %tmp
+}
+
diff --git a/test/CodeGen/PowerPC/dcbt-sched.ll b/test/CodeGen/PowerPC/dcbt-sched.ll
new file mode 100644
index 0000000000..dfa1b75bd7
--- /dev/null
+++ b/test/CodeGen/PowerPC/dcbt-sched.ll
@@ -0,0 +1,22 @@
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+; RUN: llc -mcpu=a2 -enable-misched -enable-aa-sched-mi < %s | FileCheck %s
+
+define i8 @test1(i8* noalias %a, i8* noalias %b, i8* noalias %c) nounwind {
+entry:
+  %q = load i8* %b
+  call void @llvm.prefetch(i8* %a, i32 0, i32 3, i32 1)
+  %r = load i8* %c
+  %s = add i8 %q, %r
+  ret i8 %s
+}
+
+declare void @llvm.prefetch(i8*, i32, i32, i32)
+
+; Test that we've moved the second load to before the dcbt to better
+; hide its latency.
+; CHECK: @test1
+; CHECK: lbz
+; CHECK: lbz
+; CHECK: dcbt
+
diff --git a/test/CodeGen/PowerPC/float-asmprint.ll b/test/CodeGen/PowerPC/float-asmprint.ll
new file mode 100644
index 0000000000..c9dc02862a
--- /dev/null
+++ b/test/CodeGen/PowerPC/float-asmprint.ll
@@ -0,0 +1,34 @@
+; RUN: llc -mtriple=powerpc64-none-linux < %s | FileCheck %s
+
+; Check that all current floating-point types are correctly emitted to assembly
+; on a big-endian target. x86_fp80 can't actually print for unrelated reasons,
+; but that's not really a problem.
+
+@var128 = global fp128 0xL00000000000000008000000000000000, align 16
+@varppc128 = global ppc_fp128 0xM80000000000000000000000000000000, align 16
+@var64 = global double -0.0, align 8
+@var32 = global float -0.0, align 4
+@var16 = global half -0.0, align 2
+
+; CHECK: var128:
+; CHECK-NEXT: .quad -9223372036854775808      # fp128 -0
+; CHECK-NEXT: .quad 0
+; CHECK-NEXT: .size
+
+; CHECK: varppc128:
+; CHECK-NEXT: .quad -9223372036854775808      # ppc_fp128 -0
+; CHECK-NEXT: .quad 0
+; CHECK-NEXT: .size
+
+; CHECK: var64:
+; CHECK-NEXT: .quad -9223372036854775808      # double -0
+; CHECK-NEXT: .size
+
+; CHECK: var32:
+; CHECK-NEXT: .long 2147483648                # float -0
+; CHECK-NEXT: .size
+
+; CHECK: var16:
+; CHECK-NEXT: .short 32768                    # half -0
+; CHECK-NEXT: .size
+
diff --git a/test/CodeGen/PowerPC/in-asm-f64-reg.ll b/test/CodeGen/PowerPC/in-asm-f64-reg.ll
new file mode 100644
index 0000000000..1321dfce20
--- /dev/null
+++ b/test/CodeGen/PowerPC/in-asm-f64-reg.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s
+
+define void @f() {
+; CHECK: @f
+
+entry:
+  %0 = tail call double* asm sideeffect "qvstfdux $2,$0,$1", "=b,{r7},{f11},0,~{memory}"(i32 64, double undef, double* undef)
+  ret void
+
+; CHECK: qvstfdux 11,{{[0-9]+}},7
+}
diff --git a/test/CodeGen/PowerPC/mcm-8.ll b/test/CodeGen/PowerPC/mcm-8.ll
new file mode 100644
index 0000000000..9381a976a4
--- /dev/null
+++ b/test/CodeGen/PowerPC/mcm-8.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mcpu=pwr7 -O0 -code-model=medium < %s | FileCheck %s
+
+; Test correct code generation for medium code model (32-bit TOC offsets)
+; for loading a variable with available-externally linkage.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@x = available_externally constant [13 x i8] c"St9bad_alloc\00"
+
+define signext i8 @test_avext() nounwind {
+entry:
+  %0 = getelementptr inbounds [13 x i8]* @x, i32 0, i32 0
+  %1 = load i8* %0, align 1
+  ret i8 %1
+}
+
+; CHECK: test_avext:
+; CHECK: addis [[REG1:[0-9]+]], 2, .LC[[TOCNUM:[0-9]+]]@toc@ha
+; CHECK: ld [[REG2:[0-9]+]], .LC[[TOCNUM]]@toc@l([[REG1]])
+; CHECK: lbz {{[0-9]+}}, 0([[REG2]])
+; CHECK: .section .toc
+; CHECK: .LC[[TOCNUM]]:
+; CHECK: .tc {{[a-z0-9A-Z_.]+}}[TC],{{[a-z0-9A-Z_.]+}}
diff --git a/test/CodeGen/PowerPC/mcm-9.ll b/test/CodeGen/PowerPC/mcm-9.ll
new file mode 100644
index 0000000000..422607c5bc
--- /dev/null
+++ b/test/CodeGen/PowerPC/mcm-9.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mcpu=pwr7 -O0 -code-model=medium <%s | FileCheck %s
+
+; Test correct code generation for medium code model (32-bit TOC offsets)
+; for loading and storing an aliased external variable.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@ei = external global i32
+@a = alias i32* @ei
+
+define signext i32 @test_external() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @a, align 4
+  ret i32 %0
+}
+
+; CHECK: test_external:
+; CHECK: addis [[REG1:[0-9]+]], 2, .LC[[TOCNUM:[0-9]+]]@toc@ha
+; CHECK: ld [[REG2:[0-9]+]], .LC[[TOCNUM]]@toc@l([[REG1]])
+; CHECK: lwz {{[0-9]+}}, 0([[REG2]])
+; CHECK: stw {{[0-9]+}}, 0([[REG2]])
+; CHECK: .section .toc
+; CHECK: .LC[[TOCNUM]]:
+; CHECK: .tc {{[a-z0-9A-Z_.]+}}[TC],{{[a-z0-9A-Z_.]+}}
diff --git a/test/CodeGen/PowerPC/misched-inorder-latency.ll b/test/CodeGen/PowerPC/misched-inorder-latency.ll
new file mode 100644
index 0000000000..8fae7ad4d1
--- /dev/null
+++ b/test/CodeGen/PowerPC/misched-inorder-latency.ll
@@ -0,0 +1,55 @@
+; RUN: llc < %s -enable-misched -pre-RA-sched=source -scheditins=false \
+; RUN:          -disable-ifcvt-triangle-false -disable-post-ra | FileCheck %s
+;
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+; %val1 is a load live out of %entry. It should be hoisted
+; above the add.
+; CHECK: testload:
+; CHECK: %entry
+; CHECK: lwz
+; CHECK: addi
+; CHECK: bne
+; CHECK: %true
+define i32 @testload(i32 *%ptr, i32 %sumin) {
+entry:
+  %sum1 = add i32 %sumin, 1
+  %val1 = load i32* %ptr
+  %p = icmp eq i32 %sumin, 0
+  br i1 %p, label %true, label %end
+true:
+  %sum2 = add i32 %sum1, 1
+  %ptr2 = getelementptr i32* %ptr, i32 1
+  %val = load i32* %ptr2
+  %val2 = add i32 %val1, %val
+  br label %end
+end:
+  %valmerge = phi i32 [ %val1, %entry], [ %val2, %true ]
+  %summerge = phi i32 [ %sum1, %entry], [ %sum2, %true ]
+  %sumout = add i32 %valmerge, %summerge
+  ret i32 %sumout
+}
+
+; The prefetch gets a default latency of 3 cycles and should be hoisted
+; above the add.
+;
+; CHECK: testprefetch:
+; CHECK: %entry
+; CHECK: dcbt
+; CHECK: addi
+; CHECK: blr
+define i32 @testprefetch(i8 *%ptr, i32 %i) {
+entry:
+  %val1 = add i32 %i, 1
+  tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3, i32 1 )
+  %p = icmp eq i32 %i, 0
+  br i1 %p, label %true, label %end
+true:
+  %val2 = add i32 %val1, 1
+  br label %end
+end:
+  %valmerge = phi i32 [ %val1, %entry], [ %val2, %true ]
+  ret i32 %valmerge
+}
+declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind
diff --git a/test/CodeGen/PowerPC/sdag-ppcf128.ll b/test/CodeGen/PowerPC/sdag-ppcf128.ll
new file mode 100644
index 0000000000..535ece6d3d
--- /dev/null
+++ b/test/CodeGen/PowerPC/sdag-ppcf128.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+;
+; PR14751: Unsupported type in SelectionDAG::getConstantFP()
+
+define fastcc void @_D3std4math4sqrtFNaNbNfcZc() {
+entry:
+  br i1 undef, label %if, label %else
+; CHECK: cmplwi 0, 3, 0
+if:                                               ; preds = %entry
+  store { ppc_fp128, ppc_fp128 } zeroinitializer, { ppc_fp128, ppc_fp128 }* undef
+  ret void
+
+else:                                             ; preds = %entry
+  unreachable
+}
diff --git a/test/CodeGen/PowerPC/tls-gd-obj.ll b/test/CodeGen/PowerPC/tls-gd-obj.ll
new file mode 100644
index 0000000000..00b537d532
--- /dev/null
+++ b/test/CodeGen/PowerPC/tls-gd-obj.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mcpu=pwr7 -O0 -filetype=obj -relocation-model=pic %s -o - | \
+; RUN: elf-dump --dump-section-data | FileCheck %s
+
+; Test correct relocation generation for thread-local storage using
+; the general dynamic model and integrated assembly.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@a = thread_local global i32 0, align 4
+
+define signext i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @a, align 4
+  ret i32 %0
+}
+
+; Verify generation of R_PPC64_GOT_TLSGD16_HA, R_PPC64_GOT_TLSGD16_LO,
+; and R_PPC64_TLSGD for accessing external variable a, and R_PPC64_REL24
+; for the call to __tls_get_addr.
+;
+; CHECK:       '.rela.text'
+; CHECK:       Relocation 0
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM1:[0-9a-f]+]]
+; CHECK-NEXT:  'r_type', 0x00000052
+; CHECK:       Relocation 1
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM1]]
+; CHECK-NEXT:  'r_type', 0x00000050
+; CHECK:       Relocation 2
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM1]]
+; CHECK-NEXT:  'r_type', 0x0000006b
+; CHECK:       Relocation 3
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x{{[0-9a-f]+}}
+; CHECK-NEXT:  'r_type', 0x0000000a
+
diff --git a/test/CodeGen/PowerPC/tls-gd.ll b/test/CodeGen/PowerPC/tls-gd.ll
new file mode 100644
index 0000000000..fb8dfaf04a
--- /dev/null
+++ b/test/CodeGen/PowerPC/tls-gd.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mcpu=pwr7 -O0 -relocation-model=pic < %s | FileCheck %s
+
+; Test correct assembly code generation for thread-local storage using
+; the general dynamic model.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@a = thread_local global i32 0, align 4
+
+define signext i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @a, align 4
+  ret i32 %0
+}
+
+; CHECK: addis [[REG:[0-9]+]], 2, a@got@tlsgd@ha
+; CHECK-NEXT: addi 3, [[REG]], a@got@tlsgd@l
+; CHECK-NEXT: bl __tls_get_addr(a@tlsgd)
+; CHECK-NEXT: nop
+
diff --git a/test/CodeGen/PowerPC/tls-ie-obj.ll b/test/CodeGen/PowerPC/tls-ie-obj.ll
index 5cc0b187f6..3600cc52ba 100644
--- a/test/CodeGen/PowerPC/tls-ie-obj.ll
+++ b/test/CodeGen/PowerPC/tls-ie-obj.ll
@@ -24,9 +24,13 @@ entry:
 ; CHECK:       Relocation 0
 ; CHECK-NEXT:  'r_offset'
 ; CHECK-NEXT:  'r_sym', 0x[[SYM1:[0-9a-f]+]]
-; CHECK-NEXT:  'r_type', 0x00000057
+; CHECK-NEXT:  'r_type', 0x0000005a
 ; CHECK:       Relocation 1
 ; CHECK-NEXT:  'r_offset'
 ; CHECK-NEXT:  'r_sym', 0x[[SYM1]]
+; CHECK-NEXT:  'r_type', 0x00000058
+; CHECK:       Relocation 2
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM1]]
 ; CHECK-NEXT:  'r_type', 0x00000043
 
diff --git a/test/CodeGen/PowerPC/tls-ie.ll b/test/CodeGen/PowerPC/tls-ie.ll
index cc6f084efb..c5cfba7b3f 100644
--- a/test/CodeGen/PowerPC/tls-ie.ll
+++ b/test/CodeGen/PowerPC/tls-ie.ll
@@ -16,6 +16,7 @@ entry:
   ret i32 %0
 }
 
-; CHECK: ld [[REG:[0-9]+]], a@got@tprel(2)
-; CHECK: add {{[0-9]+}}, [[REG]], a@tls
+; CHECK: addis [[REG1:[0-9]+]], 2, a@got@tprel@ha
+; CHECK: ld [[REG2:[0-9]+]], a@got@tprel@l([[REG1]])
+; CHECK: add {{[0-9]+}}, [[REG2]], a@tls
 
diff --git a/test/CodeGen/PowerPC/tls-ld-obj.ll b/test/CodeGen/PowerPC/tls-ld-obj.ll
new file mode 100644
index 0000000000..c521ae405f
--- /dev/null
+++ b/test/CodeGen/PowerPC/tls-ld-obj.ll
@@ -0,0 +1,50 @@
+; RUN: llc -mcpu=pwr7 -O0 -filetype=obj -relocation-model=pic %s -o - | \
+; RUN: elf-dump --dump-section-data | FileCheck %s
+
+; Test correct relocation generation for thread-local storage using
+; the local dynamic model.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@a = hidden thread_local global i32 0, align 4
+
+define signext i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @a, align 4
+  ret i32 %0
+}
+
+; Verify generation of R_PPC64_GOT_TLSLD16_HA, R_PPC64_GOT_TLSLD16_LO,
+; R_PPC64_TLSLD, R_PPC64_DTPREL16_HA, and R_PPC64_DTPREL16_LO for
+; accessing external variable a, and R_PPC64_REL24 for the call to
+; __tls_get_addr.
+;
+; CHECK:       '.rela.text'
+; CHECK:       Relocation 0
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM1:[0-9a-f]+]]
+; CHECK-NEXT:  'r_type', 0x00000056
+; CHECK:       Relocation 1
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM1]]
+; CHECK-NEXT:  'r_type', 0x00000054
+; CHECK:       Relocation 2
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM1]]
+; CHECK-NEXT:  'r_type', 0x0000006c
+; CHECK:       Relocation 3
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x{{[0-9a-f]+}}
+; CHECK-NEXT:  'r_type', 0x0000000a
+; CHECK:       Relocation 4
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM1]]
+; CHECK-NEXT:  'r_type', 0x0000004d
+; CHECK:       Relocation 5
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM1]]
+; CHECK-NEXT:  'r_type', 0x0000004b
+
diff --git a/test/CodeGen/PowerPC/tls-ld.ll b/test/CodeGen/PowerPC/tls-ld.ll
new file mode 100644
index 0000000000..1ebc6129e2
--- /dev/null
+++ b/test/CodeGen/PowerPC/tls-ld.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mcpu=pwr7 -O0 -relocation-model=pic < %s | FileCheck %s
+
+; Test correct assembly code generation for thread-local storage using
+; the local dynamic model.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@a = hidden thread_local global i32 0, align 4
+
+define signext i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @a, align 4
+  ret i32 %0
+}
+
+; CHECK:      addis [[REG:[0-9]+]], 2, a@got@tlsld@ha
+; CHECK-NEXT: addi 3, [[REG]], a@got@tlsld@l
+; CHECK-NEXT: bl __tls_get_addr(a@tlsld)
+; CHECK-NEXT: nop
+; CHECK-NEXT: addis [[REG2:[0-9]+]], 3, a@dtprel@ha
+; CHECK-NEXT: addi {{[0-9]+}}, [[REG2]], a@dtprel@l
diff --git a/test/CodeGen/PowerPC/vec_extload.ll b/test/CodeGen/PowerPC/vec_extload.ll
index 15a3f9f295..998645d90d 100644
--- a/test/CodeGen/PowerPC/vec_extload.ll
+++ b/test/CodeGen/PowerPC/vec_extload.ll
@@ -15,55 +15,9 @@ define <16 x i8> @v16si8_sext_in_reg(<16 x i8> %a) {
   ret <16 x i8> %c
 }
 ; CHECK: v16si8_sext_in_reg:
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: lvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vslb
+; CHECK: vsrab
+; CHECK: blr 
 
 ; The zero extend uses a more clever logic: a vector splat
 ; and a logic and to set higher bits to 0.
@@ -83,31 +37,9 @@ define <8 x i16> @v8si16_sext_in_reg(<8 x i16> %a) {
   ret <8 x i16> %c
 }
 ; CHECK: v8si16_sext_in_reg:
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lhz
-; CHECK: sth
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lhz
-; CHECK: sth
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lhz
-; CHECK: sth
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lhz
-; CHECK: sth
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lhz
-; CHECK: sth
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lhz
-; CHECK: sth
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lhz
-; CHECK: sth
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lhz
-; CHECK: sth
-; CHECK: lvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vslh
+; CHECK: vsrah
+; CHECK: blr 
 
 ; Same as v8si16_sext_in_reg, but instead of creating the mask
 ; with a splat, loads it from memory.
@@ -129,19 +61,9 @@ define <4 x i32> @v4si32_sext_in_reg(<4 x i32> %a) {
   ret <4 x i32> %c
 }
 ; CHECK: v4si32_sext_in_reg:
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lha
-; CHECK: stw
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lha
-; CHECK: stw
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lha
-; CHECK: stw
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lha
-; CHECK: stw
-; CHECK: lvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vslw
+; CHECK: vsraw
+; CHECK: blr 
 
 ; Same as v8si16_sext_in_reg.
 define <4 x i32> @v4si32_zext_in_reg(<4 x i32> %a) {
diff --git a/test/CodeGen/PowerPC/vec_select.ll b/test/CodeGen/PowerPC/vec_select.ll
new file mode 100644
index 0000000000..4ad0acca00
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_select.ll
@@ -0,0 +1,7 @@
+; RUN: llc < %s -mtriple=powerpc64-linux-gnu -mattr=+altivec | FileCheck %s
+
+; CHECK: vsel_float
+define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2
+  ret <4 x float> %vsel
+}
diff --git a/test/CodeGen/R600/add.v4i32.ll b/test/CodeGen/R600/add.v4i32.ll
new file mode 100644
index 0000000000..ac4a87417b
--- /dev/null
+++ b/test/CodeGen/R600/add.v4i32.ll
@@ -0,0 +1,15 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %result = add <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/and.v4i32.ll b/test/CodeGen/R600/and.v4i32.ll
new file mode 100644
index 0000000000..662085e2d6
--- /dev/null
+++ b/test/CodeGen/R600/and.v4i32.ll
@@ -0,0 +1,15 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %result = and <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll b/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll
new file mode 100644
index 0000000000..1acf905955
--- /dev/null
+++ b/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll
@@ -0,0 +1,33 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; This test is for a bug in
+; DAGCombiner::reduceBuildVecConvertToConvertBuildVec() where
+; the wrong type was being passed to
+; TargetLowering::getOperationAction() when checking the legality of
+; ISD::UINT_TO_FP and ISD::SINT_TO_FP opcodes.
+
+define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %sint = load i32 addrspace(1) * %in
+  %conv = sitofp i32 %sint to float
+  %0 = insertelement <4 x float> undef, float %conv, i32 0
+  %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer
+  store <4 x float> %splat, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+;CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %uint = load i32 addrspace(1) * %in
+  %conv = uitofp i32 %uint to float
+  %0 = insertelement <4 x float> undef, float %conv, i32 0
+  %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer
+  store <4 x float> %splat, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fabs.ll b/test/CodeGen/R600/fabs.ll
new file mode 100644
index 0000000000..0407533eaa
--- /dev/null
+++ b/test/CodeGen/R600/fabs.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: MOV T{{[0-9]+\.[XYZW], \|T[0-9]+\.[XYZW]\|}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @fabs( float %r0)
+   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
+   ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
+
+declare float @fabs(float ) readnone
diff --git a/test/CodeGen/R600/fadd.ll b/test/CodeGen/R600/fadd.ll
new file mode 100644
index 0000000000..d7d1b6572c
--- /dev/null
+++ b/test/CodeGen/R600/fadd.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @llvm.R600.load.input(i32 1)
+   %r2 = fadd float %r0, %r1
+   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
+   ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
+
diff --git a/test/CodeGen/R600/fadd.v4f32.ll b/test/CodeGen/R600/fadd.v4f32.ll
new file mode 100644
index 0000000000..85dbfd52cb
--- /dev/null
+++ b/test/CodeGen/R600/fadd.v4f32.ll
@@ -0,0 +1,15 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
+  %a = load <4 x float> addrspace(1) * %in
+  %b = load <4 x float> addrspace(1) * %b_ptr
+  %result = fadd <4 x float> %a, %b
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fcmp-cnd.ll b/test/CodeGen/R600/fcmp-cnd.ll
new file mode 100644
index 0000000000..a94cfb5cf2
--- /dev/null
+++ b/test/CodeGen/R600/fcmp-cnd.ll
@@ -0,0 +1,14 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;Not checking arguments 2 and 3 to CNDE, because they may change between
+;registers and literal.x depending on what the optimizer does.
+;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+entry:
+  %0 = load float addrspace(1)* %in
+  %cmp = fcmp oeq float %0, 0.000000e+00
+  %value = select i1 %cmp, i32 2, i32 3 
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fcmp-cnde-int-args.ll b/test/CodeGen/R600/fcmp-cnde-int-args.ll
new file mode 100644
index 0000000000..5c981efa9d
--- /dev/null
+++ b/test/CodeGen/R600/fcmp-cnde-int-args.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; This test checks a bug in R600TargetLowering::LowerSELECT_CC where the
+; chance to optimize the fcmp + select instructions to CNDE was missed
+; due to the fact that the operands to fcmp and select had different types
+
+;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, 0.0, -1}}
+
+define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+entry:
+  %0 = load float addrspace(1)* %in
+  %cmp = fcmp oeq float %0, 0.000000e+00
+  %value = select i1 %cmp, i32 -1, i32 0
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fcmp.ll b/test/CodeGen/R600/fcmp.ll
new file mode 100644
index 0000000000..1dcd07c0b3
--- /dev/null
+++ b/test/CodeGen/R600/fcmp.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: SETE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MOV T{{[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+;CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+entry:
+  %0 = load float addrspace(1)* %in
+  %arrayidx1 = getelementptr inbounds float addrspace(1)* %in, i32 1
+  %1 = load float addrspace(1)* %arrayidx1
+  %cmp = fcmp oeq float %0, %1
+  %sext = sext i1 %cmp to i32
+  store i32 %sext, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fdiv.v4f32.ll b/test/CodeGen/R600/fdiv.v4f32.ll
new file mode 100644
index 0000000000..b013fd647c
--- /dev/null
+++ b/test/CodeGen/R600/fdiv.v4f32.ll
@@ -0,0 +1,19 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
+  %a = load <4 x float> addrspace(1) * %in
+  %b = load <4 x float> addrspace(1) * %b_ptr
+  %result = fdiv <4 x float> %a, %b
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/floor.ll b/test/CodeGen/R600/floor.ll
new file mode 100644
index 0000000000..845330f284
--- /dev/null
+++ b/test/CodeGen/R600/floor.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: FLOOR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @floor(float %r0)
+   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
+   ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
+
+declare float @floor(float) readonly
diff --git a/test/CodeGen/R600/fmax.ll b/test/CodeGen/R600/fmax.ll
new file mode 100644
index 0000000000..3708f0b9ee
--- /dev/null
+++ b/test/CodeGen/R600/fmax.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: MAX T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @llvm.R600.load.input(i32 1)
+   %r2 = fcmp uge float %r0, %r1
+   %r3 = select i1 %r2, float %r0, float %r1
+   call void @llvm.AMDGPU.store.output(float %r3, i32 0)
+   ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
diff --git a/test/CodeGen/R600/fmin.ll b/test/CodeGen/R600/fmin.ll
new file mode 100644
index 0000000000..19d59ab306
--- /dev/null
+++ b/test/CodeGen/R600/fmin.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: MIN T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @llvm.R600.load.input(i32 1)
+   %r2 = fcmp uge float %r0, %r1
+   %r3 = select i1 %r2, float %r1, float %r0
+   call void @llvm.AMDGPU.store.output(float %r3, i32 0)
+   ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
diff --git a/test/CodeGen/R600/fmul.ll b/test/CodeGen/R600/fmul.ll
new file mode 100644
index 0000000000..eb1d523c0b
--- /dev/null
+++ b/test/CodeGen/R600/fmul.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @llvm.R600.load.input(i32 1)
+   %r2 = fmul float %r0, %r1
+   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
+   ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
+
diff --git a/test/CodeGen/R600/fmul.v4f32.ll b/test/CodeGen/R600/fmul.v4f32.ll
new file mode 100644
index 0000000000..6d44a0c5c7
--- /dev/null
+++ b/test/CodeGen/R600/fmul.v4f32.ll
@@ -0,0 +1,15 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
+  %a = load <4 x float> addrspace(1) * %in
+  %b = load <4 x float> addrspace(1) * %b_ptr
+  %result = fmul <4 x float> %a, %b
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fsub.ll b/test/CodeGen/R600/fsub.ll
new file mode 100644
index 0000000000..0ec1c376df
--- /dev/null
+++ b/test/CodeGen/R600/fsub.ll
@@ -0,0 +1,17 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: MOV T{{[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @llvm.R600.load.input(i32 1)
+   %r2 = fsub float %r0, %r1
+   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
+   ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
+
diff --git a/test/CodeGen/R600/fsub.v4f32.ll b/test/CodeGen/R600/fsub.v4f32.ll
new file mode 100644
index 0000000000..612a57e4b6
--- /dev/null
+++ b/test/CodeGen/R600/fsub.v4f32.ll
@@ -0,0 +1,15 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
+  %a = load <4 x float> addrspace(1) * %in
+  %b = load <4 x float> addrspace(1) * %b_ptr
+  %result = fsub <4 x float> %a, %b
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/i8_to_double_to_float.ll b/test/CodeGen/R600/i8_to_double_to_float.ll
new file mode 100644
index 0000000000..39f33227fa
--- /dev/null
+++ b/test/CodeGen/R600/i8_to_double_to_float.ll
@@ -0,0 +1,11 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %1 = load i8 addrspace(1)* %in
+  %2 = uitofp i8 %1 to double
+  %3 = fptrunc double %2 to float
+  store float %3, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/icmp-select-sete-reverse-args.ll b/test/CodeGen/R600/icmp-select-sete-reverse-args.ll
new file mode 100644
index 0000000000..aad44d9edf
--- /dev/null
+++ b/test/CodeGen/R600/icmp-select-sete-reverse-args.ll
@@ -0,0 +1,18 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;Test that a select with reversed True/False values is correctly lowered
+;to a SETNE_INT.  There should only be one SETNE_INT instruction.
+
+;CHECK: SETNE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK_NOT: SETNE_INT
+
+define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = load i32 addrspace(1)* %in
+  %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %in, i32 1
+  %1 = load i32 addrspace(1)* %arrayidx1
+  %cmp = icmp eq i32 %0, %1
+  %value = select i1 %cmp, i32 0, i32 -1
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/lit.local.cfg b/test/CodeGen/R600/lit.local.cfg
new file mode 100644
index 0000000000..36ee493e59
--- /dev/null
+++ b/test/CodeGen/R600/lit.local.cfg
@@ -0,0 +1,13 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+def getRoot(config):
+    if not config.parent:
+        return config
+    return getRoot(config.parent)
+
+root = getRoot(config)
+
+targets = set(root.targets_to_build.split())
+if not 'R600' in targets:
+    config.unsupported = True
+
diff --git a/test/CodeGen/R600/literals.ll b/test/CodeGen/R600/literals.ll
new file mode 100644
index 0000000000..4c731b25ec
--- /dev/null
+++ b/test/CodeGen/R600/literals.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; Test using an integer literal constant.
+; Generated ASM should be:
+; ADD_INT REG literal.x, 5
+; or
+; ADD_INT literal.x REG, 5
+
+; CHECK: ADD_INT {{[A-Z0-9,. ]*}}literal.x,{{[A-Z0-9,. ]*}} 5
+define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = add i32 5, %in
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; Test using a float literal constant.
+; Generated ASM should be:
+; ADD REG literal.x, 5.0
+; or
+; ADD literal.x REG, 5.0
+
+; CHECK: ADD {{[A-Z0-9,. ]*}}literal.x,{{[A-Z0-9,. ]*}} {{[0-9]+}}(5.0
+define void @float_literal(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fadd float 5.0, %in
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/R600/llvm.AMDGPU.mul.ll b/test/CodeGen/R600/llvm.AMDGPU.mul.ll
new file mode 100644
index 0000000000..693eb27457
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.mul.ll
@@ -0,0 +1,17 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @llvm.R600.load.input(i32 1)
+   %r2 = call float @llvm.AMDGPU.mul( float %r0, float %r1)
+   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
+   ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
+
+declare float @llvm.AMDGPU.mul(float ,float ) readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.trunc.ll b/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
new file mode 100644
index 0000000000..fac957f7ee
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: TRUNC T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @llvm.AMDGPU.trunc( float %r0)
+   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
+   ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
+
+declare float @llvm.AMDGPU.trunc(float ) readnone
diff --git a/test/CodeGen/R600/llvm.cos.ll b/test/CodeGen/R600/llvm.cos.ll
new file mode 100644
index 0000000000..dc120bfb00
--- /dev/null
+++ b/test/CodeGen/R600/llvm.cos.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: COS T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @llvm.cos.f32(float %r0)
+   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
+   ret void
+}
+
+declare float @llvm.cos.f32(float) readnone
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
diff --git a/test/CodeGen/R600/llvm.pow.ll b/test/CodeGen/R600/llvm.pow.ll
new file mode 100644
index 0000000000..0ae9172579
--- /dev/null
+++ b/test/CodeGen/R600/llvm.pow.ll
@@ -0,0 +1,19 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: LOG_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK-NEXT: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK-NEXT: EXP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @llvm.R600.load.input(i32 1)
+   %r2 = call float @llvm.pow.f32( float %r0, float %r1)
+   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
+   ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
+
+declare float @llvm.pow.f32(float ,float ) readonly
diff --git a/test/CodeGen/R600/llvm.sin.ll b/test/CodeGen/R600/llvm.sin.ll
new file mode 100644
index 0000000000..5cd6998c93
--- /dev/null
+++ b/test/CodeGen/R600/llvm.sin.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: SIN T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @llvm.sin.f32( float %r0)
+   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
+   ret void
+}
+
+declare float @llvm.sin.f32(float) readnone
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
diff --git a/test/CodeGen/R600/load.constant_addrspace.f32.ll b/test/CodeGen/R600/load.constant_addrspace.f32.ll
new file mode 100644
index 0000000000..93627283bb
--- /dev/null
+++ b/test/CodeGen/R600/load.constant_addrspace.f32.ll
@@ -0,0 +1,9 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: VTX_READ_32 T{{[0-9]+\.X, T[0-9]+\.X}}
+
+define void @test(float addrspace(1)* %out, float addrspace(2)* %in) {
+  %1 = load float addrspace(2)* %in
+  store float %1, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/load.i8.ll b/test/CodeGen/R600/load.i8.ll
new file mode 100644
index 0000000000..b070dcd520
--- /dev/null
+++ b/test/CodeGen/R600/load.i8.ll
@@ -0,0 +1,10 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+
+define void @test(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %1 = load i8 addrspace(1)* %in
+  %2 = zext i8 %1 to i32
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/reciprocal.ll b/test/CodeGen/R600/reciprocal.ll
new file mode 100644
index 0000000000..6838c1ae36
--- /dev/null
+++ b/test/CodeGen/R600/reciprocal.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = fdiv float 1.0, %r0
+   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
+   ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
+
+declare float @llvm.AMDGPU.rcp(float ) readnone
diff --git a/test/CodeGen/R600/sdiv.ll b/test/CodeGen/R600/sdiv.ll
new file mode 100644
index 0000000000..3556facfba
--- /dev/null
+++ b/test/CodeGen/R600/sdiv.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; The code generated by sdiv is long and complex and may frequently change.
+; The goal of this test is to make sure the ISel doesn't fail.
+;
+; This program was previously failing to compile when one of the selectcc
+; opcodes generated by the sdiv lowering was being legalized and optimized to:
+; selectcc Remainder -1, 0, -1, SETGT
+; This was fixed by adding an additional pattern in R600Instructions.td to
+; match this pattern with a CNDGE_INT.
+
+; CHECK: RETURN
+
+define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in
+  %den = load i32 addrspace(1) * %den_ptr
+  %result = sdiv i32 %num, %den
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/selectcc-icmp-select-float.ll b/test/CodeGen/R600/selectcc-icmp-select-float.ll
new file mode 100644
index 0000000000..f65a30086e
--- /dev/null
+++ b/test/CodeGen/R600/selectcc-icmp-select-float.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; Note additional optimizations may cause this SGT to be replaced with a
+; CND* instruction.
+; CHECK: SGT_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, -1}}
+; Test a selectcc with i32 LHS/RHS and float True/False
+
+define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = load i32 addrspace(1)* %in
+  %1 = icmp sge i32 %0, 0
+  %2 = select i1 %1, float 1.0, float 0.0
+  store float %2, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/selectcc_cnde.ll b/test/CodeGen/R600/selectcc_cnde.ll
new file mode 100644
index 0000000000..f0a0f512ba
--- /dev/null
+++ b/test/CodeGen/R600/selectcc_cnde.ll
@@ -0,0 +1,11 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK-NOT: SETE
+;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1.0, literal.x, [-0-9]+\(2.0}}
+define void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %1 = load float addrspace(1)* %in
+  %2 = fcmp oeq float %1, 0.0
+  %3 = select i1 %2, float 1.0, float 2.0
+  store float %3, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/selectcc_cnde_int.ll b/test/CodeGen/R600/selectcc_cnde_int.ll
new file mode 100644
index 0000000000..b38078e26d
--- /dev/null
+++ b/test/CodeGen/R600/selectcc_cnde_int.ll
@@ -0,0 +1,11 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK-NOT: SETE_INT
+;CHECK: CNDE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1, literal.x, 2}}
+define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %1 = load i32 addrspace(1)* %in
+  %2 = icmp eq i32 %1, 0
+  %3 = select i1 %2, i32 1, i32 2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/setcc.v4i32.ll b/test/CodeGen/R600/setcc.v4i32.ll
new file mode 100644
index 0000000000..0752f2e63d
--- /dev/null
+++ b/test/CodeGen/R600/setcc.v4i32.ll
@@ -0,0 +1,12 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+;CHECK: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %result = icmp eq <4 x i32> %a, %b
+  %sext = sext <4 x i1> %result to <4 x i32>
+  store <4 x i32> %sext, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/short-args.ll b/test/CodeGen/R600/short-args.ll
new file mode 100644
index 0000000000..107025045c
--- /dev/null
+++ b/test/CodeGen/R600/short-args.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+
+define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
+entry:
+  %0 = zext i8 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+
+define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
+entry:
+  %0 = zext i8 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
+
+define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
+entry:
+  %0 = zext i16 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
+
+define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
+entry:
+  %0 = zext i16 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/store.v4f32.ll b/test/CodeGen/R600/store.v4f32.ll
new file mode 100644
index 0000000000..8b0d244459
--- /dev/null
+++ b/test/CodeGen/R600/store.v4f32.ll
@@ -0,0 +1,9 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: RAT_WRITE_CACHELESS_128 T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
+
+define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %1 = load <4 x float> addrspace(1) * %in
+  store <4 x float> %1, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/store.v4i32.ll b/test/CodeGen/R600/store.v4i32.ll
new file mode 100644
index 0000000000..a659815dde
--- /dev/null
+++ b/test/CodeGen/R600/store.v4i32.ll
@@ -0,0 +1,9 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: RAT_WRITE_CACHELESS_128 T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
+
+define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %1 = load <4 x i32> addrspace(1) * %in
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/udiv.v4i32.ll b/test/CodeGen/R600/udiv.v4i32.ll
new file mode 100644
index 0000000000..47657a6be7
--- /dev/null
+++ b/test/CodeGen/R600/udiv.v4i32.ll
@@ -0,0 +1,15 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;The code generated by udiv is long and complex and may frequently change.
+;The goal of this test is to make sure the ISel doesn't fail when it gets
+;a v4i32 udiv
+;CHECK: RETURN
+
+define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %result = udiv <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/urem.v4i32.ll b/test/CodeGen/R600/urem.v4i32.ll
new file mode 100644
index 0000000000..2e7388caa6
--- /dev/null
+++ b/test/CodeGen/R600/urem.v4i32.ll
@@ -0,0 +1,15 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;The code generated by urem is long and complex and may frequently change.
+;The goal of this test is to make sure the ISel doesn't fail when it gets
+;a v4i32 urem
+;CHECK: RETURN
+
+define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %result = urem <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/vec4-expand.ll b/test/CodeGen/R600/vec4-expand.ll
new file mode 100644
index 0000000000..c61f6e25b5
--- /dev/null
+++ b/test/CodeGen/R600/vec4-expand.ll
@@ -0,0 +1,49 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @fp_to_sint(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %value = load <4 x float> addrspace(1) * %in
+  %result = fptosi <4 x float> %value to <4 x i32>
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @fp_to_uint(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %value = load <4 x float> addrspace(1) * %in
+  %result = fptoui <4 x float> %value to <4 x i32>
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @sint_to_fp(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %value = load <4 x i32> addrspace(1) * %in
+  %result = sitofp <4 x i32> %value to <4 x float>
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @uint_to_fp(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %value = load <4 x i32> addrspace(1) * %in
+  %result = uitofp <4 x i32> %value to <4 x float>
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/SI/sanity.ll b/test/CodeGen/SI/sanity.ll
new file mode 100644
index 0000000000..62cdcf5eca
--- /dev/null
+++ b/test/CodeGen/SI/sanity.ll
@@ -0,0 +1,37 @@
+;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
+
+; CHECK: S_ENDPGM
+
+define void @main() {
+main_body:
+  call void @llvm.AMDGPU.shader.type(i32 1)
+  %0 = load <4 x i32> addrspace(2)* addrspace(8)* inttoptr (i32 6 to <4 x i32> addrspace(2)* addrspace(8)*)
+  %1 = getelementptr <4 x i32> addrspace(2)* %0, i32 0
+  %2 = load <4 x i32> addrspace(2)* %1
+  %3 = call i32 @llvm.SI.vs.load.buffer.index()
+  %4 = call <4 x float> @llvm.SI.vs.load.input(<4 x i32> %2, i32 0, i32 %3)
+  %5 = extractelement <4 x float> %4, i32 0
+  %6 = extractelement <4 x float> %4, i32 1
+  %7 = extractelement <4 x float> %4, i32 2
+  %8 = extractelement <4 x float> %4, i32 3
+  %9 = load <4 x i32> addrspace(2)* addrspace(8)* inttoptr (i32 6 to <4 x i32> addrspace(2)* addrspace(8)*)
+  %10 = getelementptr <4 x i32> addrspace(2)* %9, i32 1
+  %11 = load <4 x i32> addrspace(2)* %10
+  %12 = call i32 @llvm.SI.vs.load.buffer.index()
+  %13 = call <4 x float> @llvm.SI.vs.load.input(<4 x i32> %11, i32 0, i32 %12)
+  %14 = extractelement <4 x float> %13, i32 0
+  %15 = extractelement <4 x float> %13, i32 1
+  %16 = extractelement <4 x float> %13, i32 2
+  %17 = extractelement <4 x float> %13, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %14, float %15, float %16, float %17)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %5, float %6, float %7, float %8)
+  ret void
+}
+
+declare void @llvm.AMDGPU.shader.type(i32)
+
+declare i32 @llvm.SI.vs.load.buffer.index() readnone
+
+declare <4 x float> @llvm.SI.vs.load.input(<4 x i32>, i32, i32)
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/Thumb2/thumb2-shifter.ll b/test/CodeGen/Thumb2/thumb2-shifter.ll
index 98854a1205..05dd90cfbf 100644
--- a/test/CodeGen/Thumb2/thumb2-shifter.ll
+++ b/test/CodeGen/Thumb2/thumb2-shifter.ll
@@ -1,24 +1,27 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2,+t2xtpk | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=cortex-a8 | FileCheck %s --check-prefix=A8
+; RUN: llc < %s -march=thumb -mcpu=swift | FileCheck %s --check-prefix=SWIFT
+
+; rdar://12892707
 
 define i32 @t2ADDrs_lsl(i32 %X, i32 %Y) {
-; CHECK: t2ADDrs_lsl
-; CHECK: add.w  r0, r0, r1, lsl #16
+; A8: t2ADDrs_lsl
+; A8: add.w  r0, r0, r1, lsl #16
         %A = shl i32 %Y, 16
         %B = add i32 %X, %A
         ret i32 %B
 }
 
 define i32 @t2ADDrs_lsr(i32 %X, i32 %Y) {
-; CHECK: t2ADDrs_lsr
-; CHECK: add.w  r0, r0, r1, lsr #16
+; A8: t2ADDrs_lsr
+; A8: add.w  r0, r0, r1, lsr #16
         %A = lshr i32 %Y, 16
         %B = add i32 %X, %A
         ret i32 %B
 }
 
 define i32 @t2ADDrs_asr(i32 %X, i32 %Y) {
-; CHECK: t2ADDrs_asr
-; CHECK: add.w  r0, r0, r1, asr #16
+; A8: t2ADDrs_asr
+; A8: add.w  r0, r0, r1, asr #16
         %A = ashr i32 %Y, 16
         %B = add i32 %X, %A
         ret i32 %B
@@ -26,8 +29,8 @@ define i32 @t2ADDrs_asr(i32 %X, i32 %Y) {
 
 ; i32 ror(n) = (x >> n) | (x << (32 - n))
 define i32 @t2ADDrs_ror(i32 %X, i32 %Y) {
-; CHECK: t2ADDrs_ror
-; CHECK: add.w  r0, r0, r1, ror #16
+; A8: t2ADDrs_ror
+; A8: add.w  r0, r0, r1, ror #16
         %A = lshr i32 %Y, 16
         %B = shl  i32 %Y, 16
         %C = or   i32 %B, %A
@@ -36,13 +39,66 @@ define i32 @t2ADDrs_ror(i32 %X, i32 %Y) {
 }
 
 define i32 @t2ADDrs_noRegShift(i32 %X, i32 %Y, i8 %sh) {
-; CHECK: t2ADDrs_noRegShift
-; CHECK: uxtb r2, r2
-; CHECK: lsls r1, r2
-; CHECK: add  r0, r1
+; A8: t2ADDrs_noRegShift
+; A8: uxtb r2, r2
+; A8: lsls r1, r2
+; A8: add  r0, r1
+
+; SWIFT: t2ADDrs_noRegShift
+; SWIFT-NOT: lsls
+; SWIFT: lsl.w
+        %shift.upgrd.1 = zext i8 %sh to i32
+        %A = shl i32 %Y, %shift.upgrd.1
+        %B = add i32 %X, %A
+        ret i32 %B
+}
+
+define i32 @t2ADDrs_noRegShift2(i32 %X, i32 %Y, i8 %sh) {
+; A8: t2ADDrs_noRegShift2
+; A8: uxtb r2, r2
+; A8: lsrs r1, r2
+; A8: add  r0, r1
+
+; SWIFT: t2ADDrs_noRegShift2
+; SWIFT-NOT: lsrs
+; SWIFT: lsr.w
+        %shift.upgrd.1 = zext i8 %sh to i32
+        %A = lshr i32 %Y, %shift.upgrd.1
+        %B = add i32 %X, %A
+        ret i32 %B
+}
+
+define i32 @t2ADDrs_noRegShift3(i32 %X, i32 %Y, i8 %sh) {
+; A8: t2ADDrs_noRegShift3
+; A8: uxtb r2, r2
+; A8: asrs r1, r2
+; A8: add  r0, r1
+
+; SWIFT: t2ADDrs_noRegShift3
+; SWIFT-NOT: asrs
+; SWIFT: asr.w
+        %shift.upgrd.1 = zext i8 %sh to i32
+        %A = ashr i32 %Y, %shift.upgrd.1
+        %B = add i32 %X, %A
+        ret i32 %B
+}
+
+define i32 @t2ADDrs_optsize(i32 %X, i32 %Y, i8 %sh) optsize {
+; SWIFT: t2ADDrs_optsize
+; SWIFT-NOT: lsl.w
+; SWIFT: lsls
         %shift.upgrd.1 = zext i8 %sh to i32
         %A = shl i32 %Y, %shift.upgrd.1
         %B = add i32 %X, %A
         ret i32 %B
 }
 
+define i32 @t2ADDrs_minsize(i32 %X, i32 %Y, i8 %sh) minsize {
+; SWIFT: t2ADDrs_minsize
+; SWIFT-NOT: lsr.w
+; SWIFT: lsrs
+        %shift.upgrd.1 = zext i8 %sh to i32
+        %A = lshr i32 %Y, %shift.upgrd.1
+        %B = add i32 %X, %A
+        ret i32 %B
+}
diff --git a/test/CodeGen/X86/2008-05-22-FoldUnalignedLoad.ll b/test/CodeGen/X86/2008-05-22-FoldUnalignedLoad.ll
index 19a73543c6..fc38135032 100644
--- a/test/CodeGen/X86/2008-05-22-FoldUnalignedLoad.ll
+++ b/test/CodeGen/X86/2008-05-22-FoldUnalignedLoad.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movups | count 2
+; RUN: llc < %s -march=x86 -mcpu=penryn | FileCheck %s
 
 define void @a(<4 x float>* %x) nounwind  {
 entry:
@@ -8,4 +8,10 @@ entry:
         ret void
 }
 
+; CHECK: a:
+; CHECK: movups
+; CHECK: movups
+; CHECK-NOT: movups
+; CHECK: ret
+
 declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>)
diff --git a/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll b/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll
index a7207537de..da734d4b64 100644
--- a/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll
+++ b/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll
@@ -16,8 +16,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: main
 define i32 @main() nounwind uwtable {
 entry:
-; CHECK: movsbq  j(%rip), %
-; CHECK: movsbq  i(%rip), %
+; CHECK: pmovsxbq  j(%rip), %
+; CHECK: pmovsxbq  i(%rip), %
   %0 = load <2 x i8>* @i, align 8
   %1 = load <2 x i8>* @j, align 8
   %div = sdiv <2 x i8> %1, %0
diff --git a/test/CodeGen/X86/2012-12-12-DAGCombineCrash.ll b/test/CodeGen/X86/2012-12-12-DAGCombineCrash.ll
new file mode 100644
index 0000000000..8cef2c8201
--- /dev/null
+++ b/test/CodeGen/X86/2012-12-12-DAGCombineCrash.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=x86 -mtriple=i686-apple-ios -mcpu=yonah < %s
+; rdar://12868039
+
+define void @t() nounwind ssp {
+  %1 = alloca i32
+  %2 = ptrtoint i32* %1 to i32
+  br label %3
+
+; <label>:3                                       ; preds = %5, %3, %0
+  switch i32 undef, label %3 [
+    i32 611946160, label %5
+    i32 954117870, label %4
+  ]
+
+; <label>:4                                       ; preds = %3
+  ret void
+
+; <label>:5                                       ; preds = %5, %3
+  %6 = add i32 0, 148
+  %7 = and i32 %6, 48
+  %8 = add i32 %7, 0
+  %9 = or i32 %2, %8
+  %10 = xor i32 -1, %2
+  %11 = or i32 %8, %10
+  %12 = or i32 %9, %11
+  %13 = xor i32 %9, %11
+  %14 = sub i32 %12, %13
+  %15 = xor i32 2044674005, %14
+  %16 = xor i32 %15, 0
+  %17 = shl nuw nsw i32 %16, 1
+  %18 = sub i32 0, %17
+  %19 = and i32 %18, 2051242402
+  %20 = sub i32 0, %19
+  %21 = xor i32 %20, 0
+  %22 = xor i32 %21, 0
+  %23 = add i32 0, %22
+  %24 = shl i32 %23, 1
+  %25 = or i32 1, %24
+  %26 = add i32 0, %25
+  %27 = trunc i32 %26 to i8
+  %28 = xor i8 %27, 125
+  %29 = add i8 %28, -16
+  %30 = add i8 0, %29
+  store i8 %30, i8* null
+  br i1 undef, label %5, label %3
+}
diff --git a/test/CodeGen/X86/2012-12-14-v8fp80-crash.ll b/test/CodeGen/X86/2012-12-14-v8fp80-crash.ll
new file mode 100644
index 0000000000..c465527bd8
--- /dev/null
+++ b/test/CodeGen/X86/2012-12-14-v8fp80-crash.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 -mtriple=i686-pc-win32
+
+; Make sure we don't crash on this testcase.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+define void @_ZN6VectorIfE3equIeEEvfRKS_IT_E() nounwind uwtable ssp align 2 {
+entry:
+  br i1 undef, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:                                 ; preds = %entry
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %while.body.lr.ph
+  %0 = fptrunc <8 x x86_fp80> undef to <8 x float>
+  store <8 x float> %0, <8 x float>* undef, align 4
+  br label %vector.body
+
+while.end:                                        ; preds = %entry
+  ret void
+}
diff --git a/test/CodeGen/X86/2012-12-19-NoImplicitFloat.ll b/test/CodeGen/X86/2012-12-19-NoImplicitFloat.ll
new file mode 100644
index 0000000000..3025665206
--- /dev/null
+++ b/test/CodeGen/X86/2012-12-19-NoImplicitFloat.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core2 < %s | FileCheck %s
+; Test that we do not introduce vector operations with noimplicitfloat.
+; rdar://12879313
+
+%struct1 = type { i32*, i32* }
+
+define void @test() nounwind noimplicitfloat {
+entry:
+; CHECK-NOT: xmm
+; CHECK: ret
+  %0 = load %struct1** undef, align 8
+  %1 = getelementptr inbounds %struct1* %0, i64 0, i32 0
+  store i32* null, i32** %1, align 8
+  %2 = getelementptr inbounds %struct1* %0, i64 0, i32 1
+  store i32* null, i32** %2, align 8
+  ret void
+}
diff --git a/test/CodeGen/X86/2013-01-09-DAGCombineBug.ll b/test/CodeGen/X86/2013-01-09-DAGCombineBug.ll
new file mode 100644
index 0000000000..db7ec8ae26
--- /dev/null
+++ b/test/CodeGen/X86/2013-01-09-DAGCombineBug.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mtriple=x86_64-apple-macosx10.5.0 < %s
+
+; rdar://12968664
+
+define void @t() nounwind uwtable ssp {
+  br label %4
+
+; <label>:1                                       ; preds = %4, %2
+  ret void
+
+; <label>:2                                       ; preds = %6, %5, %3, %2
+  switch i32 undef, label %2 [
+    i32 1090573978, label %1
+    i32 1090573938, label %3
+    i32 1090573957, label %5
+  ]
+
+; <label>:3                                       ; preds = %4, %2
+  br i1 undef, label %2, label %4
+
+; <label>:4                                       ; preds = %6, %5, %3, %0
+  switch i32 undef, label %11 [
+    i32 1090573938, label %3
+    i32 1090573957, label %5
+    i32 1090573978, label %1
+    i32 165205179, label %6
+  ]
+
+; <label>:5                                       ; preds = %4, %2
+  br i1 undef, label %2, label %4
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 undef, 590901838
+  %8 = or i1 false, %7
+  %9 = or i1 true, %8
+  %10 = xor i1 %8, %9
+  br i1 %10, label %4, label %2
+
+; <label>:11                                      ; preds = %11, %4
+  br label %11
+}
diff --git a/test/CodeGen/X86/WidenArith.ll b/test/CodeGen/X86/WidenArith.ll
new file mode 100644
index 0000000000..0383bd665b
--- /dev/null
+++ b/test/CodeGen/X86/WidenArith.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s
+
+;CHECK: test
+;CHECK: vaddps
+;CHECK: vmulps
+;CHECK: vsubps
+;CHECK: vcmpltps
+;CHECK: vcmpltps
+;CHECK: vandps
+;CHECK: vandps
+;CHECK: ret
+define <8 x i32> @test(<8 x float> %a, <8 x float> %b) {
+ %c1 = fadd <8 x float> %a, %b
+ %b1 = fmul <8 x float> %b, %a
+ %d  = fsub <8 x float> %b1, %c1
+ %res1 = fcmp olt <8 x float> %a, %b1
+ %res2 = fcmp olt <8 x float> %c1, %d
+ %andr = and <8 x i1>%res1, %res2
+ %ex = zext <8 x i1> %andr to <8 x i32>
+ ret <8 x i32>%ex
+}
+
+
diff --git a/test/CodeGen/X86/atom-bypass-slow-division.ll b/test/CodeGen/X86/atom-bypass-slow-division.ll
index e7c9605d3e..453e72672b 100644
--- a/test/CodeGen/X86/atom-bypass-slow-division.ll
+++ b/test/CodeGen/X86/atom-bypass-slow-division.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mcpu=atom -mtriple=i686-linux  | FileCheck %s
 
-define i32 @test_get_quotient(i32 %a, i32 %b) nounwind {
-; CHECK: test_get_quotient
+define i32 @Test_get_quotient(i32 %a, i32 %b) nounwind {
+; CHECK: Test_get_quotient:
 ; CHECK: orl %ecx, %edx
 ; CHECK-NEXT: testl $-256, %edx
 ; CHECK-NEXT: je
@@ -13,8 +13,8 @@ define i32 @test_get_quotient(i32 %a, i32 %b) nounwind {
   ret i32 %result
 }
 
-define i32 @test_get_remainder(i32 %a, i32 %b) nounwind {
-; CHECK: test_get_remainder
+define i32 @Test_get_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: Test_get_remainder:
 ; CHECK: orl %ecx, %edx
 ; CHECK-NEXT: testl $-256, %edx
 ; CHECK-NEXT: je
@@ -26,8 +26,8 @@ define i32 @test_get_remainder(i32 %a, i32 %b) nounwind {
   ret i32 %result
 }
 
-define i32 @test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
-; CHECK: test_get_quotient_and_remainder
+define i32 @Test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: Test_get_quotient_and_remainder:
 ; CHECK: orl %ecx, %edx
 ; CHECK-NEXT: testl $-256, %edx
 ; CHECK-NEXT: je
@@ -35,7 +35,7 @@ define i32 @test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
 ; CHECK: divb
 ; CHECK: addl
 ; CHECK: ret
-; CEECK-NOT: idivl
+; CHECK-NOT: idivl
 ; CHECK-NOT: divb
   %resultdiv = sdiv i32 %a, %b
   %resultrem = srem i32 %a, %b
@@ -43,8 +43,8 @@ define i32 @test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
   ret i32 %result
 }
 
-define i32 @test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
-; CHECK: test_use_div_and_idiv
+define i32 @Test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
+; CHECK: Test_use_div_and_idiv:
 ; CHECK: idivl
 ; CHECK: divb
 ; CHECK: divl
@@ -57,34 +57,34 @@ define i32 @test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
   ret i32 %result
 }
 
-define i32 @test_use_div_imm_imm() nounwind {
-; CHECK: test_use_div_imm_imm
+define i32 @Test_use_div_imm_imm() nounwind {
+; CHECK: Test_use_div_imm_imm:
 ; CHECK: movl $64
   %resultdiv = sdiv i32 256, 4
   ret i32 %resultdiv
 }
 
-define i32 @test_use_div_reg_imm(i32 %a) nounwind {
-; CHECK: test_use_div_reg_imm
-; CEHCK-NOT: test
+define i32 @Test_use_div_reg_imm(i32 %a) nounwind {
+; CHECK: Test_use_div_reg_imm:
+; CHECK-NOT: test
 ; CHECK-NOT: idiv
 ; CHECK-NOT: divb
   %resultdiv = sdiv i32 %a, 33
   ret i32 %resultdiv
 }
 
-define i32 @test_use_rem_reg_imm(i32 %a) nounwind {
-; CHECK: test_use_rem_reg_imm
-; CEHCK-NOT: test
+define i32 @Test_use_rem_reg_imm(i32 %a) nounwind {
+; CHECK: Test_use_rem_reg_imm:
+; CHECK-NOT: test
 ; CHECK-NOT: idiv
 ; CHECK-NOT: divb
   %resultrem = srem i32 %a, 33
   ret i32 %resultrem
 }
 
-define i32 @test_use_divrem_reg_imm(i32 %a) nounwind {
-; CHECK: test_use_divrem_reg_imm
-; CEHCK-NOT: test
+define i32 @Test_use_divrem_reg_imm(i32 %a) nounwind {
+; CHECK: Test_use_divrem_reg_imm:
+; CHECK-NOT: test
 ; CHECK-NOT: idiv
 ; CHECK-NOT: divb
   %resultdiv = sdiv i32 %a, 33
@@ -93,8 +93,8 @@ define i32 @test_use_divrem_reg_imm(i32 %a) nounwind {
   ret i32 %result
 }
 
-define i32 @test_use_div_imm_reg(i32 %a) nounwind {
-; CHECK: test_use_div_imm_reg
+define i32 @Test_use_div_imm_reg(i32 %a) nounwind {
+; CHECK: Test_use_div_imm_reg:
 ; CHECK: test
 ; CHECK: idiv
 ; CHECK: divb
@@ -102,8 +102,8 @@ define i32 @test_use_div_imm_reg(i32 %a) nounwind {
   ret i32 %resultdiv
 }
 
-define i32 @test_use_rem_imm_reg(i32 %a) nounwind {
-; CHECK: test_use_rem_imm_reg
+define i32 @Test_use_rem_imm_reg(i32 %a) nounwind {
+; CHECK: Test_use_rem_imm_reg:
 ; CHECK: test
 ; CHECK: idiv
 ; CHECK: divb
diff --git a/test/CodeGen/X86/atom-pad-short-functions.ll b/test/CodeGen/X86/atom-pad-short-functions.ll
new file mode 100644
index 0000000000..b9a39e08cb
--- /dev/null
+++ b/test/CodeGen/X86/atom-pad-short-functions.ll
@@ -0,0 +1,103 @@
+; RUN: llc < %s -O1 -mcpu=atom -mtriple=i686-linux  | FileCheck %s
+
+declare void @external_function(...)
+
+define i32 @test_return_val(i32 %a) nounwind {
+; CHECK: test_return_val
+; CHECK: movl
+; CHECK: nop
+; CHECK: nop
+; CHECK: nop
+; CHECK: nop
+; CHECK: nop
+; CHECK: nop
+; CHECK: ret
+  ret i32 %a
+}
+
+define i32 @test_optsize(i32 %a) nounwind optsize {
+; CHECK: test_optsize
+; CHECK: movl
+; CHECK-NEXT: ret
+  ret i32 %a
+}
+
+define i32 @test_minsize(i32 %a) nounwind minsize {
+; CHECK: test_minsize
+; CHECK: movl
+; CHECK-NEXT: ret
+  ret i32 %a
+}
+
+define i32 @test_add(i32 %a, i32 %b) nounwind {
+; CHECK: test_add
+; CHECK: addl
+; CHECK: nop
+; CHECK: nop
+; CHECK: nop
+; CHECK: nop
+; CHECK: ret
+  %result = add i32 %a, %b
+  ret i32 %result
+}
+
+define i32 @test_multiple_ret(i32 %a, i32 %b, i1 %c) nounwind {
+; CHECK: @test_multiple_ret
+; CHECK: je
+
+; CHECK: nop
+; CHECK: nop
+; CHECK: ret
+
+; CHECK: nop
+; CHECK: nop
+; CHECK: ret
+
+  br i1 %c, label %bb1, label %bb2
+
+bb1:
+  ret i32 %a
+
+bb2:
+  ret i32 %b
+}
+
+define void @test_call_others(i32 %x) nounwind
+{
+; CHECK: test_call_others
+; CHECK: je
+  %tobool = icmp eq i32 %x, 0
+  br i1 %tobool, label %if.end, label %true.case
+
+; CHECK: jmp external_function
+true.case:
+  tail call void bitcast (void (...)* @external_function to void ()*)() nounwind
+  br label %if.end
+
+; CHECK: nop
+; CHECK: nop
+; CHECK: nop
+; CHECK: nop
+; CHECK: ret
+if.end:
+  ret void
+
+}
+
+define void @test_branch_to_same_bb(i32 %x, i32 %y) nounwind {
+; CHECK: @test_branch_to_same_bb
+  %cmp = icmp sgt i32 %x, 0
+  br i1 %cmp, label %while.cond, label %while.end
+
+while.cond:
+  br label %while.cond
+
+; CHECK: nop
+; CHECK: nop
+; CHECK: nop
+; CHECK: nop
+; CHECK: ret
+while.end:
+  ret void
+}
+
diff --git a/test/CodeGen/X86/avx-cvt.ll b/test/CodeGen/X86/avx-cvt.ll
index d0a7fe0100..62bdea2b49 100644
--- a/test/CodeGen/X86/avx-cvt.ll
+++ b/test/CodeGen/X86/avx-cvt.ll
@@ -46,7 +46,7 @@ entry:
   ret double %conv
 }
 
-; CHECK: vcvtsi2sd (%
+; CHECK: vcvtsi2sdl (%
 define double @funcB(i32* nocapture %e) nounwind uwtable readonly ssp {
 entry:
   %tmp1 = load i32* %e, align 4
@@ -54,7 +54,7 @@ entry:
   ret double %conv
 }
 
-; CHECK: vcvtsi2ss (%
+; CHECK: vcvtsi2ssl (%
 define float @funcC(i32* nocapture %e) nounwind uwtable readonly ssp {
 entry:
   %tmp1 = load i32* %e, align 4
diff --git a/test/CodeGen/X86/avx-sext.ll b/test/CodeGen/X86/avx-sext.ll
index 3713a8c377..8d7d79db7d 100755
--- a/test/CodeGen/X86/avx-sext.ll
+++ b/test/CodeGen/X86/avx-sext.ll
@@ -1,17 +1,144 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=pentium4 | FileCheck %s -check-prefix=SSE2
 
 define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
-;CHECK: sext_8i16_to_8i32
-;CHECK: vpmovsxwd
+; AVX: sext_8i16_to_8i32
+; AVX: vpmovsxwd
 
   %B = sext <8 x i16> %A to <8 x i32>
   ret <8 x i32>%B
 }
 
 define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
-;CHECK: sext_4i32_to_4i64
-;CHECK: vpmovsxdq
+; AVX: sext_4i32_to_4i64
+; AVX: vpmovsxdq
 
   %B = sext <4 x i32> %A to <4 x i64>
   ret <4 x i64>%B
 }
+
+; AVX: load_sext_test1
+; AVX: vpmovsxwd (%r{{[^,]*}}), %xmm{{.*}}
+; AVX: ret
+
+; SSSE3: load_sext_test1
+; SSSE3: movq
+; SSSE3: punpcklwd %xmm{{.*}}, %xmm{{.*}}
+; SSSE3: psrad $16
+; SSSE3: ret
+
+; SSE2: load_sext_test1
+; SSE2: movq
+; SSE2: punpcklwd %xmm{{.*}}, %xmm{{.*}}
+; SSE2: psrad $16
+; SSE2: ret
+define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) {
+ %X = load <4 x i16>* %ptr
+ %Y = sext <4 x i16> %X to <4 x i32>
+ ret <4 x i32>%Y
+}
+
+; AVX: load_sext_test2
+; AVX: vpmovsxbd (%r{{[^,]*}}), %xmm{{.*}}
+; AVX: ret
+
+; SSSE3: load_sext_test2
+; SSSE3: movd
+; SSSE3: pshufb
+; SSSE3: psrad $24
+; SSSE3: ret
+
+; SSE2: load_sext_test2
+; SSE2: movl
+; SSE2: psrad $24
+; SSE2: ret
+define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) {
+ %X = load <4 x i8>* %ptr
+ %Y = sext <4 x i8> %X to <4 x i32>
+ ret <4 x i32>%Y
+}
+
+; AVX: load_sext_test3
+; AVX: vpmovsxbq (%r{{[^,]*}}), %xmm{{.*}}
+; AVX: ret
+
+; SSSE3: load_sext_test3
+; SSSE3: movsbq
+; SSSE3: movsbq
+; SSSE3: punpcklqdq
+; SSSE3: ret
+
+; SSE2: load_sext_test3
+; SSE2: movsbq
+; SSE2: movsbq
+; SSE2: punpcklqdq
+; SSE2: ret
+define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) {
+ %X = load <2 x i8>* %ptr
+ %Y = sext <2 x i8> %X to <2 x i64>
+ ret <2 x i64>%Y
+}
+
+; AVX: load_sext_test4
+; AVX: vpmovsxwq (%r{{[^,]*}}), %xmm{{.*}}
+; AVX: ret
+
+; SSSE3: load_sext_test4
+; SSSE3: movswq
+; SSSE3: movswq
+; SSSE3: punpcklqdq
+; SSSE3: ret
+
+; SSE2: load_sext_test4
+; SSE2: movswq
+; SSE2: movswq
+; SSE2: punpcklqdq
+; SSE2: ret
+define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) {
+ %X = load <2 x i16>* %ptr
+ %Y = sext <2 x i16> %X to <2 x i64>
+ ret <2 x i64>%Y
+}
+
+; AVX: load_sext_test5
+; AVX: vpmovsxdq (%r{{[^,]*}}), %xmm{{.*}}
+; AVX: ret
+
+; SSSE3: load_sext_test5
+; SSSE3: movslq
+; SSSE3: movslq
+; SSSE3: punpcklqdq
+; SSSE3: ret
+
+; SSE2: load_sext_test5
+; SSE2: movslq
+; SSE2: movslq
+; SSE2: punpcklqdq
+; SSE2: ret
+define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) {
+ %X = load <2 x i32>* %ptr
+ %Y = sext <2 x i32> %X to <2 x i64>
+ ret <2 x i64>%Y
+}
+
+; AVX: load_sext_test6
+; AVX: vpmovsxbw (%r{{[^,]*}}), %xmm{{.*}}
+; AVX: ret
+
+; SSSE3: load_sext_test6
+; SSSE3: movq
+; SSSE3: punpcklbw
+; SSSE3: psraw $8
+; SSSE3: ret
+
+; SSE2: load_sext_test6
+; SSE2: movq
+; SSE2: punpcklbw
+; SSE2: psraw $8
+; SSE2: ret
+define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) {
+ %X = load <8 x i8>* %ptr
+ %Y = sext <8 x i8> %X to <8 x i16>
+ ret <8 x i16>%Y
+}
diff --git a/test/CodeGen/X86/avx-zext.ll b/test/CodeGen/X86/avx-zext.ll
index b630e9d146..582537ea90 100755
--- a/test/CodeGen/X86/avx-zext.ll
+++ b/test/CodeGen/X86/avx-zext.ll
@@ -18,11 +18,10 @@ define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
   ret <4 x i64>%B
 }
 
-
 define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
 ;CHECK: zext_8i8_to_8i32
 ;CHECK: vpunpckhwd
-;CHECK: vpunpcklwd
+;CHECK: vpmovzxwd
 ;CHECK: vinsertf128
 ;CHECK: ret
   %t = zext <8 x i8> %z to <8 x i32>
diff --git a/test/CodeGen/X86/avx2-conversions.ll b/test/CodeGen/X86/avx2-conversions.ll
index b47491335a..3ce08dcc73 100755
--- a/test/CodeGen/X86/avx2-conversions.ll
+++ b/test/CodeGen/X86/avx2-conversions.ll
@@ -63,6 +63,47 @@ define <8 x i32> @zext_8i8_8i32(<8 x i8> %A) nounwind {
   ret <8 x i32>%B
 }
 
+; CHECK: load_sext_test1
+; CHECK: vpmovsxdq (%r{{[^,]*}}), %ymm{{.*}}
+; CHECK: ret 
+define <4 x i64> @load_sext_test1(<4 x i32> *%ptr) {
+ %X = load <4 x i32>* %ptr
+ %Y = sext <4 x i32> %X to <4 x i64>
+ ret <4 x i64>%Y
+}
+
+; CHECK: load_sext_test2
+; CHECK: vpmovsxbq (%r{{[^,]*}}), %ymm{{.*}}
+; CHECK: ret 
+define <4 x i64> @load_sext_test2(<4 x i8> *%ptr) {
+ %X = load <4 x i8>* %ptr
+ %Y = sext <4 x i8> %X to <4 x i64>
+ ret <4 x i64>%Y
+}
 
+; CHECK: load_sext_test3
+; CHECK: vpmovsxwq (%r{{[^,]*}}), %ymm{{.*}}
+; CHECK: ret 
+define <4 x i64> @load_sext_test3(<4 x i16> *%ptr) {
+ %X = load <4 x i16>* %ptr
+ %Y = sext <4 x i16> %X to <4 x i64>
+ ret <4 x i64>%Y
+}
 
+; CHECK: load_sext_test4
+; CHECK: vpmovsxwd (%r{{[^,]*}}), %ymm{{.*}}
+; CHECK: ret 
+define <8 x i32> @load_sext_test4(<8 x i16> *%ptr) {
+ %X = load <8 x i16>* %ptr
+ %Y = sext <8 x i16> %X to <8 x i32>
+ ret <8 x i32>%Y
+}
 
+; CHECK: load_sext_test5
+; CHECK: vpmovsxbd (%r{{[^,]*}}), %ymm{{.*}}
+; CHECK: ret 
+define <8 x i32> @load_sext_test5(<8 x i8> *%ptr) {
+ %X = load <8 x i8>* %ptr
+ %Y = sext <8 x i8> %X to <8 x i32>
+ ret <8 x i32>%Y
+}
diff --git a/test/CodeGen/X86/avx2-logic.ll b/test/CodeGen/X86/avx2-logic.ll
index 13ebaa6f87..a5bb1a8f8e 100644
--- a/test/CodeGen/X86/avx2-logic.ll
+++ b/test/CodeGen/X86/avx2-logic.ll
@@ -48,9 +48,8 @@ entry:
 ; CHECK: vpblendvb
 ; CHECK: vpblendvb %ymm
 ; CHECK: ret
-define <32 x i8> @vpblendvb(<32 x i8> %x, <32 x i8> %y) {
-  %min_is_x = icmp ult <32 x i8> %x, %y
-  %min = select <32 x i1> %min_is_x, <32 x i8> %x, <32 x i8> %y
+define <32 x i8> @vpblendvb(<32 x i1> %cond, <32 x i8> %x, <32 x i8> %y) {
+  %min = select <32 x i1> %cond, <32 x i8> %x, <32 x i8> %y
   ret <32 x i8> %min
 }
 
diff --git a/test/CodeGen/X86/bmi.ll b/test/CodeGen/X86/bmi.ll
index 43c47c0fa8..b89e648c52 100644
--- a/test/CodeGen/X86/bmi.ll
+++ b/test/CodeGen/X86/bmi.ll
@@ -26,6 +26,14 @@ define i32 @t3(i32 %x) nounwind  {
 ; CHECK: tzcntl
 }
 
+define i32 @tzcnt32_load(i32* %x) nounwind  {
+  %x1 = load i32* %x
+  %tmp = tail call i32 @llvm.cttz.i32(i32 %x1, i1 false )
+  ret i32 %tmp
+; CHECK: tzcnt32_load:
+; CHECK: tzcntl ({{.*}})
+}
+
 define i64 @t4(i64 %x) nounwind  {
   %tmp = tail call i64 @llvm.cttz.i64( i64 %x, i1 false )
   ret i64 %tmp
@@ -69,6 +77,15 @@ define i32 @andn32(i32 %x, i32 %y) nounwind readnone {
 ; CHECK: andnl
 }
 
+define i32 @andn32_load(i32 %x, i32* %y) nounwind readnone {
+  %y1 = load i32* %y
+  %tmp1 = xor i32 %x, -1
+  %tmp2 = and i32 %y1, %tmp1
+  ret i32 %tmp2
+; CHECK: andn32_load:
+; CHECK: andnl ({{.*}})
+}
+
 define i64 @andn64(i64 %x, i64 %y) nounwind readnone {
   %tmp1 = xor i64 %x, -1
   %tmp2 = and i64 %tmp1, %y
@@ -84,6 +101,14 @@ define i32 @bextr32(i32 %x, i32 %y) nounwind readnone {
 ; CHECK: bextrl
 }
 
+define i32 @bextr32_load(i32* %x, i32 %y) nounwind readnone {
+  %x1 = load i32* %x
+  %tmp = tail call i32 @llvm.x86.bmi.bextr.32(i32 %x1, i32 %y)
+  ret i32 %tmp
+; CHECK: bextr32_load:
+; CHECK: bextrl {{.*}}, ({{.*}}), {{.*}}
+}
+
 declare i32 @llvm.x86.bmi.bextr.32(i32, i32) nounwind readnone
 
 define i64 @bextr64(i64 %x, i64 %y) nounwind readnone {
@@ -102,6 +127,14 @@ define i32 @bzhi32(i32 %x, i32 %y) nounwind readnone {
 ; CHECK: bzhil
 }
 
+define i32 @bzhi32_load(i32* %x, i32 %y) nounwind readnone {
+  %x1 = load i32* %x
+  %tmp = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %x1, i32 %y)
+  ret i32 %tmp
+; CHECK: bzhi32_load:
+; CHECK: bzhil {{.*}}, ({{.*}}), {{.*}}
+}
+
 declare i32 @llvm.x86.bmi.bzhi.32(i32, i32) nounwind readnone
 
 define i64 @bzhi64(i64 %x, i64 %y) nounwind readnone {
@@ -121,6 +154,15 @@ define i32 @blsi32(i32 %x) nounwind readnone {
 ; CHECK: blsil
 }
 
+define i32 @blsi32_load(i32* %x) nounwind readnone {
+  %x1 = load i32* %x
+  %tmp = sub i32 0, %x1
+  %tmp2 = and i32 %x1, %tmp
+  ret i32 %tmp2
+; CHECK: blsi32_load:
+; CHECK: blsil ({{.*}})
+}
+
 define i64 @blsi64(i64 %x) nounwind readnone {
   %tmp = sub i64 0, %x
   %tmp2 = and i64 %tmp, %x
@@ -137,6 +179,15 @@ define i32 @blsmsk32(i32 %x) nounwind readnone {
 ; CHECK: blsmskl
 }
 
+define i32 @blsmsk32_load(i32* %x) nounwind readnone {
+  %x1 = load i32* %x
+  %tmp = sub i32 %x1, 1
+  %tmp2 = xor i32 %x1, %tmp
+  ret i32 %tmp2
+; CHECK: blsmsk32_load:
+; CHECK: blsmskl ({{.*}})
+}
+
 define i64 @blsmsk64(i64 %x) nounwind readnone {
   %tmp = sub i64 %x, 1
   %tmp2 = xor i64 %tmp, %x
@@ -153,6 +204,15 @@ define i32 @blsr32(i32 %x) nounwind readnone {
 ; CHECK: blsrl
 }
 
+define i32 @blsr32_load(i32* %x) nounwind readnone {
+  %x1 = load i32* %x
+  %tmp = sub i32 %x1, 1
+  %tmp2 = and i32 %x1, %tmp
+  ret i32 %tmp2
+; CHECK: blsr32_load:
+; CHECK: blsrl ({{.*}})
+}
+
 define i64 @blsr64(i64 %x) nounwind readnone {
   %tmp = sub i64 %x, 1
   %tmp2 = and i64 %tmp, %x
@@ -168,6 +228,14 @@ define i32 @pdep32(i32 %x, i32 %y) nounwind readnone {
 ; CHECK: pdepl
 }
 
+define i32 @pdep32_load(i32 %x, i32* %y) nounwind readnone {
+  %y1 = load i32* %y
+  %tmp = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 %y1)
+  ret i32 %tmp
+; CHECK: pdep32_load:
+; CHECK: pdepl ({{.*}})
+}
+
 declare i32 @llvm.x86.bmi.pdep.32(i32, i32) nounwind readnone
 
 define i64 @pdep64(i64 %x, i64 %y) nounwind readnone {
@@ -186,6 +254,14 @@ define i32 @pext32(i32 %x, i32 %y) nounwind readnone {
 ; CHECK: pextl
 }
 
+define i32 @pext32_load(i32 %x, i32* %y) nounwind readnone {
+  %y1 = load i32* %y
+  %tmp = tail call i32 @llvm.x86.bmi.pext.32(i32 %x, i32 %y1)
+  ret i32 %tmp
+; CHECK: pext32_load:
+; CHECK: pextl ({{.*}})
+}
+
 declare i32 @llvm.x86.bmi.pext.32(i32, i32) nounwind readnone
 
 define i64 @pext64(i64 %x, i64 %y) nounwind readnone {
diff --git a/test/CodeGen/X86/clobber-fi0.ll b/test/CodeGen/X86/clobber-fi0.ll
new file mode 100644
index 0000000000..38a42dbf1a
--- /dev/null
+++ b/test/CodeGen/X86/clobber-fi0.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.7.0"
+
+; In the code below we need to copy the EFLAGS because of scheduling constraints.
+; When copying the EFLAGS we need to write to the stack with push/pop. This forces
+; us to emit the prolog.
+
+; CHECK: main
+; CHECK: subq{{.*}}rsp
+; CHECK: ret
+define i32 @main(i32 %arg, i8** %arg1) nounwind {
+bb:
+  %tmp = alloca i32, align 4                      ; [#uses=3 type=i32*]
+  %tmp2 = alloca i32, align 4                     ; [#uses=3 type=i32*]
+  %tmp3 = alloca i32                              ; [#uses=1 type=i32*]
+  store i32 1, i32* %tmp, align 4
+  store i32 1, i32* %tmp2, align 4
+  br label %bb4
+
+bb4:                                              ; preds = %bb4, %bb
+  %tmp6 = load i32* %tmp2, align 4                ; [#uses=1 type=i32]
+  %tmp7 = add i32 %tmp6, -1                       ; [#uses=2 type=i32]
+  store i32 %tmp7, i32* %tmp2, align 4
+  %tmp8 = icmp eq i32 %tmp7, 0                    ; [#uses=1 type=i1]
+  %tmp9 = load i32* %tmp                          ; [#uses=1 type=i32]
+  %tmp10 = add i32 %tmp9, -1              ; [#uses=1 type=i32]
+  store i32 %tmp10, i32* %tmp3
+  br i1 %tmp8, label %bb11, label %bb4
+
+bb11:                                             ; preds = %bb4
+  %tmp12 = load i32* %tmp, align 4                ; [#uses=1 type=i32]
+  ret i32 %tmp12
+}
+
+
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll
index eb06327f55..1855fe2fb8 100644
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -151,3 +151,18 @@ entry:
   %conv = zext i1 %cmp to i32
   ret i32 %conv
 }
+
+define i32 @test12() uwtable ssp {
+; CHECK: test12:
+; CHECK: testb
+  %1 = call zeroext i1 @test12b()
+  br i1 %1, label %2, label %3
+
+; <label>:2                                       ; preds = %0
+  ret i32 1
+
+; <label>:3                                       ; preds = %0
+  ret i32 2
+}
+
+declare zeroext i1 @test12b()
diff --git a/test/CodeGen/X86/coalesce-implicitdef.ll b/test/CodeGen/X86/coalesce-implicitdef.ll
new file mode 100644
index 0000000000..19cd08cf37
--- /dev/null
+++ b/test/CodeGen/X86/coalesce-implicitdef.ll
@@ -0,0 +1,130 @@
+; RUN: llc < %s -verify-coalescing
+; PR14732
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10"
+
+@c = common global i32 0, align 4
+@b = common global i32 0, align 4
+@a = common global i32 0, align 4
+@d = common global i32 0, align 4
+
+; This function creates an IMPLICIT_DEF with a long live range, even after
+; ProcessImplicitDefs.
+;
+; The coalescer should be able to deal with all kinds of IMPLICIT_DEF live
+; ranges, even if they are not common.
+
+define void @f() nounwind uwtable ssp {
+entry:
+  %i = alloca i32, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc34, %entry
+  %i.0.load44 = phi i32 [ %inc35, %for.inc34 ], [ undef, %entry ]
+  %pi.0 = phi i32* [ %pi.4, %for.inc34 ], [ undef, %entry ]
+  %tobool = icmp eq i32 %i.0.load44, 0
+  br i1 %tobool, label %for.end36, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  store i32 0, i32* @c, align 4, !tbaa !0
+  br label %for.body2
+
+for.body2:                                        ; preds = %for.body, %for.inc
+  %i.0.load45 = phi i32 [ %i.0.load44, %for.body ], [ 0, %for.inc ]
+  %tobool3 = icmp eq i32 %i.0.load45, 0
+  br i1 %tobool3, label %if.then10, label %if.then
+
+if.then:                                          ; preds = %for.body2
+  store i32 0, i32* %i, align 4, !tbaa !0
+  br label %for.body6
+
+for.body6:                                        ; preds = %if.then, %for.body6
+  store i32 0, i32* %i, align 4
+  br i1 true, label %for.body6, label %for.inc
+
+if.then10:                                        ; preds = %for.body2
+  store i32 1, i32* @b, align 4, !tbaa !0
+  ret void
+
+for.inc:                                          ; preds = %for.body6
+  br i1 undef, label %for.body2, label %if.end30
+
+while.condthread-pre-split:                       ; preds = %label.loopexit, %while.condthread-pre-split.lr.ph.lr.ph, %for.inc27.backedge
+  %0 = phi i32 [ %inc28, %for.inc27.backedge ], [ %inc285863, %while.condthread-pre-split.lr.ph.lr.ph ], [ %inc2858, %label.loopexit ]
+  %inc2060 = phi i32 [ %inc20, %for.inc27.backedge ], [ %a.promoted.pre, %while.condthread-pre-split.lr.ph.lr.ph ], [ %inc20, %label.loopexit ]
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.condthread-pre-split, %while.cond
+  %p2.1.in = phi i32* [ %pi.3.ph, %while.cond ], [ %i, %while.condthread-pre-split ]
+  %p2.1 = bitcast i32* %p2.1.in to i16*
+  br i1 %tobool19, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  %inc20 = add nsw i32 %inc2060, 1
+  %tobool21 = icmp eq i32 %inc2060, 0
+  br i1 %tobool21, label %for.inc27.backedge, label %if.then22
+
+for.inc27.backedge:                               ; preds = %while.end, %if.then22
+  %inc28 = add nsw i32 %0, 1
+  store i32 %inc28, i32* @b, align 4, !tbaa !0
+  %tobool17 = icmp eq i32 %inc28, 0
+  br i1 %tobool17, label %for.inc27.if.end30.loopexit56_crit_edge, label %while.condthread-pre-split
+
+if.then22:                                        ; preds = %while.end
+  %1 = load i16* %p2.1, align 2, !tbaa !3
+  %tobool23 = icmp eq i16 %1, 0
+  br i1 %tobool23, label %for.inc27.backedge, label %label.loopexit
+
+label.loopexit:                                   ; preds = %if.then22
+  store i32 %inc20, i32* @a, align 4, !tbaa !0
+  %inc2858 = add nsw i32 %0, 1
+  store i32 %inc2858, i32* @b, align 4, !tbaa !0
+  %tobool1759 = icmp eq i32 %inc2858, 0
+  br i1 %tobool1759, label %if.end30, label %while.condthread-pre-split
+
+for.inc27.if.end30.loopexit56_crit_edge:          ; preds = %for.inc27.backedge
+  store i32 %inc20, i32* @a, align 4, !tbaa !0
+  br label %if.end30
+
+if.end30:                                         ; preds = %for.inc27.if.end30.loopexit56_crit_edge, %label.loopexit, %label.preheader, %for.inc
+  %i.0.load46 = phi i32 [ 0, %for.inc ], [ %i.0.load4669, %label.preheader ], [ %i.0.load4669, %label.loopexit ], [ %i.0.load4669, %for.inc27.if.end30.loopexit56_crit_edge ]
+  %pi.4 = phi i32* [ %i, %for.inc ], [ %pi.3.ph, %label.preheader ], [ %pi.3.ph, %label.loopexit ], [ %pi.3.ph, %for.inc27.if.end30.loopexit56_crit_edge ]
+  %2 = load i32* %pi.4, align 4, !tbaa !0
+  %tobool31 = icmp eq i32 %2, 0
+  br i1 %tobool31, label %for.inc34, label %label.preheader
+
+for.inc34:                                        ; preds = %if.end30
+  %inc35 = add nsw i32 %i.0.load46, 1
+  store i32 %inc35, i32* %i, align 4
+  br label %for.cond
+
+for.end36:                                        ; preds = %for.cond
+  store i32 1, i32* %i, align 4
+  %3 = load i32* @c, align 4, !tbaa !0
+  %tobool37 = icmp eq i32 %3, 0
+  br i1 %tobool37, label %label.preheader, label %land.rhs
+
+land.rhs:                                         ; preds = %for.end36
+  store i32 0, i32* @a, align 4, !tbaa !0
+  br label %label.preheader
+
+label.preheader:                                  ; preds = %for.end36, %if.end30, %land.rhs
+  %i.0.load4669 = phi i32 [ 1, %land.rhs ], [ %i.0.load46, %if.end30 ], [ 1, %for.end36 ]
+  %pi.3.ph = phi i32* [ %pi.0, %land.rhs ], [ %pi.4, %if.end30 ], [ %pi.0, %for.end36 ]
+  %4 = load i32* @b, align 4, !tbaa !0
+  %inc285863 = add nsw i32 %4, 1
+  store i32 %inc285863, i32* @b, align 4, !tbaa !0
+  %tobool175964 = icmp eq i32 %inc285863, 0
+  br i1 %tobool175964, label %if.end30, label %while.condthread-pre-split.lr.ph.lr.ph
+
+while.condthread-pre-split.lr.ph.lr.ph:           ; preds = %label.preheader
+  %.pr50 = load i32* @d, align 4, !tbaa !0
+  %tobool19 = icmp eq i32 %.pr50, 0
+  %a.promoted.pre = load i32* @a, align 4, !tbaa !0
+  br label %while.condthread-pre-split
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"short", metadata !1}
diff --git a/test/CodeGen/X86/cvtv2f32.ll b/test/CodeGen/X86/cvtv2f32.ll
index 466b096067..d11bb9ee3e 100644
--- a/test/CodeGen/X86/cvtv2f32.ll
+++ b/test/CodeGen/X86/cvtv2f32.ll
@@ -1,3 +1,7 @@
+; A bug fix in the DAGCombiner made this test fail, so marking as xfail
+; until this can be investigated further.
+; XFAIL: *
+
 ; RUN: llc < %s -mtriple=i686-linux-pc -mcpu=corei7 | FileCheck %s
 
 define <2 x float> @foo(i32 %x, i32 %y, <2 x float> %v) {
diff --git a/test/CodeGen/X86/early-ifcvt.ll b/test/CodeGen/X86/early-ifcvt.ll
index 2e1852d3e3..2606bd28d5 100644
--- a/test/CodeGen/X86/early-ifcvt.ll
+++ b/test/CodeGen/X86/early-ifcvt.ll
@@ -142,3 +142,34 @@ save_state_and_return:
 }
 
 declare void @BZ2_bz__AssertH__fail()
+
+; Make sure we don't speculate on div/idiv instructions
+; CHECK: test_idiv
+; CHECK-NOT: cmov
+define i32 @test_idiv(i32 %a, i32 %b) nounwind uwtable readnone ssp {
+  %1 = icmp eq i32 %b, 0
+  br i1 %1, label %4, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = sdiv i32 %a, %b
+  br label %4
+
+; <label>:4                                       ; preds = %0, %2
+  %5 = phi i32 [ %3, %2 ], [ %a, %0 ]
+  ret i32 %5
+}
+
+; CHECK: test_div
+; CHECK-NOT: cmov
+define i32 @test_div(i32 %a, i32 %b) nounwind uwtable readnone ssp {
+  %1 = icmp eq i32 %b, 0
+  br i1 %1, label %4, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = udiv i32 %a, %b
+  br label %4
+
+; <label>:4                                       ; preds = %0, %2
+  %5 = phi i32 [ %3, %2 ], [ %a, %0 ]
+  ret i32 %5
+}
diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index 86b6606779..acfa64582c 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mattr=-avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
-; RUN: llc < %s -mattr=+avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mattr=-avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mattr=+avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s --check-prefix=AVX
 ; RUN: llc < %s -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort -mtriple=x86_64-none-nacl | FileCheck %s --check-prefix=NACL64
 ; RUN: llc < %s -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort -mtriple=x86_64-none-nacl -relocation-model=pic | FileCheck %s --check-prefix=NACL64_PIC
 
diff --git a/test/CodeGen/X86/float-asmprint.ll b/test/CodeGen/X86/float-asmprint.ll
new file mode 100644
index 0000000000..4aeae7fe04
--- /dev/null
+++ b/test/CodeGen/X86/float-asmprint.ll
@@ -0,0 +1,40 @@
+; RUN: llc -mtriple=x86_64-none-linux < %s | FileCheck %s
+
+; Check that all current floating-point types are correctly emitted to assembly
+; on a little-endian target.
+
+@var128 = global fp128 0xL00000000000000008000000000000000, align 16
+@varppc128 = global ppc_fp128 0xM80000000000000000000000000000000, align 16
+@var80 = global x86_fp80 0xK80000000000000000000, align 16
+@var64 = global double -0.0, align 8
+@var32 = global float -0.0, align 4
+@var16 = global half -0.0, align 2
+
+; CHECK: var128:
+; CHECK-NEXT: .quad 0                         # fp128 -0
+; CHECK-NEXT: .quad -9223372036854775808
+; CHECK-NEXT: .size
+
+; CHECK: varppc128:
+; CHECK-NEXT: .quad 0                         # ppc_fp128 -0
+; CHECK-NEXT: .quad -9223372036854775808
+; CHECK-NEXT: .size
+
+; CHECK: var80:
+; CHECK-NEXT: .quad 0                         # x86_fp80 -0
+; CHECK-NEXT: .short 32768
+; CHECK-NEXT: .zero 6
+; CHECK-NEXT: .size
+
+; CHECK: var64:
+; CHECK-NEXT: .quad -9223372036854775808      # double -0
+; CHECK-NEXT: .size
+
+; CHECK: var32:
+; CHECK-NEXT: .long 2147483648                # float -0
+; CHECK-NEXT: .size
+
+; CHECK: var16:
+; CHECK-NEXT: .short 32768                    # half -0
+; CHECK-NEXT: .size
+
diff --git a/test/CodeGen/X86/fold-call.ll b/test/CodeGen/X86/fold-call.ll
index 603e9ad66c..35327faa64 100644
--- a/test/CodeGen/X86/fold-call.ll
+++ b/test/CodeGen/X86/fold-call.ll
@@ -1,10 +1,27 @@
-; RUN: llc < %s -march=x86 | not grep mov
-; RUN: llc < %s -march=x86-64 | not grep mov
+; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -march=x86-64 | FileCheck %s
 
-declare void @bar()
+; CHECK: test1
+; CHECK-NOT: mov
 
-define void @foo(i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, void()* %arg) nounwind {
+declare void @bar()
+define void @test1(i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, void()* %arg) nounwind {
 	call void @bar()
 	call void %arg()
 	ret void
 }
+
+; PR14739
+; CHECK: test2
+; CHECK: mov{{.*}} $0, ([[REGISTER:%[a-z]+]])
+; CHECK-NOT: jmp{{.*}} *([[REGISTER]])
+
+%struct.X = type { void ()* }
+define void @test2(%struct.X* nocapture %x) {
+entry:
+  %f = getelementptr inbounds %struct.X* %x, i64 0, i32 0
+  %0 = load void ()** %f
+  store void ()* null, void ()** %f
+  tail call void %0()
+  ret void
+}
diff --git a/test/CodeGen/X86/fold-vex.ll b/test/CodeGen/X86/fold-vex.ll
new file mode 100644
index 0000000000..2bb5b441c7
--- /dev/null
+++ b/test/CodeGen/X86/fold-vex.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s
+
+;CHECK: @test
+; No need to load from memory. The operand will be loaded as part of th AND instr.
+;CHECK-NOT: vmovaps
+;CHECK: vandps
+;CHECK: ret
+
+define void @test1(<8 x i32>* %p0, <8 x i32> %in1) nounwind {
+entry:
+  %in0 = load <8 x i32>* %p0, align 2
+  %a = and <8 x i32> %in0, %in1
+  store <8 x i32> %a, <8 x i32>* undef
+  ret void
+}
+
diff --git a/test/CodeGen/X86/memcpy-2.ll b/test/CodeGen/X86/memcpy-2.ll
index dcc8f0d268..949d6a4293 100644
--- a/test/CodeGen/X86/memcpy-2.ll
+++ b/test/CodeGen/X86/memcpy-2.ll
@@ -17,11 +17,11 @@ entry:
 ; SSE2: movb $0, 24(%esp)
 
 ; SSE1: t1:
-; SSE1: fldl _.str+16
-; SSE1: fstpl 16(%esp)
 ; SSE1: movaps _.str, %xmm0
 ; SSE1: movaps %xmm0
 ; SSE1: movb $0, 24(%esp)
+; SSE1: movl $0, 20(%esp)
+; SSE1: movl $0, 16(%esp)
 
 ; NOSSE: t1:
 ; NOSSE: movb $0
diff --git a/test/CodeGen/X86/memcpy.ll b/test/CodeGen/X86/memcpy.ll
index 39c7fbafd4..2e02e45c8d 100644
--- a/test/CodeGen/X86/memcpy.ll
+++ b/test/CodeGen/X86/memcpy.ll
@@ -87,8 +87,21 @@ entry:
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([30 x i8]* @.str, i64 0, i64 0), i64 16, i32 1, i1 false)
   ret void
 
+; DARWIN: test5:
 ; DARWIN: movabsq	$7016996765293437281
 ; DARWIN: movabsq	$7016996765293437184
 }
 
 
+; PR14896
+@.str2 = private unnamed_addr constant [2 x i8] c"x\00", align 1
+
+define void @test6() nounwind uwtable {
+entry:
+; DARWIN: test6
+; DARWIN: movw $0, 8
+; DARWIN: movq $120, 0
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* null, i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0), i64 10, i32 1, i1 false)
+  ret void
+}
+
diff --git a/test/CodeGen/X86/ms-inline-asm.ll b/test/CodeGen/X86/ms-inline-asm.ll
index 24d28adda8..68e332eed4 100644
--- a/test/CodeGen/X86/ms-inline-asm.ll
+++ b/test/CodeGen/X86/ms-inline-asm.ll
@@ -61,3 +61,21 @@ entry:
 ; CHECK: .att_syntax
 ; CHECK: {{## InlineAsm End|#NO_APP}}
 }
+
+define void @t19_helper() nounwind {
+entry:
+  ret void
+}
+
+define void @t19() nounwind {
+entry:
+  call void asm sideeffect inteldialect "call $0", "r,~{dirflag},~{fpsr},~{flags}"(void ()* @t19_helper) nounwind
+  ret void
+; CHECK: t19:
+; CHECK: movl ${{_?}}t19_helper, %eax
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: call eax
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+}
diff --git a/test/CodeGen/X86/pmovsx-inreg.ll b/test/CodeGen/X86/pmovsx-inreg.ll
new file mode 100644
index 0000000000..d8c27f2504
--- /dev/null
+++ b/test/CodeGen/X86/pmovsx-inreg.ll
@@ -0,0 +1,176 @@
+; RUN: llc < %s -march=x86-64 -mcpu=penryn | FileCheck -check-prefix=SSE41 %s
+; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck -check-prefix=AVX1 %s
+; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck -check-prefix=AVX2 %s
+
+; PR14887
+; These tests inject a store into the chain to test the inreg versions of pmovsx
+
+define void @test1(<2 x i8>* %in, <2 x i64>* %out) nounwind {
+  %wide.load35 = load <2 x i8>* %in, align 1
+  %sext = sext <2 x i8> %wide.load35 to <2 x i64>
+  store <2 x i64> zeroinitializer, <2 x i64>* undef, align 8
+  store <2 x i64> %sext, <2 x i64>* %out, align 8
+  ret void
+
+; SSE41: test1:
+; SSE41: pmovsxbq
+
+; AVX1: test1:
+; AVX1: vpmovsxbq
+
+; AVX2: test1:
+; AVX2: vpmovsxbq
+}
+
+define void @test2(<4 x i8>* %in, <4 x i64>* %out) nounwind {
+  %wide.load35 = load <4 x i8>* %in, align 1
+  %sext = sext <4 x i8> %wide.load35 to <4 x i64>
+  store <4 x i64> zeroinitializer, <4 x i64>* undef, align 8
+  store <4 x i64> %sext, <4 x i64>* %out, align 8
+  ret void
+
+; AVX2: test2:
+; AVX2: vpmovsxbq
+}
+
+define void @test3(<4 x i8>* %in, <4 x i32>* %out) nounwind {
+  %wide.load35 = load <4 x i8>* %in, align 1
+  %sext = sext <4 x i8> %wide.load35 to <4 x i32>
+  store <4 x i32> zeroinitializer, <4 x i32>* undef, align 8
+  store <4 x i32> %sext, <4 x i32>* %out, align 8
+  ret void
+
+; SSE41: test3:
+; SSE41: pmovsxbd
+
+; AVX1: test3:
+; AVX1: vpmovsxbd
+
+; AVX2: test3:
+; AVX2: vpmovsxbd
+}
+
+define void @test4(<8 x i8>* %in, <8 x i32>* %out) nounwind {
+  %wide.load35 = load <8 x i8>* %in, align 1
+  %sext = sext <8 x i8> %wide.load35 to <8 x i32>
+  store <8 x i32> zeroinitializer, <8 x i32>* undef, align 8
+  store <8 x i32> %sext, <8 x i32>* %out, align 8
+  ret void
+
+; AVX2: test4:
+; AVX2: vpmovsxbd
+}
+
+define void @test5(<8 x i8>* %in, <8 x i16>* %out) nounwind {
+  %wide.load35 = load <8 x i8>* %in, align 1
+  %sext = sext <8 x i8> %wide.load35 to <8 x i16>
+  store <8 x i16> zeroinitializer, <8 x i16>* undef, align 8
+  store <8 x i16> %sext, <8 x i16>* %out, align 8
+  ret void
+
+; SSE41: test5:
+; SSE41: pmovsxbw
+
+; AVX1: test5:
+; AVX1: vpmovsxbw
+
+; AVX2: test5:
+; AVX2: vpmovsxbw
+}
+
+define void @test6(<16 x i8>* %in, <16 x i16>* %out) nounwind {
+  %wide.load35 = load <16 x i8>* %in, align 1
+  %sext = sext <16 x i8> %wide.load35 to <16 x i16>
+  store <16 x i16> zeroinitializer, <16 x i16>* undef, align 8
+  store <16 x i16> %sext, <16 x i16>* %out, align 8
+  ret void
+
+; AVX2: test6:
+; FIXME: v16i8 -> v16i16 is scalarized.
+; AVX2-NOT: pmovsx
+}
+
+define void @test7(<2 x i16>* %in, <2 x i64>* %out) nounwind {
+  %wide.load35 = load <2 x i16>* %in, align 1
+  %sext = sext <2 x i16> %wide.load35 to <2 x i64>
+  store <2 x i64> zeroinitializer, <2 x i64>* undef, align 8
+  store <2 x i64> %sext, <2 x i64>* %out, align 8
+  ret void
+
+
+; SSE41: test7:
+; SSE41: pmovsxwq
+
+; AVX1: test7:
+; AVX1: vpmovsxwq
+
+; AVX2: test7:
+; AVX2: vpmovsxwq
+}
+
+define void @test8(<4 x i16>* %in, <4 x i64>* %out) nounwind {
+  %wide.load35 = load <4 x i16>* %in, align 1
+  %sext = sext <4 x i16> %wide.load35 to <4 x i64>
+  store <4 x i64> zeroinitializer, <4 x i64>* undef, align 8
+  store <4 x i64> %sext, <4 x i64>* %out, align 8
+  ret void
+
+; AVX2: test8:
+; AVX2: vpmovsxwq
+}
+
+define void @test9(<4 x i16>* %in, <4 x i32>* %out) nounwind {
+  %wide.load35 = load <4 x i16>* %in, align 1
+  %sext = sext <4 x i16> %wide.load35 to <4 x i32>
+  store <4 x i32> zeroinitializer, <4 x i32>* undef, align 8
+  store <4 x i32> %sext, <4 x i32>* %out, align 8
+  ret void
+
+; SSE41: test9:
+; SSE41: pmovsxwd
+
+; AVX1: test9:
+; AVX1: vpmovsxwd
+
+; AVX2: test9:
+; AVX2: vpmovsxwd
+}
+
+define void @test10(<8 x i16>* %in, <8 x i32>* %out) nounwind {
+  %wide.load35 = load <8 x i16>* %in, align 1
+  %sext = sext <8 x i16> %wide.load35 to <8 x i32>
+  store <8 x i32> zeroinitializer, <8 x i32>* undef, align 8
+  store <8 x i32> %sext, <8 x i32>* %out, align 8
+  ret void
+
+; AVX2: test10:
+; AVX2: vpmovsxwd
+}
+
+define void @test11(<2 x i32>* %in, <2 x i64>* %out) nounwind {
+  %wide.load35 = load <2 x i32>* %in, align 1
+  %sext = sext <2 x i32> %wide.load35 to <2 x i64>
+  store <2 x i64> zeroinitializer, <2 x i64>* undef, align 8
+  store <2 x i64> %sext, <2 x i64>* %out, align 8
+  ret void
+
+; SSE41: test11:
+; SSE41: pmovsxdq
+
+; AVX1: test11:
+; AVX1: vpmovsxdq
+
+; AVX2: test11:
+; AVX2: vpmovsxdq
+}
+
+define void @test12(<4 x i32>* %in, <4 x i64>* %out) nounwind {
+  %wide.load35 = load <4 x i32>* %in, align 1
+  %sext = sext <4 x i32> %wide.load35 to <4 x i64>
+  store <4 x i64> zeroinitializer, <4 x i64>* undef, align 8
+  store <4 x i64> %sext, <4 x i64>* %out, align 8
+  ret void
+
+; AVX2: test12:
+; AVX2: vpmovsxdq
+}
diff --git a/test/CodeGen/X86/pointer-vector.ll b/test/CodeGen/X86/pointer-vector.ll
index 58423d1959..0ee9987526 100644
--- a/test/CodeGen/X86/pointer-vector.ll
+++ b/test/CodeGen/X86/pointer-vector.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=i686-linux -mcpu=corei7 | FileCheck %s
-; RUN: opt -instsimplify %s -disable-output
+; RUN: opt -instsimplify -disable-output < %s
 
 ;CHECK: SHUFF0
 define <8 x i32*> @SHUFF0(<4 x i32*> %ptrv) nounwind {
diff --git a/test/CodeGen/X86/psubus.ll b/test/CodeGen/X86/psubus.ll
new file mode 100644
index 0000000000..aff4afbd2e
--- /dev/null
+++ b/test/CodeGen/X86/psubus.ll
@@ -0,0 +1,340 @@
+; RUN: llc -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE2
+; RUN: llc -mcpu=corei7-avx < %s | FileCheck %s -check-prefix=AVX1
+; RUN: llc -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define void @test1(i16* nocapture %head) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i16* %head, i64 %index
+  %1 = bitcast i16* %0 to <8 x i16>*
+  %2 = load <8 x i16>* %1, align 2
+  %3 = icmp slt <8 x i16> %2, zeroinitializer
+  %4 = xor <8 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
+  %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
+  store <8 x i16> %5, <8 x i16>* %1, align 2
+  %index.next = add i64 %index, 8
+  %6 = icmp eq i64 %index.next, 16384
+  br i1 %6, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: @test1
+; SSE2: psubusw LCPI0_0(%rip), %xmm0
+
+; AVX1: @test1
+; AVX1: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
+
+; AVX2: @test1
+; AVX2: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
+}
+
+define void @test2(i16* nocapture %head) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i16* %head, i64 %index
+  %1 = bitcast i16* %0 to <8 x i16>*
+  %2 = load <8 x i16>* %1, align 2
+  %3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
+  %4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
+  %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
+  store <8 x i16> %5, <8 x i16>* %1, align 2
+  %index.next = add i64 %index, 8
+  %6 = icmp eq i64 %index.next, 16384
+  br i1 %6, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: @test2
+; SSE2: psubusw LCPI1_0(%rip), %xmm0
+
+; AVX1: @test2
+; AVX1: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
+
+; AVX2: @test2
+; AVX2: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
+}
+
+define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {
+vector.ph:
+  %0 = insertelement <8 x i16> undef, i16 %w, i32 0
+  %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds i16* %head, i64 %index
+  %2 = bitcast i16* %1 to <8 x i16>*
+  %3 = load <8 x i16>* %2, align 2
+  %4 = icmp ult <8 x i16> %3, %broadcast15
+  %5 = sub <8 x i16> %3, %broadcast15
+  %6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5
+  store <8 x i16> %6, <8 x i16>* %2, align 2
+  %index.next = add i64 %index, 8
+  %7 = icmp eq i64 %index.next, 16384
+  br i1 %7, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: @test3
+; SSE2: psubusw %xmm0, %xmm1
+
+; AVX1: @test3
+; AVX1: vpsubusw %xmm0, %xmm1, %xmm1
+
+; AVX2: @test3
+; AVX2: vpsubusw %xmm0, %xmm1, %xmm1
+}
+
+define void @test4(i8* nocapture %head) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i8* %head, i64 %index
+  %1 = bitcast i8* %0 to <16 x i8>*
+  %2 = load <16 x i8>* %1, align 1
+  %3 = icmp slt <16 x i8> %2, zeroinitializer
+  %4 = xor <16 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+  %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
+  store <16 x i8> %5, <16 x i8>* %1, align 1
+  %index.next = add i64 %index, 16
+  %6 = icmp eq i64 %index.next, 16384
+  br i1 %6, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: @test4
+; SSE2: psubusb LCPI3_0(%rip), %xmm0
+
+; AVX1: @test4
+; AVX1: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
+
+; AVX2: @test4
+; AVX2: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
+}
+
+define void @test5(i8* nocapture %head) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i8* %head, i64 %index
+  %1 = bitcast i8* %0 to <16 x i8>*
+  %2 = load <16 x i8>* %1, align 1
+  %3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
+  %4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
+  %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
+  store <16 x i8> %5, <16 x i8>* %1, align 1
+  %index.next = add i64 %index, 16
+  %6 = icmp eq i64 %index.next, 16384
+  br i1 %6, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: @test5
+; SSE2: psubusb LCPI4_0(%rip), %xmm0
+
+; AVX1: @test5
+; AVX1: vpsubusb LCPI4_0(%rip), %xmm0, %xmm0
+
+; AVX2: @test5
+; AVX2: vpsubusb LCPI4_0(%rip), %xmm0, %xmm0
+}
+
+define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {
+vector.ph:
+  %0 = insertelement <16 x i8> undef, i8 %w, i32 0
+  %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds i8* %head, i64 %index
+  %2 = bitcast i8* %1 to <16 x i8>*
+  %3 = load <16 x i8>* %2, align 1
+  %4 = icmp ult <16 x i8> %3, %broadcast15
+  %5 = sub <16 x i8> %3, %broadcast15
+  %6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5
+  store <16 x i8> %6, <16 x i8>* %2, align 1
+  %index.next = add i64 %index, 16
+  %7 = icmp eq i64 %index.next, 16384
+  br i1 %7, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: @test6
+; SSE2: psubusb %xmm0, %xmm1
+
+; AVX1: @test6
+; AVX1: vpsubusb %xmm0, %xmm1, %xmm1
+
+; AVX2: @test6
+; AVX2: vpsubusb %xmm0, %xmm1, %xmm1
+}
+
+define void @test7(i16* nocapture %head) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i16* %head, i64 %index
+  %1 = bitcast i16* %0 to <16 x i16>*
+  %2 = load <16 x i16>* %1, align 2
+  %3 = icmp slt <16 x i16> %2, zeroinitializer
+  %4 = xor <16 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
+  %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
+  store <16 x i16> %5, <16 x i16>* %1, align 2
+  %index.next = add i64 %index, 8
+  %6 = icmp eq i64 %index.next, 16384
+  br i1 %6, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: @test7
+; AVX2: vpsubusw LCPI6_0(%rip), %ymm0, %ymm0
+}
+
+define void @test8(i16* nocapture %head) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i16* %head, i64 %index
+  %1 = bitcast i16* %0 to <16 x i16>*
+  %2 = load <16 x i16>* %1, align 2
+  %3 = icmp ugt <16 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
+  %4 = add <16 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
+  %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
+  store <16 x i16> %5, <16 x i16>* %1, align 2
+  %index.next = add i64 %index, 8
+  %6 = icmp eq i64 %index.next, 16384
+  br i1 %6, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: @test8
+; AVX2: vpsubusw LCPI7_0(%rip), %ymm0, %ymm0
+}
+
+define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind {
+vector.ph:
+  %0 = insertelement <16 x i16> undef, i16 %w, i32 0
+  %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds i16* %head, i64 %index
+  %2 = bitcast i16* %1 to <16 x i16>*
+  %3 = load <16 x i16>* %2, align 2
+  %4 = icmp ult <16 x i16> %3, %broadcast15
+  %5 = sub <16 x i16> %3, %broadcast15
+  %6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5
+  store <16 x i16> %6, <16 x i16>* %2, align 2
+  %index.next = add i64 %index, 8
+  %7 = icmp eq i64 %index.next, 16384
+  br i1 %7, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+
+; AVX2: @test9
+; AVX2: vpsubusw %ymm0, %ymm1, %ymm1
+}
+
+define void @test10(i8* nocapture %head) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i8* %head, i64 %index
+  %1 = bitcast i8* %0 to <32 x i8>*
+  %2 = load <32 x i8>* %1, align 1
+  %3 = icmp slt <32 x i8> %2, zeroinitializer
+  %4 = xor <32 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+  %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
+  store <32 x i8> %5, <32 x i8>* %1, align 1
+  %index.next = add i64 %index, 16
+  %6 = icmp eq i64 %index.next, 16384
+  br i1 %6, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+
+; AVX2: @test10
+; AVX2: vpsubusb LCPI9_0(%rip), %ymm0, %ymm0
+}
+
+define void @test11(i8* nocapture %head) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i8* %head, i64 %index
+  %1 = bitcast i8* %0 to <32 x i8>*
+  %2 = load <32 x i8>* %1, align 1
+  %3 = icmp ugt <32 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
+  %4 = add <32 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
+  %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
+  store <32 x i8> %5, <32 x i8>* %1, align 1
+  %index.next = add i64 %index, 16
+  %6 = icmp eq i64 %index.next, 16384
+  br i1 %6, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: @test11
+; AVX2: vpsubusb LCPI10_0(%rip), %ymm0, %ymm0
+}
+
+define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind {
+vector.ph:
+  %0 = insertelement <32 x i8> undef, i8 %w, i32 0
+  %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds i8* %head, i64 %index
+  %2 = bitcast i8* %1 to <32 x i8>*
+  %3 = load <32 x i8>* %2, align 1
+  %4 = icmp ult <32 x i8> %3, %broadcast15
+  %5 = sub <32 x i8> %3, %broadcast15
+  %6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5
+  store <32 x i8> %6, <32 x i8>* %2, align 1
+  %index.next = add i64 %index, 16
+  %7 = icmp eq i64 %index.next, 16384
+  br i1 %7, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: @test12
+; AVX2: vpsubusb %ymm0, %ymm1, %ymm1
+}
diff --git a/test/CodeGen/X86/ret-mmx.ll b/test/CodeGen/X86/ret-mmx.ll
index 865e147a4a..778e4722cd 100644
--- a/test/CodeGen/X86/ret-mmx.ll
+++ b/test/CodeGen/X86/ret-mmx.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin11 -mattr=+mmx,+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin11 -mcpu=core2 -mattr=+mmx,+sse2 | FileCheck %s
 ; rdar://6602459
 
 @g_v1di = external global <1 x i64>
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index 3bec3acdbf..09ca07b31a 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -282,7 +282,7 @@ define i32 @test13(i32 %a, i32 %b) nounwind {
 ; ATOM: test13:
 ; ATOM: cmpl
 ; ATOM-NEXT: sbbl
-; ATOM-NEXT: ret
+; ATOM: ret
 }
 
 define i32 @test14(i32 %a, i32 %b) nounwind {
@@ -299,7 +299,7 @@ define i32 @test14(i32 %a, i32 %b) nounwind {
 ; ATOM: cmpl
 ; ATOM-NEXT: sbbl
 ; ATOM-NEXT: notl
-; ATOM-NEXT: ret
+; ATOM: ret
 }
 
 ; rdar://10961709
diff --git a/test/CodeGen/X86/sse-align-2.ll b/test/CodeGen/X86/sse-align-2.ll
index 102c3fb06c..22cd772306 100644
--- a/test/CodeGen/X86/sse-align-2.ll
+++ b/test/CodeGen/X86/sse-align-2.ll
@@ -1,12 +1,21 @@
-; RUN: llc < %s -march=x86-64 | grep movup | count 2
+; RUN: llc < %s -march=x86-64 -mcpu=penryn | FileCheck %s
 
 define <4 x float> @foo(<4 x float>* %p, <4 x float> %x) nounwind {
   %t = load <4 x float>* %p, align 4
   %z = fmul <4 x float> %t, %x
   ret <4 x float> %z
 }
+
+; CHECK: foo:
+; CHECK: movups
+; CHECK: ret
+
 define <2 x double> @bar(<2 x double>* %p, <2 x double> %x) nounwind {
   %t = load <2 x double>* %p, align 8
   %z = fmul <2 x double> %t, %x
   ret <2 x double> %z
 }
+
+; CHECK: bar:
+; CHECK: movupd
+; CHECK: ret
diff --git a/test/CodeGen/X86/sse-domains.ll b/test/CodeGen/X86/sse-domains.ll
index c99287bdfb..168959a5d6 100644
--- a/test/CodeGen/X86/sse-domains.ll
+++ b/test/CodeGen/X86/sse-domains.ll
@@ -55,10 +55,10 @@ while.end:
 ; instructions, they are still dependent on themselves.
 ; CHECK: xorps [[XMM1:%xmm[0-9]+]]
 ; CHECK: , [[XMM1]]
-; CHECK: cvtsi2ss %{{.*}}, [[XMM1]]
+; CHECK: cvtsi2ssl %{{.*}}, [[XMM1]]
 ; CHECK: xorps [[XMM2:%xmm[0-9]+]]
 ; CHECK: , [[XMM2]]
-; CHECK: cvtsi2ss %{{.*}}, [[XMM2]]
+; CHECK: cvtsi2ssl %{{.*}}, [[XMM2]]
 ;
 define float @f2(i32 %m) nounwind uwtable readnone ssp {
 entry:
diff --git a/test/CodeGen/X86/sse2-mul.ll b/test/CodeGen/X86/sse2-mul.ll
new file mode 100644
index 0000000000..0466d60ec3
--- /dev/null
+++ b/test/CodeGen/X86/sse2-mul.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -march=x86-64 -mcpu=core2 | FileCheck %s
+
+define <4 x i32> @test1(<4 x i32> %x, <4 x i32> %y) {
+  %m = mul <4 x i32> %x, %y
+  ret <4 x i32> %m
+; CHECK: test1:
+; CHECK: pshufd $49
+; CHECK: pmuludq
+; CHECK: pshufd $49
+; CHECK: pmuludq
+; CHECK: shufps $-120
+; CHECK: pshufd $-40
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/store_op_load_fold.ll b/test/CodeGen/X86/store_op_load_fold.ll
index 6e47eb397d..070cccdb87 100644
--- a/test/CodeGen/X86/store_op_load_fold.ll
+++ b/test/CodeGen/X86/store_op_load_fold.ll
@@ -1,13 +1,30 @@
-; RUN: llc < %s -march=x86 | not grep mov
+; RUN: llc < %s -mtriple=i686-darwin | FileCheck %s
 ;
 ; Test the add and load are folded into the store instruction.
 
 @X = internal global i16 0              ; <i16*> [#uses=2]
 
 define void @foo() nounwind {
+; CHECK: foo:
+; CHECK-NOT: mov
+; CHECK: add
+; CHECK-NEXT: ret
         %tmp.0 = load i16* @X           ; <i16> [#uses=1]
         %tmp.3 = add i16 %tmp.0, 329            ; <i16> [#uses=1]
         store i16 %tmp.3, i16* @X
         ret void
 }
 
+; rdar://12838504
+%struct.S2 = type { i64, i16, [2 x i8], i8, [3 x i8], [7 x i8], i8, [8 x i8] }
+@s2 = external global %struct.S2, align 16
+define void @test2() nounwind uwtable ssp {
+; CHECK: test2:
+; CHECK: mov
+; CHECK-NEXT: and
+; CHECK-NEXT: ret
+  %bf.load35 = load i56* bitcast ([7 x i8]* getelementptr inbounds (%struct.S2* @s2, i32 0, i32 5) to i56*), align 16
+  %bf.clear36 = and i56 %bf.load35, -1125895611875329
+  store i56 %bf.clear36, i56* bitcast ([7 x i8]* getelementptr inbounds (%struct.S2* @s2, i32 0, i32 5) to i56*), align 16
+  ret void
+}
diff --git a/test/CodeGen/X86/v8i1-masks.ll b/test/CodeGen/X86/v8i1-masks.ll
new file mode 100644
index 0000000000..abb4b39bd6
--- /dev/null
+++ b/test/CodeGen/X86/v8i1-masks.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
+
+;CHECK: and_masks
+;CHECK: vmovups
+;CHECK: vcmpltp
+;CHECK: vcmpltp
+;CHECK: vandps
+;CHECK: vandps
+;CHECK: vmovups
+;CHECK: ret
+
+define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
+  %v0 = load <8 x float>* %a, align 16
+  %v1 = load <8 x float>* %b, align 16
+  %m0 = fcmp olt <8 x float> %v1, %v0
+  %v2 = load <8 x float>* %c, align 16
+  %m1 = fcmp olt <8 x float> %v2, %v0
+  %mand = and <8 x i1> %m1, %m0
+  %r = zext <8 x i1> %mand to <8 x i32>
+  store <8 x i32> %r, <8 x i32>* undef, align 16
+  ret void
+}
+
+;CHECK: neg_mask
+;CHECK: vcmpltps
+;CHECK: vxorps
+;CHECK: vandps
+;CHECK: vmovups
+;CHECK: ret
+define void @neg_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
+  %v0 = load <8 x float>* %a, align 16
+  %v1 = load <8 x float>* %b, align 16
+  %m0 = fcmp olt <8 x float> %v1, %v0
+  %mand = xor <8 x i1> %m0, <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>
+  %r = zext <8 x i1> %mand to <8 x i32>
+  store <8 x i32> %r, <8 x i32>* undef, align 16
+  ret void
+}
+
diff --git a/test/CodeGen/X86/vec_compare.ll b/test/CodeGen/X86/vec_compare.ll
index 367dd27f30..b6d91a3f77 100644
--- a/test/CodeGen/X86/vec_compare.ll
+++ b/test/CodeGen/X86/vec_compare.ll
@@ -41,3 +41,27 @@ define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) nounwind {
         %D = sext <4 x i1> %C to <4 x i32>
 	ret <4 x i32> %D
 }
+
+define <2 x i64> @test5(<2 x i64> %A, <2 x i64> %B) nounwind {
+; CHECK: test5:
+; CHECK: pcmpeqd
+; CHECK: pshufd $-79
+; CHECK: pand
+; CHECK: ret
+	%C = icmp eq <2 x i64> %A, %B
+	%D = sext <2 x i1> %C to <2 x i64>
+	ret <2 x i64> %D
+}
+
+define <2 x i64> @test6(<2 x i64> %A, <2 x i64> %B) nounwind {
+; CHECK: test6:
+; CHECK: pcmpeqd
+; CHECK: pshufd $-79
+; CHECK: pand
+; CHECK: pcmpeqd
+; CHECK: pxor
+; CHECK: ret
+	%C = icmp ne <2 x i64> %A, %B
+	%D = sext <2 x i1> %C to <2 x i64>
+	ret <2 x i64> %D
+}
diff --git a/test/CodeGen/X86/vec_sdiv_to_shift.ll b/test/CodeGen/X86/vec_sdiv_to_shift.ll
new file mode 100644
index 0000000000..35e052d97b
--- /dev/null
+++ b/test/CodeGen/X86/vec_sdiv_to_shift.ll
@@ -0,0 +1,72 @@
+; RUN: llc < %s -march=x86-64 -mcpu=penryn -mattr=+avx2 | FileCheck %s
+
+
+define <8 x i16> @sdiv_vec8x16(<8 x i16> %var) {
+entry:
+; CHECK: sdiv_vec8x16
+; CHECK: psraw  $15
+; CHECK: vpsrlw  $11
+; CHECK: vpaddw
+; CHECK: vpsraw  $5
+; CHECK: ret
+  %0 = sdiv <8 x i16> %var, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
+  ret <8 x i16> %0
+}
+
+define <4 x i32> @sdiv_zero(<4 x i32> %var) {
+entry:
+; CHECK: sdiv_zero
+; CHECK-NOT sra
+; CHECK: ret
+  %0 = sdiv <4 x i32> %var, <i32 0, i32 0, i32 0, i32 0>
+  ret <4 x i32> %0
+}
+
+define <4 x i32> @sdiv_vec4x32(<4 x i32> %var) {
+entry:
+; CHECK: sdiv_vec4x32
+; CHECK: vpsrad $31
+; CHECK: vpsrld $28
+; CHECK: vpaddd
+; CHECK: vpsrad $4
+; CHECK: ret
+%0 = sdiv <4 x i32> %var, <i32 16, i32 16, i32 16, i32 16>
+ret <4 x i32> %0
+}
+
+define <4 x i32> @sdiv_negative(<4 x i32> %var) {
+entry:
+; CHECK: sdiv_negative
+; CHECK: vpsrad $31
+; CHECK: vpsrld $28
+; CHECK: vpaddd
+; CHECK: vpsrad $4
+; CHECK: vpsubd
+; CHECK: ret
+%0 = sdiv <4 x i32> %var, <i32 -16, i32 -16, i32 -16, i32 -16>
+ret <4 x i32> %0
+}
+
+define <8 x i32> @sdiv8x32(<8 x i32> %var) {
+entry:
+; CHECK: sdiv8x32
+; CHECK: vpsrad $31
+; CHECK: vpsrld $26
+; CHECK: vpaddd
+; CHECK: vpsrad $6
+; CHECK: ret
+%0 = sdiv <8 x i32> %var, <i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64>
+ret <8 x i32> %0
+}
+
+define <16 x i16> @sdiv16x16(<16 x i16> %var) {
+entry:
+; CHECK: sdiv16x16
+; CHECK: vpsraw  $15
+; CHECK: vpsrlw  $14
+; CHECK: vpaddw
+; CHECK: vpsraw  $2
+; CHECK: ret
+  %a0 = sdiv <16 x i16> %var, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
+  ret <16 x i16> %a0
+}
diff --git a/test/CodeGen/X86/vector-gep.ll b/test/CodeGen/X86/vector-gep.ll
index 3476e36c64..d08e2a0746 100644
--- a/test/CodeGen/X86/vector-gep.ll
+++ b/test/CodeGen/X86/vector-gep.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -mcpu=corei7-avx | FileCheck %s
-; RUN: opt -instsimplify %s -disable-output
+; RUN: opt -instsimplify -disable-output < %s
 
 ;CHECK: AGEP0:
 define <4 x i32*> @AGEP0(i32* %ptr) nounwind {
diff --git a/test/CodeGen/X86/vselect-minmax.ll b/test/CodeGen/X86/vselect-minmax.ll
new file mode 100644
index 0000000000..cf654b6f20
--- /dev/null
+++ b/test/CodeGen/X86/vselect-minmax.ll
@@ -0,0 +1,2788 @@
+; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE2
+; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s -check-prefix=SSE4
+; RUN: llc -march=x86-64 -mcpu=corei7-avx < %s | FileCheck %s -check-prefix=AVX1
+; RUN: llc -march=x86-64 -mcpu=core-avx2 -mattr=+avx2 < %s | FileCheck %s -check-prefix=AVX2
+
+define void @test1(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
+  %load.a = load <16 x i8>* %ptr.a, align 2
+  %load.b = load <16 x i8>* %ptr.b, align 2
+  %cmp = icmp slt <16 x i8> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i8> %load.a, <16 x i8> %load.b
+  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test1:
+; SSE4: pminsb
+
+; AVX1: test1:
+; AVX1: vpminsb
+
+; AVX2: test1:
+; AVX2: vpminsb
+}
+
+define void @test2(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
+  %load.a = load <16 x i8>* %ptr.a, align 2
+  %load.b = load <16 x i8>* %ptr.b, align 2
+  %cmp = icmp sle <16 x i8> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i8> %load.a, <16 x i8> %load.b
+  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test2:
+; SSE4: pminsb
+
+; AVX1: test2:
+; AVX1: vpminsb
+
+; AVX2: test2:
+; AVX2: vpminsb
+}
+
+define void @test3(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
+  %load.a = load <16 x i8>* %ptr.a, align 2
+  %load.b = load <16 x i8>* %ptr.b, align 2
+  %cmp = icmp sgt <16 x i8> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i8> %load.a, <16 x i8> %load.b
+  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test3:
+; SSE4: pmaxsb
+
+; AVX1: test3:
+; AVX1: vpmaxsb
+
+; AVX2: test3:
+; AVX2: vpmaxsb
+}
+
+define void @test4(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
+  %load.a = load <16 x i8>* %ptr.a, align 2
+  %load.b = load <16 x i8>* %ptr.b, align 2
+  %cmp = icmp sge <16 x i8> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i8> %load.a, <16 x i8> %load.b
+  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test4:
+; SSE4: pmaxsb
+
+; AVX1: test4:
+; AVX1: vpmaxsb
+
+; AVX2: test4:
+; AVX2: vpmaxsb
+}
+
+define void @test5(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
+  %load.a = load <16 x i8>* %ptr.a, align 2
+  %load.b = load <16 x i8>* %ptr.b, align 2
+  %cmp = icmp ult <16 x i8> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i8> %load.a, <16 x i8> %load.b
+  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: test5:
+; SSE2: pminub
+
+; AVX1: test5:
+; AVX1: vpminub
+
+; AVX2: test5:
+; AVX2: vpminub
+}
+
+define void @test6(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
+  %load.a = load <16 x i8>* %ptr.a, align 2
+  %load.b = load <16 x i8>* %ptr.b, align 2
+  %cmp = icmp ule <16 x i8> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i8> %load.a, <16 x i8> %load.b
+  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: test6:
+; SSE2: pminub
+
+; AVX1: test6:
+; AVX1: vpminub
+
+; AVX2: test6:
+; AVX2: vpminub
+}
+
+define void @test7(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
+  %load.a = load <16 x i8>* %ptr.a, align 2
+  %load.b = load <16 x i8>* %ptr.b, align 2
+  %cmp = icmp ugt <16 x i8> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i8> %load.a, <16 x i8> %load.b
+  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: test7:
+; SSE2: pmaxub
+
+; AVX1: test7:
+; AVX1: vpmaxub
+
+; AVX2: test7:
+; AVX2: vpmaxub
+}
+
+define void @test8(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
+  %load.a = load <16 x i8>* %ptr.a, align 2
+  %load.b = load <16 x i8>* %ptr.b, align 2
+  %cmp = icmp uge <16 x i8> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i8> %load.a, <16 x i8> %load.b
+  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: test8:
+; SSE2: pmaxub
+
+; AVX1: test8:
+; AVX1: vpmaxub
+
+; AVX2: test8:
+; AVX2: vpmaxub
+}
+
+define void @test9(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
+  %load.a = load <8 x i16>* %ptr.a, align 2
+  %load.b = load <8 x i16>* %ptr.b, align 2
+  %cmp = icmp slt <8 x i16> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i16> %load.a, <8 x i16> %load.b
+  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: test9:
+; SSE2: pminsw
+
+; AVX1: test9:
+; AVX1: vpminsw
+
+; AVX2: test9:
+; AVX2: vpminsw
+}
+
+define void @test10(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
+  %load.a = load <8 x i16>* %ptr.a, align 2
+  %load.b = load <8 x i16>* %ptr.b, align 2
+  %cmp = icmp sle <8 x i16> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i16> %load.a, <8 x i16> %load.b
+  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: test10:
+; SSE2: pminsw
+
+; AVX1: test10:
+; AVX1: vpminsw
+
+; AVX2: test10:
+; AVX2: vpminsw
+}
+
+define void @test11(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
+  %load.a = load <8 x i16>* %ptr.a, align 2
+  %load.b = load <8 x i16>* %ptr.b, align 2
+  %cmp = icmp sgt <8 x i16> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i16> %load.a, <8 x i16> %load.b
+  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: test11:
+; SSE2: pmaxsw
+
+; AVX1: test11:
+; AVX1: vpmaxsw
+
+; AVX2: test11:
+; AVX2: vpmaxsw
+}
+
+define void @test12(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
+  %load.a = load <8 x i16>* %ptr.a, align 2
+  %load.b = load <8 x i16>* %ptr.b, align 2
+  %cmp = icmp sge <8 x i16> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i16> %load.a, <8 x i16> %load.b
+  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: test12:
+; SSE2: pmaxsw
+
+; AVX1: test12:
+; AVX1: vpmaxsw
+
+; AVX2: test12:
+; AVX2: vpmaxsw
+}
+
+define void @test13(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
+  %load.a = load <8 x i16>* %ptr.a, align 2
+  %load.b = load <8 x i16>* %ptr.b, align 2
+  %cmp = icmp ult <8 x i16> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i16> %load.a, <8 x i16> %load.b
+  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test13:
+; SSE4: pminuw
+
+; AVX1: test13:
+; AVX1: vpminuw
+
+; AVX2: test13:
+; AVX2: vpminuw
+}
+
+define void @test14(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
+  %load.a = load <8 x i16>* %ptr.a, align 2
+  %load.b = load <8 x i16>* %ptr.b, align 2
+  %cmp = icmp ule <8 x i16> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i16> %load.a, <8 x i16> %load.b
+  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test14:
+; SSE4: pminuw
+
+; AVX1: test14:
+; AVX1: vpminuw
+
+; AVX2: test14:
+; AVX2: vpminuw
+}
+
+define void @test15(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
+  %load.a = load <8 x i16>* %ptr.a, align 2
+  %load.b = load <8 x i16>* %ptr.b, align 2
+  %cmp = icmp ugt <8 x i16> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i16> %load.a, <8 x i16> %load.b
+  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test15:
+; SSE4: pmaxuw
+
+; AVX1: test15:
+; AVX1: vpmaxuw
+
+; AVX2: test15:
+; AVX2: vpmaxuw
+}
+
+define void @test16(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
+  %load.a = load <8 x i16>* %ptr.a, align 2
+  %load.b = load <8 x i16>* %ptr.b, align 2
+  %cmp = icmp uge <8 x i16> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i16> %load.a, <8 x i16> %load.b
+  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test16:
+; SSE4: pmaxuw
+
+; AVX1: test16:
+; AVX1: vpmaxuw
+
+; AVX2: test16:
+; AVX2: vpmaxuw
+}
+
+define void @test17(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
+  %load.a = load <4 x i32>* %ptr.a, align 2
+  %load.b = load <4 x i32>* %ptr.b, align 2
+  %cmp = icmp slt <4 x i32> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i32> %load.a, <4 x i32> %load.b
+  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 4
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test17:
+; SSE4: pminsd
+
+; AVX1: test17:
+; AVX1: vpminsd
+
+; AVX2: test17:
+; AVX2: vpminsd
+}
+
+define void @test18(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
+  %load.a = load <4 x i32>* %ptr.a, align 2
+  %load.b = load <4 x i32>* %ptr.b, align 2
+  %cmp = icmp sle <4 x i32> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i32> %load.a, <4 x i32> %load.b
+  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 4
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test18:
+; SSE4: pminsd
+
+; AVX1: test18:
+; AVX1: vpminsd
+
+; AVX2: test18:
+; AVX2: vpminsd
+}
+
+define void @test19(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
+  %load.a = load <4 x i32>* %ptr.a, align 2
+  %load.b = load <4 x i32>* %ptr.b, align 2
+  %cmp = icmp sgt <4 x i32> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i32> %load.a, <4 x i32> %load.b
+  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 4
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test19:
+; SSE4: pmaxsd
+
+; AVX1: test19:
+; AVX1: vpmaxsd
+
+; AVX2: test19:
+; AVX2: vpmaxsd
+}
+
+define void @test20(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
+  %load.a = load <4 x i32>* %ptr.a, align 2
+  %load.b = load <4 x i32>* %ptr.b, align 2
+  %cmp = icmp sge <4 x i32> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i32> %load.a, <4 x i32> %load.b
+  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 4
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test20:
+; SSE4: pmaxsd
+
+; AVX1: test20:
+; AVX1: vpmaxsd
+
+; AVX2: test20:
+; AVX2: vpmaxsd
+}
+
+define void @test21(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
+  %load.a = load <4 x i32>* %ptr.a, align 2
+  %load.b = load <4 x i32>* %ptr.b, align 2
+  %cmp = icmp ult <4 x i32> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i32> %load.a, <4 x i32> %load.b
+  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 4
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test21:
+; SSE4: pminud
+
+; AVX1: test21:
+; AVX1: vpminud
+
+; AVX2: test21:
+; AVX2: vpminud
+}
+
+define void @test22(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
+  %load.a = load <4 x i32>* %ptr.a, align 2
+  %load.b = load <4 x i32>* %ptr.b, align 2
+  %cmp = icmp ule <4 x i32> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i32> %load.a, <4 x i32> %load.b
+  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 4
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test22:
+; SSE4: pminud
+
+; AVX1: test22:
+; AVX1: vpminud
+
+; AVX2: test22:
+; AVX2: vpminud
+}
+
+define void @test23(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
+  %load.a = load <4 x i32>* %ptr.a, align 2
+  %load.b = load <4 x i32>* %ptr.b, align 2
+  %cmp = icmp ugt <4 x i32> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i32> %load.a, <4 x i32> %load.b
+  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 4
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test23:
+; SSE4: pmaxud
+
+; AVX1: test23:
+; AVX1: vpmaxud
+
+; AVX2: test23:
+; AVX2: vpmaxud
+}
+
+define void @test24(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
+  %load.a = load <4 x i32>* %ptr.a, align 2
+  %load.b = load <4 x i32>* %ptr.b, align 2
+  %cmp = icmp uge <4 x i32> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i32> %load.a, <4 x i32> %load.b
+  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 4
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test24:
+; SSE4: pmaxud
+
+; AVX1: test24:
+; AVX1: vpmaxud
+
+; AVX2: test24:
+; AVX2: vpmaxud
+}
+
+define void @test25(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
+  %load.a = load <32 x i8>* %ptr.a, align 2
+  %load.b = load <32 x i8>* %ptr.b, align 2
+  %cmp = icmp slt <32 x i8> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i8> %load.a, <32 x i8> %load.b
+  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test25:
+; AVX2: vpminsb
+}
+
+define void @test26(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
+  %load.a = load <32 x i8>* %ptr.a, align 2
+  %load.b = load <32 x i8>* %ptr.b, align 2
+  %cmp = icmp sle <32 x i8> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i8> %load.a, <32 x i8> %load.b
+  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test26:
+; AVX2: vpminsb
+}
+
+define void @test27(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
+  %load.a = load <32 x i8>* %ptr.a, align 2
+  %load.b = load <32 x i8>* %ptr.b, align 2
+  %cmp = icmp sgt <32 x i8> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i8> %load.a, <32 x i8> %load.b
+  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test27:
+; AVX2: vpmaxsb
+}
+
+define void @test28(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
+  %load.a = load <32 x i8>* %ptr.a, align 2
+  %load.b = load <32 x i8>* %ptr.b, align 2
+  %cmp = icmp sge <32 x i8> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i8> %load.a, <32 x i8> %load.b
+  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test28:
+; AVX2: vpmaxsb
+}
+
+define void @test29(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
+  %load.a = load <32 x i8>* %ptr.a, align 2
+  %load.b = load <32 x i8>* %ptr.b, align 2
+  %cmp = icmp ult <32 x i8> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i8> %load.a, <32 x i8> %load.b
+  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test29:
+; AVX2: vpminub
+}
+
+define void @test30(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
+  %load.a = load <32 x i8>* %ptr.a, align 2
+  %load.b = load <32 x i8>* %ptr.b, align 2
+  %cmp = icmp ule <32 x i8> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i8> %load.a, <32 x i8> %load.b
+  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test30:
+; AVX2: vpminub
+}
+
+define void @test31(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
+  %load.a = load <32 x i8>* %ptr.a, align 2
+  %load.b = load <32 x i8>* %ptr.b, align 2
+  %cmp = icmp ugt <32 x i8> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i8> %load.a, <32 x i8> %load.b
+  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test31:
+; AVX2: vpmaxub
+}
+
+define void @test32(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
+  %load.a = load <32 x i8>* %ptr.a, align 2
+  %load.b = load <32 x i8>* %ptr.b, align 2
+  %cmp = icmp uge <32 x i8> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i8> %load.a, <32 x i8> %load.b
+  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test32:
+; AVX2: vpmaxub
+}
+
+define void @test33(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
+  %load.a = load <16 x i16>* %ptr.a, align 2
+  %load.b = load <16 x i16>* %ptr.b, align 2
+  %cmp = icmp slt <16 x i16> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i16> %load.a, <16 x i16> %load.b
+  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test33:
+; AVX2: vpminsw
+}
+
+define void @test34(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
+  %load.a = load <16 x i16>* %ptr.a, align 2
+  %load.b = load <16 x i16>* %ptr.b, align 2
+  %cmp = icmp sle <16 x i16> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i16> %load.a, <16 x i16> %load.b
+  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test34:
+; AVX2: vpminsw
+}
+
+define void @test35(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
+  %load.a = load <16 x i16>* %ptr.a, align 2
+  %load.b = load <16 x i16>* %ptr.b, align 2
+  %cmp = icmp sgt <16 x i16> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i16> %load.a, <16 x i16> %load.b
+  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test35:
+; AVX2: vpmaxsw
+}
+
+define void @test36(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
+  %load.a = load <16 x i16>* %ptr.a, align 2
+  %load.b = load <16 x i16>* %ptr.b, align 2
+  %cmp = icmp sge <16 x i16> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i16> %load.a, <16 x i16> %load.b
+  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test36:
+; AVX2: vpmaxsw
+}
+
+define void @test37(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
+  %load.a = load <16 x i16>* %ptr.a, align 2
+  %load.b = load <16 x i16>* %ptr.b, align 2
+  %cmp = icmp ult <16 x i16> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i16> %load.a, <16 x i16> %load.b
+  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test37:
+; AVX2: vpminuw
+}
+
+define void @test38(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
+  %load.a = load <16 x i16>* %ptr.a, align 2
+  %load.b = load <16 x i16>* %ptr.b, align 2
+  %cmp = icmp ule <16 x i16> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i16> %load.a, <16 x i16> %load.b
+  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test38:
+; AVX2: vpminuw
+}
+
+define void @test39(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
+  %load.a = load <16 x i16>* %ptr.a, align 2
+  %load.b = load <16 x i16>* %ptr.b, align 2
+  %cmp = icmp ugt <16 x i16> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i16> %load.a, <16 x i16> %load.b
+  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test39:
+; AVX2: vpmaxuw
+}
+
+define void @test40(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
+  %load.a = load <16 x i16>* %ptr.a, align 2
+  %load.b = load <16 x i16>* %ptr.b, align 2
+  %cmp = icmp uge <16 x i16> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i16> %load.a, <16 x i16> %load.b
+  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test40:
+; AVX2: vpmaxuw
+}
+
+define void @test41(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
+  %load.a = load <8 x i32>* %ptr.a, align 2
+  %load.b = load <8 x i32>* %ptr.b, align 2
+  %cmp = icmp slt <8 x i32> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i32> %load.a, <8 x i32> %load.b
+  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test41:
+; AVX2: vpminsd
+}
+
+define void @test42(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
+  %load.a = load <8 x i32>* %ptr.a, align 2
+  %load.b = load <8 x i32>* %ptr.b, align 2
+  %cmp = icmp sle <8 x i32> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i32> %load.a, <8 x i32> %load.b
+  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test42:
+; AVX2: vpminsd
+}
+
+define void @test43(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
+  %load.a = load <8 x i32>* %ptr.a, align 2
+  %load.b = load <8 x i32>* %ptr.b, align 2
+  %cmp = icmp sgt <8 x i32> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i32> %load.a, <8 x i32> %load.b
+  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test43:
+; AVX2: vpmaxsd
+}
+
+define void @test44(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
+  %load.a = load <8 x i32>* %ptr.a, align 2
+  %load.b = load <8 x i32>* %ptr.b, align 2
+  %cmp = icmp sge <8 x i32> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i32> %load.a, <8 x i32> %load.b
+  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test44:
+; AVX2: vpmaxsd
+}
+
+define void @test45(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
+  %load.a = load <8 x i32>* %ptr.a, align 2
+  %load.b = load <8 x i32>* %ptr.b, align 2
+  %cmp = icmp ult <8 x i32> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i32> %load.a, <8 x i32> %load.b
+  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test45:
+; AVX2: vpminud
+}
+
+define void @test46(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
+  %load.a = load <8 x i32>* %ptr.a, align 2
+  %load.b = load <8 x i32>* %ptr.b, align 2
+  %cmp = icmp ule <8 x i32> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i32> %load.a, <8 x i32> %load.b
+  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test46:
+; AVX2: vpminud
+}
+
+define void @test47(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
+  %load.a = load <8 x i32>* %ptr.a, align 2
+  %load.b = load <8 x i32>* %ptr.b, align 2
+  %cmp = icmp ugt <8 x i32> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i32> %load.a, <8 x i32> %load.b
+  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test47:
+; AVX2: vpmaxud
+}
+
+define void @test48(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
+  %load.a = load <8 x i32>* %ptr.a, align 2
+  %load.b = load <8 x i32>* %ptr.b, align 2
+  %cmp = icmp uge <8 x i32> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i32> %load.a, <8 x i32> %load.b
+  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test48:
+; AVX2: vpmaxud
+}
+
+define void @test49(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
+  %load.a = load <16 x i8>* %ptr.a, align 2
+  %load.b = load <16 x i8>* %ptr.b, align 2
+  %cmp = icmp slt <16 x i8> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i8> %load.b, <16 x i8> %load.a
+  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test49:
+; SSE4: pmaxsb
+
+; AVX1: test49:
+; AVX1: vpmaxsb
+
+; AVX2: test49:
+; AVX2: vpmaxsb
+}
+
+define void @test50(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
+  %load.a = load <16 x i8>* %ptr.a, align 2
+  %load.b = load <16 x i8>* %ptr.b, align 2
+  %cmp = icmp sle <16 x i8> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i8> %load.b, <16 x i8> %load.a
+  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test50:
+; SSE4: pmaxsb
+
+; AVX1: test50:
+; AVX1: vpmaxsb
+
+; AVX2: test50:
+; AVX2: vpmaxsb
+}
+
+define void @test51(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
+  %load.a = load <16 x i8>* %ptr.a, align 2
+  %load.b = load <16 x i8>* %ptr.b, align 2
+  %cmp = icmp sgt <16 x i8> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i8> %load.b, <16 x i8> %load.a
+  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test51:
+; SSE4: pminsb
+
+; AVX1: test51:
+; AVX1: vpminsb
+
+; AVX2: test51:
+; AVX2: vpminsb
+}
+
+define void @test52(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
+  %load.a = load <16 x i8>* %ptr.a, align 2
+  %load.b = load <16 x i8>* %ptr.b, align 2
+  %cmp = icmp sge <16 x i8> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i8> %load.b, <16 x i8> %load.a
+  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test52:
+; SSE4: pminsb
+
+; AVX1: test52:
+; AVX1: vpminsb
+
+; AVX2: test52:
+; AVX2: vpminsb
+}
+
+define void @test53(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
+  %load.a = load <16 x i8>* %ptr.a, align 2
+  %load.b = load <16 x i8>* %ptr.b, align 2
+  %cmp = icmp ult <16 x i8> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i8> %load.b, <16 x i8> %load.a
+  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: test53:
+; SSE2: pmaxub
+
+; AVX1: test53:
+; AVX1: vpmaxub
+
+; AVX2: test53:
+; AVX2: vpmaxub
+}
+
+define void @test54(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
+  %load.a = load <16 x i8>* %ptr.a, align 2
+  %load.b = load <16 x i8>* %ptr.b, align 2
+  %cmp = icmp ule <16 x i8> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i8> %load.b, <16 x i8> %load.a
+  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: test54:
+; SSE2: pmaxub
+
+; AVX1: test54:
+; AVX1: vpmaxub
+
+; AVX2: test54:
+; AVX2: vpmaxub
+}
+
+define void @test55(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
+  %load.a = load <16 x i8>* %ptr.a, align 2
+  %load.b = load <16 x i8>* %ptr.b, align 2
+  %cmp = icmp ugt <16 x i8> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i8> %load.b, <16 x i8> %load.a
+  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: test55:
+; SSE2: pminub
+
+; AVX1: test55:
+; AVX1: vpminub
+
+; AVX2: test55:
+; AVX2: vpminub
+}
+
+define void @test56(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <16 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <16 x i8>*
+  %load.a = load <16 x i8>* %ptr.a, align 2
+  %load.b = load <16 x i8>* %ptr.b, align 2
+  %cmp = icmp uge <16 x i8> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i8> %load.b, <16 x i8> %load.a
+  store <16 x i8> %sel, <16 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: test56:
+; SSE2: pminub
+
+; AVX1: test56:
+; AVX1: vpminub
+
+; AVX2: test56:
+; AVX2: vpminub
+}
+
+define void @test57(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
+  %load.a = load <8 x i16>* %ptr.a, align 2
+  %load.b = load <8 x i16>* %ptr.b, align 2
+  %cmp = icmp slt <8 x i16> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i16> %load.b, <8 x i16> %load.a
+  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: test57:
+; SSE2: pmaxsw
+
+; AVX1: test57:
+; AVX1: vpmaxsw
+
+; AVX2: test57:
+; AVX2: vpmaxsw
+}
+
+define void @test58(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
+  %load.a = load <8 x i16>* %ptr.a, align 2
+  %load.b = load <8 x i16>* %ptr.b, align 2
+  %cmp = icmp sle <8 x i16> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i16> %load.b, <8 x i16> %load.a
+  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: test58:
+; SSE2: pmaxsw
+
+; AVX1: test58:
+; AVX1: vpmaxsw
+
+; AVX2: test58:
+; AVX2: vpmaxsw
+}
+
+define void @test59(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
+  %load.a = load <8 x i16>* %ptr.a, align 2
+  %load.b = load <8 x i16>* %ptr.b, align 2
+  %cmp = icmp sgt <8 x i16> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i16> %load.b, <8 x i16> %load.a
+  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: test59:
+; SSE2: pminsw
+
+; AVX1: test59:
+; AVX1: vpminsw
+
+; AVX2: test59:
+; AVX2: vpminsw
+}
+
+define void @test60(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
+  %load.a = load <8 x i16>* %ptr.a, align 2
+  %load.b = load <8 x i16>* %ptr.b, align 2
+  %cmp = icmp sge <8 x i16> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i16> %load.b, <8 x i16> %load.a
+  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: test60:
+; SSE2: pminsw
+
+; AVX1: test60:
+; AVX1: vpminsw
+
+; AVX2: test60:
+; AVX2: vpminsw
+}
+
+define void @test61(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
+  %load.a = load <8 x i16>* %ptr.a, align 2
+  %load.b = load <8 x i16>* %ptr.b, align 2
+  %cmp = icmp ult <8 x i16> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i16> %load.b, <8 x i16> %load.a
+  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test61:
+; SSE4: pmaxuw
+
+; AVX1: test61:
+; AVX1: vpmaxuw
+
+; AVX2: test61:
+; AVX2: vpmaxuw
+}
+
+define void @test62(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
+  %load.a = load <8 x i16>* %ptr.a, align 2
+  %load.b = load <8 x i16>* %ptr.b, align 2
+  %cmp = icmp ule <8 x i16> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i16> %load.b, <8 x i16> %load.a
+  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test62:
+; SSE4: pmaxuw
+
+; AVX1: test62:
+; AVX1: vpmaxuw
+
+; AVX2: test62:
+; AVX2: vpmaxuw
+}
+
+define void @test63(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
+  %load.a = load <8 x i16>* %ptr.a, align 2
+  %load.b = load <8 x i16>* %ptr.b, align 2
+  %cmp = icmp ugt <8 x i16> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i16> %load.b, <8 x i16> %load.a
+  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test63:
+; SSE4: pminuw
+
+; AVX1: test63:
+; AVX1: vpminuw
+
+; AVX2: test63:
+; AVX2: vpminuw
+}
+
+define void @test64(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <8 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <8 x i16>*
+  %load.a = load <8 x i16>* %ptr.a, align 2
+  %load.b = load <8 x i16>* %ptr.b, align 2
+  %cmp = icmp uge <8 x i16> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i16> %load.b, <8 x i16> %load.a
+  store <8 x i16> %sel, <8 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test64:
+; SSE4: pminuw
+
+; AVX1: test64:
+; AVX1: vpminuw
+
+; AVX2: test64:
+; AVX2: vpminuw
+}
+
+define void @test65(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
+  %load.a = load <4 x i32>* %ptr.a, align 2
+  %load.b = load <4 x i32>* %ptr.b, align 2
+  %cmp = icmp slt <4 x i32> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i32> %load.b, <4 x i32> %load.a
+  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 4
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test65:
+; SSE4: pmaxsd
+
+; AVX1: test65:
+; AVX1: vpmaxsd
+
+; AVX2: test65:
+; AVX2: vpmaxsd
+}
+
+define void @test66(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
+  %load.a = load <4 x i32>* %ptr.a, align 2
+  %load.b = load <4 x i32>* %ptr.b, align 2
+  %cmp = icmp sle <4 x i32> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i32> %load.b, <4 x i32> %load.a
+  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 4
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test66:
+; SSE4: pmaxsd
+
+; AVX1: test66:
+; AVX1: vpmaxsd
+
+; AVX2: test66:
+; AVX2: vpmaxsd
+}
+
+define void @test67(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
+  %load.a = load <4 x i32>* %ptr.a, align 2
+  %load.b = load <4 x i32>* %ptr.b, align 2
+  %cmp = icmp sgt <4 x i32> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i32> %load.b, <4 x i32> %load.a
+  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 4
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test67:
+; SSE4: pminsd
+
+; AVX1: test67:
+; AVX1: vpminsd
+
+; AVX2: test67:
+; AVX2: vpminsd
+}
+
+define void @test68(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
+  %load.a = load <4 x i32>* %ptr.a, align 2
+  %load.b = load <4 x i32>* %ptr.b, align 2
+  %cmp = icmp sge <4 x i32> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i32> %load.b, <4 x i32> %load.a
+  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 4
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test68:
+; SSE4: pminsd
+
+; AVX1: test68:
+; AVX1: vpminsd
+
+; AVX2: test68:
+; AVX2: vpminsd
+}
+
+define void @test69(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
+  %load.a = load <4 x i32>* %ptr.a, align 2
+  %load.b = load <4 x i32>* %ptr.b, align 2
+  %cmp = icmp ult <4 x i32> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i32> %load.b, <4 x i32> %load.a
+  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 4
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test69:
+; SSE4: pmaxud
+
+; AVX1: test69:
+; AVX1: vpmaxud
+
+; AVX2: test69:
+; AVX2: vpmaxud
+}
+
+define void @test70(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
+  %load.a = load <4 x i32>* %ptr.a, align 2
+  %load.b = load <4 x i32>* %ptr.b, align 2
+  %cmp = icmp ule <4 x i32> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i32> %load.b, <4 x i32> %load.a
+  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 4
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test70:
+; SSE4: pmaxud
+
+; AVX1: test70:
+; AVX1: vpmaxud
+
+; AVX2: test70:
+; AVX2: vpmaxud
+}
+
+define void @test71(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
+  %load.a = load <4 x i32>* %ptr.a, align 2
+  %load.b = load <4 x i32>* %ptr.b, align 2
+  %cmp = icmp ugt <4 x i32> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i32> %load.b, <4 x i32> %load.a
+  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 4
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test71:
+; SSE4: pminud
+
+; AVX1: test71:
+; AVX1: vpminud
+
+; AVX2: test71:
+; AVX2: vpminud
+}
+
+define void @test72(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <4 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <4 x i32>*
+  %load.a = load <4 x i32>* %ptr.a, align 2
+  %load.b = load <4 x i32>* %ptr.b, align 2
+  %cmp = icmp uge <4 x i32> %load.a, %load.b
+  %sel = select <4 x i1> %cmp, <4 x i32> %load.b, <4 x i32> %load.a
+  store <4 x i32> %sel, <4 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 4
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE4: test72:
+; SSE4: pminud
+
+; AVX1: test72:
+; AVX1: vpminud
+
+; AVX2: test72:
+; AVX2: vpminud
+}
+
+define void @test73(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
+  %load.a = load <32 x i8>* %ptr.a, align 2
+  %load.b = load <32 x i8>* %ptr.b, align 2
+  %cmp = icmp slt <32 x i8> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i8> %load.b, <32 x i8> %load.a
+  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test73:
+; AVX2: vpmaxsb
+}
+
+define void @test74(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
+  %load.a = load <32 x i8>* %ptr.a, align 2
+  %load.b = load <32 x i8>* %ptr.b, align 2
+  %cmp = icmp sle <32 x i8> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i8> %load.b, <32 x i8> %load.a
+  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test74:
+; AVX2: vpmaxsb
+}
+
+define void @test75(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
+  %load.a = load <32 x i8>* %ptr.a, align 2
+  %load.b = load <32 x i8>* %ptr.b, align 2
+  %cmp = icmp sgt <32 x i8> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i8> %load.b, <32 x i8> %load.a
+  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test75:
+; AVX2: vpminsb
+}
+
+define void @test76(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
+  %load.a = load <32 x i8>* %ptr.a, align 2
+  %load.b = load <32 x i8>* %ptr.b, align 2
+  %cmp = icmp sge <32 x i8> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i8> %load.b, <32 x i8> %load.a
+  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test76:
+; AVX2: vpminsb
+}
+
+define void @test77(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
+  %load.a = load <32 x i8>* %ptr.a, align 2
+  %load.b = load <32 x i8>* %ptr.b, align 2
+  %cmp = icmp ult <32 x i8> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i8> %load.b, <32 x i8> %load.a
+  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test77:
+; AVX2: vpmaxub
+}
+
+define void @test78(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
+  %load.a = load <32 x i8>* %ptr.a, align 2
+  %load.b = load <32 x i8>* %ptr.b, align 2
+  %cmp = icmp ule <32 x i8> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i8> %load.b, <32 x i8> %load.a
+  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test78:
+; AVX2: vpmaxub
+}
+
+define void @test79(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
+  %load.a = load <32 x i8>* %ptr.a, align 2
+  %load.b = load <32 x i8>* %ptr.b, align 2
+  %cmp = icmp ugt <32 x i8> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i8> %load.b, <32 x i8> %load.a
+  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test79:
+; AVX2: vpminub
+}
+
+define void @test80(i8* nocapture %a, i8* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i8* %a, i64 %index
+  %gep.b = getelementptr inbounds i8* %b, i64 %index
+  %ptr.a = bitcast i8* %gep.a to <32 x i8>*
+  %ptr.b = bitcast i8* %gep.b to <32 x i8>*
+  %load.a = load <32 x i8>* %ptr.a, align 2
+  %load.b = load <32 x i8>* %ptr.b, align 2
+  %cmp = icmp uge <32 x i8> %load.a, %load.b
+  %sel = select <32 x i1> %cmp, <32 x i8> %load.b, <32 x i8> %load.a
+  store <32 x i8> %sel, <32 x i8>* %ptr.a, align 2
+  %index.next = add i64 %index, 32
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test80:
+; AVX2: vpminub
+}
+
+define void @test81(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
+  %load.a = load <16 x i16>* %ptr.a, align 2
+  %load.b = load <16 x i16>* %ptr.b, align 2
+  %cmp = icmp slt <16 x i16> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i16> %load.b, <16 x i16> %load.a
+  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test81:
+; AVX2: vpmaxsw
+}
+
+define void @test82(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
+  %load.a = load <16 x i16>* %ptr.a, align 2
+  %load.b = load <16 x i16>* %ptr.b, align 2
+  %cmp = icmp sle <16 x i16> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i16> %load.b, <16 x i16> %load.a
+  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test82:
+; AVX2: vpmaxsw
+}
+
+define void @test83(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
+  %load.a = load <16 x i16>* %ptr.a, align 2
+  %load.b = load <16 x i16>* %ptr.b, align 2
+  %cmp = icmp sgt <16 x i16> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i16> %load.b, <16 x i16> %load.a
+  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test83:
+; AVX2: vpminsw
+}
+
+define void @test84(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
+  %load.a = load <16 x i16>* %ptr.a, align 2
+  %load.b = load <16 x i16>* %ptr.b, align 2
+  %cmp = icmp sge <16 x i16> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i16> %load.b, <16 x i16> %load.a
+  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test84:
+; AVX2: vpminsw
+}
+
+define void @test85(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
+  %load.a = load <16 x i16>* %ptr.a, align 2
+  %load.b = load <16 x i16>* %ptr.b, align 2
+  %cmp = icmp ult <16 x i16> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i16> %load.b, <16 x i16> %load.a
+  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test85:
+; AVX2: vpmaxuw
+}
+
+define void @test86(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
+  %load.a = load <16 x i16>* %ptr.a, align 2
+  %load.b = load <16 x i16>* %ptr.b, align 2
+  %cmp = icmp ule <16 x i16> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i16> %load.b, <16 x i16> %load.a
+  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test86:
+; AVX2: vpmaxuw
+}
+
+define void @test87(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
+  %load.a = load <16 x i16>* %ptr.a, align 2
+  %load.b = load <16 x i16>* %ptr.b, align 2
+  %cmp = icmp ugt <16 x i16> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i16> %load.b, <16 x i16> %load.a
+  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test87:
+; AVX2: vpminuw
+}
+
+define void @test88(i16* nocapture %a, i16* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i16* %a, i64 %index
+  %gep.b = getelementptr inbounds i16* %b, i64 %index
+  %ptr.a = bitcast i16* %gep.a to <16 x i16>*
+  %ptr.b = bitcast i16* %gep.b to <16 x i16>*
+  %load.a = load <16 x i16>* %ptr.a, align 2
+  %load.b = load <16 x i16>* %ptr.b, align 2
+  %cmp = icmp uge <16 x i16> %load.a, %load.b
+  %sel = select <16 x i1> %cmp, <16 x i16> %load.b, <16 x i16> %load.a
+  store <16 x i16> %sel, <16 x i16>* %ptr.a, align 2
+  %index.next = add i64 %index, 16
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test88:
+; AVX2: vpminuw
+}
+
+define void @test89(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
+  %load.a = load <8 x i32>* %ptr.a, align 2
+  %load.b = load <8 x i32>* %ptr.b, align 2
+  %cmp = icmp slt <8 x i32> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i32> %load.b, <8 x i32> %load.a
+  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test89:
+; AVX2: vpmaxsd
+}
+
+define void @test90(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
+  %load.a = load <8 x i32>* %ptr.a, align 2
+  %load.b = load <8 x i32>* %ptr.b, align 2
+  %cmp = icmp sle <8 x i32> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i32> %load.b, <8 x i32> %load.a
+  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test90:
+; AVX2: vpmaxsd
+}
+
+define void @test91(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
+  %load.a = load <8 x i32>* %ptr.a, align 2
+  %load.b = load <8 x i32>* %ptr.b, align 2
+  %cmp = icmp sgt <8 x i32> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i32> %load.b, <8 x i32> %load.a
+  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test91:
+; AVX2: vpminsd
+}
+
+define void @test92(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
+  %load.a = load <8 x i32>* %ptr.a, align 2
+  %load.b = load <8 x i32>* %ptr.b, align 2
+  %cmp = icmp sge <8 x i32> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i32> %load.b, <8 x i32> %load.a
+  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test92:
+; AVX2: vpminsd
+}
+
+define void @test93(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
+  %load.a = load <8 x i32>* %ptr.a, align 2
+  %load.b = load <8 x i32>* %ptr.b, align 2
+  %cmp = icmp ult <8 x i32> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i32> %load.b, <8 x i32> %load.a
+  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test93:
+; AVX2: vpmaxud
+}
+
+define void @test94(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
+  %load.a = load <8 x i32>* %ptr.a, align 2
+  %load.b = load <8 x i32>* %ptr.b, align 2
+  %cmp = icmp ule <8 x i32> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i32> %load.b, <8 x i32> %load.a
+  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test94:
+; AVX2: vpmaxud
+}
+
+define void @test95(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
+  %load.a = load <8 x i32>* %ptr.a, align 2
+  %load.b = load <8 x i32>* %ptr.b, align 2
+  %cmp = icmp ugt <8 x i32> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i32> %load.b, <8 x i32> %load.a
+  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test95:
+; AVX2: vpminud
+}
+
+define void @test96(i32* nocapture %a, i32* nocapture %b) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %gep.a = getelementptr inbounds i32* %a, i64 %index
+  %gep.b = getelementptr inbounds i32* %b, i64 %index
+  %ptr.a = bitcast i32* %gep.a to <8 x i32>*
+  %ptr.b = bitcast i32* %gep.b to <8 x i32>*
+  %load.a = load <8 x i32>* %ptr.a, align 2
+  %load.b = load <8 x i32>* %ptr.b, align 2
+  %cmp = icmp uge <8 x i32> %load.a, %load.b
+  %sel = select <8 x i1> %cmp, <8 x i32> %load.b, <8 x i32> %load.a
+  store <8 x i32> %sel, <8 x i32>* %ptr.a, align 2
+  %index.next = add i64 %index, 8
+  %loop = icmp eq i64 %index.next, 16384
+  br i1 %loop, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: test96:
+; AVX2: vpminud
+}
diff --git a/test/CodeGen/X86/vsplit-and.ll b/test/CodeGen/X86/vsplit-and.ll
index ee98806c0f..3b7fdff84e 100644
--- a/test/CodeGen/X86/vsplit-and.ll
+++ b/test/CodeGen/X86/vsplit-and.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux |  FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn |  FileCheck %s
 
 define void @t0(<2 x i64>* %dst, <2 x i64> %src1, <2 x i64> %src2) nounwind readonly {
 ; CHECK: t0
diff --git a/test/CodeGen/X86/x86-64-dead-stack-adjust.ll b/test/CodeGen/X86/x86-64-dead-stack-adjust.ll
index 902c9d5ae0..9c01f16f24 100644
--- a/test/CodeGen/X86/x86-64-dead-stack-adjust.ll
+++ b/test/CodeGen/X86/x86-64-dead-stack-adjust.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mcpu=nehalem | not grep rsp
-; RUN: llc < %s -mcpu=nehalem | grep cvttsd2siq
+; RUN: llc < %s -mcpu=nehalem | grep cvttsd2si
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-apple-darwin8"
diff --git a/test/DebugInfo/2009-11-03-InsertExtractValue.ll b/test/DebugInfo/2009-11-03-InsertExtractValue.ll
index 8782e4446f..933384af74 100644
--- a/test/DebugInfo/2009-11-03-InsertExtractValue.ll
+++ b/test/DebugInfo/2009-11-03-InsertExtractValue.ll
@@ -1,11 +1,15 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
 
-!0 = metadata !{i32 42}
+!dbg = !{!0}
+!0 = metadata !{i32 786478, i32 0, metadata !1, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEv", metadata !1, i32 3, metadata !"nard", i1 false, i1 false, i32 0, i32 0, null, i32 258, i1 false, null, null, i32 0, metadata !1, i32 3} 
+!1 = metadata !{i32 42}
 
 define <{i32, i32}> @f1() {
-; CHECK: !dbgx !0
-  %r = insertvalue <{ i32, i32 }> zeroinitializer, i32 4, 1, !dbgx !0
-; CHECK: !dbgx !0
-  %e = extractvalue <{ i32, i32 }> %r, 0, !dbgx !0
+; CHECK: !dbgx !1
+  %r = insertvalue <{ i32, i32 }> zeroinitializer, i32 4, 1, !dbgx !1
+; CHECK: !dbgx !1
+  %e = extractvalue <{ i32, i32 }> %r, 0, !dbgx !1
   ret <{ i32, i32 }> %r
 }
+
+; CHECK: [protected]
diff --git a/test/CodeGen/X86/2010-08-10-DbgConstant.ll b/test/DebugInfo/X86/2010-08-10-DbgConstant.ll
index b3cc35d723..78f8750995 100644
--- a/test/CodeGen/X86/2010-08-10-DbgConstant.ll
+++ b/test/DebugInfo/X86/2010-08-10-DbgConstant.ll
@@ -1,6 +1,7 @@
-; RUN: llc  -mtriple=i686-linux -O0 < %s | FileCheck %s
-; CHECK: DW_TAG_constant
-; CHECK-NEXT: .long .Lstring3 #{{#?}} DW_AT_name
+; RUN: llc  -mtriple=i686-linux -O0 -filetype=obj -o %t %s
+; RUN: llvm-dwarfdump %t | FileCheck %s
+; CHECK: DW_TAG_constant [4]
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp] ( .debug_str[0x0000002c] = "ro")
 
 define void @foo() nounwind ssp {
 entry:
diff --git a/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll b/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll
index 934fa81435..e514493442 100644
--- a/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll
+++ b/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=x86_64-pc-linux-gnu -asm-verbose %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux-gnu %s -o %t -filetype=obj
+; RUN: llvm-dwarfdump %t | FileCheck %s
 
 ; ModuleID = 'test.c'
 
@@ -38,10 +39,13 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !18 = metadata !{i32 4, i32 23, metadata !16, null}
 !19 = metadata !{i32 5, i32 5, metadata !16, null}
 
-; CHECK: .long .Lstring3
-; CHECK: .byte	1
-; CHECK: .byte	1
+; CHECK: DW_TAG_variable [3]
+; CHECK: DW_AT_name [DW_FORM_strp]       ( .debug_str[0x00000043] = "GLB")
+; CHECK: DW_AT_decl_file [DW_FORM_data1] (0x01)
+; CHECK: DW_AT_decl_line [DW_FORM_data1] (0x01)
+
+; CHECK: DW_TAG_variable [6]
+; CHECK: DW_AT_name [DW_FORM_strp]   ( .debug_str[0x0000004d] = "LOC")
+; CHECK: DW_AT_decl_file [DW_FORM_data1]     (0x01)
+; CHECK: DW_AT_decl_line [DW_FORM_data1]     (0x04)
 
-; CHECK: .long .Lstring6
-; CHECK: .byte	1
-; CHECK: .byte	4
diff --git a/test/DebugInfo/X86/DW_AT_object_pointer.ll b/test/DebugInfo/X86/DW_AT_object_pointer.ll
index 163a1e7cec..b1fbbf771f 100644
--- a/test/DebugInfo/X86/DW_AT_object_pointer.ll
+++ b/test/DebugInfo/X86/DW_AT_object_pointer.ll
@@ -1,20 +1,25 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
 ; RUN: llvm-dwarfdump %t | FileCheck %s
 
-; CHECK: DW_AT_object_pointer [DW_FORM_ref4]     (cu + 0x00bf => {0x000000bf})
-; CHECK: 0x000000bf:     DW_TAG_formal_parameter [12]
-; CHECK-NEXT: DW_AT_name [DW_FORM_strp]     ( .debug_str[0x00000085] = "this")
+; CHECK: DW_TAG_formal_parameter [
+; CHECK: DW_TAG_class_type
+; CHECK: DW_AT_object_pointer [DW_FORM_ref4]     (cu + 0x00fd => {0x000000fd})
+; CHECK: 0x000000fd:     DW_TAG_formal_parameter [13]
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp]     ( .debug_str[0x00000086] = "this")
 
 %class.A = type { i32 }
 
-define i32 @_Z3foov() nounwind uwtable ssp {
+define i32 @_Z3fooi(i32) nounwind uwtable ssp {
 entry:
+  %.addr = alloca i32, align 4
   %a = alloca %class.A, align 4
+  store i32 %0, i32* %.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %.addr}, metadata !36), !dbg !35
   call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !21), !dbg !23
   call void @_ZN1AC1Ev(%class.A* %a), !dbg !24
   %m_a = getelementptr inbounds %class.A* %a, i32 0, i32 0, !dbg !25
-  %0 = load i32* %m_a, align 4, !dbg !25
-  ret i32 %0, !dbg !25
+  %1 = load i32* %m_a, align 4, !dbg !25
+  ret i32 %1, !dbg !25
 }
 
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
@@ -47,7 +52,7 @@ entry:
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !4}
 !4 = metadata !{metadata !5, metadata !10, metadata !20}
-!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3foov", metadata !6, i32 7, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z3foov, null, null, metadata !1, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [foo]
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3fooi", metadata !6, i32 7, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z3fooi, null, null, metadata !1, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [foo]
 !6 = metadata !{i32 786473, metadata !"bar.cpp", metadata !"/Users/echristo/debug-tests", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
@@ -63,6 +68,8 @@ entry:
 !18 = metadata !{metadata !19}
 !19 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
 !20 = metadata !{i32 786478, i32 0, null, metadata !"A", metadata !"A", metadata !"_ZN1AC2Ev", metadata !6, i32 3, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*)* @_ZN1AC2Ev, null, metadata !17, metadata !1, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [A]
+!36 = metadata !{i32 786689, metadata !5, metadata !"", metadata !6, i32 16777223, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [line 7]
+!35 = metadata !{i32 7, i32 0, metadata !5, null}
 !21 = metadata !{i32 786688, metadata !22, metadata !"a", metadata !6, i32 8, metadata !14, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 8]
 !22 = metadata !{i32 786443, metadata !5, i32 7, i32 11, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/Users/echristo/debug-tests/bar.cpp]
 !23 = metadata !{i32 8, i32 5, metadata !22, null}
diff --git a/test/CodeGen/X86/dbg-value-inlined-parameter.ll b/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
index d248a41303..a09a7ea682 100644
--- a/test/CodeGen/X86/dbg-value-inlined-parameter.ll
+++ b/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
@@ -1,14 +1,17 @@
-; RUN: llc -mtriple=x86_64-apple-darwin < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-apple-darwin -regalloc=basic < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin %s -filetype=obj -o %t
+; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin -regalloc=basic %s -filetype=obj -o %t
+; RUN: llvm-dwarfdump %t | FileCheck %s
 
-;CHECK: DW_TAG_inlined_subroutine
+;CHECK: DW_TAG_inlined_subroutine [12]
 ;CHECK-NEXT: DW_AT_abstract_origin
 ;CHECK-NEXT: DW_AT_low_pc
 ;CHECK-NEXT: DW_AT_high_pc
 ;CHECK-NEXT: DW_AT_call_file
 ;CHECK-NEXT: DW_AT_call_line
-;CHECK-NEXT: DW_TAG_formal_parameter
-;CHECK-NEXT: Lstring11-Lsection_str ## DW_AT_name
+
+;CHECK: DW_TAG_formal_parameter [9]
+;CHECK-NEXT: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000055] = "sp")
 
 %struct.S1 = type { float*, i32 }
 
diff --git a/test/DebugInfo/X86/elf-names.ll b/test/DebugInfo/X86/elf-names.ll
index b908bcefe4..b6a263dfca 100644
--- a/test/DebugInfo/X86/elf-names.ll
+++ b/test/DebugInfo/X86/elf-names.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu %s -o %t -filetype=obj
 ; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-as < %s | llvm-dis | FileCheck --check-prefix=CHECK-DIS %s
 
 ; CHECK: 0x0000000b: DW_TAG_compile_unit
 ; CHECK: 0x00000012:   DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000035] = "foo.cpp")
@@ -7,6 +8,9 @@
 ; CHECK: 0x0000003d:     DW_AT_name [DW_FORM_strp]       ( .debug_str[0x0000006d] = "D")
 ; CHECK: 0x00000044:     DW_TAG_member
 ; CHECK: 0x00000045:       DW_AT_name [DW_FORM_strp]     ( .debug_str[0x0000005d] = "c1")
+; CHECK: 0x0000008d:       DW_AT_artificial [DW_FORM_flag_present]       (true)
+
+; CHECK-DIS: [artificial]
 
 %class.D = type { i32, i32, i32, i32 }
 
diff --git a/test/DebugInfo/X86/empty-and-one-elem-array.ll b/test/DebugInfo/X86/empty-and-one-elem-array.ll
index b9224b1fde..0744c6bac8 100644
--- a/test/DebugInfo/X86/empty-and-one-elem-array.ll
+++ b/test/DebugInfo/X86/empty-and-one-elem-array.ll
@@ -29,33 +29,33 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 ; should.
 
 ; CHECK:      0x00000074:   DW_TAG_base_type [5]  
-; CHECK-NEXT: 0x00000075:     DW_AT_name [DW_FORM_strp]  ( .debug_str[0x00000043] = "int")
-; CHECK-NEXT: 0x00000079:     DW_AT_encoding [DW_FORM_data1]   (0x05)
-; CHECK-NEXT: 0x0000007a:     DW_AT_byte_size [DW_FORM_data1]  (0x04)
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp]  ( .debug_str[0x00000043] = "int")
+; CHECK-NEXT: DW_AT_encoding [DW_FORM_data1]   (0x05)
+; CHECK-NEXT: DW_AT_byte_size [DW_FORM_data1]  (0x04)
 
 ; int[1]:
-; CHECK:      0x0000007e:   DW_TAG_array_type [7] *
-; CHECK-NEXT: 0x0000007f:     DW_AT_type [DW_FORM_ref4]    (cu + 0x0074 => {0x00000074})
-; CHECK:      0x00000083:     DW_TAG_subrange_type [8]  
-; CHECK-NEXT: 0x00000084:       DW_AT_type [DW_FORM_ref4]  (cu + 0x007b => {0x0000007b})
-; CHECK-NEXT: 0x00000088:       DW_AT_upper_bound [DW_FORM_data1]  (0x00)
+; CHECK:      0x00000082:   DW_TAG_array_type [7] *
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]    (cu + 0x0074 => {0x00000074})
+; CHECK:      0x00000087:     DW_TAG_subrange_type [8]
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]  (cu + 0x007b => {0x0000007b})
+; CHECK-NEXT: DW_AT_upper_bound [DW_FORM_data1]  (0x00)
 
 ; int foo::b[1]:
-; CHECK:      0x000000a1:     DW_TAG_member [10]  
-; CHECK-NEXT: 0x000000a2:       DW_AT_name [DW_FORM_strp]  ( .debug_str[0x00000050] = "b")
-; CHECK-NEXT: 0x000000a6:       DW_AT_type [DW_FORM_ref4]  (cu + 0x007e => {0x0000007e})
+; CHECK:      0x000000a5:     DW_TAG_member [10]
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp]  ( .debug_str[0x00000050] = "b")
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]  (cu + 0x0082 => {0x00000082})
 
 ; int[0]:
-; CHECK:      0x000000b1:   DW_TAG_array_type [7] *
-; CHECK-NEXT: 0x000000b2:     DW_AT_type [DW_FORM_ref4]    (cu + 0x0074 => {0x00000074})
-; CHECK:      0x000000b6:     DW_TAG_subrange_type [11]  
-; CHECK-NEXT: 0x000000b7:       DW_AT_type [DW_FORM_ref4]  (cu + 0x007b => {0x0000007b})
+; CHECK:      0x000000b5:   DW_TAG_array_type [7] *
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]    (cu + 0x0074 => {0x00000074})
+; CHECK:      0x000000ba:     DW_TAG_subrange_type [11]
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]  (cu + 0x007b => {0x0000007b})
 ; CHECK-NOT:  DW_AT_upper_bound
 
 ; int bar::b[0]:
-; CHECK:      0x000000d3:     DW_TAG_member [10]  
-; CHECK-NEXT: 0x000000d4:       DW_AT_name [DW_FORM_strp]  ( .debug_str[0x00000050] = "b")
-; CHECK-NEXT: 0x000000d8:       DW_AT_type [DW_FORM_ref4]  (cu + 0x00b1 => {0x000000b1})
+; CHECK:      0x000000d7:     DW_TAG_member [10]
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp]  ( .debug_str[0x00000050] = "b")
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]  (cu + 0x00b5 => {0x000000b5})
 
 !llvm.dbg.cu = !{!0}
 
diff --git a/test/DebugInfo/X86/empty-array.ll b/test/DebugInfo/X86/empty-array.ll
index cd968478ab..dd5c6369f4 100644
--- a/test/DebugInfo/X86/empty-array.ll
+++ b/test/DebugInfo/X86/empty-array.ll
@@ -7,19 +7,20 @@
 @a = global %class.A zeroinitializer, align 4
 
 ; CHECK:      0x0000002d:   DW_TAG_base_type [3]  
-; CHECK-NEXT: 0x0000002e:     DW_AT_byte_size [DW_FORM_data1]  (0x04)
-; CHECK-NEXT: 0x0000002f:     DW_AT_encoding [DW_FORM_data1]   (0x05)
+; CHECK-NEXT: DW_AT_name
+; CHECK-NEXT: DW_AT_byte_size [DW_FORM_data1]  (0x04)
+; CHECK-NEXT: DW_AT_encoding [DW_FORM_data1]   (0x05)
 
-; CHECK:      0x00000030:   DW_TAG_array_type [4] *
-; CHECK-NEXT: 0x00000031:     DW_AT_type [DW_FORM_ref4]    (cu + 0x0026 => {0x00000026})
+; CHECK:      0x00000034:   DW_TAG_array_type [4] *
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]    (cu + 0x0026 => {0x00000026})
 
-; CHECK:      0x00000035:     DW_TAG_subrange_type [5]  
-; CHECK-NEXT: 0x00000036:       DW_AT_type [DW_FORM_ref4]  (cu + 0x002d => {0x0000002d})
+; CHECK:      0x00000039:     DW_TAG_subrange_type [5]
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]  (cu + 0x002d => {0x0000002d})
 ; CHECK-NOT:  DW_AT_upper_bound
 
-; CHECK:      0x00000048:     DW_TAG_member [8]  
-; CHECK-NEXT: 0x00000049:       DW_AT_name [DW_FORM_strp]  ( .debug_str[0x0000003f] = "x")
-; CHECK-NEXT: 0x0000004d:       DW_AT_type [DW_FORM_ref4]  (cu + 0x0030 => {0x00000030})
+; CHECK:      DW_TAG_member [8]
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp]  ( .debug_str[0x0000003f] = "x")
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]  (cu + 0x0034 => {0x00000034})
 
 !llvm.dbg.cu = !{!0}
 
diff --git a/test/DebugInfo/X86/fission-cu.ll b/test/DebugInfo/X86/fission-cu.ll
index fe4d5b0d52..3ada3ef383 100644
--- a/test/DebugInfo/X86/fission-cu.ll
+++ b/test/DebugInfo/X86/fission-cu.ll
@@ -19,8 +19,21 @@
 ; DW_AT_low_pc, DW_AT_high_pc, DW_AT_ranges, DW_AT_dwo_name, DW_AT_dwo_id,
 ; DW_AT_ranges_base, DW_AT_addr_base.
 
+; CHECK: .debug_info contents:
 ; CHECK: DW_TAG_compile_unit
-; CHECK: DW_AT_GNU_dwo_name [DW_FORM_strp] ( .debug_str[0x00000035] = "baz.c")
+; CHECK: DW_AT_GNU_dwo_name [DW_FORM_strp] ( .debug_str[0x00000000] = "baz.c")
 ; CHECK: DW_AT_low_pc [DW_FORM_addr]       (0x0000000000000000)
 ; CHECK: DW_AT_stmt_list [DW_FORM_data4]   (0x00000000)
-; CHECK: DW_AT_comp_dir [DW_FORM_strp]     ( .debug_str[0x0000003b] = "/usr/local/google/home/echristo/tmp")
+; CHECK: DW_AT_comp_dir [DW_FORM_strp]     ( .debug_str[0x00000006] = "/usr/local/google/home/echristo/tmp")
+
+; Check that the rest of the compile units have information.
+; FIXME: Strings will ultimately be a different form.
+; CHECK: .debug_info.dwo contents:
+; CHECK: DW_TAG_compile_unit
+; CHECK: DW_AT_producer [DW_FORM_GNU_str_index] ( indexed (00000000) string = "clang version 3.3 (trunk 169021) (llvm/trunk 169020)")
+; CHECK: DW_AT_language [DW_FORM_data2]        (0x000c)
+; CHECK: DW_AT_name [DW_FORM_GNU_str_index]    ( indexed (00000001) string = "baz.c")
+; CHECK: DW_TAG_base_type
+; CHECK: DW_AT_name [DW_FORM_GNU_str_index]     ( indexed (00000004) string = "int")
+; CHECK: DW_TAG_variable
+; CHECK: DW_AT_name [DW_FORM_GNU_str_index]     ( indexed (00000003) string = "a")
diff --git a/test/DebugInfo/X86/lit.local.cfg b/test/DebugInfo/X86/lit.local.cfg
index 0d694da8df..60d66eae49 100644
--- a/test/DebugInfo/X86/lit.local.cfg
+++ b/test/DebugInfo/X86/lit.local.cfg
@@ -1,4 +1,4 @@
-config.suffixes = ['.ll']
+config.suffixes = ['.ll', '.s']
 
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
diff --git a/test/DebugInfo/X86/main-file-name.s b/test/DebugInfo/X86/main-file-name.s
new file mode 100644
index 0000000000..6817c9e3a7
--- /dev/null
+++ b/test/DebugInfo/X86/main-file-name.s
@@ -0,0 +1,17 @@
+// RUN: llvm-mc -triple x86_64-unknown-linux-gnu -filetype obj -main-file-name foo.S -g -o %t %s
+// RUN: llvm-dwarfdump %t | FileCheck %s
+
+// CHECK: DW_TAG_compile_unit [1]
+// CHECK-NOT: DW_TAG_
+// CHECK: DW_AT_name [DW_FORM_string]       ("foo.S")
+        
+
+# 1 "foo.S"
+# 1 "<built-in>" 1
+# 1 "foo.S" 2
+
+foo:
+  nop
+  nop
+  nop
+        
diff --git a/test/DebugInfo/X86/nondefault-subrange-array.ll b/test/DebugInfo/X86/nondefault-subrange-array.ll
index 5845f3e0b8..6247cc3c81 100644
--- a/test/DebugInfo/X86/nondefault-subrange-array.ll
+++ b/test/DebugInfo/X86/nondefault-subrange-array.ll
@@ -8,21 +8,22 @@
 ; Check that we can handle non-default array bounds. In this case, the array
 ; goes from [-3, 38].
 
-; CHECK:      0x0000002d:   DW_TAG_base_type [3]  
-; CHECK-NEXT: 0x0000002e:     DW_AT_byte_size [DW_FORM_data1]  (0x04)
-; CHECK-NEXT: 0x0000002f:     DW_AT_encoding [DW_FORM_data1]   (0x05)
-
-; CHECK:      0x00000030:   DW_TAG_array_type [4] *
-; CHECK-NEXT: 0x00000031:     DW_AT_type [DW_FORM_ref4]    (cu + 0x0026 => {0x00000026})
-
-; CHECK:      0x00000035:     DW_TAG_subrange_type [5]
-; CHECK-NEXT: 0x00000036:       DW_AT_type [DW_FORM_ref4]  (cu + 0x002d => {0x0000002d})
-; CHECK-NEXT: 0x0000003a:       DW_AT_lower_bound [DW_FORM_data8]	(0xfffffffffffffffd)
-; CHECK-NEXT: 0x00000042:       DW_AT_upper_bound [DW_FORM_data1]	(0x26)
-
-; CHECK:      0x00000051:     DW_TAG_member [8]  
-; CHECK-NEXT: 0x00000052:       DW_AT_name [DW_FORM_strp]	( .debug_str[0x0000003f] = "x")
-; CHECK-NEXT: 0x00000056:       DW_AT_type [DW_FORM_ref4]	(cu + 0x0030 => {0x00000030})
+; CHECK:      0x0000002d:   DW_TAG_base_type [3]
+; CHECK-NEXT: 0x0000002e:     DW_AT_name [DW_FORM_strp]       ( .debug_str[0x00000041] = "int")
+; CHECK-NEXT: 0x00000032:     DW_AT_byte_size [DW_FORM_data1] (0x04)
+; CHECK-NEXT: 0x00000033:     DW_AT_encoding [DW_FORM_data1]  (0x05)
+
+; CHECK:      0x00000034:   DW_TAG_array_type [4] *
+; CHECK-NEXT: 0x00000035:     DW_AT_type [DW_FORM_ref4]    (cu + 0x0026 => {0x00000026})
+
+; CHECK:      0x00000039:     DW_TAG_subrange_type [5]
+; CHECK-NEXT: 0x0000003a:       DW_AT_type [DW_FORM_ref4]  (cu + 0x002d => {0x0000002d})
+; CHECK-NEXT: 0x0000003e:       DW_AT_lower_bound [DW_FORM_data8]       (0xfffffffffffffffd)
+; CHECK-NEXT: 0x00000046:       DW_AT_upper_bound [DW_FORM_data1]       (0x26)
+
+; CHECK:      0x00000055:     DW_TAG_member [8]
+; CHECK-NEXT: 0x00000056:       DW_AT_name [DW_FORM_strp]       ( .debug_str[0x0000003f] = "x")
+; CHECK-NEXT: 0x0000005a:       DW_AT_type [DW_FORM_ref4]       (cu + 0x0034 => {0x00000034})
 
 !llvm.dbg.cu = !{!0}
 
diff --git a/test/DebugInfo/X86/stringpool.ll b/test/DebugInfo/X86/stringpool.ll
index caf12c2756..21b0d09a86 100644
--- a/test/DebugInfo/X86/stringpool.ll
+++ b/test/DebugInfo/X86/stringpool.ll
@@ -15,7 +15,7 @@
 !7 = metadata !{i32 720932, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 
 ; Verify that we refer to 'yyyy' with a relocation.
-; LINUX:      .long   .Lstring3               # DW_AT_name
+; LINUX:      .long   .Linfo_string3          # DW_AT_name
 ; LINUX-NEXT: .long   38                      # DW_AT_type
 ; LINUX-NEXT:                                 # DW_AT_external
 ; LINUX-NEXT: .byte   1                       # DW_AT_decl_file
@@ -25,7 +25,7 @@
 ; LINUX-NEXT: .quad   yyyy
 
 ; Verify that we refer to 'yyyy' without a relocation.
-; DARWIN: Lset5 = Lstring3-Lsection_str               ## DW_AT_name
+; DARWIN: Lset5 = Linfo_string3-Linfo_string          ## DW_AT_name
 ; DARWIN-NEXT:        .long   Lset5
 ; DARWIN-NEXT:        .long   39                      ## DW_AT_type
 ; DARWIN-NEXT:        .byte   1                       ## DW_AT_external
diff --git a/test/DebugInfo/X86/subrange-type.ll b/test/DebugInfo/X86/subrange-type.ll
new file mode 100644
index 0000000000..15202fb74d
--- /dev/null
+++ b/test/DebugInfo/X86/subrange-type.ll
@@ -0,0 +1,39 @@
+; RUN: llc -O0 %s -mtriple=x86_64-unknown-linux-gnu -filetype=obj -o %t
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; Make sure that the base type from the subrange type has a name.
+; CHECK: 0x0000006b:   DW_TAG_base_type [6]
+; CHECK-NEXT: DW_AT_name
+; CHECK: DW_TAG_subrange_type [8]
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]     (cu + 0x006b => {0x0000006b})
+
+define i32 @main() nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca [2 x i32], align 4
+  store i32 0, i32* %retval
+  call void @llvm.dbg.declare(metadata !{[2 x i32]* %i}, metadata !10), !dbg !15
+  ret i32 0, !dbg !16
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 12, metadata !"foo.c", metadata !"/usr/local/google/home/echristo/tmp", metadata !"clang version 3.3 (trunk 171472) (llvm/trunk 171487)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.c] [DW_LANG_C99]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"main", metadata !"main", metadata !"", metadata !6, i32 2, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !1, i32 3} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 3] [main]
+!6 = metadata !{i32 786473, metadata !"foo.c", metadata !"/usr/local/google/home/echristo/tmp", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{i32 786688, metadata !11, metadata !"i", metadata !6, i32 4, metadata !12, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 4]
+!11 = metadata !{i32 786443, metadata !5, i32 3, i32 0, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/foo.c]
+!12 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 64, i64 32, i32 0, i32 0, metadata !9, metadata !13, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 64, align 32, offset 0] [from int]
+!13 = metadata !{metadata !14}
+!14 = metadata !{i32 786465, i64 0, i64 2}        ; [ DW_TAG_subrange_type ] [0, 1]
+!15 = metadata !{i32 4, i32 0, metadata !11, null}
+!16 = metadata !{i32 6, i32 0, metadata !11, null}
diff --git a/test/DebugInfo/X86/vector.ll b/test/DebugInfo/X86/vector.ll
new file mode 100644
index 0000000000..7b61e76f18
--- /dev/null
+++ b/test/DebugInfo/X86/vector.ll
@@ -0,0 +1,29 @@
+; RUN: llc -mtriple=x86_64-linux-gnu -O0 -filetype=obj -o %t %s
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; Generated from:
+; clang -g -S -emit-llvm -o foo.ll foo.c
+; typedef int v4si __attribute__((__vector_size__(16)));
+;
+; v4si a
+
+@a = common global <4 x i32> zeroinitializer, align 16
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 12, metadata !"foo.c", metadata !"/Users/echristo", metadata !"clang version 3.3 (trunk 171825) (llvm/trunk 171822)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3} ; [ DW_TAG_compile_unit ] [/Users/echristo/foo.c] [DW_LANG_C99]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 3, metadata !7, i32 0, i32 1, <4 x i32>* @a} ; [ DW_TAG_variable ] [a] [line 3] [def]
+!6 = metadata !{i32 786473, metadata !"foo.c", metadata !"/Users/echristo", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786454, null, metadata !"v4si", metadata !6, i32 1, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_typedef ] [v4si] [line 1, size 0, align 0, offset 0] [from ]
+!8 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 128, i64 128, i32 0, i32 2048, metadata !9, metadata !10, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [vector] [from int]
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !11}
+!11 = metadata !{i32 786465, i64 0, i64 4}        ; [ DW_TAG_subrange_type ] [0, 3]
+
+; Check that we get an array type with a vector attribute.
+; CHECK: DW_TAG_array_type
+; CHECK-NEXT: DW_AT_GNU_vector
diff --git a/test/DebugInfo/member-pointers.ll b/test/DebugInfo/member-pointers.ll
new file mode 100644
index 0000000000..47874d9427
--- /dev/null
+++ b/test/DebugInfo/member-pointers.ll
@@ -0,0 +1,30 @@
+; RUN: llc -filetype=obj -O0 < %s > %t
+; RUN: llvm-dwarfdump %t | FileCheck %s
+; CHECK: DW_TAG_ptr_to_member_type
+; CHECK: DW_TAG_ptr_to_member_type
+; IR generated from clang -g with the following source:
+; struct S {
+; };
+;
+; int S::*x = 0;
+; void (S::*y)(int) = 0;
+
+@x = global i64 -1, align 8
+@y = global { i64, i64 } zeroinitializer, align 8
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"simple.cpp", metadata !"/home/blaikie/Development/scratch", metadata !"clang version 3.3 ", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3} ; [ DW_TAG_compile_unit ] [/home/blaikie/Development/scratch/simple.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5, metadata !10}
+!5 = metadata !{i32 786484, i32 0, null, metadata !"x", metadata !"x", metadata !"", metadata !6, i32 2, metadata !7, i32 0, i32 1, i64* @x} ; [ DW_TAG_variable ] [x] [line 2] [def]
+!6 = metadata !{i32 786473, metadata !"simple.cpp", metadata !"/home/blaikie/Development/scratch", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786463, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !8, metadata !9} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from int]
+!8 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 786451, null, metadata !"S", metadata !6, i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null} ; [ DW_TAG_structure_type ] [S] [line 1, size 8, align 8, offset 0] [from ]
+!10 = metadata !{i32 786484, i32 0, null, metadata !"y", metadata !"y", metadata !"", metadata !6, i32 3, metadata !11, i32 0, i32 1, { i64, i64 }* @y} ; [ DW_TAG_variable ] [y] [line 3] [def]
+!11 = metadata !{i32 786463, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !12, metadata !9} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{null, metadata !8}
diff --git a/test/Feature/const_pv.ll b/test/Feature/const_pv.ll
index 6fd6abdccf..272bf43a06 100644
--- a/test/Feature/const_pv.ll
+++ b/test/Feature/const_pv.ll
@@ -4,5 +4,5 @@
 @G1 = global i8 zeroinitializer
 @g = constant <2 x i8*> getelementptr (<2 x i8*> <i8* @G1, i8* @G1>, <2 x i32> <i32 0, i32 0>)
 
-@t = constant <2 x i1> icmp ((<2 x i32> ptrtoint (<2 x i8*> zeroinitializer to <2 x i32>), <2 x i32> zeroinitializer )
+@t = constant <2 x i1> icmp eq (<2 x i32> ptrtoint (<2 x i8*> zeroinitializer to <2 x i32>), <2 x i32> zeroinitializer )
 
diff --git a/test/Feature/global_pv.ll b/test/Feature/global_pv.ll
index d257ec077a..34b9a7df88 100644
--- a/test/Feature/global_pv.ll
+++ b/test/Feature/global_pv.ll
@@ -1,5 +1,5 @@
-; RUN: opt -instcombine -S -o - %s | llvm-as
-; RUN: opt -instcombine -globalopt -S -o - %s | llvm-as
+; RUN: opt -instcombine -S < %s | llvm-as
+; RUN: opt -instcombine -globalopt -S < %s | llvm-as
 @G1 = global i32 zeroinitializer
 @G2 = global i32 zeroinitializer
 @g = global <2 x i32*> zeroinitializer
diff --git a/test/Instrumentation/AddressSanitizer/debug_info.ll b/test/Instrumentation/AddressSanitizer/debug_info.ll
new file mode 100644
index 0000000000..f686ac1c52
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/debug_info.ll
@@ -0,0 +1,60 @@
+; RUN: opt < %s -asan -asan-module -S | FileCheck %s
+
+; Checks that llvm.dbg.declare instructions are updated 
+; accordingly as we merge allocas.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @_Z3zzzi(i32 %p) nounwind uwtable address_safety {
+entry:
+  %p.addr = alloca i32, align 4
+  %r = alloca i32, align 4
+  store i32 %p, i32* %p.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %p.addr}, metadata !10), !dbg !11
+  call void @llvm.dbg.declare(metadata !{i32* %r}, metadata !12), !dbg !14
+  %0 = load i32* %p.addr, align 4, !dbg !14
+  %add = add nsw i32 %0, 1, !dbg !14
+  store i32 %add, i32* %r, align 4, !dbg !14
+  %1 = load i32* %r, align 4, !dbg !15
+  ret i32 %1, !dbg !15
+}
+
+;   CHECK: define i32 @_Z3zzzi
+;   CHECK: entry:
+; Verify that llvm.dbg.declare calls are in the entry basic block.
+;   CHECK-NOT: %entry
+;   CHECK: call void @llvm.dbg.declare(metadata {{.*}}, metadata ![[ARG_ID:[0-9]+]])
+;   CHECK-NOT: %entry
+;   CHECK: call void @llvm.dbg.declare(metadata {{.*}}, metadata ![[VAR_ID:[0-9]+]])
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"a.cc", metadata !"/usr/local/google/llvm_cmake_clang/tmp/debuginfo", metadata !"clang version 3.3 (trunk 169314)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/llvm_cmake_clang/tmp/debuginfo/a.cc] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"zzz", metadata !"zzz", metadata !"_Z3zzzi", metadata !6, i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z3zzzi, null, null, metadata !1, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [zzz]
+!6 = metadata !{i32 786473, metadata !"a.cc", metadata !"/usr/local/google/llvm_cmake_clang/tmp/debuginfo", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9, metadata !9}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{i32 786689, metadata !5, metadata !"p", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p] [line 1]
+!11 = metadata !{i32 1, i32 0, metadata !5, null}
+!12 = metadata !{i32 786688, metadata !13, metadata !"r", metadata !6, i32 2, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [r] [line 2]
+
+; Verify that debug descriptors for argument and local variable will be replaced
+; with descriptors that end with OpDeref (encoded as 2).
+;   CHECK: ![[ARG_ID]] = metadata {{.*}} i64 2} ; [ DW_TAG_arg_variable ] [p] [line 1]
+;   CHECK: ![[VAR_ID]] = metadata {{.*}} i64 2} ; [ DW_TAG_auto_variable ] [r] [line 2]
+; Verify that there are no more variable descriptors.
+;   CHECK-NOT: DW_TAG_arg_variable
+;   CHECK-NOT: DW_TAG_auto_variable
+
+
+!13 = metadata !{i32 786443, metadata !5, i32 1, i32 0, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/llvm_cmake_clang/tmp/debuginfo/a.cc]
+!14 = metadata !{i32 2, i32 0, metadata !13, null}
+!15 = metadata !{i32 3, i32 0, metadata !13, null}
diff --git a/test/Instrumentation/AddressSanitizer/lifetime.ll b/test/Instrumentation/AddressSanitizer/lifetime.ll
index 55cd475f1f..982ad085ec 100644
--- a/test/Instrumentation/AddressSanitizer/lifetime.ll
+++ b/test/Instrumentation/AddressSanitizer/lifetime.ll
@@ -31,7 +31,7 @@ define void @lifetime() address_safety {
   %i.ptr = bitcast i32* %i to i8*
   call void @llvm.lifetime.start(i64 3, i8* %i.ptr)
   ; Memory is unpoisoned at llvm.lifetime.start
-  ; CHECK: %[[VAR:[^ ]*]] = ptrtoint i8* %i.ptr to i64
+  ; CHECK: %[[VAR:[^ ]*]] = ptrtoint i32* %{{[^ ]+}} to i64
   ; CHECK-NEXT: call void @__asan_unpoison_stack_memory(i64 %[[VAR]], i64 3)
   call void @llvm.lifetime.end(i64 4, i8* %i.ptr)
   call void @llvm.lifetime.end(i64 2, i8* %i.ptr)
@@ -59,3 +59,26 @@ define void @lifetime() address_safety {
   ; CHECK: ret void
   ret void
 }
+
+; Check that arguments of lifetime may come from phi nodes.
+define void @phi_args(i1 %x) address_safety {
+  ; CHECK: @phi_args
+
+entry:
+  %i = alloca i64, align 4
+  %i.ptr = bitcast i64* %i to i8*
+  call void @llvm.lifetime.start(i64 8, i8* %i.ptr)
+  ; CHECK: __asan_unpoison_stack_memory
+  br i1 %x, label %bb0, label %bb1
+
+bb0:
+  %i.ptr2 = bitcast i64* %i to i8*
+  br label %bb1
+
+bb1:
+  %i.phi = phi i8* [ %i.ptr, %entry ], [ %i.ptr2, %bb0 ]
+  call void @llvm.lifetime.end(i64 8, i8* %i.phi)
+  ; CHECK: __asan_poison_stack_memory
+  ; CHECK: ret void
+  ret void
+}
diff --git a/test/Instrumentation/MemorySanitizer/msan_basic.ll b/test/Instrumentation/MemorySanitizer/msan_basic.ll
index 3228863193..cd90f8836a 100644
--- a/test/Instrumentation/MemorySanitizer/msan_basic.ll
+++ b/test/Instrumentation/MemorySanitizer/msan_basic.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -msan -S | FileCheck %s
-; RUN: opt < %s -msan -msan-track-origins=1 -S | FileCheck -check-prefix=CHECK-ORIGINS %s
+; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-track-origins=1 -S | FileCheck -check-prefix=CHECK-ORIGINS %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Check the presence of __msan_init
@@ -8,7 +8,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; Check the presence and the linkage type of __msan_track_origins
 ; CHECK: @__msan_track_origins = weak_odr constant i32 0
 
+
 ; Check instrumentation of stores
+
 define void @Store(i32* nocapture %p, i32 %x) nounwind uwtable {
 entry:
   store i32 %x, i32* %p, align 4
@@ -33,6 +35,34 @@ entry:
 ; CHECK-ORIGINS: ret void
 
 
+; Check instrumentation of aligned stores
+; Shadow store has the same alignment as the original store; origin store
+; does not specify explicit alignment.
+
+define void @AlignedStore(i32* nocapture %p, i32 %x) nounwind uwtable {
+entry:
+  store i32 %x, i32* %p, align 32
+  ret void
+}
+
+; CHECK: @AlignedStore
+; CHECK: load {{.*}} @__msan_param_tls
+; CHECK: store {{.*}} align 32
+; CHECK: store {{.*}} align 32
+; CHECK: ret void
+; CHECK-ORIGINS: @AlignedStore
+; CHECK-ORIGINS: load {{.*}} @__msan_param_tls
+; CHECK-ORIGINS: store {{.*}} align 32
+; CHECK-ORIGINS: icmp
+; CHECK-ORIGINS: br i1
+; CHECK-ORIGINS: <label>
+; CHECK-ORIGINS: store {{.*}} align 32
+; CHECK-ORIGINS: br label
+; CHECK-ORIGINS: <label>
+; CHECK-ORIGINS: store {{.*}} align 32
+; CHECK-ORIGINS: ret void
+
+
 ; load followed by cmp: check that we load the shadow and call __msan_warning.
 define void @LoadAndCmp(i32* nocapture %a) nounwind uwtable {
 entry:
@@ -221,6 +251,23 @@ entry:
 ; CHECK: ret i32
 
 
+; Check that we propagate origin for "select" with vector condition.
+; Select condition is flattened to i1, which is then used to select one of the
+; argument origins.
+
+define <8 x i16> @SelectVector(<8 x i16> %a, <8 x i16> %b, <8 x i1> %c) nounwind uwtable readnone {
+entry:
+  %cond = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %cond
+}
+
+; CHECK-ORIGINS: @SelectVector
+; CHECK-ORIGINS: bitcast <8 x i1> {{.*}} to i8
+; CHECK-ORIGINS: icmp ne i8
+; CHECK-ORIGINS: select i1
+; CHECK-ORIGINS: ret <8 x i16>
+
+
 define i8* @IntToPtr(i64 %x) nounwind uwtable readnone {
 entry:
   %0 = inttoptr i64 %x to i8*
@@ -315,7 +362,8 @@ define zeroext i1 @ICmpSLE(i32 %x) nounwind uwtable readnone {
 ; CHECK: ret i1
 
 
-; Check that loads from shadow have the same aligment as the original loads.
+; Check that loads of shadow have the same aligment as the original loads.
+; Check that loads of origin have the aligment of max(4, original alignment).
 
 define i32 @ShadowLoadAlignmentLarge() nounwind uwtable {
   %y = alloca i32, align 64
@@ -339,6 +387,12 @@ define i32 @ShadowLoadAlignmentSmall() nounwind uwtable {
 ; CHECK: load volatile i32* {{.*}} align 2
 ; CHECK: ret i32
 
+; CHECK-ORIGINS: @ShadowLoadAlignmentSmall
+; CHECK-ORIGINS: load i32* {{.*}} align 2
+; CHECK-ORIGINS: load i32* {{.*}} align 4
+; CHECK-ORIGINS: load volatile i32* {{.*}} align 2
+; CHECK-ORIGINS: ret i32
+
 
 ; Test vector manipulation instructions.
 ; Check that the same bit manipulation is applied to the shadow values.
@@ -378,6 +432,7 @@ define <4 x i32> @ShuffleVector(<4 x i32> %vec, <4 x i32> %vec1) {
 ; CHECK: shufflevector
 ; CHECK: ret <4 x i32>
 
+
 ; Test bswap intrinsic instrumentation
 define i32 @BSwap(i32 %x) nounwind uwtable readnone {
   %y = tail call i32 @llvm.bswap.i32(i32 %x)
@@ -393,3 +448,102 @@ declare i32 @llvm.bswap.i32(i32) nounwind readnone
 ; CHECK: @llvm.bswap.i32
 ; CHECK-NOT: call void @__msan_warning
 ; CHECK: ret i32
+
+
+; Store intrinsic.
+
+define void @StoreIntrinsic(i8* %p, <4 x float> %x) nounwind uwtable {
+  call void @llvm.x86.sse.storeu.ps(i8* %p, <4 x float> %x)
+  ret void
+}
+
+declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
+
+; CHECK: @StoreIntrinsic
+; CHECK-NOT: br
+; CHECK-NOT: = or
+; CHECK: store <4 x i32> {{.*}} align 1
+; CHECK: call void @llvm.x86.sse.storeu.ps
+; CHECK: ret void
+
+
+; Load intrinsic.
+
+define <16 x i8> @LoadIntrinsic(i8* %p) nounwind uwtable {
+  %call = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %p)
+  ret <16 x i8> %call
+}
+
+declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %p) nounwind
+
+; CHECK: @LoadIntrinsic
+; CHECK: load <16 x i8>* {{.*}} align 1
+; CHECK-NOT: br
+; CHECK-NOT: = or
+; CHECK: call <16 x i8> @llvm.x86.sse3.ldu.dq
+; CHECK: store <16 x i8> {{.*}} @__msan_retval_tls
+; CHECK: ret <16 x i8>
+
+; CHECK-ORIGINS: @LoadIntrinsic
+; CHECK-ORIGINS: [[ORIGIN:%[01-9a-z]+]] = load i32* {{.*}}
+; CHECK-ORIGINS: call <16 x i8> @llvm.x86.sse3.ldu.dq
+; CHECK-ORIGINS: store i32 {{.*}}[[ORIGIN]], i32* @__msan_retval_origin_tls
+; CHECK-ORIGINS: ret <16 x i8>
+
+
+; Simple NoMem intrinsic
+; Check that shadow is OR'ed, and origin is Select'ed
+; And no shadow checks!
+
+define <8 x i16> @Paddsw128(<8 x i16> %a, <8 x i16> %b) nounwind uwtable {
+  %call = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %call
+}
+
+declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b) nounwind
+
+; CHECK: @Paddsw128
+; CHECK-NEXT: load <8 x i16>* {{.*}} @__msan_param_tls
+; CHECK-NEXT: load <8 x i16>* {{.*}} @__msan_param_tls
+; CHECK-NEXT: = or <8 x i16>
+; CHECK-NEXT: call <8 x i16> @llvm.x86.sse2.padds.w
+; CHECK-NEXT: store <8 x i16> {{.*}} @__msan_retval_tls
+; CHECK-NEXT: ret <8 x i16>
+
+; CHECK-ORIGINS: @Paddsw128
+; CHECK-ORIGINS: load i32* {{.*}} @__msan_param_origin_tls
+; CHECK-ORIGINS: load i32* {{.*}} @__msan_param_origin_tls
+; CHECK-ORIGINS: = bitcast <8 x i16> {{.*}} to i128
+; CHECK-ORIGINS-NEXT: = icmp ne i128 {{.*}}, 0
+; CHECK-ORIGINS-NEXT: = select i1 {{.*}}, i32 {{.*}}, i32
+; CHECK-ORIGINS: call <8 x i16> @llvm.x86.sse2.padds.w
+; CHECK-ORIGINS: store i32 {{.*}} @__msan_retval_origin_tls
+; CHECK-ORIGINS: ret <8 x i16>
+
+
+; Test handling of vectors of pointers.
+; Check that shadow of such vector is a vector of integers.
+
+define <8 x i8*> @VectorOfPointers(<8 x i8*>* %p) nounwind uwtable {
+  %x = load <8 x i8*>* %p
+  ret <8 x i8*> %x
+}
+
+; CHECK: @VectorOfPointers
+; CHECK: load <8 x i64>*
+; CHECK: load <8 x i8*>*
+; CHECK: store <8 x i64> {{.*}} @__msan_retval_tls
+; CHECK: ret <8 x i8*>
+
+; Test handling of va_copy.
+
+declare void @llvm.va_copy(i8*, i8*) nounwind
+
+define void @VACopy(i8* %p1, i8* %p2) nounwind uwtable {
+  call void @llvm.va_copy(i8* %p1, i8* %p2) nounwind
+  ret void
+}
+
+; CHECK: @VACopy
+; CHECK: call void @llvm.memset.p0i8.i64({{.*}}, i8 0, i64 24, i32 8, i1 false)
+; CHECK: ret void
diff --git a/test/Instrumentation/MemorySanitizer/unreachable.ll b/test/Instrumentation/MemorySanitizer/unreachable.ll
new file mode 100644
index 0000000000..66a9575d3f
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/unreachable.ll
@@ -0,0 +1,23 @@
+; RUN: opt < %s -msan -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+
+; Test that MemorySanitizer correctly handles unreachable blocks.
+
+define i32 @Func(i32* %p) nounwind uwtable {
+entry:
+  br label %exit
+
+unreachable:
+  %x = load i32* %p
+  br label %exit
+
+exit:
+  %z = phi i32 [ 42, %entry ], [ %x, %unreachable ]
+  ret i32 %z
+}
+
+; CHECK: @Func
+; CHECK: store i32 0, {{.*}} @__msan_retval_tls
+; CHECK: ret i32 42
diff --git a/test/Linker/testlink1.ll b/test/Linker/testlink1.ll
index a8746379b6..6ba6fd5fd7 100644
--- a/test/Linker/testlink1.ll
+++ b/test/Linker/testlink1.ll
@@ -13,6 +13,10 @@
 ; The uses of intlist in the other file should be remapped.
 ; CHECK-NOT: {{%intlist.[0-9]}}
 
+; CHECK: %VecSize = type { <5 x i32> }
+; CHECK: %VecSize.{{[0-9]}} = type { <10 x i32> }
+%VecSize = type { <5 x i32> }
+
 %Struct1 = type opaque
 @S1GV = external global %Struct1*
 
@@ -93,3 +97,5 @@ define internal void @Testintern() {
 define void @testIntern() {
   ret void
 }
+
+declare void @VecSizeCrash(%VecSize)
diff --git a/test/Linker/testlink2.ll b/test/Linker/testlink2.ll
index 1798e31e47..ff8e529986 100644
--- a/test/Linker/testlink2.ll
+++ b/test/Linker/testlink2.ll
@@ -8,6 +8,8 @@
 %Ty1 = type { %Ty2* }
 %Ty2 = type opaque
 
+%VecSize = type { <10 x i32> }
+
 @GVTy1 = global %Ty1* null
 @GVTy2 = external global %Ty2*
 
@@ -53,3 +55,4 @@ define internal void @testIntern() {
   ret void
 }
 
+declare void @VecSizeCrash1(%VecSize)
diff --git a/test/MC/ARM/AlignedBundling/group-bundle-arm.s b/test/MC/ARM/AlignedBundling/group-bundle-arm.s
new file mode 100644
index 0000000000..823d9e0cb8
--- /dev/null
+++ b/test/MC/ARM/AlignedBundling/group-bundle-arm.s
@@ -0,0 +1,37 @@
+# RUN: llvm-mc -filetype=obj -triple armv7-linux-gnueabi %s -o - \
+# RUN:   | llvm-objdump -no-show-raw-insn -triple armv7 -disassemble - | FileCheck %s
+
+# On ARM each instruction is 4 bytes long so padding for individual
+# instructions should not be inserted. However, for bundle-locked groups
+# it can be.
+
+	.syntax unified
+	.text
+  .bundle_align_mode 4
+
+  bx lr
+  and r1, r1, r2
+  and r1, r1, r2
+  .bundle_lock
+  bx r9
+  bx r8
+  .bundle_unlock
+# CHECK:      c:  nop
+# CHECK-NEXT: 10: bx
+# CHECK-NEXT: 14: bx
+
+  # pow2 here
+  .align 4 
+  bx lr
+  .bundle_lock
+  bx r9
+  bx r9
+  bx r9
+  bx r8
+  .bundle_unlock
+# CHECK:      20: bx
+# CHECK-NEXT: 24: nop
+# CHECK-NEXT: 28: nop
+# CHECK-NEXT: 2c: nop
+# CHECK-NEXT: 30: bx
+
diff --git a/test/MC/ARM/AlignedBundling/lit.local.cfg b/test/MC/ARM/AlignedBundling/lit.local.cfg
new file mode 100644
index 0000000000..6c49f08b74
--- /dev/null
+++ b/test/MC/ARM/AlignedBundling/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.s']
+
+targets = set(config.root.targets_to_build.split())
+if not 'X86' in targets:
+    config.unsupported = True
+
diff --git a/test/MC/ARM/AlignedBundling/pad-align-to-bundle-end.s b/test/MC/ARM/AlignedBundling/pad-align-to-bundle-end.s
new file mode 100644
index 0000000000..341358b9db
--- /dev/null
+++ b/test/MC/ARM/AlignedBundling/pad-align-to-bundle-end.s
@@ -0,0 +1,41 @@
+# RUN: llvm-mc -filetype=obj -triple armv7-linux-gnueabi %s -o - \
+# RUN:   | llvm-objdump -no-show-raw-insn -triple armv7 -disassemble - | FileCheck %s
+
+	.syntax unified
+	.text
+  .bundle_align_mode 4
+
+  bx lr
+  and r1, r1, r2
+  and r1, r1, r2
+  .bundle_lock align_to_end
+  bx r9
+  .bundle_unlock
+# No padding required here because bx just happens to be in the
+# right offset.
+# CHECK:      8:  and
+# CHECK-NEXT: c:  bx
+
+  bx lr
+  and r1, r1, r2
+  .bundle_lock align_to_end
+  bx r9
+  .bundle_unlock
+# A 4-byte padding is needed here
+# CHECK:      18: nop
+# CHECK-NEXT: 1c: bx
+
+  bx lr
+  and r1, r1, r2
+  .bundle_lock align_to_end
+  bx r9
+  bx r9
+  bx r9
+  .bundle_unlock
+# A 12-byte padding is needed here to push the group to the end of the next
+# bundle
+# CHECK:      28: nop
+# CHECK-NEXT: 2c: nop
+# CHECK-NEXT: 30: nop
+# CHECK-NEXT: 34: bx
+
diff --git a/test/MC/ARM/basic-thumb2-instructions.s b/test/MC/ARM/basic-thumb2-instructions.s
index 23d9f5977a..d495c91c0e 100644
--- a/test/MC/ARM/basic-thumb2-instructions.s
+++ b/test/MC/ARM/basic-thumb2-instructions.s
@@ -3509,3 +3509,7 @@ _func:
 @ CHECK: ldrh.w	r11, [pc, #-22]         @ encoding: [0x3f,0xf8,0x16,0xb0]
 @ CHECK: ldrsb.w r11, [pc, #-22]        @ encoding: [0x1f,0xf9,0x16,0xb0]
 @ CHECK: ldrsh.w r11, [pc, #-22]        @ encoding: [0x3f,0xf9,0x16,0xb0]
+
+@ rdar://12596361
+        ldr r1, [pc, #12]
+@ CHECK: ldr.n r1, [pc, #12]        @ encoding: [0x03,0x49]
diff --git a/test/MC/ARM/data-in-code.ll b/test/MC/ARM/data-in-code.ll
new file mode 100644
index 0000000000..c2feec5303
--- /dev/null
+++ b/test/MC/ARM/data-in-code.ll
@@ -0,0 +1,176 @@
+;; RUN: llc -O0 -mtriple=armv7-linux-gnueabi -filetype=obj %s -o - | \
+;; RUN:   elf-dump | FileCheck -check-prefix=ARM %s
+
+;; RUN: llc -O0 -mtriple=thumbv7-linux-gnueabi -filetype=obj %s -o - | \
+;; RUN:   elf-dump --dump-section-data | FileCheck -check-prefix=TMB %s
+
+;; Ensure that if a jump table is generated that it has Mapping Symbols
+;; marking the data-in-code region.
+
+define void @foo(i32* %ptr) nounwind ssp {
+  %tmp = load i32* %ptr, align 4
+  switch i32 %tmp, label %default [
+    i32 11, label %bb0
+    i32 10, label %bb1
+    i32 8, label %bb2
+    i32 4, label %bb3
+    i32 2, label %bb4
+    i32 6, label %bb5
+    i32 9, label %bb6
+    i32 15, label %bb7
+    i32 1, label %bb8
+    i32 3, label %bb9
+    i32 5, label %bb10
+    i32 30, label %bb11
+    i32 31, label %bb12
+    i32 13, label %bb13
+    i32 14, label %bb14
+    i32 20, label %bb15
+    i32 19, label %bb16
+    i32 17, label %bb17
+    i32 18, label %bb18
+    i32 21, label %bb19
+    i32 22, label %bb20
+    i32 16, label %bb21
+    i32 24, label %bb22
+    i32 25, label %bb23
+    i32 26, label %bb24
+    i32 27, label %bb25
+    i32 28, label %bb26
+    i32 23, label %bb27
+    i32 12, label %bb28
+  ]
+
+default:
+  br label %exit
+bb0:
+  br label %exit
+bb1:
+  br label %exit
+bb2:
+  br label %exit
+bb3:
+  br label %exit
+bb4:
+  br label %exit
+bb5:
+  br label %exit
+bb6:
+  br label %exit
+bb7:
+  br label %exit
+bb8:
+  br label %exit
+bb9:
+  br label %exit
+bb10:
+  br label %exit
+bb11:
+  br label %exit
+bb12:
+  br label %exit
+bb13:
+  br label %exit
+bb14:
+  br label %exit
+bb15:
+  br label %exit
+bb16:
+  br label %exit
+bb17:
+  br label %exit
+bb18:
+  br label %exit
+bb19:
+  br label %exit
+bb20:
+  br label %exit
+bb21:
+  br label %exit
+bb22:
+  br label %exit
+bb23:
+  br label %exit
+bb24:
+  br label %exit
+bb25:
+  br label %exit
+bb26:
+  br label %exit
+bb27:
+  br label %exit
+bb28:
+  br label %exit
+
+
+exit:
+
+  ret void
+}
+
+;; ARM:         # Symbol 2
+;; ARM-NEXT:    $a
+;; ARM-NEXT:   'st_value', 0x00000000
+;; ARM-NEXT:   'st_size', 0x00000000
+;; ARM-NEXT:   'st_bind', 0x0
+;; ARM-NEXT:   'st_type', 0x0
+;; ARM-NEXT:   'st_other'
+;; ARM-NEXT:   'st_shndx', [[MIXED_SECT:0x[0-9a-f]+]]
+
+;; ARM:         # Symbol 3
+;; ARM-NEXT:    $a
+;; ARM-NEXT:   'st_value', 0x000000ac
+;; ARM-NEXT:   'st_size', 0x00000000
+;; ARM-NEXT:   'st_bind', 0x0
+;; ARM-NEXT:   'st_type', 0x0
+;; ARM-NEXT:   'st_other'
+;; ARM-NEXT:   'st_shndx', [[MIXED_SECT]]
+
+;; ARM:         # Symbol 4
+;; ARM-NEXT:    $d
+;; ARM-NEXT:    'st_value', 0x00000000
+;; ARM-NEXT:    'st_size', 0x00000000
+;; ARM-NEXT:    'st_bind', 0x0
+;; ARM-NEXT:    'st_type', 0x0
+
+;; ARM:         # Symbol 5
+;; ARM-NEXT:    $d
+;; ARM-NEXT:   'st_value', 0x00000030
+;; ARM-NEXT:   'st_size', 0x00000000
+;; ARM-NEXT:   'st_bind', 0x0
+;; ARM-NEXT:   'st_type', 0x0
+;; ARM-NEXT:   'st_other'
+;; ARM-NEXT:   'st_shndx', [[MIXED_SECT]]
+
+;; ARM-NOT:     ${{[atd]}}
+
+;; TMB:         # Symbol 3
+;; TMB-NEXT:    $d
+;; TMB-NEXT:   'st_value', 0x00000016
+;; TMB-NEXT:   'st_size', 0x00000000
+;; TMB-NEXT:   'st_bind', 0x0
+;; TMB-NEXT:   'st_type', 0x0
+;; TMB-NEXT:   'st_other'
+;; TMB-NEXT:   'st_shndx', [[MIXED_SECT:0x[0-9a-f]+]]
+
+;; TMB:         # Symbol 4
+;; TMB-NEXT:    $t
+;; TMB-NEXT:   'st_value', 0x00000000
+;; TMB-NEXT:   'st_size', 0x00000000
+;; TMB-NEXT:   'st_bind', 0x0
+;; TMB-NEXT:   'st_type', 0x0
+;; TMB-NEXT:   'st_other'
+;; TMB-NEXT:   'st_shndx', [[MIXED_SECT]]
+
+;; TMB:         # Symbol 5
+;; TMB-NEXT:    $t
+;; TMB-NEXT:   'st_value', 0x00000036
+;; TMB-NEXT:   'st_size', 0x00000000
+;; TMB-NEXT:   'st_bind', 0x0
+;; TMB-NEXT:   'st_type', 0x0
+;; TMB-NEXT:   'st_other'
+;; TMB-NEXT:   'st_shndx', [[MIXED_SECT]]
+
+
+;; TMB-NOT:     ${{[atd]}}
+
diff --git a/test/MC/ARM/elf-reloc-01.ll b/test/MC/ARM/elf-reloc-01.ll
index c98026b6a0..3ebd7c641b 100644
--- a/test/MC/ARM/elf-reloc-01.ll
+++ b/test/MC/ARM/elf-reloc-01.ll
@@ -62,9 +62,9 @@ declare void @exit(i32) noreturn nounwind
 
 ;; OBJ:          Relocation 1
 ;; OBJ-NEXT:     'r_offset',
-;; OBJ-NEXT:     'r_sym', 0x000002
+;; OBJ-NEXT:     'r_sym', 0x000007
 ;; OBJ-NEXT:     'r_type', 0x2b
 
-;; OBJ:         Symbol 2
+;; OBJ:         Symbol 7
 ;; OBJ-NEXT:    '_MergedGlobals'
 ;; OBJ-NEXT:    'st_value', 0x00000010
diff --git a/test/MC/ARM/elf-reloc-02.ll b/test/MC/ARM/elf-reloc-02.ll
index e51bac30ca..6b6b03c388 100644
--- a/test/MC/ARM/elf-reloc-02.ll
+++ b/test/MC/ARM/elf-reloc-02.ll
@@ -42,9 +42,9 @@ declare i32 @write(...)
 declare void @exit(i32) noreturn nounwind
 
 ;; OBJ:        Relocation 0
-;; OBJ-NEXT:    'r_offset', 
-;; OBJ-NEXT:    'r_sym', 0x000002
+;; OBJ-NEXT:    'r_offset',
+;; OBJ-NEXT:    'r_sym', 0x000005
 ;; OBJ-NEXT:    'r_type', 0x2b
 
-;; OBJ:          Symbol 2
+;; OBJ:          Symbol 5
 ;; OBJ-NEXT:    '.L.str'
diff --git a/test/MC/ARM/elf-reloc-03.ll b/test/MC/ARM/elf-reloc-03.ll
index 922242f9d3..87f91c1121 100644
--- a/test/MC/ARM/elf-reloc-03.ll
+++ b/test/MC/ARM/elf-reloc-03.ll
@@ -89,9 +89,9 @@ entry:
 declare void @exit(i32) noreturn nounwind
 
 ;; OBJ:           Relocation 1
-;; OBJ-NEXT:     'r_offset', 
-;; OBJ-NEXT:     'r_sym', 0x00000c
+;; OBJ-NEXT:     'r_offset',
+;; OBJ-NEXT:     'r_sym', 0x000010
 ;; OBJ-NEXT:     'r_type', 0x2b
 
-;; OBJ:      Symbol 12
+;; OBJ:      Symbol 16
 ;; OBJ-NEXT:    'vtable'
diff --git a/test/MC/ARM/elf-reloc-condcall.s b/test/MC/ARM/elf-reloc-condcall.s
index 08b4ecc9c7..3fafb43eb0 100644
--- a/test/MC/ARM/elf-reloc-condcall.s
+++ b/test/MC/ARM/elf-reloc-condcall.s
@@ -9,25 +9,25 @@
 // OBJ: .rel.text
 
 // OBJ: 'r_offset', 0x00000000
-// OBJ-NEXT:  'r_sym', 0x000004
+// OBJ-NEXT:  'r_sym', 0x000005
 // OBJ-NEXT: 'r_type', 0x1d
 
 // OBJ: 'r_offset', 0x00000004
-// OBJ-NEXT:  'r_sym', 0x000004
+// OBJ-NEXT:  'r_sym', 0x000005
 // OBJ-NEXT: 'r_type', 0x1c
 
 // OBJ: 'r_offset', 0x00000008
-// OBJ-NEXT:  'r_sym', 0x000004
+// OBJ-NEXT:  'r_sym', 0x000005
 // OBJ-NEXT: 'r_type', 0x1c
 
 // OBJ: 'r_offset', 0x0000000c
-// OBJ-NEXT:  'r_sym', 0x000004
+// OBJ-NEXT:  'r_sym', 0x000005
 // OBJ-NEXT: 'r_type', 0x1d
 
 // OBJ: 'r_offset', 0x00000010
-// OBJ-NEXT:  'r_sym', 0x000004
+// OBJ-NEXT:  'r_sym', 0x000005
 // OBJ-NEXT: 'r_type', 0x1d
 
 // OBJ: .symtab
-// OBJ: Symbol 4
+// OBJ: Symbol 5
 // OBJ-NEXT: some_label
diff --git a/test/MC/ARM/elf-thumbfunc-reloc.ll b/test/MC/ARM/elf-thumbfunc-reloc.ll
index ecac11daa3..b2f253d2fa 100644
--- a/test/MC/ARM/elf-thumbfunc-reloc.ll
+++ b/test/MC/ARM/elf-thumbfunc-reloc.ll
@@ -28,10 +28,10 @@ entry:
 ; 00000008  0000070a R_ARM_THM_CALL    00000001   foo
 ; CHECK:           Relocation 0
 ; CHECK-NEXT:      'r_offset', 0x00000008
-; CHECK-NEXT:      'r_sym', 0x000007
+; CHECK-NEXT:      'r_sym', 0x000009
 ; CHECK-NEXT:      'r_type', 0x0a
 
 ; make sure foo is thumb function: bit 0 = 1
-; CHECK:           Symbol 7
+; CHECK:           Symbol 9
 ; CHECK-NEXT:      'foo'
 ; CHECK-NEXT:      'st_value', 0x00000001
diff --git a/test/MC/ARM/elf-thumbfunc.s b/test/MC/ARM/elf-thumbfunc.s
index 0aa7f41cc4..91b2eee759 100644
--- a/test/MC/ARM/elf-thumbfunc.s
+++ b/test/MC/ARM/elf-thumbfunc.s
@@ -12,7 +12,7 @@ foo:
 	bx	lr
 
 @@ make sure foo is thumb function: bit 0 = 1 (st_value)
-@CHECK:           Symbol 4
+@CHECK:           Symbol 5
 @CHECK-NEXT:      'st_name', 0x00000001
 @CHECK-NEXT:      'st_value', 0x00000001
 @CHECK-NEXT:      'st_size', 0x00000000
diff --git a/test/MC/ARM/mapping-within-section.s b/test/MC/ARM/mapping-within-section.s
new file mode 100644
index 0000000000..56dd6ef07e
--- /dev/null
+++ b/test/MC/ARM/mapping-within-section.s
@@ -0,0 +1,33 @@
+@ RUN: llvm-mc -triple=arm-linux-gnueabi -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
+
+    .text
+@ $a at 0x0000
+    add r0, r0, r0
+@ $d at 0x0004
+    .word 42
+    .thumb
+@ $t at 0x0008
+    adds r0, r0, r0
+    adds r0, r0, r0
+@ $a at 0x000c
+    .arm
+    add r0, r0, r0
+@ $t at 0x0010
+    .thumb
+    adds r0, r0, r0
+@ $d at 0x0012
+    .ascii "012"
+    .byte 1
+    .byte 2
+    .byte 3
+@ $a at 0x0018
+    .arm
+    add r0, r0, r0
+
+@ CHECK:      00000000         .text  00000000 $a
+@ CHECK-NEXT: 0000000c         .text  00000000 $a
+@ CHECK-NEXT: 00000018         .text  00000000 $a
+@ CHECK-NEXT: 00000004         .text  00000000 $d
+@ CHECK-NEXT: 00000012         .text  00000000 $d
+@ CHECK-NEXT: 00000008         .text  00000000 $t
+@ CHECK-NEXT: 00000010         .text  00000000 $t
diff --git a/test/MC/ARM/multi-section-mapping.s b/test/MC/ARM/multi-section-mapping.s
new file mode 100644
index 0000000000..f7c4e89a85
--- /dev/null
+++ b/test/MC/ARM/multi-section-mapping.s
@@ -0,0 +1,35 @@
+@ RUN: llvm-mc -triple=arm-linux-gnueabi -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
+
+        .text
+        add r0, r0, r0
+
+@ .wibble should *not* inherit .text's mapping symbol. It's a completely different section.
+        .section .wibble
+        add r0, r0, r0
+
+@ A section should be able to start with a $t
+        .section .starts_thumb
+        .thumb
+        adds r0, r0, r0
+
+@ A setion should be able to start with a $d
+        .section .starts_data
+        .word 42
+
+@ Changing back to .text should not emit a redundant $a
+        .text
+        .arm
+        add r0, r0, r0
+
+@ With all those constraints, we want:
+@   + .text to have $a at 0 and no others
+@   + .wibble to have $a at 0
+@   + .starts_thumb to have $t at 0
+@   + .starts_data to have $d at 0
+
+@ CHECK: 00000000 .text 00000000 $a
+@ CHECK-NEXT: 00000000 .wibble 00000000 $a
+@ CHECK-NEXT: 00000000 .starts_data 00000000 $d
+@ CHECK-NEXT: 00000000 .starts_thumb 00000000 $t
+@ CHECK-NOT: ${{[adt]}}
+
diff --git a/test/MC/ARM/relocated-mapping.s b/test/MC/ARM/relocated-mapping.s
new file mode 100644
index 0000000000..3bed14c452
--- /dev/null
+++ b/test/MC/ARM/relocated-mapping.s
@@ -0,0 +1,11 @@
+@ RUN: llvm-mc -triple=arm-linux-gnueabi -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
+
+@ Implementation-detail test (unfortunately): values that are relocated do not
+@ go via MCStreamer::EmitBytes; make sure they still emit a mapping symbol.
+        add r0, r0, r0
+        .word somewhere
+        add r0, r0, r0
+
+@ CHECK: 00000000 .text 00000000 $a
+@ CHECK-NEXT: 00000008 .text 00000000 $a
+@ CHECK-NEXT: 00000004 .text 00000000 $d
diff --git a/test/MC/Disassembler/X86/enhanced.txt b/test/MC/Disassembler/X86/enhanced.txt
deleted file mode 100644
index 97b0fa4ab5..0000000000
--- a/test/MC/Disassembler/X86/enhanced.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# RUN: llvm-mc --edis %s -triple=x86_64-apple-darwin9 2>&1 | FileCheck %s
-
-# CHECK: [o:jne][w:	][0-p:-][0-l:10=10] <br> 0:[RIP/{{[0-9]+}}](pc)=18446744073709551606
-0x0f 0x85 0xf6 0xff 0xff 0xff
-# CHECK: [o:movq][w:	][1-r:%gs=r{{[0-9]+}}][1-p::][1-l:8=8][p:,][w: ][0-r:%rcx=r{{[0-9]+}}] <mov> 0:[RCX/{{[0-9]+}}]=0 1:[GS/{{[0-9]+}}]=8
-0x65 0x48 0x8b 0x0c 0x25 0x08 0x00 0x00 0x00
-# CHECK: [o:xorps][w:	][2-r:%xmm1=r{{[0-9]+}}][p:,][w: ][0-r:%xmm2=r{{[0-9]+}}] 0:[XMM2/{{[0-9]+}}]=0 1:[XMM2/{{[0-9]+}}]=0 2:[XMM1/{{[0-9]+}}]=0
-0x0f 0x57 0xd1
-# CHECK: [o:andps][w:	][2-r:%xmm1=r{{[0-9]+}}][p:,][w: ][0-r:%xmm2=r{{[0-9]+}}] 0:[XMM2/{{[0-9]+}}]=0 1:[XMM2/{{[0-9]+}}]=0 2:[XMM1/{{[0-9]+}}]=0
-0x0f 0x54 0xd1
diff --git a/test/MC/Disassembler/X86/simple-tests.txt b/test/MC/Disassembler/X86/simple-tests.txt
index 672d239243..5ea40eb913 100644
--- a/test/MC/Disassembler/X86/simple-tests.txt
+++ b/test/MC/Disassembler/X86/simple-tests.txt
@@ -120,13 +120,13 @@
 # CHECK: vandps (%rdx), %xmm1, %xmm7
 0xc5 0xf0 0x54 0x3a
 
-# CHECK: vcvtss2sil %xmm0, %eax
+# CHECK: vcvtss2si %xmm0, %eax
 0xc5 0xfa 0x2d 0xc0
 
-# CHECK: vcvtsd2sil %xmm0, %eax
+# CHECK: vcvtsd2si %xmm0, %eax
 0xc5 0xfb 0x2d 0xc0
 
-# CHECK: vcvtsd2siq %xmm0, %rax
+# CHECK: vcvtsd2si %xmm0, %rax
 0xc4 0xe1 0xfb 0x2d 0xc0
 
 # CHECK: vmaskmovpd %xmm0, %xmm1, (%rax)
@@ -437,10 +437,10 @@
 # CHECK: vroundsd $0, %xmm0, %xmm0, %xmm0
 0xc4 0xe3 0x7d 0x0b 0xc0 0x00
 
-# CHECK: vcvtsd2sil %xmm0, %eax
+# CHECK: vcvtsd2si %xmm0, %eax
 0xc4 0xe1 0x7f 0x2d 0xc0
 
-# CHECK: vcvtsd2siq %xmm0, %rax
+# CHECK: vcvtsd2si %xmm0, %rax
 0xc4 0xe1 0xff 0x2d 0xc0
 
 # CHECK: vucomisd %xmm1, %xmm0
diff --git a/test/MC/Disassembler/X86/x86-32.txt b/test/MC/Disassembler/X86/x86-32.txt
index 899657b0d4..99d49943b1 100644
--- a/test/MC/Disassembler/X86/x86-32.txt
+++ b/test/MC/Disassembler/X86/x86-32.txt
@@ -156,13 +156,13 @@
 # CHECK: vandps (%edx), %xmm1, %xmm7
 0xc5 0xf0 0x54 0x3a
 
-# CHECK: vcvtss2sil %xmm0, %eax
+# CHECK: vcvtss2si %xmm0, %eax
 0xc5 0xfa 0x2d 0xc0
 
-# CHECK: vcvtsd2sil %xmm0, %eax
+# CHECK: vcvtsd2si %xmm0, %eax
 0xc5 0xfb 0x2d 0xc0
 
-# CHECK: vcvtsd2sil %xmm0, %eax
+# CHECK: vcvtsd2si %xmm0, %eax
 0xc4 0xe1 0x7b 0x2d 0xc0
 
 # CHECK: vmaskmovpd %xmm0, %xmm1, (%eax)
@@ -460,10 +460,10 @@
 # CHECK: vroundsd $0, %xmm0, %xmm0, %xmm0
 0xc4 0xe3 0x7d 0x0b 0xc0 0x00
 
-# CHECK: vcvtsd2sil %xmm0, %eax
+# CHECK: vcvtsd2si %xmm0, %eax
 0xc4 0xe1 0x7f 0x2d 0xc0
 
-# CHECK: vcvtsd2sil %xmm0, %eax
+# CHECK: vcvtsd2si %xmm0, %eax
 0xc4 0xe1 0xff 0x2d 0xc0
 
 # CHECK: vucomisd %xmm1, %xmm0
diff --git a/test/MC/Disassembler/XCore/lit.local.cfg b/test/MC/Disassembler/XCore/lit.local.cfg
new file mode 100644
index 0000000000..15b65836e7
--- /dev/null
+++ b/test/MC/Disassembler/XCore/lit.local.cfg
@@ -0,0 +1,5 @@
+config.suffixes = ['.txt']
+
+targets = set(config.root.targets_to_build.split())
+if not 'XCore' in targets:
+    config.unsupported = True
diff --git a/test/MC/Disassembler/XCore/xcore.txt b/test/MC/Disassembler/XCore/xcore.txt
new file mode 100644
index 0000000000..f6b9c90da0
--- /dev/null
+++ b/test/MC/Disassembler/XCore/xcore.txt
@@ -0,0 +1,198 @@
+# RUN: llvm-mc --disassemble %s -triple=xcore-xmos-elf | FileCheck %s
+# CHECK: .section        __TEXT,__text,regular,pure_instructions
+
+# 0r instructions
+
+# CHECK: clre
+0xed 0x07
+
+# CHECK: get r11, id
+0xee 0x17
+
+# CHECK: get r11, ed
+0xfe 0x0f
+
+# CHECK: get r11, et
+0xff 0x0f
+
+# CHECK: ssync
+0xee 0x07
+
+# CHECK: waiteu
+0xec 0x07
+
+# 1r instructions
+
+# CHECK: msync res[r0]
+0xf0 0x1f
+
+# CHECK: mjoin res[r1]
+0xf1 0x17
+
+# CHECK: bau r2
+0xf2 0x27
+
+# CHECK: set sp, r3
+0xf3 0x2f
+
+# CHECK: ecallt r4
+0xf4 0x4f
+
+# CHECK: ecallf r5
+0xe5 0x4f
+
+# CHECK: bla r6
+0xe6 0x27
+
+# CHECK: syncr res[r7]
+0xf7 0x87
+
+# CHECK: freer res[r8]
+0xe8 0x17
+
+# CHECK: setv res[r9], r11
+0xf9 0x47
+
+# CHECK: setev res[r10], r11
+0xfa 0x3f
+
+# CHECK: eeu res[r11]
+0xfb 0x07
+
+# 2r instructions
+
+# CHECK: not r1, r8
+0x24 0x8f
+
+# CHECK: neg r7, r6
+0xce 0x97
+
+# CHECK: andnot r10, r11
+0xab 0x2f
+
+# CHECK: mkmsk r11, r0
+0x4c 0xa7
+
+# CHECK: getts r8, res[r1]
+0x41 0x3f
+
+# CHECK: setpt res[r2], r3
+0xde 0x3e
+
+# CHECK: outct res[r1], r2
+0xc6 0x4e
+
+# CHECK: outt res[r5], r4
+0xd1 0x0f
+
+# CHECK: out res[r9], r10
+0xa9 0xaf
+
+# CHECK: outshr res[r0], r2
+0xd8 0xae
+
+# CHECK: inct r7, res[r4]
+0xdc 0x87
+
+# CHECK: int r8, res[r3]
+0x53 0x8f
+
+# CHECK: in r10, res[r0]
+0x48 0xb7
+
+# CHECK: inshr r4, res[r2]
+0x12 0xb7
+
+# CHECK: chkct res[r6], r0
+0x08 0xcf
+
+# CHECK: testct r8, res[r3]
+0x53 0xbf
+
+# CHECK: testwct r2, res[r9]
+0x39 0xc7
+
+# CHECK: setd res[r3], r4
+0x13 0x17
+
+# CHECK: getst r7, res[r1]
+0x1d 0x07
+
+# CHECK: init t[r1]:sp, r2
+0xc9 0x16
+
+# CHECK: init t[r10]:pc, r1
+0x26 0x07
+
+# CHECK: init t[r2]:cp, r10
+0x4a 0x1f
+
+# CHECK: init t[r2]:dp, r3
+0xce 0x0e
+
+# CHECK: setpsc res[r8], r2
+0x28 0xc7
+
+# CHECK: zext r3, r8
+0x2c 0x47
+
+# CHECK: sext r9, r1
+0x45 0x37
+
+# rus instructions
+
+# CHECK: chkct res[r1], 8
+0x34 0xcf
+
+# CHECK: getr r11, 2
+0x4e 0x87
+
+# CHECK: mkmsk r4, 24
+0x72 0xa7
+
+# CHECK: outct res[r3], r0
+0xcc 0x4e
+
+# CHECK: sext r8, 16
+0xb1 0x37
+
+# CHECK: zext r2, 32
+0xd8 0x46
+
+# CHECK: peek r0, res[r5]
+0x81 0xbf
+
+# CHECK: endin r10, res[r1]
+0x59 0x97
+
+# l2r instructions
+
+# CHECK: bitrev r1, r10
+0x26 0xff 0xec 0x07
+
+# CHECK: byterev r4, r1
+0x11 0xff 0xec 0x07
+
+# CHECK: clz r11, r10
+0xae 0xff 0xec 0x0f
+
+# CHECK: get r3, ps[r6]
+0x9e 0xff 0xec 0x17
+
+# CHECK: setc res[r5], r9
+0x75 0xff 0xec 0x2f
+
+# CHECK: init t[r2]:lr, r1
+0xc6 0xfe 0xec 0x17
+
+# CHECK: setclk res[r2], r1
+0xd6 0xfe 0xec 0x0f
+
+# CHECK: set ps[r9], r10
+0xa9 0xff 0xec 0x1f
+
+# CHECK: setrdy res[r3], r1
+0xc7 0xfe 0xec 0x2f
+
+# CHECK: settw res[r7], r2
+0x9b 0xff 0xec 0x27
diff --git a/test/MC/ELF/comp-dir.s b/test/MC/ELF/comp-dir.s
new file mode 100644
index 0000000000..50d10eb9a5
--- /dev/null
+++ b/test/MC/ELF/comp-dir.s
@@ -0,0 +1,7 @@
+// RUN: llvm-mc -triple=x86_64-linux-unknown -g -fdebug-compilation-dir=/test/comp/dir %s -filetype=obj -o %t.o
+// RUN: llvm-dwarfdump %t.o | FileCheck %s
+
+// CHECK: DW_AT_comp_dir [DW_FORM_string] ("{{([A-Za-z]:.*)?}}/test/comp/dir")
+
+f:
+  nop
diff --git a/test/MC/Mips/ef_frame.ll b/test/MC/Mips/ef_frame.ll
new file mode 100644
index 0000000000..91c8b43d02
--- /dev/null
+++ b/test/MC/Mips/ef_frame.ll
@@ -0,0 +1,52 @@
+; This tests .eh_frame CIE descriptor for the.
+; Data alignment factor
+
+; RUN: llc -filetype=obj -mcpu=mips64r2 -mattr=n64 -march=mips64el %s -o - \
+; RUN: | llvm-objdump -s - | FileCheck %s
+
+; N64
+; CHECK: Contents of section .eh_frame:
+; CHECK-NEXT:  0000 1c000000 00000000 017a504c 52000178  .........zPLR..x
+; CHECK-NEXT:  0010 1f0b0000 00000000 00000000 000c1d00  ................
+; CHECK-NEXT:  0020 2c000000 24000000 00000000 00000000  ,...$...........
+; CHECK-NEXT:  0030 7c000000 00000000 08000000 00000000  |...............
+; CHECK-NEXT:  0040 00440e10 489f019c 02000000 00000000  .D..H...........
+
+; ModuleID = 'simple_throw.cpp'
+
+@_ZTIi = external constant i8*
+@str = private unnamed_addr constant [7 x i8] c"All ok\00"
+
+define i32 @main() {
+entry:
+  %exception.i = tail call i8* @__cxa_allocate_exception(i64 4) nounwind
+  %0 = bitcast i8* %exception.i to i32*
+  store i32 5, i32* %0, align 4
+  invoke void @__cxa_throw(i8* %exception.i, i8* bitcast (i8** @_ZTIi to i8*), i8* null) noreturn
+          to label %.noexc unwind label %return
+
+.noexc:                                           ; preds = %entry
+  unreachable
+
+return:                                           ; preds = %entry
+  %1 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  %2 = extractvalue { i8*, i32 } %1, 0
+  %3 = tail call i8* @__cxa_begin_catch(i8* %2) nounwind
+  %puts = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str, i64 0, i64 0))
+  tail call void @__cxa_end_catch()
+  ret i32 0
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+declare i8* @__cxa_allocate_exception(i64)
+
+declare void @__cxa_throw(i8*, i8*, i8*)
+
+declare i32 @puts(i8* nocapture) nounwind
+
diff --git a/test/MC/Mips/mips-alu-instructions.s b/test/MC/Mips/mips-alu-instructions.s
index 2997782cd0..ee2a9a0db4 100644
--- a/test/MC/Mips/mips-alu-instructions.s
+++ b/test/MC/Mips/mips-alu-instructions.s
@@ -80,7 +80,7 @@
 # CHECK:  subu   $4, $3, $5      # encoding: [0x23,0x20,0x65,0x00]
 # CHECK:  sub     $6, $zero, $7  # encoding: [0x22,0x30,0x07,0x00]
 # CHECK:  subu    $6, $zero, $7  # encoding: [0x23,0x30,0x07,0x00]
-# CHECK:  add     $7, $8, $zero  # encoding: [0x20,0x38,0x00,0x01]
+# CHECK:  addu    $7, $8, $zero  # encoding: [0x21,0x38,0x00,0x01]
     add    $9,$6,$7
     add    $9,$6,17767
     addu   $9,$6,-15001
diff --git a/test/MC/Mips/mips-jump-instructions.s b/test/MC/Mips/mips-jump-instructions.s
index 998be418d2..58250f306e 100644
--- a/test/MC/Mips/mips-jump-instructions.s
+++ b/test/MC/Mips/mips-jump-instructions.s
@@ -23,7 +23,7 @@
 # CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
 # CHECK:   bne $9, $6, 1332       # encoding: [0x34,0x05,0x26,0x15]
 # CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
-# CHECK:   bal     1332           # encoding: [0x34,0x05,0x00,0x04]
+# CHECK:   bal     1332           # encoding: [0x34,0x05,0x11,0x04]
 # CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
          b 1332
          nop
diff --git a/test/MC/Mips/mips64-alu-instructions.s b/test/MC/Mips/mips64-alu-instructions.s
new file mode 100644
index 0000000000..a77ed43ff1
--- /dev/null
+++ b/test/MC/Mips/mips64-alu-instructions.s
@@ -0,0 +1,94 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips64r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for arithmetic and logical instructions.
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+#------------------------------------------------------------------------------
+# Logical instructions
+#------------------------------------------------------------------------------
+# CHECK:  and    $9, $6, $7      # encoding: [0x24,0x48,0xc7,0x00]
+# CHECK:  andi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x30]
+# CHECK:  andi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x30]
+# CHECK:  clo    $6, $7          # encoding: [0x21,0x30,0xe6,0x70]
+# CHECK:  clz    $6, $7          # encoding: [0x20,0x30,0xe6,0x70]
+# CHECK:  ins    $19, $9, 6, 7   # encoding: [0x84,0x61,0x33,0x7d]
+# CHECK:  nor    $9, $6, $7      # encoding: [0x27,0x48,0xc7,0x00]
+# CHECK:  or     $3, $3, $5      # encoding: [0x25,0x18,0x65,0x00]
+# CHECK:  ori    $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x34]
+# CHECK:  rotr   $9, $6, 7       # encoding: [0xc2,0x49,0x26,0x00]
+# CHECK:  rotrv  $9, $6, $7      # encoding: [0x46,0x48,0xe6,0x00]
+# CHECK:  sll    $4, $3, 7       # encoding: [0xc0,0x21,0x03,0x00]
+# CHECK:  sllv   $2, $3, $5      # encoding: [0x04,0x10,0xa3,0x00]
+# CHECK:  slt    $3, $3, $5      # encoding: [0x2a,0x18,0x65,0x00]
+# CHECK:  slti   $3, $3, 103     # encoding: [0x67,0x00,0x63,0x28]
+# CHECK:  slti   $3, $3, 103     # encoding: [0x67,0x00,0x63,0x28]
+# CHECK:  sltiu  $3, $3, 103     # encoding: [0x67,0x00,0x63,0x2c]
+# CHECK:  sltu   $3, $3, $5      # encoding: [0x2b,0x18,0x65,0x00]
+# CHECK:  sra    $4, $3, 7       # encoding: [0xc3,0x21,0x03,0x00]
+# CHECK:  srav   $2, $3, $5      # encoding: [0x07,0x10,0xa3,0x00]
+# CHECK:  srl    $4, $3, 7       # encoding: [0xc2,0x21,0x03,0x00]
+# CHECK:  srlv   $2, $3, $5      # encoding: [0x06,0x10,0xa3,0x00]
+# CHECK:  xor    $3, $3, $5      # encoding: [0x26,0x18,0x65,0x00]
+# CHECK:  xori    $9, $6, 17767  # encoding: [0x67,0x45,0xc9,0x38]
+# CHECK:  xori   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x38]
+# CHECK:  wsbh   $6, $7          # encoding: [0xa0,0x30,0x07,0x7c]
+# CHECK:  nor    $7, $8, $zero   # encoding: [0x27,0x38,0x00,0x01]
+     and    $9,  $6, $7
+     and    $9,  $6, 17767
+     andi   $9,  $6, 17767
+     clo    $6,  $7
+     clz    $6,  $7
+     ins    $19, $9, 6,7
+     nor    $9,  $6, $7
+     or     $3,  $3, $5
+     ori    $9,  $6, 17767
+     rotr   $9,  $6, 7
+     rotrv  $9,  $6, $7
+     sll    $4,  $3, 7
+     sllv   $2,  $3, $5
+     slt    $3,  $3, $5
+     slt    $3,  $3, 103
+     slti   $3,  $3, 103
+     sltiu  $3,  $3, 103
+     sltu   $3,  $3, $5
+     sra    $4,  $3, 7
+     srav   $2,  $3, $5
+     srl    $4,  $3, 7
+     srlv   $2,  $3, $5
+     xor    $3,  $3, $5
+     xor    $9,  $6, 17767
+     xori   $9,  $6, 17767
+     wsbh   $6,  $7
+     not    $7  ,$8
+
+#------------------------------------------------------------------------------
+# Arithmetic instructions
+#------------------------------------------------------------------------------
+
+# CHECK:  dadd    $9, $6, $7      # encoding: [0x2c,0x48,0xc7,0x00]
+# CHECK:  daddi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x60]
+# CHECK:  daddiu  $9, $6, -15001  # encoding: [0x67,0xc5,0xc9,0x64]
+# CHECK:  daddi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x60]
+# CHECK:  daddiu  $9, $6, -15001  # encoding: [0x67,0xc5,0xc9,0x64]
+# CHECK:  daddu   $9, $6, $7      # encoding: [0x2d,0x48,0xc7,0x00]
+# CHECK:  madd   $6, $7          # encoding: [0x00,0x00,0xc7,0x70]
+# CHECK:  maddu  $6, $7          # encoding: [0x01,0x00,0xc7,0x70]
+# CHECK:  msub   $6, $7          # encoding: [0x04,0x00,0xc7,0x70]
+# CHECK:  msubu  $6, $7          # encoding: [0x05,0x00,0xc7,0x70]
+# CHECK:  mult   $3, $5          # encoding: [0x18,0x00,0x65,0x00]
+# CHECK:  multu  $3, $5          # encoding: [0x19,0x00,0x65,0x00]
+# CHECK:  dsubu   $4, $3, $5      # encoding: [0x2f,0x20,0x65,0x00]
+# CHECK:  daddu    $7, $8, $zero  # encoding: [0x2d,0x38,0x00,0x01]
+    dadd    $9,$6,$7
+    dadd    $9,$6,17767
+    daddu   $9,$6,-15001
+    daddi   $9,$6,17767
+    daddiu  $9,$6,-15001
+    daddu   $9,$6,$7
+    madd   $6,$7
+    maddu  $6,$7
+    msub   $6,$7
+    msubu  $6,$7
+    mult   $3,$5
+    multu  $3,$5
+    dsubu   $4,$3,$5
+    move   $7,$8
diff --git a/test/MC/Mips/mips_gprel16.ll b/test/MC/Mips/mips_gprel16.ll
new file mode 100644
index 0000000000..b5a282de56
--- /dev/null
+++ b/test/MC/Mips/mips_gprel16.ll
@@ -0,0 +1,33 @@
+; This addresses bug 14456. We were not writing
+; out the addend to the gprel16 relocation. The
+; addend is stored in the instruction immediate 
+; field.
+;llc gprel16.ll -o gprel16.o -mcpu=mips32r2 -march=mipsel -filetype=obj -relocation-model=static
+
+; RUN: llc -mcpu=mips32r2 -march=mipsel -filetype=obj -relocation-model=static %s -o - \
+; RUN: | llvm-objdump -disassemble -mattr +mips32r2 - \
+; RUN: | FileCheck %s
+
+target triple = "mipsel-sde--elf-gcc"
+
+@var1 = internal global i32 0, align 4
+@var2 = internal global i32 0, align 4
+
+define i32 @testvar1() nounwind {
+entry:
+; CHECK: lw ${{[0-9]+}}, 0($gp)
+  %0 = load i32* @var1, align 4
+  %tobool = icmp ne i32 %0, 0
+  %cond = select i1 %tobool, i32 1, i32 0
+  ret i32 %cond
+}
+
+define i32 @testvar2() nounwind {
+entry:
+; CHECK: lw ${{[0-9]+}}, 4($gp)
+  %0 = load i32* @var2, align 4
+  %tobool = icmp ne i32 %0, 0
+  %cond = select i1 %tobool, i32 1, i32 0
+  ret i32 %cond
+}
+
diff --git a/test/MC/PowerPC/ppc64-initial-cfa.ll b/test/MC/PowerPC/ppc64-initial-cfa.ll
index 3936cf2e81..16236c9c65 100644
--- a/test/MC/PowerPC/ppc64-initial-cfa.ll
+++ b/test/MC/PowerPC/ppc64-initial-cfa.ll
@@ -1,41 +1,78 @@
-;; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -filetype=obj %s -o - | \
-;; RUN: elf-dump --dump-section-data | FileCheck %s
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -filetype=obj -relocation-model=static %s -o - | \
+; RUN: elf-dump --dump-section-data | FileCheck %s -check-prefix=STATIC
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -filetype=obj -relocation-model=pic %s -o - | \
+; RUN: elf-dump --dump-section-data | FileCheck %s -check-prefix=PIC
 
-;; FIXME: this file should be in .s form, change when asm parser is available.
+; FIXME: this file should be in .s form, change when asm parser is available.
 
 define void @f() {
 entry:
   ret void
 }
 
-;; CHECK:      ('sh_name', 0x{{.*}}) # '.eh_frame'
-;; CHECK-NEXT: ('sh_type', 0x00000001)
-;; CHECK-NEXT: ('sh_flags', 0x0000000000000002)
-;; CHECK-NEXT: ('sh_addr', 0x{{.*}})
-;; CHECK-NEXT: ('sh_offset', 0x{{.*}})
-;; CHECK-NEXT: ('sh_size', 0x0000000000000030)
-;; CHECK-NEXT: ('sh_link', 0x00000000)
-;; CHECK-NEXT: ('sh_info', 0x00000000)
-;; CHECK-NEXT: ('sh_addralign', 0x0000000000000008)
-;; CHECK-NEXT: ('sh_entsize', 0x0000000000000000)
-;; CHECK-NEXT: ('_section_data', '00000010 00000000 017a5200 01784101 000c0100 00000018 00000018 00000000 00000000 00000000 00000010 00000000')
-
-;; CHECK:      ('sh_name', 0x{{.*}}) # '.rela.eh_frame'
-;; CHECK-NEXT: ('sh_type', 0x00000004)
-;; CHECK-NEXT: ('sh_flags', 0x0000000000000000)
-;; CHECK-NEXT: ('sh_addr', 0x{{.*}})
-;; CHECK-NEXT: ('sh_offset', 0x{{.*}})
-;; CHECK-NEXT: ('sh_size', 0x0000000000000018)
-;; CHECK-NEXT: ('sh_link', 0x{{.*}})
-;; CHECK-NEXT: ('sh_info', 0x{{.*}})
-;; CHECK-NEXT: ('sh_addralign', 0x0000000000000008)
-;; CHECK-NEXT: ('sh_entsize', 0x0000000000000018)
-;; CHECK-NEXT: ('_relocations', [
-;; CHECK-NEXT:  # Relocation 0
-;; CHECK-NEXT:  (('r_offset', 0x000000000000001c)
-;; CHECK-NEXT:   ('r_sym', 0x{{.*}})
-;; CHECK-NEXT:   ('r_type', 0x00000026)
-;; CHECK-NEXT:   ('r_addend', 0x0000000000000000)
-;; CHECK-NEXT:  ),
-;; CHECK-NEXT: ])
+; STATIC:      ('sh_name', 0x{{.*}}) # '.eh_frame'
+; STATIC-NEXT: ('sh_type', 0x00000001)
+; STATIC-NEXT: ('sh_flags', 0x0000000000000002)
+; STATIC-NEXT: ('sh_addr', 0x{{.*}})
+; STATIC-NEXT: ('sh_offset', 0x{{.*}})
+; STATIC-NEXT: ('sh_size', 0x0000000000000028)
+; STATIC-NEXT: ('sh_link', 0x00000000)
+; STATIC-NEXT: ('sh_info', 0x00000000)
+; STATIC-NEXT: ('sh_addralign', 0x0000000000000008)
+; STATIC-NEXT: ('sh_entsize', 0x0000000000000000)
+; STATIC-NEXT: ('_section_data', '00000010 00000000 017a5200 01784101 1b0c0100 00000010 00000018 00000000 00000010 00000000')
 
+; STATIC:      ('sh_name', 0x{{.*}}) # '.rela.eh_frame'
+; STATIC-NEXT: ('sh_type', 0x00000004)
+; STATIC-NEXT: ('sh_flags', 0x0000000000000000)
+; STATIC-NEXT: ('sh_addr', 0x{{.*}})
+; STATIC-NEXT: ('sh_offset', 0x{{.*}})
+; STATIC-NEXT: ('sh_size', 0x0000000000000018)
+; STATIC-NEXT: ('sh_link', 0x{{.*}})
+; STATIC-NEXT: ('sh_info', 0x{{.*}})
+; STATIC-NEXT: ('sh_addralign', 0x0000000000000008)
+; STATIC-NEXT: ('sh_entsize', 0x0000000000000018)
+; STATIC-NEXT: ('_relocations', [
+
+; Static build should create R_PPC64_REL32 relocations
+; STATIC-NEXT:  # Relocation 0
+; STATIC-NEXT:  (('r_offset', 0x000000000000001c)
+; STATIC-NEXT:   ('r_sym', 0x{{.*}})
+; STATIC-NEXT:   ('r_type', 0x0000001a)
+; STATIC-NEXT:   ('r_addend', 0x0000000000000000)
+; STATIC-NEXT:  ),
+; STATIC-NEXT: ])
+
+
+; PIC:      ('sh_name', 0x{{.*}}) # '.eh_frame'
+; PIC-NEXT: ('sh_type', 0x00000001)
+; PIC-NEXT: ('sh_flags', 0x0000000000000002)
+; PIC-NEXT: ('sh_addr', 0x{{.*}})
+; PIC-NEXT: ('sh_offset', 0x{{.*}})
+; PIC-NEXT: ('sh_size', 0x0000000000000028)
+; PIC-NEXT: ('sh_link', 0x00000000)
+; PIC-NEXT: ('sh_info', 0x00000000)
+; PIC-NEXT: ('sh_addralign', 0x0000000000000008)
+; PIC-NEXT: ('sh_entsize', 0x0000000000000000)
+; PIC-NEXT: ('_section_data', '00000010 00000000 017a5200 01784101 1b0c0100 00000010 00000018 00000000 00000010 00000000')
+
+; PIC:      ('sh_name', 0x{{.*}}) # '.rela.eh_frame'
+; PIC-NEXT: ('sh_type', 0x00000004)
+; PIC-NEXT: ('sh_flags', 0x0000000000000000)
+; PIC-NEXT: ('sh_addr', 0x{{.*}})
+; PIC-NEXT: ('sh_offset', 0x{{.*}})
+; PIC-NEXT: ('sh_size', 0x0000000000000018)
+; PIC-NEXT: ('sh_link', 0x{{.*}})
+; PIC-NEXT: ('sh_info', 0x{{.*}})
+; PIC-NEXT: ('sh_addralign', 0x0000000000000008)
+; PIC-NEXT: ('sh_entsize', 0x0000000000000018)
+; PIC-NEXT: ('_relocations', [
+
+; PIC build should create R_PPC64_REL32 relocations
+; PIC-NEXT:  # Relocation 0
+; PIC-NEXT:  (('r_offset', 0x000000000000001c)
+; PIC-NEXT:   ('r_sym', 0x{{.*}})
+; PIC-NEXT:   ('r_type', 0x0000001a)
+; PIC-NEXT:   ('r_addend', 0x0000000000000000)
+; PIC-NEXT:  ),
+; PIC-NEXT: ])
diff --git a/test/MC/X86/AlignedBundling/align-mode-argument-error.s b/test/MC/X86/AlignedBundling/align-mode-argument-error.s
new file mode 100644
index 0000000000..b4ce0a9d10
--- /dev/null
+++ b/test/MC/X86/AlignedBundling/align-mode-argument-error.s
@@ -0,0 +1,8 @@
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - 2>&1 | FileCheck %s
+
+# Missing .bundle_align_mode argument
+# CHECK: error: unknown token
+
+  .bundle_align_mode
+  imull $17, %ebx, %ebp
+
diff --git a/test/MC/X86/AlignedBundling/asm-printing-bundle-directives.s b/test/MC/X86/AlignedBundling/asm-printing-bundle-directives.s
new file mode 100644
index 0000000000..387e0fe59b
--- /dev/null
+++ b/test/MC/X86/AlignedBundling/asm-printing-bundle-directives.s
@@ -0,0 +1,22 @@
+# RUN: llvm-mc -filetype=asm -triple x86_64-pc-linux-gnu %s -o - 2>&1 | FileCheck %s
+
+# Just a simple test for the assembly emitter - making sure it emits back the
+# bundling directives.
+
+  .text
+foo:
+  .bundle_align_mode 4
+# CHECK:      .bundle_align_mode 4
+  pushq   %rbp
+  .bundle_lock
+# CHECK: .bundle_lock
+  cmpl    %r14d, %ebp
+  jle     .L_ELSE
+  .bundle_unlock
+# CHECK: .bundle_unlock
+  .bundle_lock align_to_end
+# CHECK: .bundle_lock align_to_end
+  add     %rbx, %rdx
+  .bundle_unlock
+
+
diff --git a/test/MC/X86/AlignedBundling/autogen-inst-offset-align-to-end.s b/test/MC/X86/AlignedBundling/autogen-inst-offset-align-to-end.s
new file mode 100644
index 0000000000..7fbb71bd4d
--- /dev/null
+++ b/test/MC/X86/AlignedBundling/autogen-inst-offset-align-to-end.s
@@ -0,0 +1,2794 @@
+# RUN: llvm-mc -filetype=obj -triple i386-pc-linux-gnu %s -o - \
+# RUN:   | llvm-objdump -triple i386 -disassemble -no-show-raw-insn - | FileCheck %s
+
+# !!! This test is auto-generated from utils/testgen/mc-bundling-x86-gen.py !!!
+#     It tests that bundle-aligned grouping works correctly in MC. Read the
+#     source of the script for more details.
+
+  .text
+  .bundle_align_mode 4
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_0:
+  .bundle_lock align_to_end
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 0: nop
+# CHECK: f: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 21: nop
+# CHECK: 2f: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 42: nop
+# CHECK: 4f: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 63: nop
+# CHECK: 6f: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 84: nop
+# CHECK: 8f: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: a5: nop
+# CHECK: af: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: c6: nop
+# CHECK: cf: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: e7: nop
+# CHECK: ef: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 108: nop
+# CHECK: 10f: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 129: nop
+# CHECK: 12f: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 14a: nop
+# CHECK: 14f: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 16b: nop
+# CHECK: 16f: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 18c: nop
+# CHECK: 18f: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1ad: nop
+# CHECK: 1af: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1ce: nop
+# CHECK: 1cf: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1ef: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_0:
+  .bundle_lock align_to_end
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 200: nop
+# CHECK: 20e: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 221: nop
+# CHECK: 22e: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 242: nop
+# CHECK: 24e: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 263: nop
+# CHECK: 26e: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 284: nop
+# CHECK: 28e: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 2a5: nop
+# CHECK: 2ae: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 2c6: nop
+# CHECK: 2ce: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 2e7: nop
+# CHECK: 2ee: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 308: nop
+# CHECK: 30e: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 329: nop
+# CHECK: 32e: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 34a: nop
+# CHECK: 34e: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 36b: nop
+# CHECK: 36e: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 38c: nop
+# CHECK: 38e: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 3ad: nop
+# CHECK: 3ae: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 3ce: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 3ef: nop
+# CHECK: 3fe: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_0:
+  .bundle_lock align_to_end
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 400: nop
+# CHECK: 40d: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 421: nop
+# CHECK: 42d: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 442: nop
+# CHECK: 44d: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 463: nop
+# CHECK: 46d: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 484: nop
+# CHECK: 48d: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 4a5: nop
+# CHECK: 4ad: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 4c6: nop
+# CHECK: 4cd: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 4e7: nop
+# CHECK: 4ed: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 508: nop
+# CHECK: 50d: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 529: nop
+# CHECK: 52d: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 54a: nop
+# CHECK: 54d: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 56b: nop
+# CHECK: 56d: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 58c: nop
+# CHECK: 58d: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 5ad: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 5ce: nop
+# CHECK: 5dd: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 5ef: nop
+# CHECK: 5fd: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_0:
+  .bundle_lock align_to_end
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 600: nop
+# CHECK: 60c: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 621: nop
+# CHECK: 62c: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 642: nop
+# CHECK: 64c: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 663: nop
+# CHECK: 66c: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 684: nop
+# CHECK: 68c: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 6a5: nop
+# CHECK: 6ac: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 6c6: nop
+# CHECK: 6cc: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 6e7: nop
+# CHECK: 6ec: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 708: nop
+# CHECK: 70c: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 729: nop
+# CHECK: 72c: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 74a: nop
+# CHECK: 74c: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 76b: nop
+# CHECK: 76c: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 78c: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 7ad: nop
+# CHECK: 7bc: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 7ce: nop
+# CHECK: 7dc: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 7ef: nop
+# CHECK: 7fc: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_0:
+  .bundle_lock align_to_end
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 800: nop
+# CHECK: 80b: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 821: nop
+# CHECK: 82b: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 842: nop
+# CHECK: 84b: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 863: nop
+# CHECK: 86b: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 884: nop
+# CHECK: 88b: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 8a5: nop
+# CHECK: 8ab: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 8c6: nop
+# CHECK: 8cb: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 8e7: nop
+# CHECK: 8eb: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 908: nop
+# CHECK: 90b: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 929: nop
+# CHECK: 92b: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 94a: nop
+# CHECK: 94b: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 96b: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 98c: nop
+# CHECK: 99b: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 9ad: nop
+# CHECK: 9bb: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 9ce: nop
+# CHECK: 9db: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 9ef: nop
+# CHECK: 9fb: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_0:
+  .bundle_lock align_to_end
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: a00: nop
+# CHECK: a0a: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: a21: nop
+# CHECK: a2a: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: a42: nop
+# CHECK: a4a: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: a63: nop
+# CHECK: a6a: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: a84: nop
+# CHECK: a8a: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: aa5: nop
+# CHECK: aaa: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: ac6: nop
+# CHECK: aca: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: ae7: nop
+# CHECK: aea: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: b08: nop
+# CHECK: b0a: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: b29: nop
+# CHECK: b2a: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: b4a: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: b6b: nop
+# CHECK: b7a: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: b8c: nop
+# CHECK: b9a: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: bad: nop
+# CHECK: bba: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: bce: nop
+# CHECK: bda: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: bef: nop
+# CHECK: bfa: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_0:
+  .bundle_lock align_to_end
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: c00: nop
+# CHECK: c09: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: c21: nop
+# CHECK: c29: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: c42: nop
+# CHECK: c49: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: c63: nop
+# CHECK: c69: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: c84: nop
+# CHECK: c89: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: ca5: nop
+# CHECK: ca9: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: cc6: nop
+# CHECK: cc9: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: ce7: nop
+# CHECK: ce9: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: d08: nop
+# CHECK: d09: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: d29: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: d4a: nop
+# CHECK: d59: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: d6b: nop
+# CHECK: d79: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: d8c: nop
+# CHECK: d99: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: dad: nop
+# CHECK: db9: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: dce: nop
+# CHECK: dd9: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: def: nop
+# CHECK: df9: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_0:
+  .bundle_lock align_to_end
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: e00: nop
+# CHECK: e08: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: e21: nop
+# CHECK: e28: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: e42: nop
+# CHECK: e48: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: e63: nop
+# CHECK: e68: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: e84: nop
+# CHECK: e88: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: ea5: nop
+# CHECK: ea8: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: ec6: nop
+# CHECK: ec8: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: ee7: nop
+# CHECK: ee8: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: f08: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: f29: nop
+# CHECK: f38: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: f4a: nop
+# CHECK: f58: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: f6b: nop
+# CHECK: f78: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: f8c: nop
+# CHECK: f98: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: fad: nop
+# CHECK: fb8: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: fce: nop
+# CHECK: fd8: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: fef: nop
+# CHECK: ff8: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_0:
+  .bundle_lock align_to_end
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1000: nop
+# CHECK: 1007: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1021: nop
+# CHECK: 1027: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1042: nop
+# CHECK: 1047: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1063: nop
+# CHECK: 1067: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1084: nop
+# CHECK: 1087: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 10a5: nop
+# CHECK: 10a7: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 10c6: nop
+# CHECK: 10c7: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 10e7: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1108: nop
+# CHECK: 1117: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1129: nop
+# CHECK: 1137: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 114a: nop
+# CHECK: 1157: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 116b: nop
+# CHECK: 1177: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 118c: nop
+# CHECK: 1197: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 11ad: nop
+# CHECK: 11b7: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 11ce: nop
+# CHECK: 11d7: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 11ef: nop
+# CHECK: 11f7: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_0:
+  .bundle_lock align_to_end
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1200: nop
+# CHECK: 1206: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1221: nop
+# CHECK: 1226: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1242: nop
+# CHECK: 1246: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1263: nop
+# CHECK: 1266: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1284: nop
+# CHECK: 1286: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 12a5: nop
+# CHECK: 12a6: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 12c6: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 12e7: nop
+# CHECK: 12f6: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1308: nop
+# CHECK: 1316: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1329: nop
+# CHECK: 1336: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 134a: nop
+# CHECK: 1356: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 136b: nop
+# CHECK: 1376: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 138c: nop
+# CHECK: 1396: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 13ad: nop
+# CHECK: 13b6: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 13ce: nop
+# CHECK: 13d6: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 13ef: nop
+# CHECK: 13f6: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_0:
+  .bundle_lock align_to_end
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1400: nop
+# CHECK: 1405: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1421: nop
+# CHECK: 1425: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1442: nop
+# CHECK: 1445: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1463: nop
+# CHECK: 1465: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1484: nop
+# CHECK: 1485: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 14a5: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 14c6: nop
+# CHECK: 14d5: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 14e7: nop
+# CHECK: 14f5: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1508: nop
+# CHECK: 1515: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1529: nop
+# CHECK: 1535: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 154a: nop
+# CHECK: 1555: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 156b: nop
+# CHECK: 1575: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 158c: nop
+# CHECK: 1595: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 15ad: nop
+# CHECK: 15b5: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 15ce: nop
+# CHECK: 15d5: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 15ef: nop
+# CHECK: 15f5: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_0:
+  .bundle_lock align_to_end
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1600: nop
+# CHECK: 1604: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1621: nop
+# CHECK: 1624: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1642: nop
+# CHECK: 1644: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1663: nop
+# CHECK: 1664: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1684: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 16a5: nop
+# CHECK: 16b4: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 16c6: nop
+# CHECK: 16d4: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 16e7: nop
+# CHECK: 16f4: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1708: nop
+# CHECK: 1714: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1729: nop
+# CHECK: 1734: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 174a: nop
+# CHECK: 1754: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 176b: nop
+# CHECK: 1774: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 178c: nop
+# CHECK: 1794: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 17ad: nop
+# CHECK: 17b4: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 17ce: nop
+# CHECK: 17d4: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 17ef: nop
+# CHECK: 17f4: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_0:
+  .bundle_lock align_to_end
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1800: nop
+# CHECK: 1803: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1821: nop
+# CHECK: 1823: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1842: nop
+# CHECK: 1843: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1863: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1884: nop
+# CHECK: 1893: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 18a5: nop
+# CHECK: 18b3: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 18c6: nop
+# CHECK: 18d3: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 18e7: nop
+# CHECK: 18f3: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1908: nop
+# CHECK: 1913: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1929: nop
+# CHECK: 1933: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 194a: nop
+# CHECK: 1953: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 196b: nop
+# CHECK: 1973: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 198c: nop
+# CHECK: 1993: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 19ad: nop
+# CHECK: 19b3: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 19ce: nop
+# CHECK: 19d3: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 19ef: nop
+# CHECK: 19f3: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_0:
+  .bundle_lock align_to_end
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1a00: nop
+# CHECK: 1a02: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1a21: nop
+# CHECK: 1a22: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1a42: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1a63: nop
+# CHECK: 1a72: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1a84: nop
+# CHECK: 1a92: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1aa5: nop
+# CHECK: 1ab2: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1ac6: nop
+# CHECK: 1ad2: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1ae7: nop
+# CHECK: 1af2: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1b08: nop
+# CHECK: 1b12: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1b29: nop
+# CHECK: 1b32: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1b4a: nop
+# CHECK: 1b52: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1b6b: nop
+# CHECK: 1b72: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1b8c: nop
+# CHECK: 1b92: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1bad: nop
+# CHECK: 1bb2: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1bce: nop
+# CHECK: 1bd2: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1bef: nop
+# CHECK: 1bf2: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_0:
+  .bundle_lock align_to_end
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1c00: nop
+# CHECK: 1c01: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1c21: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1c42: nop
+# CHECK: 1c51: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1c63: nop
+# CHECK: 1c71: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1c84: nop
+# CHECK: 1c91: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1ca5: nop
+# CHECK: 1cb1: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1cc6: nop
+# CHECK: 1cd1: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1ce7: nop
+# CHECK: 1cf1: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1d08: nop
+# CHECK: 1d11: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1d29: nop
+# CHECK: 1d31: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1d4a: nop
+# CHECK: 1d51: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1d6b: nop
+# CHECK: 1d71: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1d8c: nop
+# CHECK: 1d91: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1dad: nop
+# CHECK: 1db1: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1dce: nop
+# CHECK: 1dd1: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1def: nop
+# CHECK: 1df1: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_0:
+  .bundle_lock align_to_end
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1e00: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1e21: nop
+# CHECK: 1e30: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1e42: nop
+# CHECK: 1e50: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1e63: nop
+# CHECK: 1e70: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1e84: nop
+# CHECK: 1e90: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1ea5: nop
+# CHECK: 1eb0: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1ec6: nop
+# CHECK: 1ed0: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1ee7: nop
+# CHECK: 1ef0: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1f08: nop
+# CHECK: 1f10: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1f29: nop
+# CHECK: 1f30: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1f4a: nop
+# CHECK: 1f50: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1f6b: nop
+# CHECK: 1f70: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1f8c: nop
+# CHECK: 1f90: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1fad: nop
+# CHECK: 1fb0: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1fce: nop
+# CHECK: 1fd0: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock align_to_end
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1fef: nop
+# CHECK: 1ff0: incl
+
diff --git a/test/MC/X86/AlignedBundling/autogen-inst-offset-padding.s b/test/MC/X86/AlignedBundling/autogen-inst-offset-padding.s
new file mode 100644
index 0000000000..12786b34af
--- /dev/null
+++ b/test/MC/X86/AlignedBundling/autogen-inst-offset-padding.s
@@ -0,0 +1,2674 @@
+# RUN: llvm-mc -filetype=obj -triple i386-pc-linux-gnu %s -o - \
+# RUN:   | llvm-objdump -triple i386 -disassemble -no-show-raw-insn - | FileCheck %s
+
+# !!! This test is auto-generated from utils/testgen/mc-bundling-x86-gen.py !!!
+#     It tests that bundle-aligned grouping works correctly in MC. Read the
+#     source of the script for more details.
+
+  .text
+  .bundle_align_mode 4
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_0:
+  .bundle_lock
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 0: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 21: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 42: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 63: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 84: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: a5: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: c6: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: e7: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 108: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 129: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 14a: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 16b: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 18c: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1ad: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1ce: incl
+
+  .align 32, 0x90
+INSTRLEN_1_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock
+  .rept 1
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1ef: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_0:
+  .bundle_lock
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 200: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 221: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 242: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 263: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 284: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 2a5: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 2c6: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 2e7: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 308: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 329: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 34a: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 36b: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 38c: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 3ad: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 3ce: incl
+
+  .align 32, 0x90
+INSTRLEN_2_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock
+  .rept 2
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 3ef: nop
+# CHECK: 3f0: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_0:
+  .bundle_lock
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 400: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 421: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 442: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 463: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 484: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 4a5: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 4c6: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 4e7: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 508: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 529: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 54a: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 56b: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 58c: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 5ad: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 5ce: nop
+# CHECK: 5d0: incl
+
+  .align 32, 0x90
+INSTRLEN_3_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock
+  .rept 3
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 5ef: nop
+# CHECK: 5f0: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_0:
+  .bundle_lock
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 600: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 621: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 642: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 663: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 684: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 6a5: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 6c6: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 6e7: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 708: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 729: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 74a: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 76b: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 78c: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 7ad: nop
+# CHECK: 7b0: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 7ce: nop
+# CHECK: 7d0: incl
+
+  .align 32, 0x90
+INSTRLEN_4_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock
+  .rept 4
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 7ef: nop
+# CHECK: 7f0: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_0:
+  .bundle_lock
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 800: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 821: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 842: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 863: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 884: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 8a5: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 8c6: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 8e7: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 908: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 929: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 94a: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 96b: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 98c: nop
+# CHECK: 990: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 9ad: nop
+# CHECK: 9b0: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 9ce: nop
+# CHECK: 9d0: incl
+
+  .align 32, 0x90
+INSTRLEN_5_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock
+  .rept 5
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 9ef: nop
+# CHECK: 9f0: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_0:
+  .bundle_lock
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: a00: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: a21: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: a42: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: a63: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: a84: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: aa5: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: ac6: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: ae7: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: b08: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: b29: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: b4a: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: b6b: nop
+# CHECK: b70: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: b8c: nop
+# CHECK: b90: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: bad: nop
+# CHECK: bb0: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: bce: nop
+# CHECK: bd0: incl
+
+  .align 32, 0x90
+INSTRLEN_6_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock
+  .rept 6
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: bef: nop
+# CHECK: bf0: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_0:
+  .bundle_lock
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: c00: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: c21: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: c42: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: c63: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: c84: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: ca5: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: cc6: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: ce7: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: d08: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: d29: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: d4a: nop
+# CHECK: d50: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: d6b: nop
+# CHECK: d70: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: d8c: nop
+# CHECK: d90: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: dad: nop
+# CHECK: db0: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: dce: nop
+# CHECK: dd0: incl
+
+  .align 32, 0x90
+INSTRLEN_7_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock
+  .rept 7
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: def: nop
+# CHECK: df0: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_0:
+  .bundle_lock
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: e00: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: e21: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: e42: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: e63: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: e84: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: ea5: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: ec6: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: ee7: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: f08: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: f29: nop
+# CHECK: f30: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: f4a: nop
+# CHECK: f50: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: f6b: nop
+# CHECK: f70: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: f8c: nop
+# CHECK: f90: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: fad: nop
+# CHECK: fb0: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: fce: nop
+# CHECK: fd0: incl
+
+  .align 32, 0x90
+INSTRLEN_8_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock
+  .rept 8
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: fef: nop
+# CHECK: ff0: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_0:
+  .bundle_lock
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1000: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1021: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1042: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1063: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1084: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 10a5: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 10c6: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 10e7: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1108: nop
+# CHECK: 1110: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1129: nop
+# CHECK: 1130: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 114a: nop
+# CHECK: 1150: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 116b: nop
+# CHECK: 1170: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 118c: nop
+# CHECK: 1190: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 11ad: nop
+# CHECK: 11b0: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 11ce: nop
+# CHECK: 11d0: incl
+
+  .align 32, 0x90
+INSTRLEN_9_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock
+  .rept 9
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 11ef: nop
+# CHECK: 11f0: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_0:
+  .bundle_lock
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1200: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1221: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1242: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1263: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1284: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 12a5: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 12c6: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 12e7: nop
+# CHECK: 12f0: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1308: nop
+# CHECK: 1310: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1329: nop
+# CHECK: 1330: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 134a: nop
+# CHECK: 1350: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 136b: nop
+# CHECK: 1370: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 138c: nop
+# CHECK: 1390: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 13ad: nop
+# CHECK: 13b0: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 13ce: nop
+# CHECK: 13d0: incl
+
+  .align 32, 0x90
+INSTRLEN_10_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock
+  .rept 10
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 13ef: nop
+# CHECK: 13f0: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_0:
+  .bundle_lock
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1400: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1421: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1442: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1463: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1484: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 14a5: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 14c6: nop
+# CHECK: 14d0: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 14e7: nop
+# CHECK: 14f0: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1508: nop
+# CHECK: 1510: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1529: nop
+# CHECK: 1530: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 154a: nop
+# CHECK: 1550: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 156b: nop
+# CHECK: 1570: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 158c: nop
+# CHECK: 1590: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 15ad: nop
+# CHECK: 15b0: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 15ce: nop
+# CHECK: 15d0: incl
+
+  .align 32, 0x90
+INSTRLEN_11_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock
+  .rept 11
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 15ef: nop
+# CHECK: 15f0: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_0:
+  .bundle_lock
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1600: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1621: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1642: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1663: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1684: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 16a5: nop
+# CHECK: 16b0: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 16c6: nop
+# CHECK: 16d0: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 16e7: nop
+# CHECK: 16f0: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1708: nop
+# CHECK: 1710: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1729: nop
+# CHECK: 1730: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 174a: nop
+# CHECK: 1750: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 176b: nop
+# CHECK: 1770: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 178c: nop
+# CHECK: 1790: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 17ad: nop
+# CHECK: 17b0: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 17ce: nop
+# CHECK: 17d0: incl
+
+  .align 32, 0x90
+INSTRLEN_12_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock
+  .rept 12
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 17ef: nop
+# CHECK: 17f0: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_0:
+  .bundle_lock
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1800: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1821: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1842: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1863: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1884: nop
+# CHECK: 1890: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 18a5: nop
+# CHECK: 18b0: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 18c6: nop
+# CHECK: 18d0: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 18e7: nop
+# CHECK: 18f0: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1908: nop
+# CHECK: 1910: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1929: nop
+# CHECK: 1930: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 194a: nop
+# CHECK: 1950: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 196b: nop
+# CHECK: 1970: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 198c: nop
+# CHECK: 1990: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 19ad: nop
+# CHECK: 19b0: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 19ce: nop
+# CHECK: 19d0: incl
+
+  .align 32, 0x90
+INSTRLEN_13_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock
+  .rept 13
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 19ef: nop
+# CHECK: 19f0: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_0:
+  .bundle_lock
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1a00: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1a21: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1a42: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1a63: nop
+# CHECK: 1a70: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1a84: nop
+# CHECK: 1a90: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1aa5: nop
+# CHECK: 1ab0: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1ac6: nop
+# CHECK: 1ad0: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1ae7: nop
+# CHECK: 1af0: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1b08: nop
+# CHECK: 1b10: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1b29: nop
+# CHECK: 1b30: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1b4a: nop
+# CHECK: 1b50: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1b6b: nop
+# CHECK: 1b70: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1b8c: nop
+# CHECK: 1b90: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1bad: nop
+# CHECK: 1bb0: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1bce: nop
+# CHECK: 1bd0: incl
+
+  .align 32, 0x90
+INSTRLEN_14_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock
+  .rept 14
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1bef: nop
+# CHECK: 1bf0: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_0:
+  .bundle_lock
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1c00: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1c21: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1c42: nop
+# CHECK: 1c50: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1c63: nop
+# CHECK: 1c70: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1c84: nop
+# CHECK: 1c90: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1ca5: nop
+# CHECK: 1cb0: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1cc6: nop
+# CHECK: 1cd0: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1ce7: nop
+# CHECK: 1cf0: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1d08: nop
+# CHECK: 1d10: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1d29: nop
+# CHECK: 1d30: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1d4a: nop
+# CHECK: 1d50: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1d6b: nop
+# CHECK: 1d70: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1d8c: nop
+# CHECK: 1d90: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1dad: nop
+# CHECK: 1db0: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1dce: nop
+# CHECK: 1dd0: incl
+
+  .align 32, 0x90
+INSTRLEN_15_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock
+  .rept 15
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1def: nop
+# CHECK: 1df0: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_0:
+  .bundle_lock
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1e00: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_1:
+  .fill 1, 1, 0x90
+  .bundle_lock
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1e21: nop
+# CHECK: 1e30: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_2:
+  .fill 2, 1, 0x90
+  .bundle_lock
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1e42: nop
+# CHECK: 1e50: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_3:
+  .fill 3, 1, 0x90
+  .bundle_lock
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1e63: nop
+# CHECK: 1e70: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_4:
+  .fill 4, 1, 0x90
+  .bundle_lock
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1e84: nop
+# CHECK: 1e90: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_5:
+  .fill 5, 1, 0x90
+  .bundle_lock
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1ea5: nop
+# CHECK: 1eb0: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_6:
+  .fill 6, 1, 0x90
+  .bundle_lock
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1ec6: nop
+# CHECK: 1ed0: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_7:
+  .fill 7, 1, 0x90
+  .bundle_lock
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1ee7: nop
+# CHECK: 1ef0: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_8:
+  .fill 8, 1, 0x90
+  .bundle_lock
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1f08: nop
+# CHECK: 1f10: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_9:
+  .fill 9, 1, 0x90
+  .bundle_lock
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1f29: nop
+# CHECK: 1f30: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_10:
+  .fill 10, 1, 0x90
+  .bundle_lock
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1f4a: nop
+# CHECK: 1f50: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_11:
+  .fill 11, 1, 0x90
+  .bundle_lock
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1f6b: nop
+# CHECK: 1f70: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_12:
+  .fill 12, 1, 0x90
+  .bundle_lock
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1f8c: nop
+# CHECK: 1f90: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_13:
+  .fill 13, 1, 0x90
+  .bundle_lock
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1fad: nop
+# CHECK: 1fb0: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_14:
+  .fill 14, 1, 0x90
+  .bundle_lock
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1fce: nop
+# CHECK: 1fd0: incl
+
+  .align 32, 0x90
+INSTRLEN_16_OFFSET_15:
+  .fill 15, 1, 0x90
+  .bundle_lock
+  .rept 16
+  inc %eax
+  .endr
+  .bundle_unlock
+# CHECK: 1fef: nop
+# CHECK: 1ff0: incl
+
diff --git a/test/MC/X86/AlignedBundling/bundle-group-too-large-error.s b/test/MC/X86/AlignedBundling/bundle-group-too-large-error.s
new file mode 100644
index 0000000000..722bf7b922
--- /dev/null
+++ b/test/MC/X86/AlignedBundling/bundle-group-too-large-error.s
@@ -0,0 +1,17 @@
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - 2>&1 | FileCheck %s
+
+# CHECK: ERROR: Fragment can't be larger than a bundle size
+
+  .text
+foo:
+  .bundle_align_mode 4
+  pushq   %rbp
+
+  .bundle_lock
+  pushq   %r14
+  callq   bar
+  callq   bar
+  callq   bar
+  callq   bar
+  .bundle_unlock
+
diff --git a/test/MC/X86/AlignedBundling/bundle-lock-option-error.s b/test/MC/X86/AlignedBundling/bundle-lock-option-error.s
new file mode 100644
index 0000000000..82c5d7cf0e
--- /dev/null
+++ b/test/MC/X86/AlignedBundling/bundle-lock-option-error.s
@@ -0,0 +1,11 @@
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - 2>&1 | FileCheck %s
+
+# Missing .bundle_align_mode argument
+# CHECK: error: invalid option
+
+  .bundle_align_mode 4
+  .bundle_lock 5
+  imull $17, %ebx, %ebp
+  .bundle_unlock
+
+
diff --git a/test/MC/X86/AlignedBundling/different-sections.s b/test/MC/X86/AlignedBundling/different-sections.s
new file mode 100644
index 0000000000..3e9fcf376d
--- /dev/null
+++ b/test/MC/X86/AlignedBundling/different-sections.s
@@ -0,0 +1,25 @@
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
+
+# Test two different executable sections with bundling.
+
+  .bundle_align_mode 3
+  .section text1, "x"
+# CHECK: section text1
+  imull $17, %ebx, %ebp
+  imull $17, %ebx, %ebp
+
+  imull $17, %ebx, %ebp
+# CHECK:      6: nop
+# CHECK-NEXT: 8: imull
+
+  .section text2, "x"
+# CHECK: section text2
+  imull $17, %ebx, %ebp
+  imull $17, %ebx, %ebp
+
+  imull $17, %ebx, %ebp
+# CHECK:      6: nop
+# CHECK-NEXT: 8: imull
+
+
diff --git a/test/MC/X86/AlignedBundling/lit.local.cfg b/test/MC/X86/AlignedBundling/lit.local.cfg
new file mode 100644
index 0000000000..6c49f08b74
--- /dev/null
+++ b/test/MC/X86/AlignedBundling/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.s']
+
+targets = set(config.root.targets_to_build.split())
+if not 'X86' in targets:
+    config.unsupported = True
+
diff --git a/test/MC/X86/AlignedBundling/lock-without-bundle-mode-error.s b/test/MC/X86/AlignedBundling/lock-without-bundle-mode-error.s
new file mode 100644
index 0000000000..d45a9b4a5d
--- /dev/null
+++ b/test/MC/X86/AlignedBundling/lock-without-bundle-mode-error.s
@@ -0,0 +1,10 @@
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - 2>&1 | FileCheck %s
+
+# .bundle_lock can't come without a .bundle_align_mode before it
+
+# CHECK: ERROR: .bundle_lock forbidden when bundling is disabled
+
+  imull $17, %ebx, %ebp
+  .bundle_lock
+
+
diff --git a/test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s b/test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s
new file mode 100644
index 0000000000..3d58d7c14e
--- /dev/null
+++ b/test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s
@@ -0,0 +1,33 @@
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
+
+# Test some variations of padding to the end of a bundle.
+
+  .text
+foo:
+  .bundle_align_mode 4
+
+# Each of these callq instructions is 5 bytes long
+  callq   bar
+  callq   bar
+  .bundle_lock align_to_end
+  callq   bar
+  .bundle_unlock
+# To align this group to a bundle end, we need a 1-byte NOP.
+# CHECK:        a:  nop
+# CHECK-NEXT:   b: callq
+
+  callq   bar
+  callq   bar
+  .bundle_lock align_to_end
+  callq   bar
+  callq   bar
+  .bundle_unlock
+# Here we have to pad until the end of the *next* boundary because
+# otherwise the group crosses a boundary.
+# CHECK:      1a: nop
+# The nop sequence may be implemented as one instruction or many, but if
+# it's one insruction, that instruction cannot iself cross the boundary.
+# CHECK:      20: nop
+# CHECK-NEXT: 26: callq
+# CHECK-NEXT: 2b: callq
diff --git a/test/MC/X86/AlignedBundling/pad-bundle-groups.s b/test/MC/X86/AlignedBundling/pad-bundle-groups.s
new file mode 100644
index 0000000000..b65ee7a5cc
--- /dev/null
+++ b/test/MC/X86/AlignedBundling/pad-bundle-groups.s
@@ -0,0 +1,46 @@
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
+
+# Test some variations of padding for bundle-locked groups.
+
+  .text
+foo:
+  .bundle_align_mode 4
+
+# Each of these callq instructions is 5 bytes long
+  callq   bar
+  callq   bar
+
+  .bundle_lock
+  callq   bar
+  callq   bar
+  .bundle_unlock
+# We'll need a 6-byte NOP before this group
+# CHECK:        a:  nop
+# CHECK-NEXT:   10: callq
+# CHECK-NEXT:   15: callq
+
+  .bundle_lock
+  callq   bar
+  callq   bar
+  .bundle_unlock
+# Same here
+# CHECK:        1a:  nop
+# CHECK-NEXT:   20: callq
+# CHECK-NEXT:   25: callq
+
+  .align 16, 0x90
+  callq   bar
+  .bundle_lock
+  callq   bar
+  callq   bar
+  callq   bar
+  .bundle_unlock
+# And here we'll need a 11-byte NOP
+# CHECK:        30: callq
+# CHECK:        35: nop
+# CHECK-NEXT:   40: callq
+# CHECK-NEXT:   45: callq
+
+
+
diff --git a/test/MC/X86/AlignedBundling/relax-in-bundle-group.s b/test/MC/X86/AlignedBundling/relax-in-bundle-group.s
new file mode 100644
index 0000000000..0a99bb5ce5
--- /dev/null
+++ b/test/MC/X86/AlignedBundling/relax-in-bundle-group.s
@@ -0,0 +1,42 @@
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN:   | llvm-objdump -disassemble - | FileCheck %s
+
+# Test that instructions inside bundle-locked groups are relaxed even if their
+# fixup is short enough not to warrant relaxation on its own.
+
+  .text
+foo:
+  .bundle_align_mode 4
+  pushq   %rbp
+
+  movl    %edi, %ebx
+  callq   bar
+  movl    %eax, %r14d
+  imull   $17, %ebx, %ebp
+  movl    %ebx, %edi
+  callq   bar
+  cmpl    %r14d, %ebp
+  .bundle_lock
+
+  jle     .L_ELSE
+# This group would've started at 0x18 and is too long, so a chunky NOP padding
+# is inserted to push it to 0x20.
+# CHECK: 18: {{[a-f0-9 ]+}} nopl
+
+# The long encoding for JLE should be used here even though its target is close
+# CHECK-NEXT: 20: 0f 8e
+
+  addl    %ebp, %eax
+
+  jmp     .L_RET
+# Same for the JMP
+# CHECK: 28: e9
+
+  .bundle_unlock
+
+.L_ELSE:
+  imull   %ebx, %eax
+.L_RET:
+
+  popq    %rbx
+
diff --git a/test/MC/X86/AlignedBundling/single-inst-bundling.s b/test/MC/X86/AlignedBundling/single-inst-bundling.s
new file mode 100644
index 0000000000..c0275f4d1e
--- /dev/null
+++ b/test/MC/X86/AlignedBundling/single-inst-bundling.s
@@ -0,0 +1,47 @@
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
+
+# Test simple NOP insertion for single instructions.
+
+  .text
+foo:
+  # Will be bundle-aligning to 16 byte boundaries
+  .bundle_align_mode 4
+  pushq   %rbp
+  pushq   %r14
+  pushq   %rbx
+
+  movl    %edi, %ebx
+  callq   bar
+  movl    %eax, %r14d
+
+  imull   $17, %ebx, %ebp
+# This imull is 3 bytes long and should have started at 0xe, so two bytes
+# of nop padding are inserted instead and it starts at 0x10
+# CHECK:          nop
+# CHECK-NEXT:     10: imull
+
+  movl    %ebx, %edi
+  callq   bar
+  cmpl    %r14d, %ebp
+  jle     .L_ELSE
+# Due to the padding that's inserted before the addl, the jump target
+# becomes farther by one byte.
+# CHECK:         jle 5
+
+  addl    %ebp, %eax
+# CHECK:          nop
+# CHECK-NEXT:     20: addl
+
+  jmp     .L_RET
+.L_ELSE:
+  imull   %ebx, %eax
+.L_RET:
+  ret
+
+# Just sanity checking that data fills don't drive bundling crazy
+  .data
+  .byte 40
+  .byte 98
+
+
diff --git a/test/MC/X86/AlignedBundling/switch-section-locked-error.s b/test/MC/X86/AlignedBundling/switch-section-locked-error.s
new file mode 100644
index 0000000000..af41e19212
--- /dev/null
+++ b/test/MC/X86/AlignedBundling/switch-section-locked-error.s
@@ -0,0 +1,16 @@
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - 2>&1 | FileCheck %s
+
+# This test invokes .bundle_lock and then switches to a different section
+# w/o the appropriate unlock.
+
+# CHECK: ERROR: Unterminated .bundle_lock
+
+  .bundle_align_mode 3
+  .section text1, "x"
+  imull $17, %ebx, %ebp
+  .bundle_lock
+  imull $17, %ebx, %ebp
+
+  .section text2, "x"
+  imull $17, %ebx, %ebp
+
diff --git a/test/MC/X86/AlignedBundling/unlock-without-lock-error.s b/test/MC/X86/AlignedBundling/unlock-without-lock-error.s
new file mode 100644
index 0000000000..699511d4e6
--- /dev/null
+++ b/test/MC/X86/AlignedBundling/unlock-without-lock-error.s
@@ -0,0 +1,11 @@
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - 2>&1 | FileCheck %s
+
+# .bundle_unlock can't come without a .bundle_lock before it
+
+# CHECK: ERROR: .bundle_unlock without matching lock
+
+  .bundle_align_mode 3
+  imull $17, %ebx, %ebp
+  .bundle_unlock
+
+
diff --git a/test/MC/X86/lit.local.cfg b/test/MC/X86/lit.local.cfg
index eee568e8fd..ad280c7cf7 100644
--- a/test/MC/X86/lit.local.cfg
+++ b/test/MC/X86/lit.local.cfg
@@ -1,12 +1,5 @@
 config.suffixes = ['.ll', '.c', '.cpp', '.s']
 
-def getRoot(config):
-    if not config.parent:
-        return config
-    return getRoot(config.parent)
-
-root = getRoot(config)
-
-targets = set(root.targets_to_build.split())
+targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/MC/X86/x86-32-avx.s b/test/MC/X86/x86-32-avx.s
index 586f3fe73c..ec4abdbb2a 100644
--- a/test/MC/X86/x86-32-avx.s
+++ b/test/MC/X86/x86-32-avx.s
@@ -655,14 +655,22 @@
 // CHECK: encoding: [0xc5,0xfa,0x2c,0x01]
           vcvttss2si  (%ecx), %eax
 
-// CHECK: vcvtsi2ss  (%eax), %xmm1, %xmm2
+// CHECK: vcvtsi2ssl  (%eax), %xmm1, %xmm2
 // CHECK: encoding: [0xc5,0xf2,0x2a,0x10]
           vcvtsi2ss  (%eax), %xmm1, %xmm2
 
-// CHECK: vcvtsi2ss  (%eax), %xmm1, %xmm2
+// CHECK: vcvtsi2ssl  (%eax), %xmm1, %xmm2
 // CHECK: encoding: [0xc5,0xf2,0x2a,0x10]
           vcvtsi2ss  (%eax), %xmm1, %xmm2
 
+// CHECK: vcvtsi2ssl  (%eax), %xmm1, %xmm2
+// CHECK: encoding: [0xc5,0xf2,0x2a,0x10]
+          vcvtsi2ssl  (%eax), %xmm1, %xmm2
+
+// CHECK: vcvtsi2ssl  (%eax), %xmm1, %xmm2
+// CHECK: encoding: [0xc5,0xf2,0x2a,0x10]
+          vcvtsi2ssl  (%eax), %xmm1, %xmm2
+
 // CHECK: vcvttsd2si  %xmm1, %eax
 // CHECK: encoding: [0xc5,0xfb,0x2c,0xc1]
           vcvttsd2si  %xmm1, %eax
@@ -671,14 +679,22 @@
 // CHECK: encoding: [0xc5,0xfb,0x2c,0x01]
           vcvttsd2si  (%ecx), %eax
 
-// CHECK: vcvtsi2sd  (%eax), %xmm1, %xmm2
+// CHECK: vcvtsi2sdl  (%eax), %xmm1, %xmm2
 // CHECK: encoding: [0xc5,0xf3,0x2a,0x10]
           vcvtsi2sd  (%eax), %xmm1, %xmm2
 
-// CHECK: vcvtsi2sd  (%eax), %xmm1, %xmm2
+// CHECK: vcvtsi2sdl  (%eax), %xmm1, %xmm2
 // CHECK: encoding: [0xc5,0xf3,0x2a,0x10]
           vcvtsi2sd  (%eax), %xmm1, %xmm2
 
+// CHECK: vcvtsi2sdl  (%eax), %xmm1, %xmm2
+// CHECK: encoding: [0xc5,0xf3,0x2a,0x10]
+          vcvtsi2sdl  (%eax), %xmm1, %xmm2
+
+// CHECK: vcvtsi2sdl  (%eax), %xmm1, %xmm2
+// CHECK: encoding: [0xc5,0xf3,0x2a,0x10]
+          vcvtsi2sdl  (%eax), %xmm1, %xmm2
+
 // CHECK: vmovaps  (%eax), %xmm2
 // CHECK: encoding: [0xc5,0xf8,0x28,0x10]
           vmovaps  (%eax), %xmm2
@@ -767,14 +783,22 @@
 // CHECK: encoding: [0xc5,0xe8,0x12,0xd9]
           vmovhlps  %xmm1, %xmm2, %xmm3
 
-// CHECK: vcvtss2sil  %xmm1, %eax
+// CHECK: vcvtss2si  %xmm1, %eax
 // CHECK: encoding: [0xc5,0xfa,0x2d,0xc1]
           vcvtss2si  %xmm1, %eax
 
-// CHECK: vcvtss2sil  (%eax), %ebx
+// CHECK: vcvtss2si  (%eax), %ebx
 // CHECK: encoding: [0xc5,0xfa,0x2d,0x18]
           vcvtss2si  (%eax), %ebx
 
+// CHECK: vcvtss2si  %xmm1, %eax
+// CHECK: encoding: [0xc5,0xfa,0x2d,0xc1]
+          vcvtss2sil  %xmm1, %eax
+
+// CHECK: vcvtss2si  (%eax), %ebx
+// CHECK: encoding: [0xc5,0xfa,0x2d,0x18]
+          vcvtss2sil  (%eax), %ebx
+
 // CHECK: vcvtdq2ps  %xmm5, %xmm6
 // CHECK: encoding: [0xc5,0xf8,0x5b,0xf5]
           vcvtdq2ps  %xmm5, %xmm6
@@ -3103,19 +3127,35 @@
 // CHECK: encoding: [0xc5,0xf8,0x77]
           vzeroupper
 
-// CHECK: vcvtsd2sil  %xmm4, %ecx
+// CHECK: vcvtsd2si  %xmm4, %ecx
 // CHECK: encoding: [0xc5,0xfb,0x2d,0xcc]
           vcvtsd2sil  %xmm4, %ecx
 
-// CHECK: vcvtsd2sil  (%ecx), %ecx
+// CHECK: vcvtsd2si  (%ecx), %ecx
 // CHECK: encoding: [0xc5,0xfb,0x2d,0x09]
           vcvtsd2sil  (%ecx), %ecx
 
-// CHECK: vcvtsi2sd  (%ebp), %xmm0, %xmm7
+// CHECK: vcvtsd2si  %xmm4, %ecx
+// CHECK: encoding: [0xc5,0xfb,0x2d,0xcc]
+          vcvtsd2si  %xmm4, %ecx
+
+// CHECK: vcvtsd2si  (%ecx), %ecx
+// CHECK: encoding: [0xc5,0xfb,0x2d,0x09]
+          vcvtsd2si  (%ecx), %ecx
+
+// CHECK: vcvtsi2sdl  (%ebp), %xmm0, %xmm7
+// CHECK: encoding: [0xc5,0xfb,0x2a,0x7d,0x00]
+          vcvtsi2sdl  (%ebp), %xmm0, %xmm7
+
+// CHECK: vcvtsi2sdl  (%esp), %xmm0, %xmm7
+// CHECK: encoding: [0xc5,0xfb,0x2a,0x3c,0x24]
+          vcvtsi2sdl  (%esp), %xmm0, %xmm7
+
+// CHECK: vcvtsi2sdl  (%ebp), %xmm0, %xmm7
 // CHECK: encoding: [0xc5,0xfb,0x2a,0x7d,0x00]
           vcvtsi2sd  (%ebp), %xmm0, %xmm7
 
-// CHECK: vcvtsi2sd  (%esp), %xmm0, %xmm7
+// CHECK: vcvtsi2sdl  (%esp), %xmm0, %xmm7
 // CHECK: encoding: [0xc5,0xfb,0x2a,0x3c,0x24]
           vcvtsi2sd  (%esp), %xmm0, %xmm7
 
diff --git a/test/MC/X86/x86-32-coverage.s b/test/MC/X86/x86-32-coverage.s
index 0824916519..c348915d23 100644
--- a/test/MC/X86/x86-32-coverage.s
+++ b/test/MC/X86/x86-32-coverage.s
@@ -896,11 +896,11 @@
 // CHECK: 	cvtps2pi	%xmm5, %mm3
         	cvtps2pi	%xmm5,%mm3
 
-// CHECK: 	cvtsi2ss	%ecx, %xmm5
-        	cvtsi2ss	%ecx,%xmm5
+// CHECK: 	cvtsi2ssl	%ecx, %xmm5
+        	cvtsi2ssl	%ecx,%xmm5
 
-// CHECK: 	cvtsi2ss	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtsi2ss	0xdeadbeef(%ebx,%ecx,8),%xmm5
+// CHECK: 	cvtsi2ssl	3735928559(%ebx,%ecx,8), %xmm5
+        	cvtsi2ssl	0xdeadbeef(%ebx,%ecx,8),%xmm5
 
 // CHECK: 	cvttps2pi	3735928559(%ebx,%ecx,8), %mm3
         	cvttps2pi	0xdeadbeef(%ebx,%ecx,8),%mm3
@@ -1157,11 +1157,11 @@
 // CHECK: 	cvtpi2pd	%mm3, %xmm5
         	cvtpi2pd	%mm3,%xmm5
 
-// CHECK: 	cvtsi2sd	%ecx, %xmm5
-        	cvtsi2sd	%ecx,%xmm5
+// CHECK: 	cvtsi2sdl	%ecx, %xmm5
+        	cvtsi2sdl	%ecx,%xmm5
 
-// CHECK: 	cvtsi2sd	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtsi2sd	0xdeadbeef(%ebx,%ecx,8),%xmm5
+// CHECK: 	cvtsi2sdl	3735928559(%ebx,%ecx,8), %xmm5
+        	cvtsi2sdl	0xdeadbeef(%ebx,%ecx,8),%xmm5
 
 // CHECK: 	divpd	%xmm5, %xmm5
         	divpd	%xmm5,%xmm5
@@ -3948,6 +3948,10 @@
 // CHECK:  encoding: [0xd9,0xca]
         	fxch	%st(2)
 
+// CHECK: fcom
+// CHECK:  encoding: [0xd8,0xd1]
+        	fcom
+
 // CHECK: fcom	%st(2)
 // CHECK:  encoding: [0xd8,0xd2]
         	fcom	%st(2)
@@ -3968,6 +3972,10 @@
 // CHECK:  encoding: [0xda,0x15,0x78,0x56,0x34,0x12]
         	ficoml	0x12345678
 
+// CHECK: fcomp
+// CHECK:  encoding: [0xd8,0xd9]
+        	fcomp
+
 // CHECK: fcomp	%st(2)
 // CHECK:  encoding: [0xd8,0xda]
         	fcomp	%st(2)
@@ -7144,29 +7152,29 @@
 // CHECK:  encoding: [0x0f,0x2d,0xdd]
         	cvtps2pi	%xmm5,%mm3
 
-// CHECK: cvtsi2ss	%ecx, %xmm5
+// CHECK: cvtsi2ssl	%ecx, %xmm5
 // CHECK:  encoding: [0xf3,0x0f,0x2a,0xe9]
-        	cvtsi2ss	%ecx,%xmm5
+        	cvtsi2ssl	%ecx,%xmm5
 
-// CHECK: cvtsi2ss	3735928559(%ebx,%ecx,8), %xmm5
+// CHECK: cvtsi2ssl	3735928559(%ebx,%ecx,8), %xmm5
 // CHECK:  encoding: [0xf3,0x0f,0x2a,0xac,0xcb,0xef,0xbe,0xad,0xde]
-        	cvtsi2ss	0xdeadbeef(%ebx,%ecx,8),%xmm5
+        	cvtsi2ssl	0xdeadbeef(%ebx,%ecx,8),%xmm5
 
-// CHECK: cvtsi2ss	69, %xmm5
+// CHECK: cvtsi2ssl	69, %xmm5
 // CHECK:  encoding: [0xf3,0x0f,0x2a,0x2d,0x45,0x00,0x00,0x00]
-        	cvtsi2ss	0x45,%xmm5
+        	cvtsi2ssl	0x45,%xmm5
 
-// CHECK: cvtsi2ss	32493, %xmm5
+// CHECK: cvtsi2ssl	32493, %xmm5
 // CHECK:  encoding: [0xf3,0x0f,0x2a,0x2d,0xed,0x7e,0x00,0x00]
-        	cvtsi2ss	0x7eed,%xmm5
+        	cvtsi2ssl	0x7eed,%xmm5
 
-// CHECK: cvtsi2ss	3133065982, %xmm5
+// CHECK: cvtsi2ssl	3133065982, %xmm5
 // CHECK:  encoding: [0xf3,0x0f,0x2a,0x2d,0xfe,0xca,0xbe,0xba]
-        	cvtsi2ss	0xbabecafe,%xmm5
+        	cvtsi2ssl	0xbabecafe,%xmm5
 
-// CHECK: cvtsi2ss	305419896, %xmm5
+// CHECK: cvtsi2ssl	305419896, %xmm5
 // CHECK:  encoding: [0xf3,0x0f,0x2a,0x2d,0x78,0x56,0x34,0x12]
-        	cvtsi2ss	0x12345678,%xmm5
+        	cvtsi2ssl	0x12345678,%xmm5
 
 // CHECK: cvttps2pi	3735928559(%ebx,%ecx,8), %mm3
 // CHECK:  encoding: [0x0f,0x2c,0x9c,0xcb,0xef,0xbe,0xad,0xde]
@@ -8652,29 +8660,29 @@
 // CHECK:  encoding: [0x66,0x0f,0x2a,0xeb]
         	cvtpi2pd	%mm3,%xmm5
 
-// CHECK: cvtsi2sd	%ecx, %xmm5
+// CHECK: cvtsi2sdl	%ecx, %xmm5
 // CHECK:  encoding: [0xf2,0x0f,0x2a,0xe9]
-        	cvtsi2sd	%ecx,%xmm5
+        	cvtsi2sdl	%ecx,%xmm5
 
-// CHECK: cvtsi2sd	3735928559(%ebx,%ecx,8), %xmm5
+// CHECK: cvtsi2sdl	3735928559(%ebx,%ecx,8), %xmm5
 // CHECK:  encoding: [0xf2,0x0f,0x2a,0xac,0xcb,0xef,0xbe,0xad,0xde]
-        	cvtsi2sd	0xdeadbeef(%ebx,%ecx,8),%xmm5
+        	cvtsi2sdl	0xdeadbeef(%ebx,%ecx,8),%xmm5
 
-// CHECK: cvtsi2sd	69, %xmm5
+// CHECK: cvtsi2sdl	69, %xmm5
 // CHECK:  encoding: [0xf2,0x0f,0x2a,0x2d,0x45,0x00,0x00,0x00]
-        	cvtsi2sd	0x45,%xmm5
+        	cvtsi2sdl	0x45,%xmm5
 
-// CHECK: cvtsi2sd	32493, %xmm5
+// CHECK: cvtsi2sdl	32493, %xmm5
 // CHECK:  encoding: [0xf2,0x0f,0x2a,0x2d,0xed,0x7e,0x00,0x00]
-        	cvtsi2sd	0x7eed,%xmm5
+        	cvtsi2sdl	0x7eed,%xmm5
 
-// CHECK: cvtsi2sd	3133065982, %xmm5
+// CHECK: cvtsi2sdl	3133065982, %xmm5
 // CHECK:  encoding: [0xf2,0x0f,0x2a,0x2d,0xfe,0xca,0xbe,0xba]
-        	cvtsi2sd	0xbabecafe,%xmm5
+        	cvtsi2sdl	0xbabecafe,%xmm5
 
-// CHECK: cvtsi2sd	305419896, %xmm5
+// CHECK: cvtsi2sdl	305419896, %xmm5
 // CHECK:  encoding: [0xf2,0x0f,0x2a,0x2d,0x78,0x56,0x34,0x12]
-        	cvtsi2sd	0x12345678,%xmm5
+        	cvtsi2sdl	0x12345678,%xmm5
 
 // CHECK: divpd	3735928559(%ebx,%ecx,8), %xmm5
 // CHECK:  encoding: [0x66,0x0f,0x5e,0xac,0xcb,0xef,0xbe,0xad,0xde]
@@ -16200,23 +16208,23 @@
 // CHECK: 	cvtps2pi	%xmm5, %mm3
         	cvtps2pi	%xmm5,%mm3
 
-// CHECK: 	cvtsi2ss	%ecx, %xmm5
-        	cvtsi2ss	%ecx,%xmm5
+// CHECK: 	cvtsi2ssl	%ecx, %xmm5
+        	cvtsi2ssl	%ecx,%xmm5
 
-// CHECK: 	cvtsi2ss	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtsi2ss	0xdeadbeef(%ebx,%ecx,8),%xmm5
+// CHECK: 	cvtsi2ssl	3735928559(%ebx,%ecx,8), %xmm5
+        	cvtsi2ssl	0xdeadbeef(%ebx,%ecx,8),%xmm5
 
-// CHECK: 	cvtsi2ss	69, %xmm5
-        	cvtsi2ss	0x45,%xmm5
+// CHECK: 	cvtsi2ssl	69, %xmm5
+        	cvtsi2ssl	0x45,%xmm5
 
-// CHECK: 	cvtsi2ss	32493, %xmm5
-        	cvtsi2ss	0x7eed,%xmm5
+// CHECK: 	cvtsi2ssl	32493, %xmm5
+        	cvtsi2ssl	0x7eed,%xmm5
 
-// CHECK: 	cvtsi2ss	3133065982, %xmm5
-        	cvtsi2ss	0xbabecafe,%xmm5
+// CHECK: 	cvtsi2ssl	3133065982, %xmm5
+        	cvtsi2ssl	0xbabecafe,%xmm5
 
-// CHECK: 	cvtsi2ss	305419896, %xmm5
-        	cvtsi2ss	0x12345678,%xmm5
+// CHECK: 	cvtsi2ssl	305419896, %xmm5
+        	cvtsi2ssl	0x12345678,%xmm5
 
 // CHECK: 	cvttps2pi	3735928559(%ebx,%ecx,8), %mm3
         	cvttps2pi	0xdeadbeef(%ebx,%ecx,8),%mm3
@@ -17334,23 +17342,23 @@
 // CHECK: 	cvtpi2pd	%mm3, %xmm5
         	cvtpi2pd	%mm3,%xmm5
 
-// CHECK: 	cvtsi2sd	%ecx, %xmm5
-        	cvtsi2sd	%ecx,%xmm5
+// CHECK: 	cvtsi2sdl	%ecx, %xmm5
+        	cvtsi2sdl	%ecx,%xmm5
 
-// CHECK: 	cvtsi2sd	3735928559(%ebx,%ecx,8), %xmm5
-        	cvtsi2sd	0xdeadbeef(%ebx,%ecx,8),%xmm5
+// CHECK: 	cvtsi2sdl	3735928559(%ebx,%ecx,8), %xmm5
+        	cvtsi2sdl	0xdeadbeef(%ebx,%ecx,8),%xmm5
 
-// CHECK: 	cvtsi2sd	69, %xmm5
-        	cvtsi2sd	0x45,%xmm5
+// CHECK: 	cvtsi2sdl	69, %xmm5
+        	cvtsi2sdl	0x45,%xmm5
 
-// CHECK: 	cvtsi2sd	32493, %xmm5
-        	cvtsi2sd	0x7eed,%xmm5
+// CHECK: 	cvtsi2sdl	32493, %xmm5
+        	cvtsi2sdl	0x7eed,%xmm5
 
-// CHECK: 	cvtsi2sd	3133065982, %xmm5
-        	cvtsi2sd	0xbabecafe,%xmm5
+// CHECK: 	cvtsi2sdl	3133065982, %xmm5
+        	cvtsi2sdl	0xbabecafe,%xmm5
 
-// CHECK: 	cvtsi2sd	305419896, %xmm5
-        	cvtsi2sd	0x12345678,%xmm5
+// CHECK: 	cvtsi2sdl	305419896, %xmm5
+        	cvtsi2sdl	0x12345678,%xmm5
 
 // CHECK: 	divpd	3735928559(%ebx,%ecx,8), %xmm5
         	divpd	0xdeadbeef(%ebx,%ecx,8),%xmm5
diff --git a/test/MC/X86/x86-64.s b/test/MC/X86/x86-64.s
index 03cb62e7cb..c5f1d15f8f 100644
--- a/test/MC/X86/x86-64.s
+++ b/test/MC/X86/x86-64.s
@@ -507,15 +507,15 @@ fsave	32493
 
 // rdar://8456382 - cvtsd2si support.
 cvtsd2si	%xmm1, %rax
-// CHECK: cvtsd2siq	%xmm1, %rax
+// CHECK: cvtsd2si	%xmm1, %rax
 // CHECK: encoding: [0xf2,0x48,0x0f,0x2d,0xc1]
 cvtsd2si	%xmm1, %eax
-// CHECK: cvtsd2sil	%xmm1, %eax
+// CHECK: cvtsd2si	%xmm1, %eax
 // CHECK: encoding: [0xf2,0x0f,0x2d,0xc1]
 
-cvtsd2siq %xmm0, %rax // CHECK: cvtsd2siq	%xmm0, %rax
-cvtsd2sil %xmm0, %eax // CHECK: cvtsd2sil	%xmm0, %eax
-cvtsd2si %xmm0, %rax  // CHECK: cvtsd2siq	%xmm0, %rax
+cvtsd2siq %xmm0, %rax // CHECK: cvtsd2si	%xmm0, %rax
+cvtsd2sil %xmm0, %eax // CHECK: cvtsd2si	%xmm0, %eax
+cvtsd2si %xmm0, %rax  // CHECK: cvtsd2si	%xmm0, %rax
 
 
 cvttpd2dq %xmm1, %xmm0  // CHECK: cvttpd2dq %xmm1, %xmm0
diff --git a/test/MC/X86/x86_64-avx-encoding.s b/test/MC/X86/x86_64-avx-encoding.s
index 46ff9ead39..6da9e21fef 100644
--- a/test/MC/X86/x86_64-avx-encoding.s
+++ b/test/MC/X86/x86_64-avx-encoding.s
@@ -1404,25 +1404,25 @@ vdivpd  -4(%rcx,%rbx,8), %xmm10, %xmm11
 // CHECK: encoding: [0xc5,0xfa,0x2c,0x01]
           vcvttss2si  (%rcx), %eax
 
-// CHECK: vcvtsi2ss  (%rax), %xmm11, %xmm12
+// CHECK: vcvtsi2ssl  (%rax), %xmm11, %xmm12
 // CHECK: encoding: [0xc5,0x22,0x2a,0x20]
-          vcvtsi2ss  (%rax), %xmm11, %xmm12
+          vcvtsi2ssl  (%rax), %xmm11, %xmm12
 
-// CHECK: vcvtsi2ss  (%rax), %xmm11, %xmm12
+// CHECK: vcvtsi2ssl  (%rax), %xmm11, %xmm12
 // CHECK: encoding: [0xc5,0x22,0x2a,0x20]
-          vcvtsi2ss  (%rax), %xmm11, %xmm12
+          vcvtsi2ssl  (%rax), %xmm11, %xmm12
 
 // CHECK: vcvttsd2si  (%rcx), %eax
 // CHECK: encoding: [0xc5,0xfb,0x2c,0x01]
           vcvttsd2si  (%rcx), %eax
 
-// CHECK: vcvtsi2sd  (%rax), %xmm11, %xmm12
+// CHECK: vcvtsi2sdl  (%rax), %xmm11, %xmm12
 // CHECK: encoding: [0xc5,0x23,0x2a,0x20]
-          vcvtsi2sd  (%rax), %xmm11, %xmm12
+          vcvtsi2sdl  (%rax), %xmm11, %xmm12
 
-// CHECK: vcvtsi2sd  (%rax), %xmm11, %xmm12
+// CHECK: vcvtsi2sdl  (%rax), %xmm11, %xmm12
 // CHECK: encoding: [0xc5,0x23,0x2a,0x20]
-          vcvtsi2sd  (%rax), %xmm11, %xmm12
+          vcvtsi2sdl  (%rax), %xmm11, %xmm12
 
 // CHECK: vmovaps  (%rax), %xmm12
 // CHECK: encoding: [0xc5,0x78,0x28,0x20]
@@ -1512,11 +1512,11 @@ vdivpd  -4(%rcx,%rbx,8), %xmm10, %xmm11
 // CHECK: encoding: [0xc4,0x41,0x18,0x12,0xeb]
           vmovhlps  %xmm11, %xmm12, %xmm13
 
-// CHECK: vcvtss2sil  %xmm11, %eax
+// CHECK: vcvtss2si  %xmm11, %eax
 // CHECK: encoding: [0xc4,0xc1,0x7a,0x2d,0xc3]
           vcvtss2si  %xmm11, %eax
 
-// CHECK: vcvtss2sil  (%rax), %ebx
+// CHECK: vcvtss2si  (%rax), %ebx
 // CHECK: encoding: [0xc5,0xfa,0x2d,0x18]
           vcvtss2si  (%rax), %ebx
 
@@ -3860,29 +3860,29 @@ vdivpd  -4(%rcx,%rbx,8), %xmm10, %xmm11
 // CHECK: encoding: [0xc4,0x63,0x2d,0x06,0x18,0x07]
           vperm2f128  $7, (%rax), %ymm10, %ymm11
 
-// CHECK: vcvtsd2sil  %xmm8, %r8d
+// CHECK: vcvtsd2si  %xmm8, %r8d
 // CHECK: encoding: [0xc4,0x41,0x7b,0x2d,0xc0]
-          vcvtsd2sil  %xmm8, %r8d
+          vcvtsd2si  %xmm8, %r8d
 
-// CHECK: vcvtsd2sil  (%rcx), %ecx
+// CHECK: vcvtsd2si  (%rcx), %ecx
 // CHECK: encoding: [0xc5,0xfb,0x2d,0x09]
-          vcvtsd2sil  (%rcx), %ecx
+          vcvtsd2si  (%rcx), %ecx
 
-// CHECK: vcvtss2siq  %xmm4, %rcx
+// CHECK: vcvtss2si  %xmm4, %rcx
 // CHECK: encoding: [0xc4,0xe1,0xfa,0x2d,0xcc]
-          vcvtss2siq  %xmm4, %rcx
+          vcvtss2si  %xmm4, %rcx
 
-// CHECK: vcvtss2siq  (%rcx), %r8
+// CHECK: vcvtss2si  (%rcx), %r8
 // CHECK: encoding: [0xc4,0x61,0xfa,0x2d,0x01]
-          vcvtss2siq  (%rcx), %r8
+          vcvtss2si  (%rcx), %r8
 
-// CHECK: vcvtsi2sd  %r8d, %xmm8, %xmm15
+// CHECK: vcvtsi2sdl  %r8d, %xmm8, %xmm15
 // CHECK: encoding: [0xc4,0x41,0x3b,0x2a,0xf8]
-          vcvtsi2sd  %r8d, %xmm8, %xmm15
+          vcvtsi2sdl  %r8d, %xmm8, %xmm15
 
-// CHECK: vcvtsi2sd  (%rbp), %xmm8, %xmm15
+// CHECK: vcvtsi2sdl  (%rbp), %xmm8, %xmm15
 // CHECK: encoding: [0xc5,0x3b,0x2a,0x7d,0x00]
-          vcvtsi2sd  (%rbp), %xmm8, %xmm15
+          vcvtsi2sdl  (%rbp), %xmm8, %xmm15
 
 // CHECK: vcvtsi2sdq  %rcx, %xmm4, %xmm6
 // CHECK: encoding: [0xc4,0xe1,0xdb,0x2a,0xf1]
@@ -3900,21 +3900,21 @@ vdivpd  -4(%rcx,%rbx,8), %xmm10, %xmm11
 // CHECK: encoding: [0xc4,0xe1,0xda,0x2a,0x31]
           vcvtsi2ssq  (%rcx), %xmm4, %xmm6
 
-// CHECK: vcvttsd2siq  %xmm4, %rcx
+// CHECK: vcvttsd2si  %xmm4, %rcx
 // CHECK: encoding: [0xc4,0xe1,0xfb,0x2c,0xcc]
-          vcvttsd2siq  %xmm4, %rcx
+          vcvttsd2si  %xmm4, %rcx
 
-// CHECK: vcvttsd2siq  (%rcx), %rcx
+// CHECK: vcvttsd2si  (%rcx), %rcx
 // CHECK: encoding: [0xc4,0xe1,0xfb,0x2c,0x09]
-          vcvttsd2siq  (%rcx), %rcx
+          vcvttsd2si  (%rcx), %rcx
 
-// CHECK: vcvttss2siq  %xmm4, %rcx
+// CHECK: vcvttss2si  %xmm4, %rcx
 // CHECK: encoding: [0xc4,0xe1,0xfa,0x2c,0xcc]
-          vcvttss2siq  %xmm4, %rcx
+          vcvttss2si  %xmm4, %rcx
 
-// CHECK: vcvttss2siq  (%rcx), %rcx
+// CHECK: vcvttss2si  (%rcx), %rcx
 // CHECK: encoding: [0xc4,0xe1,0xfa,0x2c,0x09]
-          vcvttss2siq  (%rcx), %rcx
+          vcvttss2si  (%rcx), %rcx
 
 // CHECK: vlddqu  (%rax), %ymm12
 // CHECK: encoding: [0xc5,0x7f,0xf0,0x20]
diff --git a/test/NaCl/PNaClABI/instructions.ll b/test/NaCl/PNaClABI/instructions.ll
index 0ec6cc7766..c9a21fddbe 100644
--- a/test/NaCl/PNaClABI/instructions.ll
+++ b/test/NaCl/PNaClABI/instructions.ll
@@ -93,7 +93,7 @@ define void @conversion() {
 define void @other() {
 entry:
   %a1 = icmp eq i32 undef, undef
-  %a2 = fcmp eq float undef, undef
+  %a2 = fcmp oeq float undef, undef
   br i1 undef, label %foo, label %bar
 foo:
 ; phi predecessor labels have to match to appease module verifier
diff --git a/test/Object/Inputs/macho-text-sections.macho-x86_64 b/test/Object/Inputs/macho-text-sections.macho-x86_64
new file mode 100644
index 0000000000..cce203ba0d
--- /dev/null
+++ b/test/Object/Inputs/macho-text-sections.macho-x86_64
diff --git a/test/Object/Inputs/program-headers.elf-i386 b/test/Object/Inputs/program-headers.elf-i386
new file mode 100644
index 0000000000..eb92c71cee
--- /dev/null
+++ b/test/Object/Inputs/program-headers.elf-i386
diff --git a/test/Object/Inputs/program-headers.elf-x86-64 b/test/Object/Inputs/program-headers.elf-x86-64
new file mode 100644
index 0000000000..037bf14866
--- /dev/null
+++ b/test/Object/Inputs/program-headers.elf-x86-64
diff --git a/test/Object/X86/macho-text-sections.test b/test/Object/X86/macho-text-sections.test
new file mode 100644
index 0000000000..1b697dcada
--- /dev/null
+++ b/test/Object/X86/macho-text-sections.test
@@ -0,0 +1,3 @@
+RUN: llvm-objdump -disassemble %p/../Inputs/macho-text-sections.macho-x86_64 | FileCheck %s
+
+CHECK: Disassembly of section __notext,__notext
diff --git a/test/Object/archive-long-index.test b/test/Object/archive-long-index.test
index d0fb19cd8d..bd530edbf4 100644
--- a/test/Object/archive-long-index.test
+++ b/test/Object/archive-long-index.test
@@ -1,5 +1,5 @@
 #
-# Check if the index is appearing properly in the output file 
+# Check if the index is appearing properly in the output file
 #
 RUN: llvm-nm -s %p/Inputs/liblong_filenames.a | FileCheck -check-prefix=CHECKIDX %s
 
diff --git a/test/Object/objdump-private-headers.test b/test/Object/objdump-private-headers.test
new file mode 100644
index 0000000000..c562044b3c
--- /dev/null
+++ b/test/Object/objdump-private-headers.test
@@ -0,0 +1,18 @@
+RUN: llvm-objdump -p %p/Inputs/program-headers.elf-i386 \
+RUN:              | FileCheck %s -check-prefix ELF-i386
+RUN: llvm-objdump -p %p/Inputs/program-headers.elf-x86-64 \
+RUN:              | FileCheck %s -check-prefix ELF-x86-64
+
+ELF-i386: Program Header:
+ELF-i386:     LOAD off    0x00000000 vaddr 0x08048000 paddr 0x08048000 align 2**12
+ELF-i386:          filesz 0x00000134 memsz 0x00000134 flags r-x
+ELF-i386:    STACK off    0x00000000 vaddr 0x00000000 paddr 0x00000000 align 2**2
+ELF-i386:          filesz 0x00000000 memsz 0x00000000 flags rw-
+
+ELF-x86-64: Program Header:
+ELF-x86-64:     LOAD off    0x0000000000000000 vaddr 0x0000000000400000 paddr 0x0000000000400000 align 2**21
+ELF-x86-64:          filesz 0x0000000000000138 memsz 0x0000000000000138 flags r-x
+ELF-x86-64: EH_FRAME off    0x00000000000000f4 vaddr 0x00000000004000f4 paddr 0x00000000004000f4 align 2**2
+ELF-x86-64:          filesz 0x0000000000000014 memsz 0x0000000000000014 flags r--
+ELF-x86-64:    STACK off    0x0000000000000000 vaddr 0x0000000000000000 paddr 0x0000000000000000 align 2**3
+ELF-x86-64:          filesz 0x0000000000000000 memsz 0x0000000000000000 flags rw-
diff --git a/test/Object/objdump-sectionheaders.test b/test/Object/objdump-sectionheaders.test
index a417d07a81..bc2478cea2 100644
--- a/test/Object/objdump-sectionheaders.test
+++ b/test/Object/objdump-sectionheaders.test
@@ -6,11 +6,11 @@
 
 ; CHECK: Sections:
 ; CHECK: Idx Name          Size      Address          Type
-; CHECK:   0               000000000 00000000000000000 
-; CHECK:   1 .text         000000026 00000000000000000 TEXT DATA 
-; CHECK:   2 .rodata.str1.1 00000000d 00000000000000026 DATA 
-; CHECK:   3 .note.GNU-stack 000000000 00000000000000033 
-; CHECK:   4 .rela.text    000000048 00000000000000038 
-; CHECK:   5 .symtab       0000000c0 00000000000000080 
-; CHECK:   6 .strtab       000000033 00000000000000140 
-; CHECK:   7 .shstrtab     00000004b 00000000000000173 
+; CHECK:   0               00000000 0000000000000000
+; CHECK:   1 .text         00000026 0000000000000000 TEXT DATA
+; CHECK:   2 .rodata.str1.1 0000000d 0000000000000026 DATA
+; CHECK:   3 .note.GNU-stack 00000000 0000000000000033
+; CHECK:   4 .rela.text    00000048 0000000000000038
+; CHECK:   5 .symtab       000000c0 0000000000000080
+; CHECK:   6 .strtab       00000033 0000000000000140
+; CHECK:   7 .shstrtab     0000004b 0000000000000173
diff --git a/test/Object/readobj-shared-object.test b/test/Object/readobj-shared-object.test
index 3b5457ce07..3065c6f636 100644
--- a/test/Object/readobj-shared-object.test
+++ b/test/Object/readobj-shared-object.test
@@ -19,6 +19,7 @@ ELF32:Address Size: 32 bits
 ELF32:Load Name   : libfoo.so
 
 ELF:Symbols:
+ELF:  Name                   Type            Address        Size           FileOffset     Flags
 ELF:  .dynsym                DBG             {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-9a-f]+}}  formatspecific
 ELF:  .dynstr                DBG             {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-9a-f]+}}  formatspecific
 ELF:  .text                  DBG             {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-9a-f]+}}  formatspecific
@@ -42,6 +43,7 @@ ELF:  _edata                 ?               {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-
 ELF:  Total: 21
 
 ELF:Dynamic Symbols:
+ELF:  Name                   Type            Address        Size           FileOffset     Flags
 ELF:  common_sym             DATA            {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-9a-f]+}}  global
 ELF:  tls_sym                DATA            {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-9a-f]+}}  global,threadlocal
 ELF:  defined_sym            DATA            {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-9a-f]+}}  global
@@ -51,6 +53,24 @@ ELF:  global_func            FUNC            {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-
 ELF:  _edata                 ?               {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-9a-f]+}}  global,absolute
 ELF:  Total: {{[0-9a-f]+}}
 
+ELF:Sections:
+ELF:  Name                        Address        Size           Align          Flags
+ELF:                              {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-9a-f]+}}  rodata
+ELF:  .hash                       {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-9a-f]+}}  required,rodata
+ELF:  .dynsym                     {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-9a-f]+}}  required,rodata
+ELF:  .dynstr                     {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-9a-f]+}}  required,rodata
+ELF:  .text                       {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-9a-f]+}}  text,{{(data,)?}}required
+ELF:  .eh_frame                   {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-9a-f]+}}  data,required,rodata
+ELF:  .tdata                      {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-9a-f]+}}  data,required
+ELF:  .dynamic                    {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-9a-f]+}}  required
+ELF:  .got.plt                    {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-9a-f]+}}  data,required
+ELF:  .data                       {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-9a-f]+}}  data,required
+ELF:  .bss                        {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-9a-f]+}}  bss,required,virtual,zeroinit
+ELF:  .shstrtab                   {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-9a-f]+}}  rodata
+ELF:  .symtab                     {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-9a-f]+}}  rodata
+ELF:  .strtab                     {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-9a-f]+}}  rodata
+ELF:  Total: 14
+
 ELF:Libraries needed:
 ELF:  libc.so.6
 ELF:  libm.so.6
diff --git a/test/Object/simple-archive.test b/test/Object/simple-archive.test
index c313f3facd..3e6760ed97 100644
--- a/test/Object/simple-archive.test
+++ b/test/Object/simple-archive.test
@@ -1,5 +1,5 @@
 #
-# Check if the index is appearing properly in the output file 
+# Check if the index is appearing properly in the output file
 #
 RUN: llvm-nm -s %p/Inputs/libsimple_archive.a | FileCheck -check-prefix=CHECKIDX %s
 
diff --git a/test/TableGen/2006-09-18-LargeInt.td b/test/TableGen/2006-09-18-LargeInt.td
index f7ae4eecce..94cd1ec307 100644
--- a/test/TableGen/2006-09-18-LargeInt.td
+++ b/test/TableGen/2006-09-18-LargeInt.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | grep -- 4294901760
-// XFAIL: vg_leak
 
 def X {
   int Y = 0xFFFF0000;
diff --git a/test/TableGen/2010-03-24-PrematureDefaults.td b/test/TableGen/2010-03-24-PrematureDefaults.td
index 24f6c93b3e..716a1d5900 100644
--- a/test/TableGen/2010-03-24-PrematureDefaults.td
+++ b/test/TableGen/2010-03-24-PrematureDefaults.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class A<int k, bits<2> x = 1> {
   int K = k;
diff --git a/test/TableGen/Dag.td b/test/TableGen/Dag.td
index 7ceb4e74b2..40399a48ee 100644
--- a/test/TableGen/Dag.td
+++ b/test/TableGen/Dag.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 //===----------------------------------------------------------------------===//
 // Substitution of an int.
diff --git a/test/TableGen/DefmInherit.td b/test/TableGen/DefmInherit.td
index 46d3f62c6d..b52a709731 100644
--- a/test/TableGen/DefmInherit.td
+++ b/test/TableGen/DefmInherit.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | grep "zing = 4" | count 4
-// XFAIL: vg_leak
 
 class C1<int A, string B> { 
   int bar = A;
diff --git a/test/TableGen/DefmInsideMultiClass.td b/test/TableGen/DefmInsideMultiClass.td
index e6fc019b1e..0aea21280d 100644
--- a/test/TableGen/DefmInsideMultiClass.td
+++ b/test/TableGen/DefmInsideMultiClass.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | grep ADDPSrr | count 1
-// XFAIL: vg_leak
 
 class Instruction<bits<4> opc, string Name> {
   bits<4> opcode = opc;
diff --git a/test/TableGen/ForeachList.td b/test/TableGen/ForeachList.td
index 99b7e14c2d..9bc76e0f0c 100644
--- a/test/TableGen/ForeachList.td
+++ b/test/TableGen/ForeachList.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class Register<string name, int idx> {
   string Name = name;
diff --git a/test/TableGen/ForeachLoop.td b/test/TableGen/ForeachLoop.td
index 4aacc74d8a..a49a60bf26 100644
--- a/test/TableGen/ForeachLoop.td
+++ b/test/TableGen/ForeachLoop.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class Register<string name, int idx> {
   string Name = name;
diff --git a/test/TableGen/LazyChange.td b/test/TableGen/LazyChange.td
index 306959ebb6..919a1a7e9a 100644
--- a/test/TableGen/LazyChange.td
+++ b/test/TableGen/LazyChange.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | grep "int Y = 3"
-// XFAIL: vg_leak
 
 class C {
   int X = 4;
diff --git a/test/TableGen/LetInsideMultiClasses.td b/test/TableGen/LetInsideMultiClasses.td
index cb13508e51..72f48b6d80 100644
--- a/test/TableGen/LetInsideMultiClasses.td
+++ b/test/TableGen/LetInsideMultiClasses.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | grep "bit IsDouble = 1;" | count 3
-// XFAIL: vg_leak
 
 class Instruction<bits<4> opc, string Name> {
   bits<4> opcode = opc;
diff --git a/test/TableGen/ListOfList.td b/test/TableGen/ListOfList.td
index 864401ec3c..adf9fe483e 100644
--- a/test/TableGen/ListOfList.td
+++ b/test/TableGen/ListOfList.td
@@ -1,7 +1,6 @@
 // RUN llvm-tblgen %s | FileCheck %s
 
 // RUN: llvm-tblgen %s | grep "foo" | count 1
-// XFAIL: vg_leak
 
 class Base<string t> {
   string text = t;
diff --git a/test/TableGen/LoLoL.td b/test/TableGen/LoLoL.td
index 778c9609d1..f758e1b604 100644
--- a/test/TableGen/LoLoL.td
+++ b/test/TableGen/LoLoL.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class Base<list<int> v> {
   list<int> values = v;
diff --git a/test/TableGen/MultiClass.td b/test/TableGen/MultiClass.td
index 449c5d6c04..ef320cf79f 100644
--- a/test/TableGen/MultiClass.td
+++ b/test/TableGen/MultiClass.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | grep "zing = 4" | count 2
-// XFAIL: vg_leak
 
 class C1<int A, string B> { 
   int bar = A;
diff --git a/test/TableGen/MultiClassDefName.td b/test/TableGen/MultiClassDefName.td
index 296e30c7c7..75d6af5b42 100644
--- a/test/TableGen/MultiClassDefName.td
+++ b/test/TableGen/MultiClassDefName.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | grep WorldHelloCC | count 1
-// XFAIL: vg_leak
 
 class C<string n> {
   string name = n;
diff --git a/test/TableGen/MultiClassInherit.td b/test/TableGen/MultiClassInherit.td
index c768fff0b6..9d1470a661 100644
--- a/test/TableGen/MultiClassInherit.td
+++ b/test/TableGen/MultiClassInherit.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | grep "zing = 4" | count 28
-// XFAIL: vg_leak
 
 class C1<int A, string B> { 
   int bar = A;
diff --git a/test/TableGen/MultiPat.td b/test/TableGen/MultiPat.td
index b3792777b6..b49b06c24c 100644
--- a/test/TableGen/MultiPat.td
+++ b/test/TableGen/MultiPat.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class ValueType<int size, int value> {
   int Size = size;
diff --git a/test/TableGen/NestedForeach.td b/test/TableGen/NestedForeach.td
index e8c16f720d..5b63175b19 100644
--- a/test/TableGen/NestedForeach.td
+++ b/test/TableGen/NestedForeach.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class Droid<string series, int release, string model, int patchlevel> {
   string Series = series;
diff --git a/test/TableGen/Paste.td b/test/TableGen/Paste.td
index a7e2a5b318..33d61ccde1 100644
--- a/test/TableGen/Paste.td
+++ b/test/TableGen/Paste.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class Instr<int i> {
   int index = i;
diff --git a/test/TableGen/SetTheory.td b/test/TableGen/SetTheory.td
index 761332312b..f26b9e65ac 100644
--- a/test/TableGen/SetTheory.td
+++ b/test/TableGen/SetTheory.td
@@ -1,6 +1,5 @@
 // Test evaluation of set operations in dags.
 // RUN: llvm-tblgen -print-sets %s | FileCheck %s
-// XFAIL: vg_leak
 //
 // The -print-sets driver configures a primitive SetTheory instance that
 // understands these sets:
diff --git a/test/TableGen/SiblingForeach.td b/test/TableGen/SiblingForeach.td
index a11f6f87b4..e4c4704a5e 100644
--- a/test/TableGen/SiblingForeach.td
+++ b/test/TableGen/SiblingForeach.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class Set<int i = 0, int j = 0, int k = 0> {
   int I = i;
diff --git a/test/TableGen/Slice.td b/test/TableGen/Slice.td
index 6d051d77c8..cec9fb65ca 100644
--- a/test/TableGen/Slice.td
+++ b/test/TableGen/Slice.td
@@ -1,6 +1,5 @@
 // RUN: llvm-tblgen %s | grep "\[(set" | count 2
 // RUN: llvm-tblgen %s | grep "\[\]" | count 2
-// XFAIL: vg_leak
 
 class ValueType<int size, int value> {
   int Size = size;
diff --git a/test/TableGen/TargetInstrSpec.td b/test/TableGen/TargetInstrSpec.td
index 64b706dc6a..bf2d257c5d 100644
--- a/test/TableGen/TargetInstrSpec.td
+++ b/test/TableGen/TargetInstrSpec.td
@@ -1,6 +1,5 @@
 // RUN: llvm-tblgen %s | grep '\[(set VR128:$dst, (int_x86_sse2_add_pd VR128:$src1, VR128:$src2))\]' | count 1
 // RUN: llvm-tblgen %s | grep '\[(set VR128:$dst, (int_x86_sse2_add_ps VR128:$src1, VR128:$src2))\]' | count 1
-// XFAIL: vg_leak
 
 class ValueType<int size, int value> {
   int Size = size;
diff --git a/test/TableGen/TwoLevelName.td b/test/TableGen/TwoLevelName.td
index 9c502f4755..e88696217f 100644
--- a/test/TableGen/TwoLevelName.td
+++ b/test/TableGen/TwoLevelName.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class Type<string name, int length, int width> {
   string Name = name;
diff --git a/test/TableGen/cast.td b/test/TableGen/cast.td
index 7948aff795..b9e4b37535 100644
--- a/test/TableGen/cast.td
+++ b/test/TableGen/cast.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | grep "add_ps" | count 3
-// XFAIL: vg_leak
 
 class ValueType<int size, int value> {
   int Size = size;
diff --git a/test/TableGen/defmclass.td b/test/TableGen/defmclass.td
index 80f03b3194..6198c000fd 100644
--- a/test/TableGen/defmclass.td
+++ b/test/TableGen/defmclass.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class XD { bits<4> Prefix = 11; }
 // CHECK: Prefix = { 1, 1, 0, 0 };
diff --git a/test/TableGen/eq.td b/test/TableGen/eq.td
index f8daf880b9..fc3ad424e2 100644
--- a/test/TableGen/eq.td
+++ b/test/TableGen/eq.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 // CHECK: Value = 0
 // CHECK: Value = 1
 
diff --git a/test/TableGen/eqbit.td b/test/TableGen/eqbit.td
index 1d58fa0c19..b77b1a26df 100644
--- a/test/TableGen/eqbit.td
+++ b/test/TableGen/eqbit.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 // CHECK: a = 6
 // CHECK: a = 5
 
diff --git a/test/TableGen/foreach.td b/test/TableGen/foreach.td
index 902af25237..7b7c199728 100644
--- a/test/TableGen/foreach.td
+++ b/test/TableGen/foreach.td
@@ -1,7 +1,6 @@
 // RUN: llvm-tblgen %s | grep 'Jr' | count 2
 // RUN: llvm-tblgen %s | grep 'Sr' | count 2
 // RUN: llvm-tblgen %s | grep '"NAME"' | count 1
-// XFAIL: vg_leak
 
 // Variables for foreach
 class decls {
diff --git a/test/TableGen/if.td b/test/TableGen/if.td
index 1d8d62329a..e4df74f368 100644
--- a/test/TableGen/if.td
+++ b/test/TableGen/if.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 // Support for an `!if' operator as part of a `let' statement.
 // CHECK:      class C
diff --git a/test/TableGen/ifbit.td b/test/TableGen/ifbit.td
index 88f575e9ac..e3341219ff 100644
--- a/test/TableGen/ifbit.td
+++ b/test/TableGen/ifbit.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 // CHECK: a = 6
 // CHECK: a = 5
 
diff --git a/test/TableGen/lisp.td b/test/TableGen/lisp.td
index dd85ddc67c..efe00022f5 100644
--- a/test/TableGen/lisp.td
+++ b/test/TableGen/lisp.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | grep ""
-// XFAIL: vg_leak
 
 class List<list<string> n> {
   list<string> names = n;
diff --git a/test/TableGen/list-element-bitref.td b/test/TableGen/list-element-bitref.td
index 5f3e3dabf4..7db3d31167 100644
--- a/test/TableGen/list-element-bitref.td
+++ b/test/TableGen/list-element-bitref.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class C<list<bits<8>> L> {
   bits<2> V0 = L[0]{1-0};
@@ -10,6 +9,6 @@ class C<list<bits<8>> L> {
 def c0 : C<[0b0101, 0b1010]>;
 
 // CHECK: def c0
-// CHECk-NEXT: bits<2> V0 = { 0, 1 };
-// CHECk-NEXT: bits<2> V1 = { 1, 0 };
-// CHECk-NEXT: string V2 = "Odd";
+// CHECK-NEXT: bits<2> V0 = { 0, 1 };
+// CHECK-NEXT: bits<2> V1 = { 1, 0 };
+// CHECK-NEXT: string V2 = "Odd";
diff --git a/test/TableGen/pr8330.td b/test/TableGen/pr8330.td
index 7779b635e3..e672014789 100644
--- a/test/TableGen/pr8330.td
+++ b/test/TableGen/pr8330.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class Or4<bits<8> Val> {
   bits<8> V = {Val{7}, Val{6}, Val{5}, Val{4}, Val{3}, 1, Val{1}, Val{0} };
diff --git a/test/TableGen/strconcat.td b/test/TableGen/strconcat.td
index 85ee831b4d..0173c49365 100644
--- a/test/TableGen/strconcat.td
+++ b/test/TableGen/strconcat.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | grep fufoo
-// XFAIL: vg_leak
 
 class Y<string S> {
   string T = !strconcat(S, "foo");
diff --git a/test/TableGen/subst.td b/test/TableGen/subst.td
index 850ac38465..e265b44cf3 100644
--- a/test/TableGen/subst.td
+++ b/test/TableGen/subst.td
@@ -4,7 +4,6 @@
 // RUN: llvm-tblgen %s | grep "LAST" | count 1
 // RUN: llvm-tblgen %s | grep "TVAR" | count 2
 // RUN: llvm-tblgen %s | grep "Bogus" | count 1
-// XFAIL: vg_leak
 
 class Honorific<string t> {
   string honorific = t;
diff --git a/test/TableGen/subst2.td b/test/TableGen/subst2.td
index 7c007f7db1..ce7307703d 100644
--- a/test/TableGen/subst2.td
+++ b/test/TableGen/subst2.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 // CHECK: No subst
 // CHECK: No foo
 // CHECK: RECURSE foo
diff --git a/test/TableGen/usevalname.td b/test/TableGen/usevalname.td
index d85b98ac33..a80ba12869 100644
--- a/test/TableGen/usevalname.td
+++ b/test/TableGen/usevalname.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class Instr<list<dag> pat> {
   list<dag> Pattern = pat;
diff --git a/test/Transforms/ArgumentPromotion/crash.ll b/test/Transforms/ArgumentPromotion/crash.ll
index fed002aa98..f70d8de60e 100644
--- a/test/Transforms/ArgumentPromotion/crash.ll
+++ b/test/Transforms/ArgumentPromotion/crash.ll
@@ -1,5 +1,5 @@
 ; rdar://7879828
-; RUN: opt -inline -argpromotion %s
+; RUN: opt -inline -argpromotion < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
 
diff --git a/test/Transforms/BBVectorize/X86/simple-int.ll b/test/Transforms/BBVectorize/X86/simple-int.ll
new file mode 100644
index 0000000000..f5dbe46b14
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/simple-int.ll
@@ -0,0 +1,79 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+
+declare double @llvm.fma.f64(double, double, double)
+declare double @llvm.fmuladd.f64(double, double, double)
+declare double @llvm.cos.f64(double)
+declare double @llvm.powi.f64(double, i32)
+
+; Basic depth-3 chain with fma
+define double @test1(double %A1, double %A2, double %B1, double %B2, double %C1, double %C2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = call double @llvm.fma.f64(double %X1, double %A1, double %C1)
+	%Y2 = call double @llvm.fma.f64(double %X2, double %A2, double %C2)
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test1
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain with fmuladd
+define double @test1a(double %A1, double %A2, double %B1, double %B2, double %C1, double %C2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = call double @llvm.fmuladd.f64(double %X1, double %A1, double %C1)
+	%Y2 = call double @llvm.fmuladd.f64(double %X2, double %A2, double %C2)
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test1a
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain with cos
+define double @test2(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = call double @llvm.cos.f64(double %X1)
+	%Y2 = call double @llvm.cos.f64(double %X2)
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test2
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain with powi
+define double @test3(double %A1, double %A2, double %B1, double %B2, i32 %P) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = call double @llvm.powi.f64(double %X1, i32 %P)
+	%Y2 = call double @llvm.powi.f64(double %X2, i32 %P)
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test3
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain with powi (different powers: should not vectorize)
+define double @test4(double %A1, double %A2, double %B1, double %B2, i32 %P) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+        %P2 = add i32 %P, 1
+	%Y1 = call double @llvm.powi.f64(double %X1, i32 %P)
+	%Y2 = call double @llvm.powi.f64(double %X2, i32 %P2)
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test4
+; CHECK: ret double %R
+}
+
diff --git a/test/Transforms/BBVectorize/cycle.ll b/test/Transforms/BBVectorize/cycle.ll
index e8e82ce024..bdcb30da88 100644
--- a/test/Transforms/BBVectorize/cycle.ll
+++ b/test/Transforms/BBVectorize/cycle.ll
@@ -1,5 +1,5 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s
 
 ; This test checks the non-trivial pairing-induced cycle avoidance. Without this cycle avoidance, the algorithm would otherwise
 ; want to select the pairs:
diff --git a/test/Transforms/BBVectorize/ld1.ll b/test/Transforms/BBVectorize/ld1.ll
index cea225d076..ea5cb5dd93 100644
--- a/test/Transforms/BBVectorize/ld1.ll
+++ b/test/Transforms/BBVectorize/ld1.ll
@@ -1,5 +1,5 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s
 
 define double @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly {
 entry:
diff --git a/test/Transforms/BBVectorize/loop1.ll b/test/Transforms/BBVectorize/loop1.ll
index c22ea5852a..e592edb44a 100644
--- a/test/Transforms/BBVectorize/loop1.ll
+++ b/test/Transforms/BBVectorize/loop1.ll
@@ -1,7 +1,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
-; RUN: opt < %s -basicaa -loop-unroll -unroll-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s
+; RUN: opt < %s -basicaa -loop-unroll -unroll-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL
 ; The second check covers the use of alias analysis (with loop unrolling).
 
 define void @test1(double* noalias %out, double* noalias %in1, double* noalias %in2) nounwind uwtable {
diff --git a/test/Transforms/BBVectorize/req-depth.ll b/test/Transforms/BBVectorize/req-depth.ll
index 8c9cc3c188..e0120059b9 100644
--- a/test/Transforms/BBVectorize/req-depth.ll
+++ b/test/Transforms/BBVectorize/req-depth.ll
@@ -1,6 +1,6 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth 3 -S | FileCheck %s -check-prefix=CHECK-RD3
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth 2 -S | FileCheck %s -check-prefix=CHECK-RD2
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth 3 -bb-vectorize-ignore-target-info -S | FileCheck %s -check-prefix=CHECK-RD3
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth 2 -bb-vectorize-ignore-target-info -S | FileCheck %s -check-prefix=CHECK-RD2
 
 define double @test1(double %A1, double %A2, double %B1, double %B2) {
 	%X1 = fsub double %A1, %B1
diff --git a/test/Transforms/BBVectorize/search-limit.ll b/test/Transforms/BBVectorize/search-limit.ll
index aeaf98865b..a694e45bc1 100644
--- a/test/Transforms/BBVectorize/search-limit.ll
+++ b/test/Transforms/BBVectorize/search-limit.ll
@@ -1,6 +1,6 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-search-limit=4 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-SL4
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-search-limit=4 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-SL4
 
 define double @test1(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: @test1
diff --git a/test/Transforms/BBVectorize/simple-int.ll b/test/Transforms/BBVectorize/simple-int.ll
index ae1d63bfd8..d7b7d6b8fd 100644
--- a/test/Transforms/BBVectorize/simple-int.ll
+++ b/test/Transforms/BBVectorize/simple-int.ll
@@ -1,7 +1,8 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s
 
 declare double @llvm.fma.f64(double, double, double)
+declare double @llvm.fmuladd.f64(double, double, double)
 declare double @llvm.cos.f64(double)
 declare double @llvm.powi.f64(double, i32)
 
@@ -31,6 +32,32 @@ define double @test1(double %A1, double %A2, double %B1, double %B2, double %C1,
 ; CHECK: ret double %R
 }
 
+; Basic depth-3 chain with fmuladd
+define double @test1a(double %A1, double %A2, double %B1, double %B2, double %C1, double %C2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = call double @llvm.fmuladd.f64(double %X1, double %A1, double %C1)
+	%Y2 = call double @llvm.fmuladd.f64(double %X2, double %A2, double %C2)
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test1a
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1.v.i2.1 = insertelement <2 x double> undef, double %C1, i32 0
+; CHECK: %Y1.v.i2.2 = insertelement <2 x double> %Y1.v.i2.1, double %C2, i32 1
+; CHECK: %Y1 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> %X1, <2 x double> %X1.v.i0.2, <2 x double> %Y1.v.i2.2)
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+; CHECK: ret double %R
+}
+
 ; Basic depth-3 chain with cos
 define double @test2(double %A1, double %A2, double %B1, double %B2) {
 	%X1 = fsub double %A1, %B1
@@ -98,6 +125,7 @@ define double @test4(double %A1, double %A2, double %B1, double %B2, i32 %P) {
 }
 
 ; CHECK: declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+; CHECK: declare <2 x double> @llvm.fmuladd.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
 ; CHECK: declare <2 x double> @llvm.cos.v2f64(<2 x double>) nounwind readonly
 ; CHECK: declare <2 x double> @llvm.powi.v2f64(<2 x double>, i32) nounwind readonly
 
diff --git a/test/Transforms/BBVectorize/simple-ldstr.ll b/test/Transforms/BBVectorize/simple-ldstr.ll
index 7dd77c933f..8e51d297e8 100644
--- a/test/Transforms/BBVectorize/simple-ldstr.ll
+++ b/test/Transforms/BBVectorize/simple-ldstr.ll
@@ -1,6 +1,6 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-aligned-only -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-AO
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-aligned-only -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-AO
 
 ; Simple 3-pair chain with loads and stores
 define void @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly {
diff --git a/test/Transforms/BBVectorize/simple-sel.ll b/test/Transforms/BBVectorize/simple-sel.ll
index 15ecb59702..8caccfd32c 100644
--- a/test/Transforms/BBVectorize/simple-sel.ll
+++ b/test/Transforms/BBVectorize/simple-sel.ll
@@ -1,6 +1,6 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-no-bools -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-NB
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-no-bools -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-NB
 
 ; Basic depth-3 chain with select
 define double @test1(double %A1, double %A2, double %B1, double %B2, i1 %C1, i1 %C2) {
diff --git a/test/Transforms/BBVectorize/simple.ll b/test/Transforms/BBVectorize/simple.ll
index 3527ae75b4..a447908d16 100644
--- a/test/Transforms/BBVectorize/simple.ll
+++ b/test/Transforms/BBVectorize/simple.ll
@@ -1,5 +1,5 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s
 
 ; Basic depth-3 chain
 define double @test1(double %A1, double %A2, double %B1, double %B2) {
diff --git a/test/Transforms/BBVectorize/simple3.ll b/test/Transforms/BBVectorize/simple3.ll
index 153be73f83..78bcc9f830 100644
--- a/test/Transforms/BBVectorize/simple3.ll
+++ b/test/Transforms/BBVectorize/simple3.ll
@@ -1,5 +1,5 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-vector-bits=192 -instcombine -gvn -S | FileCheck %s
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-vector-bits=192 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s
 
 ; Basic depth-3 chain
 define double @test1(double %A1, double %A2, double %A3, double %B1, double %B2, double %B3) {
diff --git a/test/Transforms/CodeGenPrepare/basic.ll b/test/Transforms/CodeGenPrepare/basic.ll
index c68e77eb55..d617e43be8 100644
--- a/test/Transforms/CodeGenPrepare/basic.ll
+++ b/test/Transforms/CodeGenPrepare/basic.ll
@@ -1,4 +1,4 @@
-; RUN: opt -codegenprepare %s -S -o - | FileCheck %s
+; RUN: opt -codegenprepare -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
diff --git a/test/Transforms/ConstantMerge/2003-10-28-MergeExternalConstants.ll b/test/Transforms/ConstantMerge/2003-10-28-MergeExternalConstants.ll
index ce79e3b296..a415995070 100644
--- a/test/Transforms/ConstantMerge/2003-10-28-MergeExternalConstants.ll
+++ b/test/Transforms/ConstantMerge/2003-10-28-MergeExternalConstants.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -constmerge %s | FileCheck %s
+; RUN: opt -S -constmerge < %s | FileCheck %s
 
 ; CHECK: @foo = constant i32 6
 ; CHECK: @bar = constant i32 6
diff --git a/test/Transforms/ConstantMerge/2011-01-15-EitherOrder.ll b/test/Transforms/ConstantMerge/2011-01-15-EitherOrder.ll
index f561daf667..5aafcfe3d4 100644
--- a/test/Transforms/ConstantMerge/2011-01-15-EitherOrder.ll
+++ b/test/Transforms/ConstantMerge/2011-01-15-EitherOrder.ll
@@ -1,4 +1,4 @@
-; RUN: opt -constmerge %s -S -o - | FileCheck %s
+; RUN: opt -constmerge -S < %s | FileCheck %s
 ; PR8978
 
 declare i32 @zed(%struct.foobar*, %struct.foobar*)
diff --git a/test/Transforms/ConstantMerge/merge-both.ll b/test/Transforms/ConstantMerge/merge-both.ll
index b71eb437db..b00345557c 100644
--- a/test/Transforms/ConstantMerge/merge-both.ll
+++ b/test/Transforms/ConstantMerge/merge-both.ll
@@ -1,4 +1,4 @@
-; RUN: opt -constmerge %s -S -o - | FileCheck %s
+; RUN: opt -constmerge -S < %s | FileCheck %s
 ; Test that in one run var3 is merged into var2 and var1 into var4.
 ; Test that we merge @var5 and @var6 into one with the higher alignment, and
 ; don't merge var7/var8 into var5/var6.
diff --git a/test/Transforms/ConstantMerge/unnamed-addr.ll b/test/Transforms/ConstantMerge/unnamed-addr.ll
index 24100837aa..aff8540f2c 100644
--- a/test/Transforms/ConstantMerge/unnamed-addr.ll
+++ b/test/Transforms/ConstantMerge/unnamed-addr.ll
@@ -1,4 +1,4 @@
-; RUN: opt -constmerge %s -S -o - | FileCheck %s
+; RUN: opt -constmerge -S < %s | FileCheck %s
 ; Test which corresponding x and y are merged and that unnamed_addr
 ; is correctly set.
 
diff --git a/test/Transforms/DeadArgElim/dbginfo.ll b/test/Transforms/DeadArgElim/dbginfo.ll
index dcbfaaa3d7..b07b60d948 100644
--- a/test/Transforms/DeadArgElim/dbginfo.ll
+++ b/test/Transforms/DeadArgElim/dbginfo.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -deadargelim -S | FileCheck %s
+; RUN: opt -deadargelim -S < %s | FileCheck %s
 ; PR14016
 
 ; Check that debug info metadata for subprograms stores pointers to
diff --git a/test/Transforms/DeadArgElim/deadexternal.ll b/test/Transforms/DeadArgElim/deadexternal.ll
index e3fe1bbb54..cca58721e5 100644
--- a/test/Transforms/DeadArgElim/deadexternal.ll
+++ b/test/Transforms/DeadArgElim/deadexternal.ll
@@ -1,4 +1,4 @@
-; RUN: opt -deadargelim -S %s | FileCheck %s
+; RUN: opt -deadargelim -S < %s | FileCheck %s
 
 define void @test(i32) {
   ret void
diff --git a/test/Transforms/DeadStoreElimination/const-pointers.ll b/test/Transforms/DeadStoreElimination/const-pointers.ll
index 7d57804631..15976f9f10 100644
--- a/test/Transforms/DeadStoreElimination/const-pointers.ll
+++ b/test/Transforms/DeadStoreElimination/const-pointers.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -basicaa -dse -S | FileCheck %s
+; RUN: opt -basicaa -dse -S < %s | FileCheck %s
 
 %t = type { i32 }
 
diff --git a/test/Transforms/DeadStoreElimination/dominate.ll b/test/Transforms/DeadStoreElimination/dominate.ll
index 284fea4234..38cf1a066d 100644
--- a/test/Transforms/DeadStoreElimination/dominate.ll
+++ b/test/Transforms/DeadStoreElimination/dominate.ll
@@ -1,4 +1,4 @@
-; RUN: opt  %s -dse -disable-output
+; RUN: opt -dse -disable-output < %s
 ; test that we don't crash
 declare void @bar()
 
diff --git a/test/Transforms/DeadStoreElimination/no-targetdata.ll b/test/Transforms/DeadStoreElimination/no-targetdata.ll
index 6c7f940316..4022d76dcb 100644
--- a/test/Transforms/DeadStoreElimination/no-targetdata.ll
+++ b/test/Transforms/DeadStoreElimination/no-targetdata.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -basicaa -dse -S | FileCheck %s
+; RUN: opt -basicaa -dse -S < %s | FileCheck %s
 
 declare void @test1f()
 
diff --git a/test/Transforms/DeadStoreElimination/pr11390.ll b/test/Transforms/DeadStoreElimination/pr11390.ll
index 2ce6eea365..f63aa1eb8a 100644
--- a/test/Transforms/DeadStoreElimination/pr11390.ll
+++ b/test/Transforms/DeadStoreElimination/pr11390.ll
@@ -1,4 +1,4 @@
-; RUN: opt -basicaa -dse -S -o - %s | FileCheck %s
+; RUN: opt -basicaa -dse -S < %s | FileCheck %s
 ; PR11390
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/EarlyCSE/commute.ll b/test/Transforms/EarlyCSE/commute.ll
index f84a7dd1aa..8cf04d1765 100644
--- a/test/Transforms/EarlyCSE/commute.ll
+++ b/test/Transforms/EarlyCSE/commute.ll
@@ -19,9 +19,9 @@ define void @test2(float %A, float %B, i1* %PA, i1* %PB) {
   ; CHECK-NEXT: store
   ; CHECK-NEXT: store
   ; CHECK-NEXT: ret
-  %C = fcmp eq float %A, %B
+  %C = fcmp oeq float %A, %B
   store i1 %C, i1* %PA
-  %D = fcmp eq float %B, %A
+  %D = fcmp oeq float %B, %A
   store i1 %D, i1* %PB
   ret void
 }
diff --git a/test/Transforms/GVN/2011-04-27-phioperands.ll b/test/Transforms/GVN/2011-04-27-phioperands.ll
index 6e5075db7c..42c46500c4 100644
--- a/test/Transforms/GVN/2011-04-27-phioperands.ll
+++ b/test/Transforms/GVN/2011-04-27-phioperands.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -gvn -disable-output
+; RUN: opt -gvn -disable-output < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64"
 
diff --git a/test/Transforms/GVN/MemdepMiscompile.ll b/test/Transforms/GVN/MemdepMiscompile.ll
new file mode 100644
index 0000000000..d420169615
--- /dev/null
+++ b/test/Transforms/GVN/MemdepMiscompile.ll
@@ -0,0 +1,54 @@
+; RUN: opt < %s -basicaa -gvn -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.7.0"
+
+; rdar://12801584
+; Value of %shouldExit can be changed by RunInMode.
+; Make sure we do not replace load %shouldExit in while.cond.backedge
+; with a phi node where the value from while.body is 0.
+define i32 @test() nounwind ssp {
+entry:
+; CHECK: test()
+; CHECK: while.body:
+; CHECK: call void @RunInMode
+; CHECK: br i1 %tobool, label %while.cond.backedge, label %if.then
+; CHECK: while.cond.backedge:
+; CHECK: load i32* %shouldExit
+; CHECK: br i1 %cmp, label %while.body
+  %shouldExit = alloca i32, align 4
+  %tasksIdle = alloca i32, align 4
+  store i32 0, i32* %shouldExit, align 4
+  store i32 0, i32* %tasksIdle, align 4
+  call void @CTestInitialize(i32* %tasksIdle) nounwind
+  %0 = load i32* %shouldExit, align 4
+  %cmp1 = icmp eq i32 %0, 0
+  br i1 %cmp1, label %while.body.lr.ph, label %while.end
+
+while.body.lr.ph:
+  br label %while.body
+
+while.body:
+  call void @RunInMode(i32 100) nounwind
+  %1 = load i32* %tasksIdle, align 4
+  %tobool = icmp eq i32 %1, 0
+  br i1 %tobool, label %while.cond.backedge, label %if.then
+
+if.then:
+  store i32 0, i32* %tasksIdle, align 4
+  call void @TimerCreate(i32* %shouldExit) nounwind
+  br label %while.cond.backedge
+
+while.cond.backedge:
+  %2 = load i32* %shouldExit, align 4
+  %cmp = icmp eq i32 %2, 0
+  br i1 %cmp, label %while.body, label %while.cond.while.end_crit_edge
+
+while.cond.while.end_crit_edge:
+  br label %while.end
+
+while.end:
+  ret i32 0
+}
+declare void @CTestInitialize(i32*)
+declare void @RunInMode(i32)
+declare void @TimerCreate(i32*)
diff --git a/test/Transforms/GVN/crash-no-aa.ll b/test/Transforms/GVN/crash-no-aa.ll
index c87a9c6576..9ad63a7350 100644
--- a/test/Transforms/GVN/crash-no-aa.ll
+++ b/test/Transforms/GVN/crash-no-aa.ll
@@ -1,4 +1,4 @@
-; RUN: opt -no-aa -gvn -S %s
+; RUN: opt -no-aa -gvn -S < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-freebsd8.0"
diff --git a/test/Transforms/GVN/crash.ll b/test/Transforms/GVN/crash.ll
index 4a8c8e4589..9fb612fcae 100644
--- a/test/Transforms/GVN/crash.ll
+++ b/test/Transforms/GVN/crash.ll
@@ -1,4 +1,4 @@
-; RUN: opt -gvn %s -disable-output
+; RUN: opt -gvn -disable-output < %s
 
 ; PR5631
 
diff --git a/test/Transforms/GVN/edge.ll b/test/Transforms/GVN/edge.ll
index 32392f3ab0..3a102b6c35 100644
--- a/test/Transforms/GVN/edge.ll
+++ b/test/Transforms/GVN/edge.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -gvn -S -o - | FileCheck %s
+; RUN: opt -gvn -S < %s | FileCheck %s
 
 define i32 @f1(i32 %x) {
   ; CHECK: define i32 @f1(
diff --git a/test/Transforms/GVN/fpmath.ll b/test/Transforms/GVN/fpmath.ll
index 8ab285448f..403df5c900 100644
--- a/test/Transforms/GVN/fpmath.ll
+++ b/test/Transforms/GVN/fpmath.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -gvn -S -o - | FileCheck %s
+; RUN: opt -gvn -S < %s | FileCheck %s
 
 define double @test1(double %x, double %y) {
 ; CHECK: @test1(double %x, double %y)
diff --git a/test/Transforms/GVN/lpre-call-wrap-2.ll b/test/Transforms/GVN/lpre-call-wrap-2.ll
index e39f3ed87d..35e3534a9c 100644
--- a/test/Transforms/GVN/lpre-call-wrap-2.ll
+++ b/test/Transforms/GVN/lpre-call-wrap-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -basicaa -gvn -enable-load-pre %s | FileCheck %s
+; RUN: opt -S -basicaa -gvn -enable-load-pre < %s | FileCheck %s
 ;
 ; The partially redundant load in bb1 should be hoisted to "bb".  This comes
 ; from this C code (GCC PR 23455):
diff --git a/test/Transforms/GVN/lpre-call-wrap.ll b/test/Transforms/GVN/lpre-call-wrap.ll
index 40462798b5..0646f3fe0a 100644
--- a/test/Transforms/GVN/lpre-call-wrap.ll
+++ b/test/Transforms/GVN/lpre-call-wrap.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -gvn -enable-load-pre %s | FileCheck %s
+; RUN: opt -S -gvn -enable-load-pre < %s | FileCheck %s
 ;
 ; Make sure the load in bb3.backedge is removed and moved into bb1 after the 
 ; call.  This makes the non-call case faster. 
diff --git a/test/Transforms/GVN/null-aliases-nothing.ll b/test/Transforms/GVN/null-aliases-nothing.ll
index 9e4ae18c71..37bf09d7f3 100644
--- a/test/Transforms/GVN/null-aliases-nothing.ll
+++ b/test/Transforms/GVN/null-aliases-nothing.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -basicaa -gvn -S | FileCheck %s
+; RUN: opt < %s -basicaa -gvn -S | FileCheck %s
 
 %t = type { i32 }
 declare void @test1f(i8*)
diff --git a/test/Transforms/GVN/pr12979.ll b/test/Transforms/GVN/pr12979.ll
index 669da9127d..0198a56513 100644
--- a/test/Transforms/GVN/pr12979.ll
+++ b/test/Transforms/GVN/pr12979.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -gvn -S -o - | FileCheck %s
+; RUN: opt -gvn -S < %s | FileCheck %s
 
 define i32 @test1(i32 %x, i32 %y) {
 ; CHECK: @test1(i32 %x, i32 %y)
diff --git a/test/Transforms/GVN/range.ll b/test/Transforms/GVN/range.ll
index 3759c415da..2115fe8566 100644
--- a/test/Transforms/GVN/range.ll
+++ b/test/Transforms/GVN/range.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -basicaa -gvn -S -o - | FileCheck %s
+; RUN: opt -basicaa -gvn -S < %s | FileCheck %s
 
 define i32 @test1(i32* %p) {
 ; CHECK: @test1(i32* %p)
diff --git a/test/Transforms/GVN/rle.ll b/test/Transforms/GVN/rle.ll
index 72fa819d1c..f470ed88bb 100644
--- a/test/Transforms/GVN/rle.ll
+++ b/test/Transforms/GVN/rle.ll
@@ -254,14 +254,11 @@ Cont:
   %A = load i8* %P3
   ret i8 %A
 
-;; FIXME: This is disabled because this caused a miscompile in the llvm-gcc
-;; bootstrap, see r82411
-;
-; HECK: @coerce_mustalias_nonlocal1
-; HECK: Cont:
-; HECK:   %A = phi i8 [
-; HECK-NOT: load
-; HECK: ret i8 %A
+; CHECK: @coerce_mustalias_nonlocal1
+; CHECK: Cont:
+; CHECK:   %A = phi i8 [
+; CHECK-NOT: load
+; CHECK: ret i8 %A
 }
 
 
diff --git a/test/Transforms/GVN/tbaa.ll b/test/Transforms/GVN/tbaa.ll
index 90661c6250..85fe39a93b 100644
--- a/test/Transforms/GVN/tbaa.ll
+++ b/test/Transforms/GVN/tbaa.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -basicaa -gvn -S -o - | FileCheck %s
+; RUN: opt -basicaa -gvn -S < %s | FileCheck %s
 
 define i32 @test1(i8* %p, i8* %q) {
 ; CHECK: @test1(i8* %p, i8* %q)
diff --git a/test/Transforms/GlobalOpt/2010-02-25-MallocPromote.ll b/test/Transforms/GlobalOpt/2010-02-25-MallocPromote.ll
index 27352fa290..629d57c884 100644
--- a/test/Transforms/GlobalOpt/2010-02-25-MallocPromote.ll
+++ b/test/Transforms/GlobalOpt/2010-02-25-MallocPromote.ll
@@ -1,5 +1,5 @@
 ; PR6422
-; RUN: opt -globalopt -S %s
+; RUN: opt -globalopt -S < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Transforms/GlobalOpt/2010-02-26-MallocSROA.ll b/test/Transforms/GlobalOpt/2010-02-26-MallocSROA.ll
index 6f1996a867..ab7721fd97 100644
--- a/test/Transforms/GlobalOpt/2010-02-26-MallocSROA.ll
+++ b/test/Transforms/GlobalOpt/2010-02-26-MallocSROA.ll
@@ -1,4 +1,4 @@
-; RUN: opt -globalopt -S %s
+; RUN: opt -globalopt -S < %s
 ; PR6435
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/GlobalOpt/crash.ll b/test/Transforms/GlobalOpt/crash.ll
index 366a874f73..80c777ccab 100644
--- a/test/Transforms/GlobalOpt/crash.ll
+++ b/test/Transforms/GlobalOpt/crash.ll
@@ -1,4 +1,4 @@
-; RUN: opt -globalopt -disable-output %s
+; RUN: opt -globalopt -disable-output < %s
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
 target triple = "i386-apple-darwin9.8"
 
diff --git a/test/Transforms/GlobalOpt/ctor-list-opt-constexpr.ll b/test/Transforms/GlobalOpt/ctor-list-opt-constexpr.ll
index e3bc473f52..c907610944 100644
--- a/test/Transforms/GlobalOpt/ctor-list-opt-constexpr.ll
+++ b/test/Transforms/GlobalOpt/ctor-list-opt-constexpr.ll
@@ -1,4 +1,4 @@
-; RUN: opt -globalopt %s -S | FileCheck %s
+; RUN: opt -globalopt -S < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
 
diff --git a/test/Transforms/GlobalOpt/integer-bool.ll b/test/Transforms/GlobalOpt/integer-bool.ll
index 5a34a9c4da..cf025ec614 100644
--- a/test/Transforms/GlobalOpt/integer-bool.ll
+++ b/test/Transforms/GlobalOpt/integer-bool.ll
@@ -1,23 +1,28 @@
-; RUN: opt < %s -globalopt -instcombine | \
-; RUN:    llvm-dis | grep "ret i1 true"
-
+; RUN: opt < %s -S -globalopt -instcombine | FileCheck %s
 ;; check that global opt turns integers that only hold 0 or 1 into bools.
 
-@G = internal global i32 0              ; <i32*> [#uses=3]
+@G = internal addrspace(1) global i32 0
+; CHECK @G.b
+; CHECK addrspace(1)
+; CHECK global i1 0
 
 define void @set1() {
-        store i32 0, i32* @G
-        ret void
+  store i32 0, i32 addrspace(1)* @G
+; CHECK: store i1 false
+  ret void
 }
 
 define void @set2() {
-        store i32 1, i32* @G
-        ret void
+  store i32 1, i32 addrspace(1)* @G
+; CHECK: store i1 true
+  ret void
 }
 
 define i1 @get() {
-        %A = load i32* @G               ; <i32> [#uses=1]
-        %C = icmp slt i32 %A, 2         ; <i1> [#uses=1]
-        ret i1 %C
+; CHECK @get
+  %A = load i32 addrspace(1) * @G
+  %C = icmp slt i32 %A, 2
+  ret i1 %C
+; CHECK: ret i1 true
 }
 
diff --git a/test/Transforms/GlobalOpt/memset-null.ll b/test/Transforms/GlobalOpt/memset-null.ll
index 01534025fa..53ec755113 100644
--- a/test/Transforms/GlobalOpt/memset-null.ll
+++ b/test/Transforms/GlobalOpt/memset-null.ll
@@ -1,4 +1,4 @@
-; RUN: opt -globalopt %s -S -o - | FileCheck %s
+; RUN: opt -globalopt -S < %s | FileCheck %s
 ; PR10047
 
 %0 = type { i32, void ()* }
diff --git a/test/Transforms/GlobalOpt/unnamed-addr.ll b/test/Transforms/GlobalOpt/unnamed-addr.ll
index ee75058731..2ca91e50da 100644
--- a/test/Transforms/GlobalOpt/unnamed-addr.ll
+++ b/test/Transforms/GlobalOpt/unnamed-addr.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -globalopt -S | FileCheck %s
+; RUN: opt -globalopt -S < %s | FileCheck %s
 
 @a = internal global i32 0, align 4
 @b = internal global i32 0, align 4
diff --git a/test/Transforms/IndVarSimplify/2003-09-23-NotAtTop.ll b/test/Transforms/IndVarSimplify/2003-09-23-NotAtTop.ll
index 150ae70a82..e3de75e36f 100644
--- a/test/Transforms/IndVarSimplify/2003-09-23-NotAtTop.ll
+++ b/test/Transforms/IndVarSimplify/2003-09-23-NotAtTop.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -indvars %s | FileCheck %s
+; RUN: opt -S -indvars < %s | FileCheck %s
 
 ; The indvar simplification code should ensure that the first PHI in the block 
 ; is the canonical one!
diff --git a/test/Transforms/IndVarSimplify/crash.ll b/test/Transforms/IndVarSimplify/crash.ll
index 1b702a3b1a..aa6a2ee165 100644
--- a/test/Transforms/IndVarSimplify/crash.ll
+++ b/test/Transforms/IndVarSimplify/crash.ll
@@ -1,4 +1,4 @@
-; RUN: opt -indvars %s -disable-output
+; RUN: opt -indvars -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 declare i32 @putchar(i8) nounwind
diff --git a/test/Transforms/Inline/2010-05-12-ValueMap.ll b/test/Transforms/Inline/2010-05-12-ValueMap.ll
index f9cc13f499..f452907efd 100644
--- a/test/Transforms/Inline/2010-05-12-ValueMap.ll
+++ b/test/Transforms/Inline/2010-05-12-ValueMap.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -inline -mergefunc -disable-output
+; RUN: opt -inline -mergefunc -disable-output < %s
 
 ; This tests for a bug where the inliner kept the functions in a ValueMap after
 ; it had completed and a ModulePass started to run. LLVM would crash deleting
diff --git a/test/Transforms/Inline/alloca_test.ll b/test/Transforms/Inline/alloca_test.ll
index e5791d5d25..8464259ce1 100644
--- a/test/Transforms/Inline/alloca_test.ll
+++ b/test/Transforms/Inline/alloca_test.ll
@@ -1,7 +1,7 @@
 ; This test ensures that alloca instructions in the entry block for an inlined
 ; function are moved to the top of the function they are inlined into.
 ;
-; RUN: opt -S -inline %s | FileCheck %s
+; RUN: opt -S -inline < %s | FileCheck %s
 
 define i32 @func(i32 %i) {
         %X = alloca i32         ; <i32*> [#uses=1]
diff --git a/test/Transforms/Inline/basictest.ll b/test/Transforms/Inline/basictest.ll
index 609a3d4e15..39e25cb5d6 100644
--- a/test/Transforms/Inline/basictest.ll
+++ b/test/Transforms/Inline/basictest.ll
@@ -45,3 +45,48 @@ define i32 @test2(i1 %cond) {
 ; CHECK-NOT: = alloca
 ; CHECK: ret i32
 }
+
+declare void @barrier() noduplicate
+
+define internal i32 @f() {
+  call void @barrier() noduplicate
+  ret i32 1
+}
+
+define i32 @g() {
+  call void @barrier() noduplicate
+  ret i32 2
+}
+
+define internal i32 @h() {
+  call void @barrier() noduplicate
+  ret i32 3
+}
+
+define i32 @test3() {
+  %b = call i32 @f()
+  ret i32 %b
+}
+
+; The call to @f cannot be inlined as there is another callsite
+; calling @f, and @f contains a noduplicate call.
+;
+; The call to @g cannot be inlined as it has external linkage.
+;
+; The call to @h *can* be inlined.
+
+; CHECK: @test
+define i32 @test() {
+; CHECK: call i32 @f()
+  %a = call i32 @f()
+; CHECK: call i32 @g()
+  %b = call i32 @g()
+; CHECK-NOT: call i32 @h()
+  %c = call i32 @h()
+
+  %d = add i32 %a, %b
+  %e = add i32 %d, %c
+
+  ret i32 %e
+; CHECK: }
+}
diff --git a/test/Transforms/Inline/crash2.ll b/test/Transforms/Inline/crash2.ll
index cb1f44d5cc..be634f6256 100644
--- a/test/Transforms/Inline/crash2.ll
+++ b/test/Transforms/Inline/crash2.ll
@@ -1,4 +1,4 @@
-; RUN: opt  -inline -scalarrepl -max-cg-scc-iterations=1  %s -disable-output
+; RUN: opt  -inline -scalarrepl -max-cg-scc-iterations=1 -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.3"
 
diff --git a/test/Transforms/Inline/delete-call.ll b/test/Transforms/Inline/delete-call.ll
index 7716d6a47b..0afd2ee4c2 100644
--- a/test/Transforms/Inline/delete-call.ll
+++ b/test/Transforms/Inline/delete-call.ll
@@ -1,5 +1,8 @@
-; RUN: opt %s -S  -inline -functionattrs -stats 2>&1 | grep "Number of call sites deleted, not inlined"
-; RUN: opt %s -S  -inline -stats 2>&1 | grep "Number of functions inlined"
+; RUN: opt -S -inline -stats < %s 2>&1 | FileCheck %s
+; CHECK: Number of functions inlined
+
+; RUN: opt -S -inline -functionattrs -stats < %s 2>&1 | FileCheck -check-prefix=FUNCTIONATTRS %s
+; CHECK-FUNCTIONATTRS: Number of call sites deleted, not inlined
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
 target triple = "i386-apple-darwin9.8"
diff --git a/test/Transforms/Inline/devirtualize-3.ll b/test/Transforms/Inline/devirtualize-3.ll
index c32be4e024..3f019676e4 100644
--- a/test/Transforms/Inline/devirtualize-3.ll
+++ b/test/Transforms/Inline/devirtualize-3.ll
@@ -1,4 +1,4 @@
-; RUN: opt -basicaa -inline -S -scalarrepl -gvn -instcombine %s | FileCheck %s
+; RUN: opt -basicaa -inline -S -scalarrepl -gvn -instcombine < %s | FileCheck %s
 ; PR5009
 
 ; CHECK: define i32 @main() 
diff --git a/test/Transforms/Inline/devirtualize.ll b/test/Transforms/Inline/devirtualize.ll
index 51ea4baa38..18bbf7a6f6 100644
--- a/test/Transforms/Inline/devirtualize.ll
+++ b/test/Transforms/Inline/devirtualize.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -basicaa -inline -scalarrepl -instcombine -simplifycfg -instcombine -gvn -globaldce %s | FileCheck %s
+; RUN: opt -S -basicaa -inline -scalarrepl -instcombine -simplifycfg -instcombine -gvn -globaldce < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
 
diff --git a/test/Transforms/Inline/gvn-inline-iteration.ll b/test/Transforms/Inline/gvn-inline-iteration.ll
index e502fd5777..526ed79e7b 100644
--- a/test/Transforms/Inline/gvn-inline-iteration.ll
+++ b/test/Transforms/Inline/gvn-inline-iteration.ll
@@ -1,4 +1,4 @@
-; RUN: opt -basicaa -inline -gvn %s -S -max-cg-scc-iterations=1 | FileCheck %s
+; RUN: opt -basicaa -inline -gvn -S -max-cg-scc-iterations=1 < %s | FileCheck %s
 ; rdar://6295824 and PR6724
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/test/Transforms/Inline/inline-optsize.ll b/test/Transforms/Inline/inline-optsize.ll
index 20d7426abd..3ad573a04e 100644
--- a/test/Transforms/Inline/inline-optsize.ll
+++ b/test/Transforms/Inline/inline-optsize.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S -Oz %s | FileCheck %s -check-prefix=OZ
-; RUN: opt -S -O2 %s | FileCheck %s -check-prefix=O2
+; RUN: opt -S -Oz < %s | FileCheck %s -check-prefix=OZ
+; RUN: opt -S -O2 < %s | FileCheck %s -check-prefix=O2
 
 ; The inline threshold for a function with the optsize attribute is currently
 ; the same as the global inline threshold for -Os. Check that the optsize
diff --git a/test/Transforms/Inline/inline_constprop.ll b/test/Transforms/Inline/inline_constprop.ll
index 0b48a7282f..77bc3784ac 100644
--- a/test/Transforms/Inline/inline_constprop.ll
+++ b/test/Transforms/Inline/inline_constprop.ll
@@ -111,6 +111,82 @@ bb.false:
   ret i32 %sub
 }
 
+declare {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a, i8 %b)
+
+define i8 @caller4(i8 %z) {
+; Check that we can constant fold through intrinsics such as the
+; overflow-detecting arithmetic instrinsics. These are particularly important
+; as they are used heavily in standard library code and generic C++ code where
+; the arguments are oftent constant but complete generality is required.
+;
+; CHECK: @caller4
+; CHECK-NOT: call
+; CHECK: ret i8 -1
+
+entry:
+  %x = call i8 @callee4(i8 254, i8 14, i8 %z)
+  ret i8 %x
+}
+
+define i8 @callee4(i8 %x, i8 %y, i8 %z) {
+  %uadd = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %x, i8 %y)
+  %o = extractvalue {i8, i1} %uadd, 1
+  br i1 %o, label %bb.true, label %bb.false
+
+bb.true:
+  ret i8 -1
+
+bb.false:
+  ; This block musn't be counted in the inline cost.
+  %z1 = add i8 %z, 1
+  %z2 = add i8 %z1, 1
+  %z3 = add i8 %z2, 1
+  %z4 = add i8 %z3, 1
+  %z5 = add i8 %z4, 1
+  %z6 = add i8 %z5, 1
+  %z7 = add i8 %z6, 1
+  %z8 = add i8 %z7, 1
+  ret i8 %z8
+}
+
+define i64 @caller5(i64 %y) {
+; Check that we can round trip constants through various kinds of casts etc w/o
+; losing track of the constant prop in the inline cost analysis.
+;
+; CHECK: @caller5
+; CHECK-NOT: call
+; CHECK: ret i64 -1
+
+entry:
+  %x = call i64 @callee5(i64 42, i64 %y)
+  ret i64 %x
+}
+
+define i64 @callee5(i64 %x, i64 %y) {
+  %inttoptr = inttoptr i64 %x to i8*
+  %bitcast = bitcast i8* %inttoptr to i32*
+  %ptrtoint = ptrtoint i32* %bitcast to i64
+  %trunc = trunc i64 %ptrtoint to i32
+  %zext = zext i32 %trunc to i64
+  %cmp = icmp eq i64 %zext, 42
+  br i1 %cmp, label %bb.true, label %bb.false
+
+bb.true:
+  ret i64 -1
+
+bb.false:
+  ; This block musn't be counted in the inline cost.
+  %y1 = add i64 %y, 1
+  %y2 = add i64 %y1, 1
+  %y3 = add i64 %y2, 1
+  %y4 = add i64 %y3, 1
+  %y5 = add i64 %y4, 1
+  %y6 = add i64 %y5, 1
+  %y7 = add i64 %y6, 1
+  %y8 = add i64 %y7, 1
+  ret i64 %y8
+}
+
 
 define i32 @PR13412.main() {
 ; This is a somewhat complicated three layer subprogram that was reported to
diff --git a/test/Transforms/Inline/inline_minisize.ll b/test/Transforms/Inline/inline_minisize.ll
new file mode 100644
index 0000000000..3dddbcf330
--- /dev/null
+++ b/test/Transforms/Inline/inline_minisize.ll
@@ -0,0 +1,232 @@
+; RUN: opt -O2 -S < %s | FileCheck %s
+
+@data = common global i32* null, align 8
+
+define i32 @fct1(i32 %a) nounwind uwtable ssp {
+entry:
+  %a.addr = alloca i32, align 4
+  %res = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  %idxprom = sext i32 %tmp to i64
+  %tmp1 = load i32** @data, align 8
+  %arrayidx = getelementptr inbounds i32* %tmp1, i64 %idxprom
+  %tmp2 = load i32* %arrayidx, align 4
+  %tmp3 = load i32* %a.addr, align 4
+  %add = add nsw i32 %tmp3, 1
+  %idxprom1 = sext i32 %add to i64
+  %tmp4 = load i32** @data, align 8
+  %arrayidx2 = getelementptr inbounds i32* %tmp4, i64 %idxprom1
+  %tmp5 = load i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %tmp2, %tmp5
+  store i32 %mul, i32* %res, align 4
+  store i32 0, i32* %i, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %tmp6 = load i32* %i, align 4
+  %tmp7 = load i32* %res, align 4
+  %cmp = icmp slt i32 %tmp6, %tmp7
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %tmp8 = load i32* %i, align 4
+  %idxprom3 = sext i32 %tmp8 to i64
+  %tmp9 = load i32** @data, align 8
+  %arrayidx4 = getelementptr inbounds i32* %tmp9, i64 %idxprom3
+  call void @fct0(i32* %arrayidx4)
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %tmp10 = load i32* %i, align 4
+  %inc = add nsw i32 %tmp10, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  store i32 0, i32* %i, align 4
+  br label %for.cond5
+
+for.cond5:                                        ; preds = %for.inc10, %for.end
+  %tmp11 = load i32* %i, align 4
+  %tmp12 = load i32* %res, align 4
+  %cmp6 = icmp slt i32 %tmp11, %tmp12
+  br i1 %cmp6, label %for.body7, label %for.end12
+
+for.body7:                                        ; preds = %for.cond5
+  %tmp13 = load i32* %i, align 4
+  %idxprom8 = sext i32 %tmp13 to i64
+  %tmp14 = load i32** @data, align 8
+  %arrayidx9 = getelementptr inbounds i32* %tmp14, i64 %idxprom8
+  call void @fct0(i32* %arrayidx9)
+  br label %for.inc10
+
+for.inc10:                                        ; preds = %for.body7
+  %tmp15 = load i32* %i, align 4
+  %inc11 = add nsw i32 %tmp15, 1
+  store i32 %inc11, i32* %i, align 4
+  br label %for.cond5
+
+for.end12:                                        ; preds = %for.cond5
+  store i32 0, i32* %i, align 4
+  br label %for.cond13
+
+for.cond13:                                       ; preds = %for.inc18, %for.end12
+  %tmp16 = load i32* %i, align 4
+  %tmp17 = load i32* %res, align 4
+  %cmp14 = icmp slt i32 %tmp16, %tmp17
+  br i1 %cmp14, label %for.body15, label %for.end20
+
+for.body15:                                       ; preds = %for.cond13
+  %tmp18 = load i32* %i, align 4
+  %idxprom16 = sext i32 %tmp18 to i64
+  %tmp19 = load i32** @data, align 8
+  %arrayidx17 = getelementptr inbounds i32* %tmp19, i64 %idxprom16
+  call void @fct0(i32* %arrayidx17)
+  br label %for.inc18
+
+for.inc18:                                        ; preds = %for.body15
+  %tmp20 = load i32* %i, align 4
+  %inc19 = add nsw i32 %tmp20, 1
+  store i32 %inc19, i32* %i, align 4
+  br label %for.cond13
+
+for.end20:                                        ; preds = %for.cond13
+  %tmp21 = load i32* %res, align 4
+  ret i32 %tmp21
+}
+
+declare void @fct0(i32*)
+
+define i32 @fct2(i32 %a) nounwind uwtable inlinehint ssp {
+entry:
+  %a.addr = alloca i32, align 4
+  %res = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  %shl = shl i32 %tmp, 1
+  %idxprom = sext i32 %shl to i64
+  %tmp1 = load i32** @data, align 8
+  %arrayidx = getelementptr inbounds i32* %tmp1, i64 %idxprom
+  %tmp2 = load i32* %arrayidx, align 4
+  %tmp3 = load i32* %a.addr, align 4
+  %shl1 = shl i32 %tmp3, 1
+  %add = add nsw i32 %shl1, 13
+  %idxprom2 = sext i32 %add to i64
+  %tmp4 = load i32** @data, align 8
+  %arrayidx3 = getelementptr inbounds i32* %tmp4, i64 %idxprom2
+  %tmp5 = load i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %tmp2, %tmp5
+  store i32 %mul, i32* %res, align 4
+  store i32 0, i32* %i, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %tmp6 = load i32* %i, align 4
+  %tmp7 = load i32* %res, align 4
+  %cmp = icmp slt i32 %tmp6, %tmp7
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %tmp8 = load i32* %i, align 4
+  %idxprom4 = sext i32 %tmp8 to i64
+  %tmp9 = load i32** @data, align 8
+  %arrayidx5 = getelementptr inbounds i32* %tmp9, i64 %idxprom4
+  call void @fct0(i32* %arrayidx5)
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %tmp10 = load i32* %i, align 4
+  %inc = add nsw i32 %tmp10, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  store i32 0, i32* %i, align 4
+  br label %for.cond6
+
+for.cond6:                                        ; preds = %for.inc11, %for.end
+  %tmp11 = load i32* %i, align 4
+  %tmp12 = load i32* %res, align 4
+  %cmp7 = icmp slt i32 %tmp11, %tmp12
+  br i1 %cmp7, label %for.body8, label %for.end13
+
+for.body8:                                        ; preds = %for.cond6
+  %tmp13 = load i32* %i, align 4
+  %idxprom9 = sext i32 %tmp13 to i64
+  %tmp14 = load i32** @data, align 8
+  %arrayidx10 = getelementptr inbounds i32* %tmp14, i64 %idxprom9
+  call void @fct0(i32* %arrayidx10)
+  br label %for.inc11
+
+for.inc11:                                        ; preds = %for.body8
+  %tmp15 = load i32* %i, align 4
+  %inc12 = add nsw i32 %tmp15, 1
+  store i32 %inc12, i32* %i, align 4
+  br label %for.cond6
+
+for.end13:                                        ; preds = %for.cond6
+  store i32 0, i32* %i, align 4
+  br label %for.cond14
+
+for.cond14:                                       ; preds = %for.inc19, %for.end13
+  %tmp16 = load i32* %i, align 4
+  %tmp17 = load i32* %res, align 4
+  %cmp15 = icmp slt i32 %tmp16, %tmp17
+  br i1 %cmp15, label %for.body16, label %for.end21
+
+for.body16:                                       ; preds = %for.cond14
+  %tmp18 = load i32* %i, align 4
+  %idxprom17 = sext i32 %tmp18 to i64
+  %tmp19 = load i32** @data, align 8
+  %arrayidx18 = getelementptr inbounds i32* %tmp19, i64 %idxprom17
+  call void @fct0(i32* %arrayidx18)
+  br label %for.inc19
+
+for.inc19:                                        ; preds = %for.body16
+  %tmp20 = load i32* %i, align 4
+  %inc20 = add nsw i32 %tmp20, 1
+  store i32 %inc20, i32* %i, align 4
+  br label %for.cond14
+
+for.end21:                                        ; preds = %for.cond14
+  %tmp21 = load i32* %res, align 4
+  ret i32 %tmp21
+}
+
+define i32 @fct3(i32 %c) nounwind uwtable ssp {
+entry:
+  ;CHECK: @fct3
+  ;CHECK: call i32 @fct1
+  ; The inline keyword gives a sufficient benefits to inline fct2
+  ;CHECK-NOT: call i32 @fct2
+  %c.addr = alloca i32, align 4
+  store i32 %c, i32* %c.addr, align 4
+  %tmp = load i32* %c.addr, align 4
+  %call = call i32 @fct1(i32 %tmp)
+  %tmp1 = load i32* %c.addr, align 4
+  %call1 = call i32 @fct2(i32 %tmp1)
+  %add = add nsw i32 %call, %call1
+  ret i32 %add
+}
+
+define i32 @fct4(i32 %c) minsize nounwind uwtable ssp {
+entry:
+  ;CHECK: @fct4
+  ;CHECK: call i32 @fct1
+  ; With Oz (minsize attribute), the benefit of inlining fct2
+  ; is the same as fct1, thus no inlining for fct2
+  ;CHECK: call i32 @fct2
+  %c.addr = alloca i32, align 4
+  store i32 %c, i32* %c.addr, align 4
+  %tmp = load i32* %c.addr, align 4
+  %call = call i32 @fct1(i32 %tmp)
+  %tmp1 = load i32* %c.addr, align 4
+  %call1 = call i32 @fct2(i32 %tmp1)
+  %add = add nsw i32 %call, %call1
+  ret i32 %add
+}
diff --git a/test/Transforms/Inline/lifetime-no-datalayout.ll b/test/Transforms/Inline/lifetime-no-datalayout.ll
index 9ad14282f9..f4ffef3850 100644
--- a/test/Transforms/Inline/lifetime-no-datalayout.ll
+++ b/test/Transforms/Inline/lifetime-no-datalayout.ll
@@ -1,4 +1,4 @@
-; RUN: opt -inline %s -S -o - | FileCheck %s
+; RUN: opt -inline -S < %s | FileCheck %s
 
 declare void @use(i8* %a)
 
diff --git a/test/Transforms/Inline/lifetime.ll b/test/Transforms/Inline/lifetime.ll
index fb520498c4..fc73385295 100644
--- a/test/Transforms/Inline/lifetime.ll
+++ b/test/Transforms/Inline/lifetime.ll
@@ -1,4 +1,4 @@
-; RUN: opt -inline %s -S -o - | FileCheck %s
+; RUN: opt -inline -S < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 declare void @llvm.lifetime.start(i64, i8*)
diff --git a/test/Transforms/Inline/noinline-recursive-fn.ll b/test/Transforms/Inline/noinline-recursive-fn.ll
index 6cde0e27fd..5520093ee4 100644
--- a/test/Transforms/Inline/noinline-recursive-fn.ll
+++ b/test/Transforms/Inline/noinline-recursive-fn.ll
@@ -2,7 +2,7 @@
 ; This effectively is just peeling off the first iteration of a loop, and the
 ; inliner heuristics are not set up for this.
 
-; RUN: opt -inline %s -S | FileCheck %s
+; RUN: opt -inline -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.3"
diff --git a/test/Transforms/Inline/noinline.ll b/test/Transforms/Inline/noinline.ll
index dc3f6e0030..7667114b68 100644
--- a/test/Transforms/Inline/noinline.ll
+++ b/test/Transforms/Inline/noinline.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -inline -S | FileCheck %s
+; RUN: opt -inline -S < %s | FileCheck %s
 ; PR6682
 declare void @foo() nounwind
 
diff --git a/test/Transforms/Inline/recursive.ll b/test/Transforms/Inline/recursive.ll
index 5fe8d1639c..fe1c041af9 100644
--- a/test/Transforms/Inline/recursive.ll
+++ b/test/Transforms/Inline/recursive.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -inline -S | FileCheck %s
+; RUN: opt -inline -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin10.0"
diff --git a/test/Transforms/InstCombine/2008-05-08-StrLenSink.ll b/test/Transforms/InstCombine/2008-05-08-StrLenSink.ll
index 1da28562aa..d266164fd8 100644
--- a/test/Transforms/InstCombine/2008-05-08-StrLenSink.ll
+++ b/test/Transforms/InstCombine/2008-05-08-StrLenSink.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -instcombine %s | FileCheck %s
+; RUN: opt -S -instcombine < %s | FileCheck %s
 ; PR2297
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin8"
diff --git a/test/Transforms/InstCombine/2010-03-03-ExtElim.ll b/test/Transforms/InstCombine/2010-03-03-ExtElim.ll
index 2df12d670a..bb3159e1e6 100644
--- a/test/Transforms/InstCombine/2010-03-03-ExtElim.ll
+++ b/test/Transforms/InstCombine/2010-03-03-ExtElim.ll
@@ -1,4 +1,4 @@
-; RUN: opt -instcombine -S %s | FileCheck %s
+; RUN: opt -instcombine -S < %s | FileCheck %s
 ; PR6486
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
diff --git a/test/Transforms/InstCombine/2010-05-30-memcpy-Struct.ll b/test/Transforms/InstCombine/2010-05-30-memcpy-Struct.ll
index b75fa5ad40..09a96749f2 100644
--- a/test/Transforms/InstCombine/2010-05-30-memcpy-Struct.ll
+++ b/test/Transforms/InstCombine/2010-05-30-memcpy-Struct.ll
@@ -1,4 +1,4 @@
-; RUN: opt -instcombine %s -S -o - | FileCheck %s
+; RUN: opt -instcombine -S < %s | FileCheck %s
 ; PR7265
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/test/Transforms/InstCombine/2012-12-14-simp-vgep.ll b/test/Transforms/InstCombine/2012-12-14-simp-vgep.ll
new file mode 100644
index 0000000000..fc29b095e5
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-12-14-simp-vgep.ll
@@ -0,0 +1,10 @@
+; RUN: opt < %s -instcombine -S
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+define <4 x i32> @foo(<4 x i32*>* %in) {
+  %t17 = load <4 x i32*>* %in, align 8
+  %t18 = icmp eq <4 x i32*> %t17, zeroinitializer
+  %t19 = zext <4 x i1> %t18 to <4 x i32>
+  ret <4 x i32> %t19
+}
diff --git a/test/Transforms/InstCombine/cast.ll b/test/Transforms/InstCombine/cast.ll
index b4eb69d436..de738bb7c0 100644
--- a/test/Transforms/InstCombine/cast.ll
+++ b/test/Transforms/InstCombine/cast.ll
@@ -473,14 +473,12 @@ define i64 @test51(i64 %A, i1 %cond) {
   %F = sext i32 %E to i64
   ret i64 %F
 ; CHECK: @test51
-
-; FIXME: disabled, see PR5997
-; HECK-NEXT: %C = and i64 %A, 4294967294
-; HECK-NEXT: %D = or i64 %A, 1
-; HECK-NEXT: %E = select i1 %cond, i64 %C, i64 %D
-; HECK-NEXT: %sext = shl i64 %E, 32
-; HECK-NEXT: %F = ashr i64 %sext, 32
-; HECK-NEXT: ret i64 %F
+; CHECK-NEXT: %C = and i64 %A, 4294967294
+; CHECK-NEXT: %D = or i64 %A, 1
+; CHECK-NEXT: %E = select i1 %cond, i64 %C, i64 %D
+; CHECK-NEXT: %sext = shl i64 %E, 32
+; CHECK-NEXT: %F = ashr exact i64 %sext, 32
+; CHECK-NEXT: ret i64 %F
 }
 
 define i32 @test52(i64 %A) {
diff --git a/test/Transforms/InstCombine/compare-signs.ll b/test/Transforms/InstCombine/compare-signs.ll
index f8e4911061..72db66e3ab 100644
--- a/test/Transforms/InstCombine/compare-signs.ll
+++ b/test/Transforms/InstCombine/compare-signs.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -instcombine -S | FileCheck %s
+; RUN: opt -instcombine -S < %s | FileCheck %s
 ; PR5438
 
 ; TODO: This should also optimize down.
diff --git a/test/Transforms/InstCombine/devirt.ll b/test/Transforms/InstCombine/devirt.ll
index 6189dc2af4..9c7cf5d697 100644
--- a/test/Transforms/InstCombine/devirt.ll
+++ b/test/Transforms/InstCombine/devirt.ll
@@ -1,4 +1,4 @@
-; RUN: opt -instcombine -S -o - %s | FileCheck %s
+; RUN: opt -instcombine -S < %s | FileCheck %s
 
 ; CHECK-NOT: getelementptr
 ; CHECK-NOT: ptrtoint
diff --git a/test/Transforms/InstCombine/fast-math.ll b/test/Transforms/InstCombine/fast-math.ll
index b6a15677bb..df0455a203 100644
--- a/test/Transforms/InstCombine/fast-math.ll
+++ b/test/Transforms/InstCombine/fast-math.ll
@@ -3,19 +3,17 @@
 ; testing-case "float fold(float a) { return 1.2f * a * 2.3f; }"
 ; 1.2f and 2.3f is supposed to be fold.
 define float @fold(float %a) {
-fold:
   %mul = fmul fast float %a, 0x3FF3333340000000
   %mul1 = fmul fast float %mul, 0x4002666660000000
   ret float %mul1
-; CHECK: fold
+; CHECK: @fold
 ; CHECK: fmul float %a, 0x4006147AE0000000
 }
 
 ; Same testing-case as the one used in fold() except that the operators have
 ; fixed FP mode.
 define float @notfold(float %a) {
-notfold:
-; CHECK: notfold
+; CHECK: @notfold
 ; CHECK: %mul = fmul fast float %a, 0x3FF3333340000000
   %mul = fmul fast float %a, 0x3FF3333340000000
   %mul1 = fmul float %mul, 0x4002666660000000
@@ -23,10 +21,238 @@ notfold:
 }
 
 define float @fold2(float %a) {
-fold2:
-; CHECK: fold2
+; CHECK: @fold2
 ; CHECK: fmul float %a, 0x4006147AE0000000
   %mul = fmul float %a, 0x3FF3333340000000
   %mul1 = fmul fast float %mul, 0x4002666660000000
   ret float %mul1
 }
+
+; C * f1 + f1 = (C+1) * f1
+define double @fold3(double %f1) {
+  %t1 = fmul fast double 2.000000e+00, %f1
+  %t2 = fadd fast double %f1, %t1
+  ret double %t2
+; CHECK: @fold3
+; CHECK: fmul fast double %f1, 3.000000e+00
+}
+
+; (C1 - X) + (C2 - Y) => (C1+C2) - (X + Y)
+define float @fold4(float %f1, float %f2) {
+  %sub = fsub float 4.000000e+00, %f1
+  %sub1 = fsub float 5.000000e+00, %f2
+  %add = fadd fast float %sub, %sub1
+  ret float %add
+; CHECK: @fold4
+; CHECK: %1 = fadd fast float %f1, %f2
+; CHECK: fsub fast float 9.000000e+00, %1
+}
+
+; (X + C1) + C2 => X + (C1 + C2)
+define float @fold5(float %f1, float %f2) {
+  %add = fadd float %f1, 4.000000e+00
+  %add1 = fadd fast float %add, 5.000000e+00
+  ret float %add1
+; CHECK: @fold5
+; CHECK: fadd float %f1, 9.000000e+00
+}
+
+; (X + X) + X => 3.0 * X
+define float @fold6(float %f1) {
+  %t1 = fadd fast float %f1, %f1
+  %t2 = fadd fast float %f1, %t1
+  ret float %t2
+; CHECK: @fold6
+; CHECK: fmul fast float %f1, 3.000000e+00
+}
+
+; C1 * X + (X + X) = (C1 + 2) * X
+define float @fold7(float %f1) {
+  %t1 = fmul fast float %f1, 5.000000e+00
+  %t2 = fadd fast float %f1, %f1
+  %t3 = fadd fast float %t1, %t2
+  ret float %t3
+; CHECK: @fold7
+; CHECK: fmul fast float %f1, 7.000000e+00
+}
+
+; (X + X) + (X + X) => 4.0 * X
+define float @fold8(float %f1) {
+  %t1 = fadd fast float %f1, %f1
+  %t2 = fadd fast float %f1, %f1
+  %t3 = fadd fast float %t1, %t2
+  ret float %t3
+; CHECK: fold8
+; CHECK: fmul fast float %f1, 4.000000e+00
+}
+
+; X - (X + Y) => 0 - Y
+define float @fold9(float %f1, float %f2) {
+  %t1 = fadd float %f1, %f2
+  %t3 = fsub fast float %f1, %t1
+  ret float %t3
+
+; CHECK: @fold9
+; CHECK: fsub fast float 0.000000e+00, %f2
+}
+
+; Let C3 = C1 + C2. (f1 + C1) + (f2 + C2) => (f1 + f2) + C3 instead of
+; "(f1 + C3) + f2" or "(f2 + C3) + f1". Placing constant-addend at the 
+; top of resulting simplified expression tree may potentially reveal some
+; optimization opportunities in the super-expression trees.
+; 
+define float @fold10(float %f1, float %f2) {
+  %t1 = fadd fast float 2.000000e+00, %f1
+  %t2 = fsub fast float %f2, 3.000000e+00
+  %t3 = fadd fast float %t1, %t2
+  ret float %t3
+; CHECK: @fold10
+; CHECK: %t3 = fadd float %t2, -1.000000e+00
+; CHECK: ret float %t3
+}
+
+; once cause Crash/miscompilation
+define float @fail1(float %f1, float %f2) {
+  %conv3 = fadd fast float %f1, -1.000000e+00
+  %add = fadd fast float %conv3, %conv3
+  %add2 = fadd fast float %add, %conv3
+  ret float %add2
+; CHECK: @fail1
+; CHECK: ret
+}
+
+define double @fail2(double %f1, double %f2) {
+  %t1 = fsub fast double %f1, %f2
+  %t2 = fadd fast double %f1, %f2
+  %t3 = fsub fast double %t1, %t2
+  ret double %t3
+; CHECK: @fail2
+; CHECK: ret
+}
+
+; rdar://12753946:  x * cond ? 1.0 : 0.0 => cond ? x : 0.0
+define double @select1(i32 %cond, double %x, double %y) {
+  %tobool = icmp ne i32 %cond, 0
+  %cond1 = select i1 %tobool, double 1.000000e+00, double 0.000000e+00
+  %mul = fmul nnan nsz double %cond1, %x
+  %add = fadd double %mul, %y
+  ret double %add
+; CHECK: @select1
+; CHECK: select i1 %tobool, double %x, double 0.000000e+00
+}
+
+define double @select2(i32 %cond, double %x, double %y) {
+  %tobool = icmp ne i32 %cond, 0
+  %cond1 = select i1 %tobool, double 0.000000e+00, double 1.000000e+00
+  %mul = fmul nnan nsz double %cond1, %x
+  %add = fadd double %mul, %y
+  ret double %add
+; CHECK: @select2
+; CHECK: select i1 %tobool, double 0.000000e+00, double %x
+}
+
+define double @select3(i32 %cond, double %x, double %y) {
+  %tobool = icmp ne i32 %cond, 0
+  %cond1 = select i1 %tobool, double 0.000000e+00, double 2.000000e+00
+  %mul = fmul nnan nsz double %cond1, %x
+  %add = fadd double %mul, %y
+  ret double %add
+; CHECK: @select3
+; CHECK: fmul nnan nsz double %cond1, %x
+}
+
+; =========================================================================
+;
+;   Testing-cases about fmul begin
+;
+; =========================================================================
+
+; ((X*C1) + C2) * C3 => (X * (C1*C3)) + (C2*C3) (i.e. distribution)
+define float @fmul_distribute1(float %f1) {
+  %t1 = fmul float %f1, 6.0e+3
+  %t2 = fadd float %t1, 2.0e+3
+  %t3 = fmul fast float %t2, 5.0e+3
+  ret float %t3 
+; CHECK: @fmul_distribute1
+; CHECK: %1 = fmul fast float %f1, 3.000000e+07
+; CHECK: %t3 = fadd fast float %1, 1.000000e+07
+}
+
+; (X/C1 + C2) * C3 => X/(C1/C3) + C2*C3
+define double @fmul_distribute2(double %f1, double %f2) {
+  %t1 = fdiv double %f1, 3.0e+0
+  %t2 = fadd double %t1, 5.0e+1
+  ; 0x10000000000000 = DBL_MIN
+  %t3 = fmul fast double %t2, 0x10000000000000
+  ret double %t3
+
+; CHECK: @fmul_distribute2
+; CHECK: %1 = fdiv fast double %f1, 0x7FE8000000000000
+; CHECK: fadd fast double %1, 0x69000000000000
+}
+
+; 5.0e-1 * DBL_MIN yields denormal, so "(f1*3.0 + 5.0e-1) * DBL_MIN" cannot
+; be simplified into f1 * (3.0*DBL_MIN) + (5.0e-1*DBL_MIN)
+define double @fmul_distribute3(double %f1) {
+  %t1 = fdiv double %f1, 3.0e+0
+  %t2 = fadd double %t1, 5.0e-1
+  %t3 = fmul fast double %t2, 0x10000000000000
+  ret double %t3
+
+; CHECK: @fmul_distribute3
+; CHECK: fmul fast double %t2, 0x10000000000000
+}
+
+; C1/X * C2 => (C1*C2) / X
+define float @fmul2(float %f1) {
+  %t1 = fdiv float 2.0e+3, %f1 
+  %t3 = fmul fast float %t1, 6.0e+3
+  ret float %t3 
+; CHECK: @fmul2
+; CHECK: fdiv fast float 1.200000e+07, %f1
+}
+
+; X/C1 * C2 => X * (C2/C1) (if C2/C1 is normal Fp)
+define float @fmul3(float %f1, float %f2) {
+  %t1 = fdiv float %f1, 2.0e+3
+  %t3 = fmul fast float %t1, 6.0e+3
+  ret float %t3 
+; CHECK: @fmul3
+; CHECK: fmul fast float %f1, 3.000000e+00
+}
+
+; Rule "X/C1 * C2 => X * (C2/C1) is not applicable if C2/C1 is either a special
+; value of a denormal. The 0x3810000000000000 here take value FLT_MIN
+;
+define float @fmul4(float %f1, float %f2) {
+  %t1 = fdiv float %f1, 2.0e+3
+  %t3 = fmul fast float %t1, 0x3810000000000000
+  ret float %t3 
+; CHECK: @fmul4
+; CHECK: fmul fast float %t1, 0x3810000000000000
+}
+
+; X / C1 * C2 => X / (C2/C1) if  C1/C2 is either a special value of a denormal, 
+;  and C2/C1 is a normal value.
+; 
+define float @fmul5(float %f1, float %f2) {
+  %t1 = fdiv float %f1, 3.0e+0
+  %t3 = fmul fast float %t1, 0x3810000000000000
+  ret float %t3 
+; CHECK: @fmul5
+; CHECK: fdiv fast float %f1, 0x47E8000000000000
+}
+
+; =========================================================================
+;
+;   Testing-cases about negation
+;
+; =========================================================================
+define float @fneg1(float %f1, float %f2) {
+  %sub = fsub float -0.000000e+00, %f1
+  %sub1 = fsub nsz float 0.000000e+00, %f2
+  %mul = fmul float %sub, %sub1
+  ret float %mul
+; CHECK: @fneg1
+; CHECK: fmul float %f1, %f2
+}
diff --git a/test/Transforms/InstCombine/fold-phi.ll b/test/Transforms/InstCombine/fold-phi.ll
new file mode 100644
index 0000000000..bd01d58aa5
--- /dev/null
+++ b/test/Transforms/InstCombine/fold-phi.ll
@@ -0,0 +1,39 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; CHECK: no_crash
+define float @no_crash(float %a) nounwind {
+entry:
+  br label %for.body
+
+for.body:
+  %sum.057 = phi float [ 0.000000e+00, %entry ], [ %add5, %bb0 ]
+  %add5 = fadd float %sum.057, %a    ; PR14592
+  br i1 undef, label %bb0, label %end
+
+bb0:
+  br label %for.body
+
+end:
+  ret float %add5
+}
+
+; CHECK: fold_phi
+define float @fold_phi(float %a) nounwind {
+entry:
+  br label %for.body
+
+for.body:
+; CHECK: phi float
+; CHECK-NEXT: br i1 undef
+  %sum.057 = phi float [ 0.000000e+00, %entry ], [ %add5, %bb0 ]
+  %add5 = fadd float %sum.057, 1.0 ;; Should be moved to the latch!
+  br i1 undef, label %bb0, label %end
+
+; CHECK: bb0:
+bb0:
+; CHECK: fadd float
+  br label %for.body
+
+end:
+  ret float %add5
+}
diff --git a/test/Transforms/InstCombine/fpcast.ll b/test/Transforms/InstCombine/fpcast.ll
index bc6aa0a689..09f053289d 100644
--- a/test/Transforms/InstCombine/fpcast.ll
+++ b/test/Transforms/InstCombine/fpcast.ll
@@ -13,3 +13,22 @@ define i8 @test2() {
 ; CHECK: ret i8 -1
 }
 
+; CHECK: test3
+define half @test3(float %a) {
+; CHECK: fptrunc
+; CHECK: llvm.fabs.f16
+  %b = call float @llvm.fabs.f32(float %a)
+  %c = fptrunc float %b to half
+  ret half %c
+}
+
+; CHECK: test4
+define half @test4(float %a) {
+; CHECK: fptrunc
+; CHECK: fsub
+  %b = fsub float -0.0, %a
+  %c = fptrunc float %b to half
+  ret half %c
+}
+
+declare float @llvm.fabs.f32(float) nounwind readonly
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index 8e064a4f2f..8fb6144c3f 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -677,3 +677,32 @@ define i1 @test66(i64 %A, i64 %B) {
 ; CHECK-NEXT: ret i1 true
   ret i1 %cmp
 }
+
+; CHECK: @test67
+; CHECK: %and = and i32 %x, 96
+; CHECK: %cmp = icmp ne i32 %and, 0
+define i1 @test67(i32 %x) nounwind uwtable {
+  %and = and i32 %x, 127
+  %cmp = icmp sgt i32 %and, 31
+  ret i1 %cmp
+}
+
+; CHECK: @test68
+; CHECK: %cmp = icmp ugt i32 %and, 30
+define i1 @test68(i32 %x) nounwind uwtable {
+  %and = and i32 %x, 127
+  %cmp = icmp sgt i32 %and, 30
+  ret i1 %cmp
+}
+
+; PR14708
+; CHECK: @test69
+; CHECK: %1 = and i32 %c, -33
+; CHECK: %2 = icmp eq i32 %1, 65
+; CHECK: ret i1 %2
+define i1 @test69(i32 %c) nounwind uwtable {
+  %1 = icmp eq i32 %c, 97
+  %2 = icmp eq i32 %c, 65
+  %3 = or i1 %1, %2
+  ret i1 %3
+}
diff --git a/test/Transforms/InstCombine/idioms.ll b/test/Transforms/InstCombine/idioms.ll
index 6b3567fc6e..1a211668c3 100644
--- a/test/Transforms/InstCombine/idioms.ll
+++ b/test/Transforms/InstCombine/idioms.ll
@@ -1,4 +1,4 @@
-; RUN: opt -instcombine %s -S | FileCheck %s
+; RUN: opt -instcombine -S < %s | FileCheck %s
 
 ; Check that code corresponding to the following C function is
 ; simplified into a single ASR operation:
diff --git a/test/Transforms/InstCombine/intrinsics.ll b/test/Transforms/InstCombine/intrinsics.ll
index 382e6b3857..93f0a953fd 100644
--- a/test/Transforms/InstCombine/intrinsics.ll
+++ b/test/Transforms/InstCombine/intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -instcombine -S | FileCheck %s
+; RUN: opt -instcombine -S < %s | FileCheck %s
 
 %overflow.result = type {i8, i1}
 
diff --git a/test/Transforms/InstCombine/malloc-free-delete.ll b/test/Transforms/InstCombine/malloc-free-delete.ll
index 4e3217dc2d..cd12b29b11 100644
--- a/test/Transforms/InstCombine/malloc-free-delete.ll
+++ b/test/Transforms/InstCombine/malloc-free-delete.ll
@@ -91,3 +91,32 @@ define void @test5(i8* %ptr, i8** %esc) {
   store volatile i8 4, i8* %g
   ret void
 }
+
+;; When a basic block contains only a call to free and this block is accessed
+;; through a test of the argument of free against null, move the call in the
+;; predecessor block.
+;; Using simplifycfg will remove the empty basic block and the branch operation
+;; Then, performing a dead elimination will remove the comparison.
+;; This is what happens with -O1 and upper.
+; CHECK: @test6
+define void @test6(i8* %foo) minsize {
+; CHECK:  %tobool = icmp eq i8* %foo, null
+;; Call to free moved
+; CHECK-NEXT: tail call void @free(i8* %foo)
+; CHECK-NEXT: br i1 %tobool, label %if.end, label %if.then
+; CHECK: if.then:
+;; Block is now empty and may be simplified by simplifycfg
+; CHECK-NEXT:   br label %if.end
+; CHECK: if.end:
+; CHECK-NEXT:  ret void
+entry:
+  %tobool = icmp eq i8* %foo, null
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  tail call void @free(i8* %foo)
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
diff --git a/test/Transforms/InstCombine/obfuscated_splat.ll b/test/Transforms/InstCombine/obfuscated_splat.ll
index c25dade168..fa9cb423d0 100644
--- a/test/Transforms/InstCombine/obfuscated_splat.ll
+++ b/test/Transforms/InstCombine/obfuscated_splat.ll
@@ -1,4 +1,4 @@
-; RUN: opt -instcombine -S %s | FileCheck %s
+; RUN: opt -instcombine -S < %s | FileCheck %s
 
 define void @test(<4 x float> *%in_ptr, <4 x float> *%out_ptr) {
   %A = load <4 x float>* %in_ptr, align 16
diff --git a/test/Transforms/InstCombine/objsize.ll b/test/Transforms/InstCombine/objsize.ll
index 31a3cb46e4..0ead9d1237 100644
--- a/test/Transforms/InstCombine/objsize.ll
+++ b/test/Transforms/InstCombine/objsize.ll
@@ -256,3 +256,131 @@ xpto:
 return:
   ret i32 7
 }
+
+declare noalias i8* @valloc(i32) nounwind
+
+; CHECK: @test14
+; CHECK: ret i32 6
+define i32 @test14(i32 %a) nounwind {
+  switch i32 %a, label %sw.default [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb1
+  ]
+
+sw.bb:
+  %call = tail call noalias i8* @malloc(i32 6) nounwind
+  br label %sw.epilog
+
+sw.bb1:
+  %call2 = tail call noalias i8* @calloc(i32 3, i32 2) nounwind
+  br label %sw.epilog
+
+sw.default:
+  %call3 = tail call noalias i8* @valloc(i32 6) nounwind
+  br label %sw.epilog
+
+sw.epilog:
+  %b.0 = phi i8* [ %call3, %sw.default ], [ %call2, %sw.bb1 ], [ %call, %sw.bb ]
+  %1 = tail call i32 @llvm.objectsize.i32(i8* %b.0, i1 false)
+  ret i32 %1
+}
+
+; CHECK: @test15
+; CHECK: llvm.objectsize
+define i32 @test15(i32 %a) nounwind {
+  switch i32 %a, label %sw.default [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb1
+  ]
+
+sw.bb:
+  %call = tail call noalias i8* @malloc(i32 3) nounwind
+  br label %sw.epilog
+
+sw.bb1:
+  %call2 = tail call noalias i8* @calloc(i32 2, i32 1) nounwind
+  br label %sw.epilog
+
+sw.default:
+  %call3 = tail call noalias i8* @valloc(i32 3) nounwind
+  br label %sw.epilog
+
+sw.epilog:
+  %b.0 = phi i8* [ %call3, %sw.default ], [ %call2, %sw.bb1 ], [ %call, %sw.bb ]
+  %1 = tail call i32 @llvm.objectsize.i32(i8* %b.0, i1 false)
+  ret i32 %1
+}
+
+; CHECK: @test16
+; CHECK: llvm.objectsize
+define i32 @test16(i8* %a, i32 %n) nounwind {
+  %b = alloca [5 x i8], align 1
+  %c = alloca [5 x i8], align 1
+  switch i32 %n, label %sw.default [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb1
+  ]
+
+sw.bb:
+  %bp = bitcast [5 x i8]* %b to i8*
+  br label %sw.epilog
+
+sw.bb1:
+  %cp = bitcast [5 x i8]* %c to i8*
+  br label %sw.epilog
+
+sw.default:
+  br label %sw.epilog
+
+sw.epilog:
+  %phi = phi i8* [ %a, %sw.default ], [ %cp, %sw.bb1 ], [ %bp, %sw.bb ]
+  %sz = call i32 @llvm.objectsize.i32(i8* %phi, i1 false)
+  ret i32 %sz
+}
+
+; CHECK: @test17
+; CHECK: ret i32 5
+define i32 @test17(i32 %n) nounwind {
+  %b = alloca [5 x i8], align 1
+  %c = alloca [5 x i8], align 1
+  %bp = bitcast [5 x i8]* %b to i8*
+  switch i32 %n, label %sw.default [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb1
+  ]
+
+sw.bb:
+  br label %sw.epilog
+
+sw.bb1:
+  %cp = bitcast [5 x i8]* %c to i8*
+  br label %sw.epilog
+
+sw.default:
+  br label %sw.epilog
+
+sw.epilog:
+  %phi = phi i8* [ %bp, %sw.default ], [ %cp, %sw.bb1 ], [ %bp, %sw.bb ]
+  %sz = call i32 @llvm.objectsize.i32(i8* %phi, i1 false)
+  ret i32 %sz
+}
+
+@globalalias = alias internal [60 x i8]* @a
+
+; CHECK: @test18
+; CHECK-NEXT: ret i32 60
+define i32 @test18() {
+  %bc = bitcast [60 x i8]* @globalalias to i8*
+  %1 = call i32 @llvm.objectsize.i32(i8* %bc, i1 false)
+  ret i32 %1
+}
+
+@globalalias2 = alias weak [60 x i8]* @a
+
+; CHECK: @test19
+; CHECK: llvm.objectsize
+define i32 @test19() {
+  %bc = bitcast [60 x i8]* @globalalias2 to i8*
+  %1 = call i32 @llvm.objectsize.i32(i8* %bc, i1 false)
+  ret i32 %1
+}
diff --git a/test/Transforms/InstCombine/shift.ll b/test/Transforms/InstCombine/shift.ll
index 32867761a3..41f8aa9ee8 100644
--- a/test/Transforms/InstCombine/shift.ll
+++ b/test/Transforms/InstCombine/shift.ll
@@ -735,3 +735,13 @@ define i32 @test61(i32 %x) {
 ; CHECK: @test61
 ; CHECK: ashr i32 %x, 4
 }
+
+; propagate "exact" trait
+define i32 @test62(i32 %x) {
+  %shr = ashr exact i32 %x, 4
+  %shl = shl i32 %shr, 1
+  %or = or i32 %shl, 1
+  ret i32 %or
+; CHECK: @test62
+; CHECK: ashr exact i32 %x, 3
+}
diff --git a/test/Transforms/InstCombine/sink_instruction.ll b/test/Transforms/InstCombine/sink_instruction.ll
index e521de208f..5c4019a98d 100644
--- a/test/Transforms/InstCombine/sink_instruction.ll
+++ b/test/Transforms/InstCombine/sink_instruction.ll
@@ -1,4 +1,4 @@
-; RUN: opt -instcombine %s -S | FileCheck %s
+; RUN: opt -instcombine -S < %s | FileCheck %s
 
 ;; This tests that the instructions in the entry blocks are sunk into each
 ;; arm of the 'if'.
diff --git a/test/Transforms/InstCombine/sqrt.ll b/test/Transforms/InstCombine/sqrt.ll
index cc78417ebb..440b974851 100644
--- a/test/Transforms/InstCombine/sqrt.ll
+++ b/test/Transforms/InstCombine/sqrt.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -instcombine %s | FileCheck %s
+; RUN: opt -S -instcombine < %s | FileCheck %s
 
 define float @test1(float %x) nounwind readnone ssp {
 entry:
diff --git a/test/Transforms/InstCombine/store.ll b/test/Transforms/InstCombine/store.ll
index 64460d7a6d..164ba76326 100644
--- a/test/Transforms/InstCombine/store.ll
+++ b/test/Transforms/InstCombine/store.ll
@@ -83,3 +83,37 @@ Cont:
 ; CHECK-NEXT:  ret void
 }
 
+
+; PR14753 - merging two stores should preserve the TBAA tag.
+define void @test6(i32 %n, float* %a, i32* %gi) nounwind uwtable ssp {
+entry:
+  store i32 42, i32* %gi, align 4, !tbaa !0
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = load i32* %gi, align 4, !tbaa !0
+  %cmp = icmp slt i32 %0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %idxprom = sext i32 %0 to i64
+  %arrayidx = getelementptr inbounds float* %a, i64 %idxprom
+  store float 0.000000e+00, float* %arrayidx, align 4, !tbaa !3
+  %1 = load i32* %gi, align 4, !tbaa !0
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* %gi, align 4, !tbaa !0
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+; CHECK: @test6
+; CHECK: for.cond:
+; CHECK-NEXT: phi i32 [ 42
+; CHECK-NEXT: store i32 %storemerge, i32* %gi, align 4, !tbaa !0
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"float", metadata !1}
diff --git a/test/Transforms/InstCombine/vector_gep1.ll b/test/Transforms/InstCombine/vector_gep1.ll
index f4c75c8009..90ca26212f 100644
--- a/test/Transforms/InstCombine/vector_gep1.ll
+++ b/test/Transforms/InstCombine/vector_gep1.ll
@@ -1,5 +1,5 @@
-; RUN: opt -instcombine %s -disable-output
-; RUN: opt -instsimplify %s -disable-output
+; RUN: opt -instcombine -disable-output < %s
+; RUN: opt -instsimplify -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Transforms/InstSimplify/call.ll b/test/Transforms/InstSimplify/call.ll
new file mode 100644
index 0000000000..1a8d0c25bd
--- /dev/null
+++ b/test/Transforms/InstSimplify/call.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+declare {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a, i8 %b)
+
+define i1 @test_uadd1() {
+; CHECK: @test_uadd1
+  %x = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 254, i8 3)
+  %overflow = extractvalue {i8, i1} %x, 1
+  ret i1 %overflow
+; CHECK-NEXT: ret i1 true
+}
+
+define i8 @test_uadd2() {
+; CHECK: @test_uadd2
+  %x = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 254, i8 44)
+  %result = extractvalue {i8, i1} %x, 0
+  ret i8 %result
+; CHECK-NEXT: ret i8 42
+}
+
+declare i256 @llvm.cttz.i256(i256 %src, i1 %is_zero_undef)
+
+define i256 @test_cttz() {
+; CHECK: @test_cttz
+  %x = call i256 @llvm.cttz.i256(i256 10, i1 false)
+  ret i256 %x
+; CHECK-NEXT: ret i256 1
+}
+
+declare i256 @llvm.ctpop.i256(i256 %src)
+
+define i256 @test_ctpop() {
+; CHECK: @test_ctpop
+  %x = call i256 @llvm.ctpop.i256(i256 10)
+  ret i256 %x
+; CHECK-NEXT: ret i256 2
+}
+
+; Test a non-intrinsic that we know about as a library call.
+declare float @fabs(float %x)
+
+define float @test_fabs_libcall() {
+; CHECK: @test_fabs_libcall
+
+  %x = call float @fabs(float -42.0)
+; This is still a real function call, so instsimplify won't nuke it -- other
+; passes have to do that.
+; CHECK-NEXT: call float @fabs
+
+  ret float %x
+; CHECK-NEXT: ret float 4.2{{0+}}e+01
+}
diff --git a/test/Transforms/InstSimplify/fast-math.ll b/test/Transforms/InstSimplify/fast-math.ll
index e4b3ea306a..154b967397 100644
--- a/test/Transforms/InstSimplify/fast-math.ll
+++ b/test/Transforms/InstSimplify/fast-math.ll
@@ -33,3 +33,75 @@ define float @no_mul_zero_3(float %a) {
 ; CHECK: ret float %b
   ret float %b
 }
+
+; fadd [nnan ninf] X, (fsub [nnan ninf] 0, X) ==> 0
+;   where nnan and ninf have to occur at least once somewhere in this
+;   expression
+; CHECK: fadd_fsub_0
+define float @fadd_fsub_0(float %a) {
+; X + -X ==> 0
+  %t1 = fsub nnan ninf float 0.0, %a
+  %zero1 = fadd nnan ninf float %t1, %a
+
+  %t2 = fsub nnan float 0.0, %a
+  %zero2 = fadd ninf float %t2, %a
+
+  %t3 = fsub nnan ninf float 0.0, %a
+  %zero3 = fadd float %t3, %a
+
+  %t4 = fsub float 0.0, %a
+  %zero4 = fadd nnan ninf float %t4, %a
+
+; Dont fold this
+; CHECK: %nofold = fsub float 0.0
+  %nofold = fsub float 0.0, %a
+; CHECK: %no_zero = fadd nnan float %nofold, %a
+  %no_zero = fadd nnan float %nofold, %a
+
+; Coalesce the folded zeros
+  %zero5 = fadd float %zero1, %zero2
+  %zero6 = fadd float %zero3, %zero4
+  %zero7 = fadd float %zero5, %zero6
+
+; Should get folded
+  %ret = fadd nsz float %no_zero, %zero7
+
+; CHECK: ret float %no_zero
+  ret float %ret
+}
+
+; fsub nnan ninf x, x ==> 0.0
+; CHECK: @fsub_x_x
+define float @fsub_x_x(float %a) {
+; X - X ==> 0
+  %zero1 = fsub nnan ninf float %a, %a
+
+; Dont fold
+; CHECK: %no_zero1 = fsub
+  %no_zero1 = fsub ninf float %a, %a
+; CHECK: %no_zero2 = fsub
+  %no_zero2 = fsub nnan float %a, %a
+; CHECK: %no_zero = fadd
+  %no_zero = fadd float %no_zero1, %no_zero2
+
+; Should get folded
+  %ret = fadd nsz float %no_zero, %zero1
+
+; CHECK: ret float %no_zero
+  ret float %ret
+}
+
+; fadd nsz X, 0 ==> X
+; CHECK: @nofold_fadd_x_0
+define float @nofold_fadd_x_0(float %a) {
+; Dont fold
+; CHECK: %no_zero1 = fadd
+  %no_zero1 = fadd ninf float %a, 0.0
+; CHECK: %no_zero2 = fadd
+  %no_zero2 = fadd nnan float %a, 0.0
+; CHECK: %no_zero = fadd
+  %no_zero = fadd float %no_zero1, %no_zero2
+
+; CHECK: ret float %no_zero
+  ret float %no_zero
+}
diff --git a/test/Transforms/InstSimplify/floating-point-arithmetic.ll b/test/Transforms/InstSimplify/floating-point-arithmetic.ll
new file mode 100644
index 0000000000..f9c364cade
--- /dev/null
+++ b/test/Transforms/InstSimplify/floating-point-arithmetic.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+; fsub 0, (fsub 0, X) ==> X
+; CHECK: @fsub_0_0_x
+define float @fsub_0_0_x(float %a) {
+  %t1 = fsub float -0.0, %a
+  %ret = fsub float -0.0, %t1
+
+; CHECK: ret float %a
+  ret float %ret
+}
+
+; fsub X, 0 ==> X
+; CHECK: @fsub_x_0
+define float @fsub_x_0(float %a) {
+  %ret = fsub float %a, 0.0
+; CHECK ret float %a
+  ret float %ret
+}
+
+; fadd X, -0 ==> X
+; CHECK: @fadd_x_n0
+define float @fadd_x_n0(float %a) {
+  %ret = fadd float %a, -0.0
+; CHECK ret float %a
+  ret float %ret
+}
+
+; fmul X, 1.0 ==> X
+; CHECK: @fmul_X_1
+define double @fmul_X_1(double %a) {
+  %b = fmul double 1.000000e+00, %a                ; <double> [#uses=1]
+  ; CHECK: ret double %a
+  ret double %b
+}
diff --git a/test/Transforms/InstSimplify/vector_gep.ll b/test/Transforms/InstSimplify/vector_gep.ll
index f65260e00f..5ac1ddef64 100644
--- a/test/Transforms/InstSimplify/vector_gep.ll
+++ b/test/Transforms/InstSimplify/vector_gep.ll
@@ -1,4 +1,4 @@
-;RUN: opt -instsimplify %s -disable-output
+;RUN: opt -instsimplify -disable-output < %s
 declare void @helper(<2 x i8*>)
 define void @test(<2 x i8*> %a) {
   %A = getelementptr <2 x i8*> %a, <2 x i32> <i32 0, i32 0>
diff --git a/test/Transforms/JumpThreading/basic.ll b/test/Transforms/JumpThreading/basic.ll
index 46271379bd..93fa29b006 100644
--- a/test/Transforms/JumpThreading/basic.ll
+++ b/test/Transforms/JumpThreading/basic.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -jump-threading -S | FileCheck %s
+; RUN: opt -jump-threading -S < %s | FileCheck %s
 
 declare i32 @f1()
 declare i32 @f2()
@@ -476,3 +476,39 @@ exit1:
 ; CHECK: }
 }
 
+; In this test we check that block duplication is inhibited by the presence
+; of a function with the 'noduplicate' attribute.
+
+declare void @g()
+declare void @j()
+declare void @k()
+
+; CHECK: define void @h(i32 %p) {
+define void @h(i32 %p) {
+  %x = icmp ult i32 %p, 5
+  br i1 %x, label %l1, label %l2
+
+l1:
+  call void @j()
+  br label %l3
+
+l2:
+  call void @k()
+  br label %l3
+
+l3:
+; CHECK: call void @g() noduplicate
+; CHECK-NOT: call void @g() noduplicate
+  call void @g() noduplicate
+  %y = icmp ult i32 %p, 5
+  br i1 %y, label %l4, label %l5
+
+l4:
+  call void @j()
+  ret void
+
+l5:
+  call void @k()
+  ret void
+; CHECK: }
+}
diff --git a/test/Transforms/JumpThreading/degenerate-phi.ll b/test/Transforms/JumpThreading/degenerate-phi.ll
index 35d9fdec42..2905b43af7 100644
--- a/test/Transforms/JumpThreading/degenerate-phi.ll
+++ b/test/Transforms/JumpThreading/degenerate-phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt -jump-threading -disable-output %s
+; RUN: opt -jump-threading -disable-output < %s
 ; PR9112
 
 ; This is actually a test for value tracking. Jump threading produces
diff --git a/test/Transforms/JumpThreading/or-undef.ll b/test/Transforms/JumpThreading/or-undef.ll
index 6e359925b6..6311b6df43 100644
--- a/test/Transforms/JumpThreading/or-undef.ll
+++ b/test/Transforms/JumpThreading/or-undef.ll
@@ -1,4 +1,4 @@
-; RUN: opt -jump-threading -S %s | FileCheck %s
+; RUN: opt -jump-threading -S < %s | FileCheck %s
 ; rdar://7620633
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/test/Transforms/LICM/2011-07-06-Alignment.ll b/test/Transforms/LICM/2011-07-06-Alignment.ll
index f97b7010bc..569231489f 100644
--- a/test/Transforms/LICM/2011-07-06-Alignment.ll
+++ b/test/Transforms/LICM/2011-07-06-Alignment.ll
@@ -1,4 +1,4 @@
-; RUN: opt -licm -S %s | FileCheck %s
+; RUN: opt -licm -S < %s | FileCheck %s
 
 @A = common global [1024 x float] zeroinitializer, align 4
 
diff --git a/test/Transforms/LICM/crash.ll b/test/Transforms/LICM/crash.ll
index de41d008a7..b43477a56d 100644
--- a/test/Transforms/LICM/crash.ll
+++ b/test/Transforms/LICM/crash.ll
@@ -1,4 +1,4 @@
-; RUN: opt -licm %s -disable-output
+; RUN: opt -licm -disable-output < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
diff --git a/test/Transforms/LICM/hoisting.ll b/test/Transforms/LICM/hoisting.ll
index 98f93345e3..1ca377eb4a 100644
--- a/test/Transforms/LICM/hoisting.ll
+++ b/test/Transforms/LICM/hoisting.ll
@@ -90,3 +90,29 @@ for.end:                                          ; preds = %for.body
 
 declare void @foo_may_call_exit(i32)
 
+; PR14854
+; CHECK: @test5
+; CHECK: extractvalue
+; CHECK: br label %tailrecurse
+; CHECK: tailrecurse:
+; CHECK: ifend:
+; CHECK: insertvalue
+define { i32*, i32 } @test5(i32 %i, { i32*, i32 } %e) {
+entry:
+  br label %tailrecurse
+
+tailrecurse:                                      ; preds = %then, %entry
+  %i.tr = phi i32 [ %i, %entry ], [ %cmp2, %then ]
+  %out = extractvalue { i32*, i32 } %e, 1
+  %d = insertvalue { i32*, i32 } %e, i32* null, 0
+  %cmp1 = icmp sgt i32 %out, %i.tr
+  br i1 %cmp1, label %then, label %ifend
+
+then:                                             ; preds = %tailrecurse
+  call void @foo()
+  %cmp2 = add i32 %i.tr, 1
+  br label %tailrecurse
+
+ifend:                                            ; preds = %tailrecurse
+  ret { i32*, i32 } %d
+}
diff --git a/test/Transforms/LICM/scalar_promote.ll b/test/Transforms/LICM/scalar_promote.ll
index 05a64d6322..e7eab92aa8 100644
--- a/test/Transforms/LICM/scalar_promote.ll
+++ b/test/Transforms/LICM/scalar_promote.ll
@@ -1,28 +1,28 @@
-; RUN: opt < %s -basicaa -licm -S | FileCheck %s
+; RUN: opt < %s -basicaa -tbaa -licm -S | FileCheck %s
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
-@X = global i32 7		; <i32*> [#uses=4]
+@X = global i32 7   ; <i32*> [#uses=4]
 
 define void @test1(i32 %i) {
 Entry:
-	br label %Loop
+  br label %Loop
 ; CHECK: @test1
 ; CHECK: Entry:
 ; CHECK-NEXT:   load i32* @X
 ; CHECK-NEXT:   br label %Loop
 
 
-Loop:		; preds = %Loop, %0
-	%j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]		; <i32> [#uses=1]
-	%x = load i32* @X		; <i32> [#uses=1]
-	%x2 = add i32 %x, 1		; <i32> [#uses=1]
-	store i32 %x2, i32* @X
-	%Next = add i32 %j, 1		; <i32> [#uses=2]
-	%cond = icmp eq i32 %Next, 0		; <i1> [#uses=1]
-	br i1 %cond, label %Out, label %Loop
+Loop:   ; preds = %Loop, %0
+  %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]    ; <i32> [#uses=1]
+  %x = load i32* @X   ; <i32> [#uses=1]
+  %x2 = add i32 %x, 1   ; <i32> [#uses=1]
+  store i32 %x2, i32* @X
+  %Next = add i32 %j, 1   ; <i32> [#uses=2]
+  %cond = icmp eq i32 %Next, 0    ; <i1> [#uses=1]
+  br i1 %cond, label %Out, label %Loop
 
-Out:	
-	ret void
+Out:
+  ret void
 ; CHECK: Out:
 ; CHECK-NEXT:   store i32 %x2, i32* @X
 ; CHECK-NEXT:   ret void
@@ -31,22 +31,22 @@ Out:
 
 define void @test2(i32 %i) {
 Entry:
-	br label %Loop
+  br label %Loop
 ; CHECK: @test2
 ; CHECK: Entry:
 ; CHECK-NEXT:    %.promoted = load i32* getelementptr inbounds (i32* @X, i64 1)
 ; CHECK-NEXT:    br label %Loop
 
-Loop:		; preds = %Loop, %0
-	%X1 = getelementptr i32* @X, i64 1		; <i32*> [#uses=1]
-	%A = load i32* %X1		; <i32> [#uses=1]
-	%V = add i32 %A, 1		; <i32> [#uses=1]
-	%X2 = getelementptr i32* @X, i64 1		; <i32*> [#uses=1]
-	store i32 %V, i32* %X2
-	br i1 false, label %Loop, label %Exit
+Loop:   ; preds = %Loop, %0
+  %X1 = getelementptr i32* @X, i64 1    ; <i32*> [#uses=1]
+  %A = load i32* %X1    ; <i32> [#uses=1]
+  %V = add i32 %A, 1    ; <i32> [#uses=1]
+  %X2 = getelementptr i32* @X, i64 1    ; <i32*> [#uses=1]
+  store i32 %V, i32* %X2
+  br i1 false, label %Loop, label %Exit
 
-Exit:		; preds = %Loop
-	ret void
+Exit:   ; preds = %Loop
+  ret void
 ; CHECK: Exit:
 ; CHECK-NEXT:   store i32 %V, i32* getelementptr inbounds (i32* @X, i64 1)
 ; CHECK-NEXT:   ret void
@@ -56,19 +56,19 @@ Exit:		; preds = %Loop
 
 define void @test3(i32 %i) {
 ; CHECK: @test3
-	br label %Loop
+  br label %Loop
 Loop:
         ; Should not promote this to a register
-	%x = load volatile i32* @X
-	%x2 = add i32 %x, 1	
-	store i32 %x2, i32* @X
-	br i1 true, label %Out, label %Loop
-        
+  %x = load volatile i32* @X
+  %x2 = add i32 %x, 1
+  store i32 %x2, i32* @X
+  br i1 true, label %Out, label %Loop
+
 ; CHECK: Loop:
 ; CHECK-NEXT: load volatile
 
-Out:		; preds = %Loop
-	ret void
+Out:    ; preds = %Loop
+  ret void
 }
 
 ; PR8041
@@ -120,27 +120,27 @@ exit:
 
 define void @test5(i32 %i, i32** noalias %P2) {
 Entry:
-	br label %Loop
+  br label %Loop
 ; CHECK: @test5
 ; CHECK: Entry:
 ; CHECK-NEXT:   load i32* @X
 ; CHECK-NEXT:   br label %Loop
 
 
-Loop:		; preds = %Loop, %0
-	%j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]		; <i32> [#uses=1]
-	%x = load i32* @X		; <i32> [#uses=1]
-	%x2 = add i32 %x, 1		; <i32> [#uses=1]
-	store i32 %x2, i32* @X
-        
+Loop:   ; preds = %Loop, %0
+  %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]    ; <i32> [#uses=1]
+  %x = load i32* @X   ; <i32> [#uses=1]
+  %x2 = add i32 %x, 1   ; <i32> [#uses=1]
+  store i32 %x2, i32* @X
+
         store volatile i32* @X, i32** %P2
-        
-	%Next = add i32 %j, 1		; <i32> [#uses=2]
-	%cond = icmp eq i32 %Next, 0		; <i1> [#uses=1]
-	br i1 %cond, label %Out, label %Loop
 
-Out:	
-	ret void
+  %Next = add i32 %j, 1   ; <i32> [#uses=2]
+  %cond = icmp eq i32 %Next, 0    ; <i1> [#uses=1]
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  ret void
 ; CHECK: Out:
 ; CHECK-NEXT:   store i32 %x2, i32* @X
 ; CHECK-NEXT:   ret void
@@ -148,3 +148,40 @@ Out:
 }
 
 
+; PR14753 - Preserve TBAA tags when promoting values in a loop.
+define void @test6(i32 %n, float* nocapture %a, i32* %gi) {
+entry:
+  store i32 0, i32* %gi, align 4, !tbaa !0
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %storemerge2 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %idxprom = sext i32 %storemerge2 to i64
+  %arrayidx = getelementptr inbounds float* %a, i64 %idxprom
+  store float 0.000000e+00, float* %arrayidx, align 4, !tbaa !3
+  %0 = load i32* %gi, align 4, !tbaa !0
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %gi, align 4, !tbaa !0
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  ret void
+
+; CHECK: for.body.lr.ph:
+; CHECK-NEXT:  %gi.promoted = load i32* %gi, align 4, !tbaa !0
+; CHECK: for.cond.for.end_crit_edge:
+; CHECK-NEXT:  store i32 %inc, i32* %gi, align 4, !tbaa !0
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"float", metadata !1}
diff --git a/test/Transforms/LoopDeletion/2011-06-21-phioperands.ll b/test/Transforms/LoopDeletion/2011-06-21-phioperands.ll
index 40c6629e6f..cf9d8ce923 100644
--- a/test/Transforms/LoopDeletion/2011-06-21-phioperands.ll
+++ b/test/Transforms/LoopDeletion/2011-06-21-phioperands.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -loop-deletion -disable-output
+; RUN: opt -loop-deletion -disable-output < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/test/Transforms/LoopIdiom/X86/popcnt.ll b/test/Transforms/LoopIdiom/X86/popcnt.ll
index 2f458fb2f1..25df93d3a0 100644
--- a/test/Transforms/LoopIdiom/X86/popcnt.ll
+++ b/test/Transforms/LoopIdiom/X86/popcnt.ll
@@ -118,3 +118,23 @@ while.end:                                        ; preds = %while.body, %entry
   %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
   ret i32 %c.0.lcssa
 }
+
+define i32 @PopCntCrash3(i64 %a, i32 %x) {
+entry:
+  %tobool3 = icmp eq i64 %a, 0
+  %cmp = icmp eq i32 %x, 0
+  br i1 %tobool3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %a.addr.04 = phi i64 [ %and, %while.body ], [ %a, %entry ]
+  %inc = add nsw i32 %c.05, 1
+  %sub = add i64 %a.addr.04, -1
+  %and = and i64 %sub, %a.addr.04
+  %tobool = icmp eq i64 %and, 0
+  br i1 %cmp, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  ret i32 %c.0.lcssa
+}
diff --git a/test/Transforms/LoopRotate/basic.ll b/test/Transforms/LoopRotate/basic.ll
index b7bcb21d56..78878f9fa6 100644
--- a/test/Transforms/LoopRotate/basic.ll
+++ b/test/Transforms/LoopRotate/basic.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -loop-rotate %s | FileCheck %s
+; RUN: opt -S -loop-rotate < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
 
@@ -33,3 +33,29 @@ for.end:                                          ; preds = %for.cond
 
 declare void @g(i32*)
 
+; CHECK: @test2
+define void @test2() nounwind ssp {
+entry:
+  %array = alloca [20 x i32], align 16
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %i.0, 100
+; CHECK: call void @f
+; CHECK-NOT: call void @f
+  call void @f() noduplicate 
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %i.0, 1
+  call void @h()
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+; CHECK: }
+}
+
+declare void @f() noduplicate
+declare void @h()
diff --git a/test/Transforms/LoopRotate/crash.ll b/test/Transforms/LoopRotate/crash.ll
index 954b834765..fd922cb556 100644
--- a/test/Transforms/LoopRotate/crash.ll
+++ b/test/Transforms/LoopRotate/crash.ll
@@ -1,4 +1,4 @@
-; RUN: opt -loop-rotate %s -disable-output -verify-dom-info -verify-loop-info
+; RUN: opt -loop-rotate -disable-output -verify-dom-info -verify-loop-info < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
diff --git a/test/Transforms/LoopRotate/dbgvalue.ll b/test/Transforms/LoopRotate/dbgvalue.ll
index b32ee82d3a..6a8d30820f 100644
--- a/test/Transforms/LoopRotate/dbgvalue.ll
+++ b/test/Transforms/LoopRotate/dbgvalue.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -loop-rotate  %s  | FileCheck %s
+; RUN: opt -S -loop-rotate < %s | FileCheck %s
 
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
diff --git a/test/Transforms/LoopRotate/phi-duplicate.ll b/test/Transforms/LoopRotate/phi-duplicate.ll
index 7372830922..8ad2dce71a 100644
--- a/test/Transforms/LoopRotate/phi-duplicate.ll
+++ b/test/Transforms/LoopRotate/phi-duplicate.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S %s -loop-rotate | FileCheck %s
+; RUN: opt -S -loop-rotate < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0"
 
diff --git a/test/Transforms/LoopStrengthReduce/2012-07-18-LimitReassociate.ll b/test/Transforms/LoopStrengthReduce/2012-07-18-LimitReassociate.ll
index 3793baccbb..53da462716 100644
--- a/test/Transforms/LoopStrengthReduce/2012-07-18-LimitReassociate.ll
+++ b/test/Transforms/LoopStrengthReduce/2012-07-18-LimitReassociate.ll
@@ -1,20 +1,21 @@
-; RUN: opt -loop-reduce -disable-output -debug-only=loop-reduce %s 2> %t
+; RUN: opt -loop-reduce -disable-output -debug-only=loop-reduce < %s 2> %t
 ; RUN: FileCheck %s < %t
 ; REQUIRES: asserts
 ;
 ; PR13361: LSR + SCEV "hangs" on reasonably sized test with sequence of loops
 ;
 ; Without limits on CollectSubexpr, we have thousands of formulae for
-; the use that crosses loops. With limits we have five.
+; the use that crosses loops. With limits we have six.
 ; CHECK: LSR on loop %bb221:
 ; CHECK: After generating reuse formulae:
 ; CHECK: LSR is examining the following uses:
 ; CHECK: LSR Use: Kind=Special
-; CHECK: {{.*reg\(\{\{\{\{\{\{\{\{\{}}
-; CHECK: {{.*reg\(\{\{\{\{\{\{\{\{\{}}
-; CHECK: {{.*reg\(\{\{\{\{\{\{\{\{\{}}
-; CHECK: {{.*reg\(\{\{\{\{\{\{\{\{\{}}
-; CHECK: {{.*reg\(\{\{\{\{\{\{\{\{\{}}
+; CHECK: {{.*reg\(\{.*\{.*\{.*\{.*\{.*\{.*\{.*\{.*\{}}
+; CHECK: {{.*reg\(\{.*\{.*\{.*\{.*\{.*\{.*\{.*\{.*\{}}
+; CHECK: {{.*reg\(\{.*\{.*\{.*\{.*\{.*\{.*\{.*\{.*\{}}
+; CHECK: {{.*reg\(\{.*\{.*\{.*\{.*\{.*\{.*\{.*\{.*\{}}
+; CHECK: {{.*reg\(\{.*\{.*\{.*\{.*\{.*\{.*\{.*\{.*\{}}
+; CHECK: {{.*reg\(\{.*\{.*\{.*\{.*\{.*\{.*\{.*\{.*\{}}
 ; CHECK-NOT:reg
 ; CHECK: Filtering for use
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
diff --git a/test/Transforms/LoopStrengthReduce/2013-01-05-IndBr.ll b/test/Transforms/LoopStrengthReduce/2013-01-05-IndBr.ll
new file mode 100644
index 0000000000..bce234cd40
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/2013-01-05-IndBr.ll
@@ -0,0 +1,44 @@
+; RUN: opt -loop-reduce -S < %s | FileCheck %s
+;
+; Indirect branch in the preheader crashes replaceCongruentIVs.
+; rdar://12910141
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+
+; CHECK: @test
+; CHECK: bb8:
+; CHECK-NEXT: phi i8
+; CHECK-NEXT: phi i8
+; CHECK: ret void
+define void @test() nounwind ssp {
+bb:
+  br label %bb190
+
+bb8:                                              ; preds = %bb190, %bb11
+  %tmp = phi i8 [ %tmp14, %bb11 ], [ 25, %bb190 ]
+  %tmp9 = phi i8 [ %tmp12, %bb11 ], [ 25, %bb190 ]
+  %tmp10 = add i8 %tmp, -5
+  indirectbr i8* undef, [label %bb11, label %bb15]
+
+bb11:                                             ; preds = %bb8
+  %tmp12 = add i8 %tmp9, 1
+  %tmp13 = add i8 %tmp9, -19
+  %tmp14 = add i8 %tmp, 1
+  indirectbr i8* undef, [label %bb8]
+
+bb15:                                             ; preds = %bb8
+  indirectbr i8* undef, [label %bb16]
+
+bb16:                                             ; preds = %bb16, %bb15
+  indirectbr i8* undef, [label %bb37, label %bb190]
+
+
+bb37:                                             ; preds = %bb190
+  indirectbr i8* undef, [label %bb38]
+
+bb38:                                             ; preds = %bb37, %bb5
+  ret void
+
+bb190:                                            ; preds = %bb189, %bb187
+  indirectbr i8* undef, [label %bb37, label %bb8]
+}
diff --git a/test/Transforms/LoopStrengthReduce/2008-08-14-ShadowIV.ll b/test/Transforms/LoopStrengthReduce/X86/2008-08-14-ShadowIV.ll
index c650d8cf76..9a7f4865c5 100644
--- a/test/Transforms/LoopStrengthReduce/2008-08-14-ShadowIV.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/2008-08-14-ShadowIV.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-reduce -S | grep "phi double" | count 1
+; RUN: opt < %s -loop-reduce -S -mtriple=x86_64-unknown-unknown | grep "phi double" | count 1
 
 define void @foobar(i32 %n) nounwind {
 entry:
diff --git a/test/Transforms/LoopStrengthReduce/2011-07-20-DoubleIV.ll b/test/Transforms/LoopStrengthReduce/X86/2011-07-20-DoubleIV.ll
index 5d9ed64ef4..a932b47925 100644
--- a/test/Transforms/LoopStrengthReduce/2011-07-20-DoubleIV.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/2011-07-20-DoubleIV.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-reduce -S | FileCheck %s
+; RUN: opt < %s -loop-reduce -S -mtriple=x86_64-unknown-unknown | FileCheck %s
 ;
 ; Test LSR's OptimizeShadowIV. Handle a floating-point IV with a
 ; nonzero initial value.
diff --git a/test/Transforms/LoopStrengthReduce/dominate-assert.ll b/test/Transforms/LoopStrengthReduce/dominate-assert.ll
index b87bf620de..ff8cab8313 100644
--- a/test/Transforms/LoopStrengthReduce/dominate-assert.ll
+++ b/test/Transforms/LoopStrengthReduce/dominate-assert.ll
@@ -1,4 +1,4 @@
-; RUN: opt -loop-reduce %s
+; RUN: opt -loop-reduce < %s
 ; we used to crash on this one
 
 declare i8* @_Znwm()
diff --git a/test/Transforms/LoopStrengthReduce/exit_compare_live_range.ll b/test/Transforms/LoopStrengthReduce/exit_compare_live_range.ll
index ad4959be34..498be1a9a1 100644
--- a/test/Transforms/LoopStrengthReduce/exit_compare_live_range.ll
+++ b/test/Transforms/LoopStrengthReduce/exit_compare_live_range.ll
@@ -2,7 +2,7 @@
 ; having overlapping live ranges that result in copies.  We want the setcc 
 ; instruction immediately before the conditional branch.
 ;
-; RUN: opt -S -loop-reduce %s | FileCheck %s
+; RUN: opt -S -loop-reduce < %s | FileCheck %s
 
 define void @foo(float* %D, i32 %E) {
 entry:
diff --git a/test/Transforms/LoopStrengthReduce/post-inc-icmpzero.ll b/test/Transforms/LoopStrengthReduce/post-inc-icmpzero.ll
index 96904c66e6..9e02d92a6f 100644
--- a/test/Transforms/LoopStrengthReduce/post-inc-icmpzero.ll
+++ b/test/Transforms/LoopStrengthReduce/post-inc-icmpzero.ll
@@ -4,12 +4,12 @@
 ; LSR should properly handle the post-inc offset when folding the
 ; non-IV operand of an icmp into the IV.
 
-; CHECK:   %4 = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
-; CHECK:   %5 = lshr i64 %4, 1
-; CHECK:   %6 = mul i64 %5, 2
+; CHECK:   %3 = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+; CHECK:   %4 = lshr i64 %3, 1
+; CHECK:   %5 = mul i64 %4, 2
 ; CHECK:   br label %for.body
 ; CHECK: for.body:
-; CHECK:   %lsr.iv2 = phi i64 [ %lsr.iv.next, %for.body ], [ %6, %for.body.lr.ph ]
+; CHECK:   %lsr.iv2 = phi i64 [ %lsr.iv.next, %for.body ], [ %5, %for.body.lr.ph ]
 ; CHECK:   %lsr.iv.next = add i64 %lsr.iv2, -2
 ; CHECK:   %lsr.iv.next3 = inttoptr i64 %lsr.iv.next to i16*
 ; CHECK:   %cmp27 = icmp eq i16* %lsr.iv.next3, null
diff --git a/test/Transforms/LoopUnroll/basic.ll b/test/Transforms/LoopUnroll/basic.ll
index eeb3e9a57b..ab5bc568ed 100644
--- a/test/Transforms/LoopUnroll/basic.ll
+++ b/test/Transforms/LoopUnroll/basic.ll
@@ -22,3 +22,26 @@ l1:                                               ; preds = %l1, %entry
 l2:                                               ; preds = %l1
   ret i32 0
 }
+
+; This should not unroll since the call is 'noduplicate'.
+
+; CHECK: @test2
+define i32 @test2(i8** %P) nounwind ssp {
+entry:
+  br label %l1
+
+l1:                                               ; preds = %l1, %entry
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %l1 ]
+; CHECK: call void @f()
+; CHECK-NOT: call void @f()
+  call void @f() noduplicate
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, 3
+  br i1 %exitcond, label %l2, label %l1
+
+l2:                                               ; preds = %l1
+  ret i32 0
+; CHECK: }
+}
+
+declare void @f()
diff --git a/test/Transforms/LoopUnswitch/2011-11-18-SimpleSwitch.ll b/test/Transforms/LoopUnswitch/2011-11-18-SimpleSwitch.ll
index c1fd588106..59a8236f13 100644
--- a/test/Transforms/LoopUnswitch/2011-11-18-SimpleSwitch.ll
+++ b/test/Transforms/LoopUnswitch/2011-11-18-SimpleSwitch.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -loop-unswitch -disable-output -stats -info-output-file - < %s | FileCheck --check-prefix=STATS %s
-; RUN: opt -S -loop-unswitch -verify-loop-info -verify-dom-info %s | FileCheck %s
+; RUN: opt -S -loop-unswitch -verify-loop-info -verify-dom-info < %s | FileCheck %s
 
 ; STATS: 1 loop-simplify - Number of pre-header or exit blocks inserted
 ; STATS: 2 loop-unswitch - Number of switches unswitched
diff --git a/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches-Threshold.ll b/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches-Threshold.ll
index f3db471199..67982feb7e 100644
--- a/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches-Threshold.ll
+++ b/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches-Threshold.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -loop-unswitch -loop-unswitch-threshold 13 -disable-output -stats -info-output-file - < %s | FileCheck --check-prefix=STATS %s
-; RUN: opt -S -loop-unswitch -loop-unswitch-threshold 13 -verify-loop-info -verify-dom-info %s | FileCheck %s
+; RUN: opt -S -loop-unswitch -loop-unswitch-threshold 13 -verify-loop-info -verify-dom-info < %s | FileCheck %s
 
 ; STATS: 1 loop-simplify - Number of pre-header or exit blocks inserted
 ; STATS: 1 loop-unswitch - Number of switches unswitched
diff --git a/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches.ll b/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches.ll
index 270899642f..36b7effb90 100644
--- a/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches.ll
+++ b/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -loop-unswitch -loop-unswitch-threshold 1000 -disable-output -stats -info-output-file - < %s | FileCheck --check-prefix=STATS %s
-; RUN: opt -S -loop-unswitch -loop-unswitch-threshold 1000 -verify-loop-info -verify-dom-info %s | FileCheck %s
+; RUN: opt -S -loop-unswitch -loop-unswitch-threshold 1000 -verify-loop-info -verify-dom-info < %s | FileCheck %s
 
 ; STATS: 1 loop-simplify - Number of pre-header or exit blocks inserted
 ; STATS: 3 loop-unswitch - Number of switches unswitched
diff --git a/test/Transforms/LoopUnswitch/basictest.ll b/test/Transforms/LoopUnswitch/basictest.ll
index 1e6f2cf15e..e98d82b652 100644
--- a/test/Transforms/LoopUnswitch/basictest.ll
+++ b/test/Transforms/LoopUnswitch/basictest.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-unswitch -disable-output
+; RUN: opt < %s -loop-unswitch -verify-loop-info -S < %s 2>&1 | FileCheck %s
 
 define i32 @test(i32* %A, i1 %C) {
 entry:
@@ -29,3 +29,40 @@ return:		; preds = %endif, %then
 	ret i32 %tmp.13
 }
 
+; This simple test would normally unswitch, but should be inhibited by the presence of
+; the noduplicate call.
+
+; CHECK: @test2
+define i32 @test2(i32* %var) {
+  %mem = alloca i32
+  store i32 2, i32* %mem
+  %c = load i32* %mem
+
+  br label %loop_begin
+
+loop_begin:
+
+  %var_val = load i32* %var
+
+  switch i32 %c, label %default [
+      i32 1, label %inc
+      i32 2, label %dec
+  ]
+
+inc:
+  call void @incf() noreturn nounwind
+  br label %loop_begin
+dec:
+; CHECK: call void @decf()
+; CHECK-NOT: call void @decf()
+  call void @decf() noreturn nounwind noduplicate
+  br label %loop_begin
+default:
+  br label %loop_exit
+loop_exit:
+  ret i32 0
+; CHECK: }
+}
+
+declare void @incf() noreturn
+declare void @decf() noreturn
diff --git a/test/Transforms/LoopUnswitch/preserve-analyses.ll b/test/Transforms/LoopUnswitch/preserve-analyses.ll
index 668f8ecaf8..f79612bef5 100644
--- a/test/Transforms/LoopUnswitch/preserve-analyses.ll
+++ b/test/Transforms/LoopUnswitch/preserve-analyses.ll
@@ -1,4 +1,4 @@
-; RUN: opt -loop-unswitch -verify-loop-info -verify-dom-info %s -disable-output
+; RUN: opt -loop-unswitch -verify-loop-info -verify-dom-info -disable-output < %s
 
 ; Loop unswitch should be able to unswitch these loops and
 ; preserve LCSSA and LoopSimplify forms.
diff --git a/test/Transforms/LoopVectorize/12-12-11-if-conv.ll b/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
new file mode 100644
index 0000000000..2dd7fe34a7
--- /dev/null
+++ b/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
@@ -0,0 +1,44 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @foo
+;CHECK: icmp eq <4 x i32>
+;CHECK: select <4 x i1>
+;CHECK: ret i32
+define i32 @foo(i32 %x, i32 %t, i32* nocapture %A) nounwind uwtable ssp {
+entry:
+  %cmp10 = icmp sgt i32 %x, 0
+  br i1 %cmp10, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %if.end
+  %indvars.iv = phi i64 [ %indvars.iv.next, %if.end ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4, !tbaa !0
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %1 = add nsw i64 %indvars.iv, 45
+  %2 = trunc i64 %indvars.iv to i32
+  %mul = mul nsw i32 %2, %t
+  %3 = trunc i64 %1 to i32
+  %add1 = add nsw i32 %3, %mul
+  br label %if.end
+
+if.end:                                           ; preds = %for.body, %if.then
+  %z.0 = phi i32 [ %add1, %if.then ], [ 9, %for.body ]
+  store i32 %z.0, i32* %arrayidx, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %x
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %if.end, %entry
+  ret i32 undef
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LoopVectorize/2012-10-20-infloop.ll b/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
index 0176c9a189..aa7cc0ee32 100644
--- a/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
+++ b/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce
 
 ; Check that we don't fall into an infinite loop.
 define void @test() nounwind {
@@ -25,3 +25,47 @@ for.body:                                         ; preds = %for.body, %entry
 for.end:                                          ; preds = %for.body
  unreachable
 }
+
+;PR14701
+define void @start_model_rare() nounwind uwtable ssp {
+entry:
+  br i1 undef, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  br i1 undef, label %cond.false, label %cond.true
+
+cond.true:                                        ; preds = %if.end
+  unreachable
+
+cond.false:                                       ; preds = %if.end
+  br i1 undef, label %cond.false28, label %cond.true20
+
+cond.true20:                                      ; preds = %cond.false
+  unreachable
+
+cond.false28:                                     ; preds = %cond.false
+  br label %for.body40
+
+for.body40:                                       ; preds = %for.inc50, %cond.false28
+  %indvars.iv123 = phi i64 [ 3, %cond.false28 ], [ %indvars.iv.next124, %for.inc50 ]
+  %step.0121 = phi i32 [ 1, %cond.false28 ], [ %step.1, %for.inc50 ]
+  br i1 undef, label %if.then46, label %for.inc50
+
+if.then46:                                        ; preds = %for.body40
+  %inc47 = add nsw i32 %step.0121, 1
+  br label %for.inc50
+
+for.inc50:                                        ; preds = %if.then46, %for.body40
+  %k.1 = phi i32 [ undef, %for.body40 ], [ %inc47, %if.then46 ]
+  %step.1 = phi i32 [ %step.0121, %for.body40 ], [ %inc47, %if.then46 ]
+  %indvars.iv.next124 = add i64 %indvars.iv123, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next124 to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %for.end52, label %for.body40
+
+for.end52:                                        ; preds = %for.inc50
+  unreachable
+
+return:                                           ; preds = %entry
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll b/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll
index 2516e248bc..405582c408 100644
--- a/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll
+++ b/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -dce -force-vector-width=4 
+; RUN: opt < %s  -loop-vectorize -dce -force-vector-unroll=1 -force-vector-width=4 
 
 ; Check that we don't crash.
 
diff --git a/test/Transforms/LoopVectorize/ARM/arm-unroll.ll b/test/Transforms/LoopVectorize/ARM/arm-unroll.ll
new file mode 100644
index 0000000000..c8d307f5d4
--- /dev/null
+++ b/test/Transforms/LoopVectorize/ARM/arm-unroll.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S | FileCheck %s --check-prefix=SWIFT
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios3.0.0"
+
+;CHECK: @foo
+;CHECK: load <4 x i32>
+;CHECK-NOT: load <4 x i32>
+;CHECK: ret
+;SWIFT: @foo
+;SWIFT: load <4 x i32>
+;SWIFT: load <4 x i32>
+;SWIFT: ret
+define i32 @foo(i32* nocapture %A, i32 %n) nounwind readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ]
+  %sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32* %A, i32 %i.02
+  %3 = load i32* %2, align 4
+  %4 = add nsw i32 %3, %sum.01
+  %5 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %5, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
diff --git a/test/Transforms/LoopVectorize/ARM/gcc-examples.ll b/test/Transforms/LoopVectorize/ARM/gcc-examples.ll
new file mode 100644
index 0000000000..6a68e81bca
--- /dev/null
+++ b/test/Transforms/LoopVectorize/ARM/gcc-examples.ll
@@ -0,0 +1,60 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S -dce | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios3.0.0"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+; Select VF = 8;
+;CHECK: @example1
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example1() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+;CHECK: @example10b
+;CHECK: load <4 x i16>
+;CHECK: sext <4 x i16>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i16* %sb, i64 %indvars.iv
+  %3 = load i16* %2, align 2
+  %4 = sext i16 %3 to i32
+  %5 = getelementptr inbounds i32* %ia, i64 %indvars.iv
+  store i32 %4, i32* %5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %6, label %1
+
+; <label>:6                                       ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/ARM/lit.local.cfg b/test/Transforms/LoopVectorize/ARM/lit.local.cfg
new file mode 100644
index 0000000000..cb77b09ef4
--- /dev/null
+++ b/test/Transforms/LoopVectorize/ARM/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'ARM' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/LoopVectorize/ARM/width-detect.ll b/test/Transforms/LoopVectorize/ARM/width-detect.ll
new file mode 100644
index 0000000000..c0795b6a79
--- /dev/null
+++ b/test/Transforms/LoopVectorize/ARM/width-detect.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios3.0.0"
+
+;CHECK:foo_F64
+;CHECK: <2 x double>
+;CHECK:ret
+define double @foo_F64(double* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %prod.01 = phi double [ %4, %.lr.ph ], [ 0.000000e+00, %0 ]
+  %2 = getelementptr inbounds double* %A, i64 %indvars.iv
+  %3 = load double* %2, align 8
+  %4 = fmul fast double %prod.01, %3
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %prod.0.lcssa = phi double [ 0.000000e+00, %0 ], [ %4, %.lr.ph ]
+  ret double %prod.0.lcssa
+}
+
+;CHECK:foo_I8
+;CHECK: xor <16 x i8>
+;CHECK:ret
+define signext i8 @foo_I8(i8* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %red.01 = phi i8 [ %4, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i8* %A, i64 %indvars.iv
+  %3 = load i8* %2, align 1
+  %4 = xor i8 %3, %red.01
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %red.0.lcssa = phi i8 [ 0, %0 ], [ %4, %.lr.ph ]
+  ret i8 %red.0.lcssa
+}
+
+
diff --git a/test/Transforms/LoopVectorize/X86/avx1.ll b/test/Transforms/LoopVectorize/X86/avx1.ll
index a2d176a534..a85c6fe0d5 100644
--- a/test/Transforms/LoopVectorize/X86/avx1.ll
+++ b/test/Transforms/LoopVectorize/X86/avx1.ll
@@ -27,7 +27,7 @@ define i32 @read_mod_write_single_ptr(float* nocapture %a, i32 %n) nounwind uwta
 
 
 ;CHECK: @read_mod_i64
-;CHECK: load <8 x i64>
+;CHECK: load <4 x i64>
 ;CHECK: ret i32
 define i32 @read_mod_i64(i64* nocapture %a, i32 %n) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 0
diff --git a/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/test/Transforms/LoopVectorize/X86/conversion-cost.ll
index 8f1bb545fa..23d9233544 100644
--- a/test/Transforms/LoopVectorize/X86/conversion-cost.ll
+++ b/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -4,7 +4,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-apple-macosx10.8.0"
 
 ;CHECK: @conversion_cost1
-;CHECK: store <2 x i8>
+;CHECK: store <32 x i8>
 ;CHECK: ret
 define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 3
diff --git a/test/Transforms/LoopVectorize/X86/cost-model.ll b/test/Transforms/LoopVectorize/X86/cost-model.ll
index 628f9912c8..b7f479acf9 100644
--- a/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -8,8 +8,11 @@ target triple = "x86_64-apple-macosx10.8.0"
 @d = common global [2048 x i32] zeroinitializer, align 16
 @a = common global [2048 x i32] zeroinitializer, align 16
 
+; The program below gathers and scatters data. We better not vectorize it.
 ;CHECK: cost_model_1
-;CHECK: <4 x i32>
+;CHECK-NOT: <2 x i32>
+;CHECK-NOT: <4 x i32>
+;CHECK-NOT: <8 x i32>
 ;CHECK: ret void
 define void @cost_model_1() nounwind uwtable noinline ssp {
 entry:
diff --git a/test/Transforms/LoopVectorize/X86/gcc-examples.ll b/test/Transforms/LoopVectorize/X86/gcc-examples.ll
index 574c529834..d2d0eac305 100644
--- a/test/Transforms/LoopVectorize/X86/gcc-examples.ll
+++ b/test/Transforms/LoopVectorize/X86/gcc-examples.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -force-vector-unroll=0 -dce -instcombine -S | FileCheck %s -check-prefix=UNROLL
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -9,10 +10,19 @@ target triple = "x86_64-apple-macosx10.8.0"
 
 ; Select VF = 8;
 ;CHECK: @example1
-;CHECK: load <8 x i32>
-;CHECK: add nsw <8 x i32>
-;CHECK: store <8 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
 ;CHECK: ret void
+
+;UNROLL: @example1
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: ret void
 define void @example1() nounwind uwtable ssp {
   br label %1
 
@@ -34,13 +44,18 @@ define void @example1() nounwind uwtable ssp {
   ret void
 }
 
-
-; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive. 
+; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive.
 ;CHECK: @example10b
 ;CHECK: load <4 x i16>
 ;CHECK: sext <4 x i16>
 ;CHECK: store <4 x i32>
 ;CHECK: ret void
+;UNROLL: @example10b
+;UNROLL: load <4 x i16>
+;UNROLL: load <4 x i16>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: ret void
 define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
   br label %1
 
diff --git a/test/Transforms/LoopVectorize/X86/no-vector.ll b/test/Transforms/LoopVectorize/X86/no-vector.ll
new file mode 100644
index 0000000000..692eec9895
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/no-vector.ll
@@ -0,0 +1,22 @@
+; RUN: opt -S -mtriple=i386-unknown-freebsd -mcpu=i486 -loop-vectorize < %s
+
+define i32 @PR14639(i8* nocapture %s, i32 %len) nounwind {
+entry:
+  %cmp4 = icmp sgt i32 %len, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %r.05 = phi i32 [ %xor, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8* %s, i32 %i.06
+  %0 = load i8* %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  %xor = xor i32 %conv, %r.05
+  %inc = add nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %len
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %r.0.lcssa = phi i32 [ 0, %entry ], [ %xor, %for.body ]
+  ret i32 %r.0.lcssa
+}
diff --git a/test/Transforms/LoopVectorize/X86/struct-store.ll b/test/Transforms/LoopVectorize/X86/struct-store.ll
new file mode 100644
index 0000000000..a995e43a5a
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/struct-store.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux-gnu -S
+
+; Make sure we are not crashing on this one.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@glbl = external global [16 x { i64, i64 }], align 16
+
+declare void @fn()
+
+define void @test() {
+entry:
+  br label %loop
+
+loop:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %loop ], [ 0, %entry ]
+  %tmp = getelementptr inbounds [16 x { i64, i64 }]* @glbl, i64 0, i64 %indvars.iv
+  store { i64, i64 } { i64 ptrtoint (void ()* @fn to i64), i64 0 }, { i64, i64 }* %tmp, align 16
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 16
+  br i1 %exitcond, label %loop, label %exit
+
+exit:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll b/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll
new file mode 100644
index 0000000000..207598636c
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll
@@ -0,0 +1,50 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx2 -force-vector-width=4 -force-vector-unroll=0 -dce -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+;CHECK: @foo
+;CHECK: load <4 x i32>
+;CHECK-NOT: load <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK-NOT: store <4 x i32>
+;CHECK: ret
+define i32 @foo(i32* nocapture %A) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = add nsw i32 %3, 6
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 100
+  br i1 %exitcond, label %5, label %1
+
+; <label>:5                                       ; preds = %1
+  ret i32 undef
+}
+
+;CHECK: @bar
+;CHECK: store <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret
+define i32 @bar(i32* nocapture %A, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = add nsw i32 %3, 6
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/test/Transforms/LoopVectorize/calloc.ll b/test/Transforms/LoopVectorize/calloc.ll
new file mode 100644
index 0000000000..08c84eff5d
--- /dev/null
+++ b/test/Transforms/LoopVectorize/calloc.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+;CHECK: hexit
+;CHECK: zext <4 x i8>
+;CHECK: ret
+
+define noalias i8* @hexit(i8* nocapture %bytes, i64 %length) nounwind uwtable ssp {
+entry:
+  %shl = shl i64 %length, 1
+  %add28 = or i64 %shl, 1
+  %call = tail call i8* @calloc(i64 1, i64 %add28) nounwind
+  %cmp29 = icmp eq i64 %shl, 0
+  br i1 %cmp29, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = shl i64 %length, 1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %i.030 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %shr = lshr i64 %i.030, 1
+  %arrayidx = getelementptr inbounds i8* %bytes, i64 %shr
+  %1 = load i8* %arrayidx, align 1, !tbaa !0
+  %conv = zext i8 %1 to i32
+  %and = shl i64 %i.030, 2
+  %neg = and i64 %and, 4
+  %and3 = xor i64 %neg, 4
+  %sh_prom = trunc i64 %and3 to i32
+  %shl4 = shl i32 15, %sh_prom
+  %and5 = and i32 %conv, %shl4
+  %shr11 = lshr i32 %and5, %sh_prom
+  %conv13 = and i32 %shr11, 254
+  %cmp15 = icmp ugt i32 %conv13, 9
+  %cond = select i1 %cmp15, i32 87, i32 48
+  %add17 = add nsw i32 %cond, %shr11
+  %conv18 = trunc i32 %add17 to i8
+  %arrayidx19 = getelementptr inbounds i8* %call, i64 %i.030
+  store i8 %conv18, i8* %arrayidx19, align 1, !tbaa !0
+  %inc = add i64 %i.030, 1
+  %exitcond = icmp eq i64 %inc, %0
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i8* %call
+}
+
+declare noalias i8* @calloc(i64, i64) nounwind
+
+!0 = metadata !{metadata !"omnipotent char", metadata !1}
+!1 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LoopVectorize/cast-induction.ll b/test/Transforms/LoopVectorize/cast-induction.ll
new file mode 100644
index 0000000000..2aa29ed2c8
--- /dev/null
+++ b/test/Transforms/LoopVectorize/cast-induction.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+; rdar://problem/12848162
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK: @example12
+;CHECK: trunc i64
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example12() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = trunc i64 %indvars.iv to i32
+  store i32 %3, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %4, label %1
+
+; <label>:4                                       ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/cpp-new-array.ll b/test/Transforms/LoopVectorize/cpp-new-array.ll
index 26902eba9e..da0fb05fe8 100644
--- a/test/Transforms/LoopVectorize/cpp-new-array.ll
+++ b/test/Transforms/LoopVectorize/cpp-new-array.ll
@@ -1,10 +1,10 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
 ;CHECK: @cpp_new_arrays
-;CHECK: insertelement <4 x i32>
+;CHECK: sext i32
 ;CHECK: load <4 x float>
 ;CHECK: fadd <4 x float>
 ;CHECK: ret i32
diff --git a/test/Transforms/LoopVectorize/flags.ll b/test/Transforms/LoopVectorize/flags.ll
index 2f22a76457..656912e178 100644
--- a/test/Transforms/LoopVectorize/flags.ll
+++ b/test/Transforms/LoopVectorize/flags.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/float-reduction.ll b/test/Transforms/LoopVectorize/float-reduction.ll
new file mode 100644
index 0000000000..565684cccb
--- /dev/null
+++ b/test/Transforms/LoopVectorize/float-reduction.ll
@@ -0,0 +1,29 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+;CHECK: @foo
+;CHECK: fadd <4 x float>
+;CHECK: ret
+define float @foo(float* nocapture %A, i32* nocapture %n) nounwind uwtable readonly ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum.04 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float* %A, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4, !tbaa !0
+  %add = fadd fast float %sum.04, %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 200
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret float %add
+}
+
+!0 = metadata !{metadata !"float", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LoopVectorize/gcc-examples.ll b/test/Transforms/LoopVectorize/gcc-examples.ll
index f1bf6cb6d8..f335557c00 100644
--- a/test/Transforms/LoopVectorize/gcc-examples.ll
+++ b/test/Transforms/LoopVectorize/gcc-examples.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -force-vector-unroll=1 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -force-vector-unroll=4 -dce -instcombine -S | FileCheck %s -check-prefix=UNROLL
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -24,6 +25,20 @@ target triple = "x86_64-apple-macosx10.8.0"
 ;CHECK: add nsw <4 x i32>
 ;CHECK: store <4 x i32>
 ;CHECK: ret void
+;UNROLL: @example1
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: ret void
 define void @example1() nounwind uwtable ssp {
   br label %1
 
@@ -48,6 +63,12 @@ define void @example1() nounwind uwtable ssp {
 ;CHECK: @example2
 ;CHECK: store <4 x i32>
 ;CHECK: ret void
+;UNROLL: @example2
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: ret void
 define void @example2(i32 %n, i32 %x) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 0
   br i1 %1, label %.lr.ph5, label %.preheader
@@ -92,6 +113,12 @@ define void @example2(i32 %n, i32 %x) nounwind uwtable ssp {
 ;CHECK: @example3
 ;CHECK: <4 x i32>
 ;CHECK: ret void
+;UNROLL: @example3
+;UNROLL: <4 x i32>
+;UNROLL: <4 x i32>
+;UNROLL: <4 x i32>
+;UNROLL: <4 x i32>
+;UNROLL: ret void
 define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp {
   %1 = icmp eq i32 %n, 0
   br i1 %1, label %._crit_edge, label %.lr.ph
@@ -115,6 +142,12 @@ define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture
 ;CHECK: @example4
 ;CHECK: load <4 x i32>
 ;CHECK: ret void
+;UNROLL: @example4
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: ret void
 define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp {
   %1 = add nsw i32 %n, -1
   %2 = icmp eq i32 %n, 0
@@ -175,6 +208,12 @@ define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture
 ;CHECK: @example8
 ;CHECK: store <4 x i32>
 ;CHECK: ret void
+;UNROLL: @example8
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: ret void
 define void @example8(i32 %x) nounwind uwtable ssp {
   br label %.preheader
 
@@ -329,7 +368,7 @@ define void @example11() nounwind uwtable ssp {
 }
 
 ;CHECK: @example12
-;CHECK: trunc <4 x i64>
+;CHECK: trunc i64
 ;CHECK: store <4 x i32>
 ;CHECK: ret void
 define void @example12() nounwind uwtable ssp {
@@ -537,7 +576,8 @@ define void @example14(i32** nocapture %in, i32** nocapture %coeff, i32* nocaptu
 }
 
 ;CHECK: @example21
-;CHECK: <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: shufflevector {{.*}} <i32 3, i32 2, i32 1, i32 0>
 ;CHECK: ret i32
 define i32 @example21(i32* nocapture %b, i32 %n) nounwind uwtable readonly ssp {
   %1 = icmp sgt i32 %n, 0
diff --git a/test/Transforms/LoopVectorize/i8-induction.ll b/test/Transforms/LoopVectorize/i8-induction.ll
new file mode 100644
index 0000000000..7759b7085a
--- /dev/null
+++ b/test/Transforms/LoopVectorize/i8-induction.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@a = common global i8 0, align 1
+@b = common global i8 0, align 1
+
+define void @f() nounwind uwtable ssp {
+scalar.ph:
+  store i8 0, i8* inttoptr (i64 1 to i8*), align 1, !tbaa !0
+  %0 = load i8* @a, align 1, !tbaa !0
+  br label %for.body
+
+for.body:
+  %mul16 = phi i8 [ 0, %scalar.ph ], [ %mul, %for.body ]              ; <------- i8 induction var.
+  %c.015 = phi i8 [ undef, %scalar.ph ], [ %conv8, %for.body ]
+  %conv2 = sext i8 %c.015 to i32
+  %tobool = icmp ne i8 %c.015, 0
+  %.sink = select i1 %tobool, i8 %c.015, i8 %0
+  %mul = mul i8 %mul16, %.sink
+  %add = add nsw i32 %conv2, 1
+  %conv8 = trunc i32 %add to i8
+  %sext = shl i32 %add, 24
+  %phitmp14 = icmp slt i32 %sext, 268435456
+  br i1 %phitmp14, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  store i8 %mul, i8* @b, align 1, !tbaa !0
+  ret void
+}
+
+!0 = metadata !{metadata !"omnipotent char", metadata !1}
+!1 = metadata !{metadata !"Simple C/C++ TBAA"}
+
diff --git a/test/Transforms/LoopVectorize/if-conv-crash.ll b/test/Transforms/LoopVectorize/if-conv-crash.ll
new file mode 100644
index 0000000000..3283456aa3
--- /dev/null
+++ b/test/Transforms/LoopVectorize/if-conv-crash.ll
@@ -0,0 +1,39 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -enable-if-conversion
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define fastcc void @DD_dump() nounwind uwtable ssp {
+entry:
+  br i1 undef, label %lor.lhs.false, label %if.end25
+
+lor.lhs.false:                                    ; preds = %entry
+  br i1 undef, label %if.end21, label %if.else
+
+if.else:                                          ; preds = %lor.lhs.false
+  br i1 undef, label %num_q.exit, label %while.body.i.preheader
+
+while.body.i.preheader:                           ; preds = %if.else
+  br label %while.body.i
+
+while.body.i:                                     ; preds = %if.end.i, %while.body.i.preheader
+  switch i8 undef, label %if.end.i [
+    i8 39, label %if.then.i
+    i8 92, label %if.then.i
+  ]
+
+if.then.i:                                        ; preds = %while.body.i, %while.body.i
+  br label %if.end.i
+
+if.end.i:                                         ; preds = %if.then.i, %while.body.i
+  br i1 undef, label %num_q.exit, label %while.body.i
+
+num_q.exit:                                       ; preds = %if.end.i, %if.else
+  unreachable
+
+if.end21:                                         ; preds = %lor.lhs.false
+  unreachable
+
+if.end25:                                         ; preds = %entry
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/if-conversion-reduction.ll b/test/Transforms/LoopVectorize/if-conversion-reduction.ll
index bacf9c00d0..3a2d82e15d 100644
--- a/test/Transforms/LoopVectorize/if-conversion-reduction.ll
+++ b/test/Transforms/LoopVectorize/if-conversion-reduction.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -enable-if-conversion -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
diff --git a/test/Transforms/LoopVectorize/if-conversion.ll b/test/Transforms/LoopVectorize/if-conversion.ll
index b4701b9655..6e7c03a556 100644
--- a/test/Transforms/LoopVectorize/if-conversion.ll
+++ b/test/Transforms/LoopVectorize/if-conversion.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -enable-if-conversion -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
diff --git a/test/Transforms/LoopVectorize/increment.ll b/test/Transforms/LoopVectorize/increment.ll
index 71ea7689fc..3fa6b19ca9 100644
--- a/test/Transforms/LoopVectorize/increment.ll
+++ b/test/Transforms/LoopVectorize/increment.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/induction_plus.ll b/test/Transforms/LoopVectorize/induction_plus.ll
index b31bceb50d..96595cdc16 100644
--- a/test/Transforms/LoopVectorize/induction_plus.ll
+++ b/test/Transforms/LoopVectorize/induction_plus.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-vectorize -force-vector-width=4 -instcombine -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -6,8 +6,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 @array = common global [1024 x i32] zeroinitializer, align 16
 
 ;CHECK: @array_at_plus_one
-;CHECK: add <4 x i64>
-;CHECK: trunc <4 x i64>
+;CHECK: trunc i64
 ;CHECK: add i64 %index, 12
 ;CHECK: ret i32
 define i32 @array_at_plus_one(i32 %n) nounwind uwtable ssp {
diff --git a/test/Transforms/LoopVectorize/intrinsic.ll b/test/Transforms/LoopVectorize/intrinsic.ll
index 54e3c69fe1..7d5a5d706b 100644
--- a/test/Transforms/LoopVectorize/intrinsic.ll
+++ b/test/Transforms/LoopVectorize/intrinsic.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -788,6 +788,66 @@ for.end:                                          ; preds = %for.body, %entry
 
 declare double @llvm.fma.f64(double, double, double) nounwind readnone
 
+;CHECK: @fmuladd_f32
+;CHECK: llvm.fmuladd.v4f32
+;CHECK: ret void
+define void @fmuladd_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z, float* noalias %w) nounwind uwtable {
+entry:
+  %cmp12 = icmp sgt i32 %n, 0
+  br i1 %cmp12, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float* %y, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4, !tbaa !0
+  %arrayidx2 = getelementptr inbounds float* %w, i64 %indvars.iv
+  %1 = load float* %arrayidx2, align 4, !tbaa !0
+  %arrayidx4 = getelementptr inbounds float* %z, i64 %indvars.iv
+  %2 = load float* %arrayidx4, align 4, !tbaa !0
+  %3 = tail call float @llvm.fmuladd.f32(float %0, float %2, float %1)
+  %arrayidx6 = getelementptr inbounds float* %x, i64 %indvars.iv
+  store float %3, float* %arrayidx6, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.fmuladd.f32(float, float, float) nounwind readnone
+
+;CHECK: @fmuladd_f64
+;CHECK: llvm.fmuladd.v4f64
+;CHECK: ret void
+define void @fmuladd_f64(i32 %n, double* noalias %y, double* noalias %x, double* noalias %z, double* noalias %w) nounwind uwtable {
+entry:
+  %cmp12 = icmp sgt i32 %n, 0
+  br i1 %cmp12, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double* %y, i64 %indvars.iv
+  %0 = load double* %arrayidx, align 8, !tbaa !3
+  %arrayidx2 = getelementptr inbounds double* %w, i64 %indvars.iv
+  %1 = load double* %arrayidx2, align 8, !tbaa !3
+  %arrayidx4 = getelementptr inbounds double* %z, i64 %indvars.iv
+  %2 = load double* %arrayidx4, align 8, !tbaa !3
+  %3 = tail call double @llvm.fmuladd.f64(double %0, double %2, double %1)
+  %arrayidx6 = getelementptr inbounds double* %x, i64 %indvars.iv
+  store double %3, double* %arrayidx6, align 8, !tbaa !3
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.fmuladd.f64(double, double, double) nounwind readnone
+
 ;CHECK: @pow_f32
 ;CHECK: llvm.pow.v4f32
 ;CHECK: ret void
diff --git a/test/Transforms/LoopVectorize/lcssa-crash.ll b/test/Transforms/LoopVectorize/lcssa-crash.ll
new file mode 100644
index 0000000000..06b3b08aa0
--- /dev/null
+++ b/test/Transforms/LoopVectorize/lcssa-crash.ll
@@ -0,0 +1,29 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%type1 = type { %type2 }
+%type2 = type { [0 x i8*], i8**, i32, i32, i32 }
+
+define void @test() nounwind uwtable align 2 {
+  br label %for.body.lr.ph.i.i.i
+
+for.body.lr.ph.i.i.i:
+  br label %for.body.i.i.i
+
+for.body.i.i.i:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc.i.i.i ], [ 0, %for.body.lr.ph.i.i.i ]
+  br label %for.inc.i.i.i
+
+for.inc.i.i.i:
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, undef
+  br i1 %exitcond, label %for.body.i.i.i, label %for.end.i.i.i
+
+for.end.i.i.i:
+  %lcssa = phi %type1* [ undef, %for.inc.i.i.i ]
+  unreachable
+}
+
diff --git a/test/Transforms/LoopVectorize/no_int_induction.ll b/test/Transforms/LoopVectorize/no_int_induction.ll
index 516fd1de07..45aa8c7cd9 100644
--- a/test/Transforms/LoopVectorize/no_int_induction.ll
+++ b/test/Transforms/LoopVectorize/no_int_induction.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 ; int __attribute__((noinline)) sum_array(int *A, int n) {
 ;  return std::accumulate(A, A + n, 0);
diff --git a/test/Transforms/LoopVectorize/nofloat.ll b/test/Transforms/LoopVectorize/nofloat.ll
new file mode 100644
index 0000000000..de23bf02b6
--- /dev/null
+++ b/test/Transforms/LoopVectorize/nofloat.ll
@@ -0,0 +1,29 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+; Make sure that we don't vectorize functions with 'noimplicitfloat' attributes.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK: @example12
+;CHECK-NOT: store <4 x i32>
+;CHECK: ret void
+define void @example12() noimplicitfloat { ;           <--------- "noimplicitfloat" attribute here!
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = trunc i64 %indvars.iv to i32
+  store i32 %3, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %4, label %1
+
+; <label>:4                                       ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/non-const-n.ll b/test/Transforms/LoopVectorize/non-const-n.ll
index 1a6c15ed96..8262a18f18 100644
--- a/test/Transforms/LoopVectorize/non-const-n.ll
+++ b/test/Transforms/LoopVectorize/non-const-n.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/nsw-crash.ll b/test/Transforms/LoopVectorize/nsw-crash.ll
new file mode 100644
index 0000000000..e5fad14d0d
--- /dev/null
+++ b/test/Transforms/LoopVectorize/nsw-crash.ll
@@ -0,0 +1,25 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4
+
+target datalayout =
+"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.7.0"
+
+define void @test() {
+entry:
+  br i1 undef, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:
+  br label %while.body
+
+while.body:
+  %it.sroa.0.091 = phi i32* [ undef, %while.body.lr.ph ], [ %incdec.ptr.i, %while.body ]
+  %incdec.ptr.i = getelementptr inbounds i32* %it.sroa.0.091, i64 1
+  %inc32 = add i32 undef, 1                                        ; <------------- Make sure we don't set NSW flags to the undef.
+  %cmp.i11 = icmp eq i32* %incdec.ptr.i, undef
+  br i1 %cmp.i11, label %while.end, label %while.body
+
+while.end:
+  ret void
+}
+
+
diff --git a/test/Transforms/LoopVectorize/read-only.ll b/test/Transforms/LoopVectorize/read-only.ll
index b4d1bac132..bfaa6d452b 100644
--- a/test/Transforms/LoopVectorize/read-only.ll
+++ b/test/Transforms/LoopVectorize/read-only.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/reduction.ll b/test/Transforms/LoopVectorize/reduction.ll
index c1848b35fc..08b7b27e42 100644
--- a/test/Transforms/LoopVectorize/reduction.ll
+++ b/test/Transforms/LoopVectorize/reduction.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -7,6 +7,11 @@ target triple = "x86_64-apple-macosx10.8.0"
 ;CHECK: phi <4 x i32>
 ;CHECK: load <4 x i32>
 ;CHECK: add <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+;CHECK: add <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+;CHECK: add <4 x i32>
+;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
 ;CHECK: ret i32
 define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
   %1 = icmp sgt i32 %n, 0
@@ -37,6 +42,11 @@ define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapt
 ;CHECK: phi <4 x i32>
 ;CHECK: load <4 x i32>
 ;CHECK: mul <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+;CHECK: mul <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+;CHECK: mul <4 x i32>
+;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
 ;CHECK: ret i32
 define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
   %1 = icmp sgt i32 %n, 0
@@ -67,6 +77,11 @@ define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocap
 ;CHECK: phi <4 x i32>
 ;CHECK: load <4 x i32>
 ;CHECK: mul nsw <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+;CHECK: add <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+;CHECK: add <4 x i32>
+;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
 ;CHECK: ret i32
 define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
   %1 = icmp sgt i32 %n, 0
@@ -95,6 +110,11 @@ define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapt
 
 ;CHECK: @reduction_mul
 ;CHECK: mul <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+;CHECK: mul <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+;CHECK: mul <4 x i32>
+;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
 ;CHECK: ret i32
 define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
   %1 = icmp sgt i32 %n, 0
@@ -124,6 +144,11 @@ define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapt
 ;CHECK: @start_at_non_zero
 ;CHECK: phi <4 x i32>
 ;CHECK: <i32 120, i32 0, i32 0, i32 0>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+;CHECK: add <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+;CHECK: add <4 x i32>
+;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
 ;CHECK: ret i32
 define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp {
 entry:
@@ -152,6 +177,11 @@ for.end:                                          ; preds = %for.body, %entry
 ;CHECK: @reduction_and
 ;CHECK: and <4 x i32>
 ;CHECK: <i32 -1, i32 -1, i32 -1, i32 -1>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+;CHECK: and <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+;CHECK: and <4 x i32>
+;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
 ;CHECK: ret i32
 define i32 @reduction_and(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
 entry:
@@ -179,6 +209,11 @@ for.end:                                          ; preds = %for.body, %entry
 
 ;CHECK: @reduction_or
 ;CHECK: or <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+;CHECK: or <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+;CHECK: or <4 x i32>
+;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
 ;CHECK: ret i32
 define i32 @reduction_or(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
 entry:
@@ -206,6 +241,11 @@ for.end:                                          ; preds = %for.body, %entry
 
 ;CHECK: @reduction_xor
 ;CHECK: xor <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+;CHECK: xor <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+;CHECK: xor <4 x i32>
+;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
 ;CHECK: ret i32
 define i32 @reduction_xor(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
 entry:
@@ -230,3 +270,56 @@ for.end:                                          ; preds = %for.body, %entry
   %result.0.lcssa = phi i32 [ 0, %entry ], [ %xor, %for.body ]
   ret i32 %result.0.lcssa
 }
+
+; In this code the subtracted variable is on the RHS and this is not an induction variable.
+;CHECK: @reduction_sub_rhs
+;CHECK-NOT: phi <4 x i32>
+;CHECK-NOT: sub nsw <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_sub_rhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %sub = sub nsw i32 %0, %x.05
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ]
+  ret i32 %x.0.lcssa
+}
+
+
+; In this test the reduction variable is on the LHS and we can vectorize it.
+;CHECK: @reduction_sub_lhs
+;CHECK: phi <4 x i32>
+;CHECK: sub nsw <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_sub_lhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %sub = sub nsw i32 %x.05, %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ]
+  ret i32 %x.0.lcssa
+}
diff --git a/test/Transforms/LoopVectorize/runtime-check.ll b/test/Transforms/LoopVectorize/runtime-check.ll
index 23933cf7c7..574d74d113 100644
--- a/test/Transforms/LoopVectorize/runtime-check.ll
+++ b/test/Transforms/LoopVectorize/runtime-check.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
diff --git a/test/Transforms/LoopVectorize/same-base-access.ll b/test/Transforms/LoopVectorize/same-base-access.ll
new file mode 100644
index 0000000000..1573893645
--- /dev/null
+++ b/test/Transforms/LoopVectorize/same-base-access.ll
@@ -0,0 +1,110 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S -enable-if-conversion | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; This is kernel11 from "LivermoreLoops". We can't vectorize it because we
+; access both x[k] and x[k-1].
+;
+; void kernel11(double *x, double *y, int n) {
+;   for ( int k=1 ; k<n ; k++ )
+;     x[k] = x[k-1] + y[k];
+; }
+
+; CHECK: @kernel11
+; CHECK-NOT: <4 x double>
+; CHECK: ret
+define i32 @kernel11(double* %x, double* %y, i32 %n) nounwind uwtable ssp {
+  %1 = alloca double*, align 8
+  %2 = alloca double*, align 8
+  %3 = alloca i32, align 4
+  %k = alloca i32, align 4
+  store double* %x, double** %1, align 8
+  store double* %y, double** %2, align 8
+  store i32 %n, i32* %3, align 4
+  store i32 1, i32* %k, align 4
+  br label %4
+
+; <label>:4                                       ; preds = %25, %0
+  %5 = load i32* %k, align 4
+  %6 = load i32* %3, align 4
+  %7 = icmp slt i32 %5, %6
+  br i1 %7, label %8, label %28
+
+; <label>:8                                       ; preds = %4
+  %9 = load i32* %k, align 4
+  %10 = sub nsw i32 %9, 1
+  %11 = sext i32 %10 to i64
+  %12 = load double** %1, align 8
+  %13 = getelementptr inbounds double* %12, i64 %11
+  %14 = load double* %13, align 8
+  %15 = load i32* %k, align 4
+  %16 = sext i32 %15 to i64
+  %17 = load double** %2, align 8
+  %18 = getelementptr inbounds double* %17, i64 %16
+  %19 = load double* %18, align 8
+  %20 = fadd double %14, %19
+  %21 = load i32* %k, align 4
+  %22 = sext i32 %21 to i64
+  %23 = load double** %1, align 8
+  %24 = getelementptr inbounds double* %23, i64 %22
+  store double %20, double* %24, align 8
+  br label %25
+
+; <label>:25                                      ; preds = %8
+  %26 = load i32* %k, align 4
+  %27 = add nsw i32 %26, 1
+  store i32 %27, i32* %k, align 4
+  br label %4
+
+; <label>:28                                      ; preds = %4
+  ret i32 0
+}
+
+
+
+; We don't vectorize this function because A[i*7] is scalarized, and the
+; different scalars can in theory wrap around and overwrite other scalar
+; elements. At the moment we only allow read/write access to arrays
+; that are consecutive.
+; 
+; void foo(int *a) {
+;   for (int i=0; i<256; ++i) {
+;     int x = a[i*7];
+;     if (x>3)
+;       x = x*x+x*4;
+;     a[i*7] = x+3;
+;   }
+; }
+
+; CHECK: @func2
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
+define i32 @func2(i32* nocapture %a) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %7, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %7 ]
+  %2 = mul nsw i64 %indvars.iv, 7
+  %3 = getelementptr inbounds i32* %a, i64 %2
+  %4 = load i32* %3, align 4
+  %5 = icmp sgt i32 %4, 3
+  br i1 %5, label %6, label %7
+
+; <label>:6                                       ; preds = %1
+  %tmp = add i32 %4, 4
+  %tmp1 = mul i32 %tmp, %4
+  br label %7
+
+; <label>:7                                       ; preds = %6, %1
+  %x.0 = phi i32 [ %tmp1, %6 ], [ %4, %1 ]
+  %8 = add nsw i32 %x.0, 3
+  store i32 %8, i32* %3, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %9, label %1
+
+; <label>:9                                       ; preds = %7
+  ret i32 0
+}
diff --git a/test/Transforms/LoopVectorize/scalar-select.ll b/test/Transforms/LoopVectorize/scalar-select.ll
index e537bde31b..7a14d247c9 100644
--- a/test/Transforms/LoopVectorize/scalar-select.ll
+++ b/test/Transforms/LoopVectorize/scalar-select.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/simple-unroll.ll b/test/Transforms/LoopVectorize/simple-unroll.ll
new file mode 100644
index 0000000000..7e2dd5fc0f
--- /dev/null
+++ b/test/Transforms/LoopVectorize/simple-unroll.ll
@@ -0,0 +1,39 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -force-vector-unroll=2 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+; This is the loop.
+;  for (i=0; i<n; i++){
+;    a[i] += i;
+;  }
+;CHECK: @inc
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @inc(i32 %n) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = trunc i64 %indvars.iv to i32
+  %5 = add nsw i32 %3, %4
+  store i32 %5, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/small-loop.ll b/test/Transforms/LoopVectorize/small-loop.ll
index 4a6e4b231d..fa83dba3d3 100644
--- a/test/Transforms/LoopVectorize/small-loop.ll
+++ b/test/Transforms/LoopVectorize/small-loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/small-size.ll b/test/Transforms/LoopVectorize/small-size.ll
new file mode 100644
index 0000000000..f390b33c03
--- /dev/null
+++ b/test/Transforms/LoopVectorize/small-size.ll
@@ -0,0 +1,170 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+@G = common global [32 x [1024 x i32]] zeroinitializer, align 16
+@ub = common global [1024 x i32] zeroinitializer, align 16
+@uc = common global [1024 x i32] zeroinitializer, align 16
+@d = common global [2048 x i32] zeroinitializer, align 16
+@fa = common global [1024 x float] zeroinitializer, align 16
+@fb = common global [1024 x float] zeroinitializer, align 16
+@ic = common global [1024 x i32] zeroinitializer, align 16
+@da = common global [1024 x float] zeroinitializer, align 16
+@db = common global [1024 x float] zeroinitializer, align 16
+@dc = common global [1024 x float] zeroinitializer, align 16
+@dd = common global [1024 x float] zeroinitializer, align 16
+@dj = common global [1024 x i32] zeroinitializer, align 16
+
+; We can optimize this test without a tail.
+;CHECK: @example1
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example1() optsize {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+; Can't vectorize in 'optsize' mode because we need a tail.
+;CHECK: @example2
+;CHECK-NOT: store <4 x i32>
+;CHECK: ret void
+define void @example2(i32 %n, i32 %x) optsize {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph5, label %.preheader
+
+..preheader_crit_edge:                            ; preds = %.lr.ph5
+  %phitmp = sext i32 %n to i64
+  br label %.preheader
+
+.preheader:                                       ; preds = %..preheader_crit_edge, %0
+  %i.0.lcssa = phi i64 [ %phitmp, %..preheader_crit_edge ], [ 0, %0 ]
+  %2 = icmp eq i32 %n, 0
+  br i1 %2, label %._crit_edge, label %.lr.ph
+
+.lr.ph5:                                          ; preds = %0, %.lr.ph5
+  %indvars.iv6 = phi i64 [ %indvars.iv.next7, %.lr.ph5 ], [ 0, %0 ]
+  %3 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv6
+  store i32 %x, i32* %3, align 4
+  %indvars.iv.next7 = add i64 %indvars.iv6, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next7 to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %..preheader_crit_edge, label %.lr.ph5
+
+.lr.ph:                                           ; preds = %.preheader, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ %i.0.lcssa, %.preheader ]
+  %.02 = phi i32 [ %4, %.lr.ph ], [ %n, %.preheader ]
+  %4 = add nsw i32 %.02, -1
+  %5 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %6 = load i32* %5, align 4
+  %7 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %8 = load i32* %7, align 4
+  %9 = and i32 %8, %6
+  %10 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %9, i32* %10, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %11 = icmp eq i32 %4, 0
+  br i1 %11, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %.preheader
+  ret void
+}
+
+; N is unknown, we need a tail. Can't vectorize.
+;CHECK: @example3
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) optsize {
+  %1 = icmp eq i32 %n, 0
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %.05 = phi i32 [ %2, %.lr.ph ], [ %n, %0 ]
+  %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ]
+  %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ]
+  %2 = add nsw i32 %.05, -1
+  %3 = getelementptr inbounds i32* %.023, i64 1
+  %4 = load i32* %.023, align 16
+  %5 = getelementptr inbounds i32* %.014, i64 1
+  store i32 %4, i32* %.014, align 16
+  %6 = icmp eq i32 %2, 0
+  br i1 %6, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+
+; We can't vectorize this one because we need a runtime ptr check.
+;CHECK: @example23
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example23(i16* nocapture %src, i32* nocapture %dst) optsize {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i32 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16* %.04, i64 1
+  %3 = load i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %7, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+
+; We CAN vectorize this example because the pointers are marked as noalias.
+;CHECK: @example23b
+;CHECK: <4 x i32>
+;CHECK: ret void
+define void @example23b(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i32 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16* %.04, i64 1
+  %3 = load i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %7, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+
diff --git a/test/Transforms/LoopVectorize/start-non-zero.ll b/test/Transforms/LoopVectorize/start-non-zero.ll
index 5aa3bc034d..998001c318 100644
--- a/test/Transforms/LoopVectorize/start-non-zero.ll
+++ b/test/Transforms/LoopVectorize/start-non-zero.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-vectorize -force-vector-width=4 -instcombine -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/write-only.ll b/test/Transforms/LoopVectorize/write-only.ll
index eb02760413..54cbe8df46 100644
--- a/test/Transforms/LoopVectorize/write-only.ll
+++ b/test/Transforms/LoopVectorize/write-only.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/MergeFunc/2011-02-08-RemoveEqual.ll b/test/Transforms/MergeFunc/2011-02-08-RemoveEqual.ll
index e3e52b401a..19cd6a5171 100644
--- a/test/Transforms/MergeFunc/2011-02-08-RemoveEqual.ll
+++ b/test/Transforms/MergeFunc/2011-02-08-RemoveEqual.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mergefunc %s -disable-output
+; RUN: opt -mergefunc -disable-output < %s
 ; This used to crash.
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
diff --git a/test/Transforms/MergeFunc/2013-01-10-MergeFuncAssert.ll b/test/Transforms/MergeFunc/2013-01-10-MergeFuncAssert.ll
new file mode 100644
index 0000000000..3f6a5ba157
--- /dev/null
+++ b/test/Transforms/MergeFunc/2013-01-10-MergeFuncAssert.ll
@@ -0,0 +1,36 @@
+; RUN: opt -mergefunc -disable-output < %s
+; This used to trigger a ConstantExpr::getBitCast assertion.
+
+define void @t1() unnamed_addr uwtable ssp align 2 {
+entry:
+  switch i32 undef, label %sw.bb12 [
+    i32 127, label %sw.bb
+    i32 126, label %sw.bb4
+  ]
+
+sw.bb:                                            ; preds = %entry
+  unreachable
+
+sw.bb4:                                           ; preds = %entry
+  unreachable
+
+sw.bb12:                                          ; preds = %entry
+  ret void
+}
+
+define void @t2() unnamed_addr uwtable ssp align 2 {
+entry:
+  switch i32 undef, label %sw.bb8 [
+    i32 4, label %sw.bb
+    i32 3, label %sw.bb4
+  ]
+
+sw.bb:                                            ; preds = %entry
+  unreachable
+
+sw.bb4:                                           ; preds = %entry
+  ret void
+
+sw.bb8:                                           ; preds = %entry
+  unreachable
+}
diff --git a/test/Transforms/MetaRenamer/metarenamer.ll b/test/Transforms/MetaRenamer/metarenamer.ll
index ad41bcf50f..4020e10450 100644
--- a/test/Transforms/MetaRenamer/metarenamer.ll
+++ b/test/Transforms/MetaRenamer/metarenamer.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -metarenamer -S | FileCheck %s
+; RUN: opt -metarenamer -S < %s | FileCheck %s
 
 ; CHECK: target triple {{.*}}
 ; CHECK-NOT: {{^x*}}xxx{{^x*}}
diff --git a/test/Transforms/ObjCARC/basic.ll b/test/Transforms/ObjCARC/basic.ll
index 7b64b1be7c..4faffa55e7 100644
--- a/test/Transforms/ObjCARC/basic.ll
+++ b/test/Transforms/ObjCARC/basic.ll
@@ -405,7 +405,7 @@ entry:
 
 ; CHECK: define void @test11(
 ; CHECK: tail call i8* @objc_retain(i8* %x) nounwind
-; CHECK: tail call i8* @objc_autorelease(i8* %0) nounwind
+; CHECK: call i8* @objc_autorelease(i8* %0) nounwind
 ; CHECK: }
 define void @test11(i8* %x) nounwind {
 entry:
@@ -465,7 +465,7 @@ entry:
 ; CHECK: tail call i8* @objc_retain(i8* %x) nounwind
 ; CHECK: tail call i8* @objc_retain(i8* %x) nounwind
 ; CHECK: @use_pointer(i8* %x)
-; CHECK: tail call i8* @objc_autorelease(i8* %x) nounwind
+; CHECK: call i8* @objc_autorelease(i8* %x) nounwind
 ; CHECK: }
 define void @test13(i8* %x, i64 %n) {
 entry:
@@ -1452,7 +1452,7 @@ define void @test45(i8** %pp, i8** %qq) {
 ; CHECK: define void @test46(
 ; CHECK: tail call i8* @objc_retain(i8* %p) nounwind
 ; CHECK: true:
-; CHECK: tail call i8* @objc_autorelease(i8* %p) nounwind
+; CHECK: call i8* @objc_autorelease(i8* %p) nounwind
 define void @test46(i8* %p, i1 %a) {
 entry:
   call i8* @objc_retain(i8* %p)
diff --git a/test/Transforms/ObjCARC/contract.ll b/test/Transforms/ObjCARC/contract.ll
index c48f8a534f..40b11a9d0e 100644
--- a/test/Transforms/ObjCARC/contract.ll
+++ b/test/Transforms/ObjCARC/contract.ll
@@ -39,7 +39,7 @@ entry:
 define void @test2(i8* %x) nounwind {
 entry:
   %0 = tail call i8* @objc_retain(i8* %x) nounwind
-  tail call i8* @objc_autorelease(i8* %0) nounwind
+  call i8* @objc_autorelease(i8* %0) nounwind
   call void @use_pointer(i8* %x)
   ret void
 }
@@ -66,7 +66,7 @@ define void @test3(i8* %x, i64 %n) {
 entry:
   tail call i8* @objc_retain(i8* %x) nounwind
   call void @use_pointer(i8* %x)
-  tail call i8* @objc_autorelease(i8* %x) nounwind
+  call i8* @objc_autorelease(i8* %x) nounwind
   ret void
 }
 
@@ -84,7 +84,7 @@ define void @test4(i8* %x, i64 %n) {
 entry:
   tail call i8* @objc_retain(i8* %x) nounwind
   call void @use_pointer(i8* %x)
-  tail call i8* @objc_autorelease(i8* %x) nounwind
+  call i8* @objc_autorelease(i8* %x) nounwind
   tail call void @objc_release(i8* %x) nounwind
   ret void
 }
@@ -94,7 +94,7 @@ entry:
 ; CHECK: define void @test5(
 ; CHECK: tail call i8* @objc_retain(i8* %p) nounwind
 ; CHECK: true:
-; CHECK: tail call i8* @objc_autorelease(i8* %0) nounwind
+; CHECK: call i8* @objc_autorelease(i8* %0) nounwind
 ; CHECK: }
 define void @test5(i8* %p, i1 %a) {
 entry:
@@ -102,7 +102,7 @@ entry:
   br i1 %a, label %true, label %false
 
 true:
-  tail call i8* @objc_autorelease(i8* %p) nounwind
+  call i8* @objc_autorelease(i8* %p) nounwind
   call void @use_pointer(i8* %p)
   ret void
 
diff --git a/test/Transforms/ObjCARC/dont-infinite-loop-during-block-escape-analysis.ll b/test/Transforms/ObjCARC/dont-infinite-loop-during-block-escape-analysis.ll
new file mode 100644
index 0000000000..bdee2be94f
--- /dev/null
+++ b/test/Transforms/ObjCARC/dont-infinite-loop-during-block-escape-analysis.ll
@@ -0,0 +1,87 @@
+; RUN: opt -S -objc-arc < %s
+; bugzilla://14551
+; rdar://12851911
+
+; Make sure that we do not hang clang during escape analysis.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-darwin"
+
+%struct.__block_descriptor = type { i64, i64 }
+%struct.__block_byref_foo = type { i8*, %struct.__block_byref_foo*, i32, i32, i32 }
+
+@_NSConcreteGlobalBlock = external global i8*
+@.str = private unnamed_addr constant [6 x i8] c"v8@?0\00", align 1
+@__block_descriptor_tmp = internal constant { i64, i64, i8*, i8* } { i64 0, i64 32, i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), i8* null }
+@__block_literal_global = internal constant { i8**, i32, i32, i8*, %struct.__block_descriptor* } { i8** @_NSConcreteGlobalBlock, i32 1342177280, i32 0, i8* bitcast (void (i8*)* @__hang_clang_block_invoke to i8*), %struct.__block_descriptor* bitcast ({ i64, i64, i8*, i8* }* @__block_descriptor_tmp to %struct.__block_descriptor*) }, align 8
+
+define void @hang_clang() uwtable optsize ssp {
+entry:
+  %foo = alloca %struct.__block_byref_foo, align 8
+  %byref.isa = getelementptr inbounds %struct.__block_byref_foo* %foo, i64 0, i32 0
+  store i8* null, i8** %byref.isa, align 8
+  %byref.forwarding = getelementptr inbounds %struct.__block_byref_foo* %foo, i64 0, i32 1
+  store %struct.__block_byref_foo* %foo, %struct.__block_byref_foo** %byref.forwarding, align 8
+  %byref.flags = getelementptr inbounds %struct.__block_byref_foo* %foo, i64 0, i32 2
+  store i32 536870912, i32* %byref.flags, align 8
+  %byref.size = getelementptr inbounds %struct.__block_byref_foo* %foo, i64 0, i32 3
+  store i32 32, i32* %byref.size, align 4
+  %foo1 = getelementptr inbounds %struct.__block_byref_foo* %foo, i64 0, i32 4
+  store i32 0, i32* %foo1, align 8, !tbaa !4
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc.for.body_crit_edge, %entry
+  %0 = phi i1 [ true, %entry ], [ %phitmp, %for.inc.for.body_crit_edge ]
+  %i.06 = phi i32 [ 1, %entry ], [ %phitmp8, %for.inc.for.body_crit_edge ]
+  %block.05 = phi void (...)* [ null, %entry ], [ %block.1, %for.inc.for.body_crit_edge ]
+  br i1 %0, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %1 = call i8* @objc_retainBlock(i8* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor* }* @__block_literal_global to i8*)) nounwind, !clang.arc.copy_on_escape !7
+  %2 = bitcast i8* %1 to void (...)*
+  %3 = bitcast void (...)* %block.05 to i8*
+  call void @objc_release(i8* %3) nounwind, !clang.imprecise_release !7
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %block.1 = phi void (...)* [ %2, %if.then ], [ %block.05, %for.body ]
+  %exitcond = icmp eq i32 %i.06, 10
+  br i1 %exitcond, label %for.end, label %for.inc.for.body_crit_edge
+
+for.inc.for.body_crit_edge:                       ; preds = %for.inc
+  %.pre = load %struct.__block_byref_foo** %byref.forwarding, align 8
+  %foo2.phi.trans.insert = getelementptr inbounds %struct.__block_byref_foo* %.pre, i64 0, i32 4
+  %.pre7 = load i32* %foo2.phi.trans.insert, align 4, !tbaa !4
+  %phitmp = icmp eq i32 %.pre7, 0
+  %phitmp8 = add i32 %i.06, 1
+  br label %for.body
+
+for.end:                                          ; preds = %for.inc
+  %4 = bitcast %struct.__block_byref_foo* %foo to i8*
+  call void @_Block_object_dispose(i8* %4, i32 8)
+  %5 = bitcast void (...)* %block.1 to i8*
+  call void @objc_release(i8* %5) nounwind, !clang.imprecise_release !7
+  ret void
+}
+
+define internal void @__hang_clang_block_invoke(i8* nocapture %.block_descriptor) nounwind uwtable readnone optsize ssp {
+entry:
+  ret void
+}
+
+declare i8* @objc_retainBlock(i8*)
+
+declare void @objc_release(i8*) nonlazybind
+
+declare void @_Block_object_dispose(i8*, i32)
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
+!4 = metadata !{metadata !"int", metadata !5}
+!5 = metadata !{metadata !"omnipotent char", metadata !6}
+!6 = metadata !{metadata !"Simple C/C++ TBAA"}
+!7 = metadata !{}
diff --git a/test/Transforms/ObjCARC/move-and-form-retain-autorelease.ll b/test/Transforms/ObjCARC/move-and-form-retain-autorelease.ll
index 170d0a99c9..d7a54ab430 100644
--- a/test/Transforms/ObjCARC/move-and-form-retain-autorelease.ll
+++ b/test/Transforms/ObjCARC/move-and-form-retain-autorelease.ll
@@ -212,7 +212,7 @@ bb99:                                             ; preds = %bb57
   br label %bb104
 
 bb104:                                            ; preds = %bb99, %bb57
-  %tmp105 = tail call i8* @objc_autorelease(i8* %tmp72) nounwind
+  %tmp105 = call i8* @objc_autorelease(i8* %tmp72) nounwind
   %tmp106 = bitcast i8* %tmp105 to %14*
   tail call void @objc_release(i8* %tmp85) nounwind
   %tmp107 = bitcast %18* %tmp47 to i8*
diff --git a/test/Transforms/ObjCARC/pr12270.ll b/test/Transforms/ObjCARC/pr12270.ll
index 1faae5f687..bdff0d7b4d 100644
--- a/test/Transforms/ObjCARC/pr12270.ll
+++ b/test/Transforms/ObjCARC/pr12270.ll
@@ -1,4 +1,4 @@
-; RUN: opt -disable-output -objc-arc-contract %s
+; RUN: opt -disable-output -objc-arc-contract < %s
 ; test that we don't crash on unreachable code
 %2 = type opaque
 
diff --git a/test/Transforms/ObjCARC/rv.ll b/test/Transforms/ObjCARC/rv.ll
index 9353a19f71..638b89ccc7 100644
--- a/test/Transforms/ObjCARC/rv.ll
+++ b/test/Transforms/ObjCARC/rv.ll
@@ -150,7 +150,7 @@ define void @test8() {
 ; Don't apply the RV optimization to autorelease if there's no retain.
 
 ; CHECK: define i8* @test9(i8* %p)
-; CHECK: tail call i8* @objc_autorelease(i8* %p)
+; CHECK: call i8* @objc_autorelease(i8* %p)
 define i8* @test9(i8* %p) {
   call i8* @objc_autorelease(i8* %p)
   ret i8* %p
@@ -174,7 +174,7 @@ define i8* @test10(i8* %p) {
 ; CHECK: define i8* @test11(i8* %p)
 ; CHECK: tail call i8* @objc_retain(i8* %p)
 ; CHECK-NEXT: call void @use_pointer(i8* %p)
-; CHECK: tail call i8* @objc_autorelease(i8* %p)
+; CHECK: call i8* @objc_autorelease(i8* %p)
 ; CHECK-NEXT: ret i8* %p
 define i8* @test11(i8* %p) {
   %1 = call i8* @objc_retain(i8* %p)
@@ -201,7 +201,7 @@ define i8* @test12(i8* %p) {
 
 ; CHECK: define i8* @test13(
 ; CHECK: tail call i8* @objc_retainAutoreleasedReturnValue(i8* %p)
-; CHECK: tail call i8* @objc_autorelease(i8* %p)
+; CHECK: call i8* @objc_autorelease(i8* %p)
 ; CHECK: ret i8* %p
 define i8* @test13() {
   %p = call i8* @returner()
@@ -323,7 +323,7 @@ define i8* @test22(i8* %p) {
 ; Convert autoreleaseRV to autorelease.
 
 ; CHECK: define void @test23(
-; CHECK: tail call i8* @objc_autorelease(i8* %p) nounwind
+; CHECK: call i8* @objc_autorelease(i8* %p) nounwind
 define void @test23(i8* %p) {
   store i8 0, i8* %p
   call i8* @objc_autoreleaseReturnValue(i8* %p)
diff --git a/test/Transforms/ObjCARC/tail-call-invariant-enforcement.ll b/test/Transforms/ObjCARC/tail-call-invariant-enforcement.ll
new file mode 100644
index 0000000000..74ac97c7b3
--- /dev/null
+++ b/test/Transforms/ObjCARC/tail-call-invariant-enforcement.ll
@@ -0,0 +1,84 @@
+; RUN: opt -objc-arc -S < %s | FileCheck %s
+
+declare i8* @objc_release(i8* %x)
+declare i8* @objc_retain(i8* %x)
+declare i8* @objc_autorelease(i8* %x)
+declare i8* @objc_autoreleaseReturnValue(i8* %x)
+declare i8* @objc_retainAutoreleasedReturnValue(i8* %x)
+
+; Never tail call objc_autorelease.
+define i8* @test0(i8* %x) {
+entry:
+  ; CHECK: %tmp0 = call i8* @objc_autorelease(i8* %x)
+  %tmp0 = call i8* @objc_autorelease(i8* %x)
+  ; CHECK: %tmp1 = call i8* @objc_autorelease(i8* %x)
+  %tmp1 = tail call i8* @objc_autorelease(i8* %x)
+
+  ret i8* %x
+}
+
+; Always tail call autoreleaseReturnValue.
+define i8* @test1(i8* %x) {
+entry:
+  ; CHECK: %tmp0 = tail call i8* @objc_autoreleaseReturnValue(i8* %x)
+  %tmp0 = call i8* @objc_autoreleaseReturnValue(i8* %x)
+  ; CHECK: %tmp1 = tail call i8* @objc_autoreleaseReturnValue(i8* %x)
+  %tmp1 = tail call i8* @objc_autoreleaseReturnValue(i8* %x)
+  ret i8* %x
+}
+
+; Always tail call objc_retain.
+define i8* @test2(i8* %x) {
+entry:
+  ; CHECK: %tmp0 = tail call i8* @objc_retain(i8* %x)
+  %tmp0 = call i8* @objc_retain(i8* %x)
+  ; CHECK: %tmp1 = tail call i8* @objc_retain(i8* %x)
+  %tmp1 = tail call i8* @objc_retain(i8* %x)
+  ret i8* %x
+}
+
+define i8* @tmp(i8* %x) {
+  ret i8* %x
+}
+
+; Always tail call objc_retainAutoreleasedReturnValue.
+define i8* @test3(i8* %x) {
+entry:
+  %y = call i8* @tmp(i8* %x)
+  ; CHECK: %tmp0 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %y)
+  %tmp0 = call i8* @objc_retainAutoreleasedReturnValue(i8* %y)
+  %z = call i8* @tmp(i8* %x)
+  ; CHECK: %tmp1 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %z)
+  %tmp1 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %z)
+  ret i8* %x
+}
+
+; By itself, we should never change whether or not objc_release is tail called.
+define i8* @test4(i8* %x) {
+entry:
+  ; CHECK: %tmp0 = call i8* @objc_release(i8* %x)
+  %tmp0 = call i8* @objc_release(i8* %x)
+  ; CHECK: %tmp1 = tail call i8* @objc_release(i8* %x)
+  %tmp1 = tail call i8* @objc_release(i8* %x)
+  ret i8* %x
+}
+
+; If we convert a tail called @objc_autoreleaseReturnValue to an
+; @objc_autorelease, ensure that the tail call is removed.
+define i8* @test5(i8* %x) {
+entry:
+  ; CHECK: %tmp0 = call i8* @objc_autorelease(i8* %x)
+  %tmp0 = tail call i8* @objc_autoreleaseReturnValue(i8* %x)
+  ret i8* %tmp0
+}
+
+; If we convert a called @objc_autorelease to an @objc_autoreleaseReturnValue,
+; ensure that the tail call is added.
+define i8* @test6(i8* %x) {
+entry:
+  ; CHECK: %tmp0 = tail call i8* @objc_retain(i8* %x)
+  %tmp0 = tail call i8* @objc_retain(i8* %x)
+  ; CHECK: %tmp1 = tail call i8* @objc_autoreleaseReturnValue(i8* %x)
+  %tmp1 = call i8* @objc_autorelease(i8* %x)
+  ret i8* %x
+}
diff --git a/test/Transforms/PhaseOrdering/2010-03-22-empty-baseclass.ll b/test/Transforms/PhaseOrdering/2010-03-22-empty-baseclass.ll
index 8859da8de1..53d98e02ec 100644
--- a/test/Transforms/PhaseOrdering/2010-03-22-empty-baseclass.ll
+++ b/test/Transforms/PhaseOrdering/2010-03-22-empty-baseclass.ll
@@ -1,4 +1,4 @@
-; RUN: opt -O2 %s -S -o - | FileCheck %s
+; RUN: opt -O2 -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin11.1"
diff --git a/test/Transforms/PhaseOrdering/PR6627.ll b/test/Transforms/PhaseOrdering/PR6627.ll
index ef9947f103..58b762a7af 100644
--- a/test/Transforms/PhaseOrdering/PR6627.ll
+++ b/test/Transforms/PhaseOrdering/PR6627.ll
@@ -1,4 +1,4 @@
-; RUN: opt -O3 -S %s | FileCheck %s
+; RUN: opt -O3 -S < %s | FileCheck %s
 ; XFAIL: *
 
 declare i32 @doo(...)
diff --git a/test/Transforms/PhaseOrdering/basic.ll b/test/Transforms/PhaseOrdering/basic.ll
index 88ebca0a9c..8fbe8c58f4 100644
--- a/test/Transforms/PhaseOrdering/basic.ll
+++ b/test/Transforms/PhaseOrdering/basic.ll
@@ -1,4 +1,4 @@
-; RUN: opt -O3 -S %s | FileCheck %s
+; RUN: opt -O3 -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-macosx10.6.7"
diff --git a/test/Transforms/PhaseOrdering/gdce.ll b/test/Transforms/PhaseOrdering/gdce.ll
index 273e47e97c..95f06757a7 100644
--- a/test/Transforms/PhaseOrdering/gdce.ll
+++ b/test/Transforms/PhaseOrdering/gdce.ll
@@ -1,4 +1,4 @@
-; RUN: opt -O2 -S %s | FileCheck %s
+; RUN: opt -O2 -S < %s | FileCheck %s
 
 ; Run global DCE to eliminate unused ctor and dtor.
 ; rdar://9142819
diff --git a/test/Transforms/PhaseOrdering/scev.ll b/test/Transforms/PhaseOrdering/scev.ll
index c731280822..39adb6b73d 100644
--- a/test/Transforms/PhaseOrdering/scev.ll
+++ b/test/Transforms/PhaseOrdering/scev.ll
@@ -1,4 +1,4 @@
-; RUN: opt -O3 -S -analyze -scalar-evolution %s | FileCheck %s
+; RUN: opt -O3 -S -analyze -scalar-evolution < %s | FileCheck %s
 ;
 ; This file contains phase ordering tests for scalar evolution.
 ; Test that the standard passes don't obfuscate the IR so scalar evolution can't
diff --git a/test/Transforms/Reassociate/crash.ll b/test/Transforms/Reassociate/crash.ll
index e29b5dc9c0..770f97371d 100644
--- a/test/Transforms/Reassociate/crash.ll
+++ b/test/Transforms/Reassociate/crash.ll
@@ -1,4 +1,4 @@
-; RUN: opt -reassociate -disable-output %s
+; RUN: opt -reassociate -disable-output < %s
 
 
 ; rdar://7507855
diff --git a/test/Transforms/Reg2Mem/crash.ll b/test/Transforms/Reg2Mem/crash.ll
new file mode 100644
index 0000000000..02fed94b85
--- /dev/null
+++ b/test/Transforms/Reg2Mem/crash.ll
@@ -0,0 +1,88 @@
+; RUN: opt -reg2mem -disable-output < %s
+; PR14782
+
+declare void @f1()
+
+declare i32 @__gxx_personality_sj0(...)
+
+declare void @f2()
+
+declare void @f3()
+
+declare void @f4_()
+
+declare void @_Z12xxxdtsP10xxxpq()
+
+define hidden void @_ZN12xxxyzIi9xxxwLi29ELi0EE4f3NewES0_i() ssp align 2 {
+bb:
+  invoke void @f4_()
+          to label %bb1 unwind label %.thread
+
+.thread:                                          ; preds = %bb
+  %tmp = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %bb13
+
+bb1:                                              ; preds = %bb
+  invoke void @f1()
+          to label %.noexc unwind label %bb10
+
+.noexc:                                           ; preds = %bb1
+  invoke void @f4_()
+          to label %bb6 unwind label %bb2
+
+bb2:                                              ; preds = %.noexc
+  %tmp3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  invoke void @f3()
+          to label %.body unwind label %bb4
+
+bb4:                                              ; preds = %bb2
+  %tmp5 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          catch i8* null
+  unreachable
+
+bb6:                                              ; preds = %.noexc
+  invoke void @_Z12xxxdtsP10xxxpq()
+          to label %_ZN6xxxdIN12xxxyzIi9xxxwLi29ELi0EE4fr1jS3_.exit unwind label %bb10
+
+_ZN6xxxdIN12xxxyzIi9xxxwLi29ELi0EE4fr1jS3_.exit:  ; preds = %bb6
+  invoke void @f2()
+          to label %bb7 unwind label %bb8
+
+bb7:                                              ; preds = %_ZN6xxxdIN12xxxyzIi9xxxwLi29ELi0EE4fr1jS3_.exit
+  ret void
+
+bb8:                                              ; preds = %_ZN6xxxdIN12xxxyzIi9xxxwLi29ELi0EE4fr1jS3_.exit
+  %tmp9 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %_ZN10xxxpqdlev.exit
+
+bb10:                                             ; preds = %bb6, %bb1
+  %.1 = phi i1 [ true, %bb1 ], [ false, %bb6 ]
+  %tmp11 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %.body
+
+.body:                                            ; preds = %bb10, %bb2
+  %.1.lpad-body = phi i1 [ %.1, %bb10 ], [ true, %bb2 ]
+  invoke void @f2()
+          to label %bb12 unwind label %bb14
+
+bb12:                                             ; preds = %.body
+  br i1 %.1.lpad-body, label %bb13, label %_ZN10xxxpqdlev.exit
+
+bb13:                                             ; preds = %bb12, %.thread
+  invoke void @xxx_MemFree()
+          to label %_ZN10xxxpqdlev.exit unwind label %bb14
+
+_ZN10xxxpqdlev.exit:                              ; preds = %bb13, %bb12, %bb8
+  resume { i8*, i32 } undef
+
+bb14:                                             ; preds = %bb13, %.body
+  %tmp15 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          catch i8* null
+  unreachable
+}
+
+declare void @xxx_MemFree()
diff --git a/test/Transforms/Reg2Mem/lit.local.cfg b/test/Transforms/Reg2Mem/lit.local.cfg
new file mode 100644
index 0000000000..19eebc0ac7
--- /dev/null
+++ b/test/Transforms/Reg2Mem/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/SCCP/crash.ll b/test/Transforms/SCCP/crash.ll
index 2f6da1d726..88528902d7 100644
--- a/test/Transforms/SCCP/crash.ll
+++ b/test/Transforms/SCCP/crash.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -sccp -S
+; RUN: opt -sccp -S < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-apple-darwin10.0"
 
diff --git a/test/Transforms/SCCP/ipsccp-addr-taken.ll b/test/Transforms/SCCP/ipsccp-addr-taken.ll
index c6572fa5d1..b49da97ab2 100644
--- a/test/Transforms/SCCP/ipsccp-addr-taken.ll
+++ b/test/Transforms/SCCP/ipsccp-addr-taken.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -ipsccp -S | FileCheck %s
+; RUN: opt -ipsccp -S < %s | FileCheck %s
 ; PR7876
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
diff --git a/test/Transforms/SCCP/retvalue-undef.ll b/test/Transforms/SCCP/retvalue-undef.ll
index 389561f8a1..5a4ba113b7 100644
--- a/test/Transforms/SCCP/retvalue-undef.ll
+++ b/test/Transforms/SCCP/retvalue-undef.ll
@@ -1,4 +1,4 @@
-; RUN: opt -ipsccp -S %s | FileCheck %s
+; RUN: opt -ipsccp -S < %s | FileCheck %s
 ; PR6414
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/SCCP/undef-resolve.ll b/test/Transforms/SCCP/undef-resolve.ll
index a3dddb799a..a1a600c960 100644
--- a/test/Transforms/SCCP/undef-resolve.ll
+++ b/test/Transforms/SCCP/undef-resolve.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -sccp -S | FileCheck %s
+; RUN: opt -sccp -S < %s | FileCheck %s
 
 
 ; PR6940
diff --git a/test/Transforms/SROA/basictest.ll b/test/Transforms/SROA/basictest.ll
index 9fe926ee2c..efc01acd59 100644
--- a/test/Transforms/SROA/basictest.ll
+++ b/test/Transforms/SROA/basictest.ll
@@ -575,8 +575,8 @@ entry:
   store i8 0, i8* %a2ptr
   %aiptr = bitcast [3 x i8]* %a to i24*
   %ai = load i24* %aiptr
-; CHCEK-NOT: store
-; CHCEK-NOT: load
+; CHECK-NOT: store
+; CHECK-NOT: load
 ; CHECK:      %[[ext2:.*]] = zext i8 0 to i24
 ; CHECK-NEXT: %[[shift2:.*]] = shl i24 %[[ext2]], 16
 ; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, 65535
@@ -597,8 +597,8 @@ entry:
   %b1 = load i8* %b1ptr
   %b2ptr = getelementptr [3 x i8]* %b, i64 0, i32 2
   %b2 = load i8* %b2ptr
-; CHCEK-NOT: store
-; CHCEK-NOT: load
+; CHECK-NOT: store
+; CHECK-NOT: load
 ; CHECK:      %[[trunc0:.*]] = trunc i24 %[[insert0]] to i8
 ; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
 ; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
@@ -1176,3 +1176,50 @@ entry:
   %baz = load i1* %a.i1, align 1
   ret void
 }
+
+define <3 x i8> @PR14572.1(i32 %x) {
+; Ensure that a split integer store which is wider than the type size of the
+; alloca (relying on the alloc size padding) doesn't trigger an assert.
+; CHECK: @PR14572.1
+
+entry:
+  %a = alloca <3 x i8>, align 4
+; CHECK-NOT: alloca
+
+  %cast = bitcast <3 x i8>* %a to i32*
+  store i32 %x, i32* %cast, align 1
+  %y = load <3 x i8>* %a, align 4
+  ret <3 x i8> %y
+; CHECK: ret <3 x i8>
+}
+
+define i32 @PR14572.2(<3 x i8> %x) {
+; Ensure that a split integer load which is wider than the type size of the
+; alloca (relying on the alloc size padding) doesn't trigger an assert.
+; CHECK: @PR14572.2
+
+entry:
+  %a = alloca <3 x i8>, align 4
+; CHECK-NOT: alloca
+
+  store <3 x i8> %x, <3 x i8>* %a, align 1
+  %cast = bitcast <3 x i8>* %a to i32*
+  %y = load i32* %cast, align 4
+  ret i32 %y
+; CHECK: ret i32
+}
+
+define i32 @PR14601(i32 %x) {
+; Don't try to form a promotable integer alloca when there is a variable length
+; memory intrinsic.
+; CHECK: @PR14601
+
+entry:
+  %a = alloca i32
+; CHECK: alloca
+
+  %a.i8 = bitcast i32* %a to i8*
+  call void @llvm.memset.p0i8.i32(i8* %a.i8, i8 0, i32 %x, i32 1, i1 false)
+  %v = load i32* %a
+  ret i32 %v
+}
diff --git a/test/Transforms/SROA/big-endian.ll b/test/Transforms/SROA/big-endian.ll
index 1ac6d25d63..64a0cc7439 100644
--- a/test/Transforms/SROA/big-endian.ll
+++ b/test/Transforms/SROA/big-endian.ll
@@ -24,8 +24,8 @@ entry:
   store i8 0, i8* %a2ptr
   %aiptr = bitcast [3 x i8]* %a to i24*
   %ai = load i24* %aiptr
-; CHCEK-NOT: store
-; CHCEK-NOT: load
+; CHECK-NOT: store
+; CHECK-NOT: load
 ; CHECK:      %[[ext2:.*]] = zext i8 0 to i24
 ; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, -256
 ; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[ext2]]
@@ -46,8 +46,8 @@ entry:
   %b1 = load i8* %b1ptr
   %b2ptr = getelementptr [3 x i8]* %b, i64 0, i32 2
   %b2 = load i8* %b2ptr
-; CHCEK-NOT: store
-; CHCEK-NOT: load
+; CHECK-NOT: store
+; CHECK-NOT: load
 ; CHECK:      %[[shift0:.*]] = lshr i24 %[[insert0]], 16
 ; CHECK-NEXT: %[[trunc0:.*]] = trunc i24 %[[shift0]] to i8
 ; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
@@ -77,8 +77,8 @@ entry:
   %a2ptr = getelementptr [7 x i8]* %a, i64 0, i32 2
   %a3ptr = getelementptr [7 x i8]* %a, i64 0, i32 3
 
-; CHCEK-NOT: store
-; CHCEK-NOT: load
+; CHECK-NOT: store
+; CHECK-NOT: load
 
   %a0i16ptr = bitcast i8* %a0ptr to i16*
   store i16 1, i16* %a0i16ptr
@@ -98,8 +98,8 @@ entry:
 ; CHECK-NEXT: %[[mask3:.*]] = and i56 undef, -1099511627776
 ; CHECK-NEXT: %[[insert3:.*]] = or i56 %[[mask3]], %[[ext3]]
 
-; CHCEK-NOT: store
-; CHCEK-NOT: load
+; CHECK-NOT: store
+; CHECK-NOT: load
 
   %aiptr = bitcast [7 x i8]* %a to i56*
   %ai = load i56* %aiptr
diff --git a/test/Transforms/SROA/vector-promotion.ll b/test/Transforms/SROA/vector-promotion.ll
index bb34e3f084..02f6d040cc 100644
--- a/test/Transforms/SROA/vector-promotion.ll
+++ b/test/Transforms/SROA/vector-promotion.ll
@@ -279,6 +279,89 @@ entry:
 ; CHECK-NEXT: ret <4 x i32> %[[ret]]
 }
 
+declare void @llvm.memset.p0i32.i32(i32* nocapture, i32, i32, i32, i1) nounwind
+
+define <4 x float> @test_subvec_memset() {
+; CHECK: @test_subvec_memset
+entry:
+  %a = alloca <4 x float>
+; CHECK-NOT: alloca
+
+  %a.gep0 = getelementptr <4 x float>* %a, i32 0, i32 0
+  %a.cast0 = bitcast float* %a.gep0 to i8*
+  call void @llvm.memset.p0i8.i32(i8* %a.cast0, i8 0, i32 8, i32 0, i1 false)
+; CHECK-NOT: store
+; CHECK:      %[[insert1:.*]] = shufflevector <4 x float> <float 0.000000e+00, float 0.000000e+00, float undef, float undef>, <4 x float> undef, <4 x i32> <i32 0, i32 1, {{.*}}>
+
+  %a.gep1 = getelementptr <4 x float>* %a, i32 0, i32 1
+  %a.cast1 = bitcast float* %a.gep1 to i8*
+  call void @llvm.memset.p0i8.i32(i8* %a.cast1, i8 1, i32 8, i32 0, i1 false)
+; CHECK-NEXT: %[[insert2:.*]] = shufflevector <4 x float> <float undef, float 0x3820202020000000, float 0x3820202020000000, float undef>, <4 x float> %[[insert1]], <4 x i32> <i32 4, i32 1, i32 2, {{.*}}>
+
+  %a.gep2 = getelementptr <4 x float>* %a, i32 0, i32 2
+  %a.cast2 = bitcast float* %a.gep2 to i8*
+  call void @llvm.memset.p0i8.i32(i8* %a.cast2, i8 3, i32 8, i32 0, i1 false)
+; CHECK-NEXT: %[[insert3:.*]] = shufflevector <4 x float> <float undef, float undef, float 0x3860606060000000, float 0x3860606060000000>, <4 x float> %[[insert2]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+
+  %a.gep3 = getelementptr <4 x float>* %a, i32 0, i32 3
+  %a.cast3 = bitcast float* %a.gep3 to i8*
+  call void @llvm.memset.p0i8.i32(i8* %a.cast3, i8 7, i32 4, i32 0, i1 false)
+; CHECK-NEXT: %[[insert4:.*]] = insertelement <4 x float> %[[insert3]], float 0x38E0E0E0E0000000, i32 3
+
+  %ret = load <4 x float>* %a
+
+  ret <4 x float> %ret
+; CHECK-NEXT: ret <4 x float> %[[insert4]]
+}
+
+define <4 x float> @test_subvec_memcpy(i8* %x, i8* %y, i8* %z, i8* %f, i8* %out) {
+; CHECK: @test_subvec_memcpy
+entry:
+  %a = alloca <4 x float>
+; CHECK-NOT: alloca
+
+  %a.gep0 = getelementptr <4 x float>* %a, i32 0, i32 0
+  %a.cast0 = bitcast float* %a.gep0 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.cast0, i8* %x, i32 8, i32 0, i1 false)
+; CHECK:      %[[xptr:.*]] = bitcast i8* %x to <2 x float>*
+; CHECK-NEXT: %[[x:.*]] = load <2 x float>* %[[xptr]]
+; CHECK-NEXT: %[[expand_x:.*]] = shufflevector <2 x float> %[[x]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT: %[[insert_x:.*]] = shufflevector <4 x float> %[[expand_x]], <4 x float> undef, <4 x i32> <i32 0, i32 1, {{.*}}>
+
+  %a.gep1 = getelementptr <4 x float>* %a, i32 0, i32 1
+  %a.cast1 = bitcast float* %a.gep1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.cast1, i8* %y, i32 8, i32 0, i1 false)
+; CHECK-NEXT: %[[yptr:.*]] = bitcast i8* %y to <2 x float>*
+; CHECK-NEXT: %[[y:.*]] = load <2 x float>* %[[yptr]]
+; CHECK-NEXT: %[[expand_y:.*]] = shufflevector <2 x float> %[[y]], <2 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
+; CHECK-NEXT: %[[insert_y:.*]] = shufflevector <4 x float> %[[expand_y]], <4 x float> %[[insert_x]], <4 x i32> <i32 4, i32 1, i32 2, {{.*}}>
+
+  %a.gep2 = getelementptr <4 x float>* %a, i32 0, i32 2
+  %a.cast2 = bitcast float* %a.gep2 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.cast2, i8* %z, i32 8, i32 0, i1 false)
+; CHECK-NEXT: %[[zptr:.*]] = bitcast i8* %z to <2 x float>*
+; CHECK-NEXT: %[[z:.*]] = load <2 x float>* %[[zptr]]
+; CHECK-NEXT: %[[expand_z:.*]] = shufflevector <2 x float> %[[z]], <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+; CHECK-NEXT: %[[insert_z:.*]] = shufflevector <4 x float> %[[expand_z]], <4 x float> %[[insert_y]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+
+  %a.gep3 = getelementptr <4 x float>* %a, i32 0, i32 3
+  %a.cast3 = bitcast float* %a.gep3 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.cast3, i8* %f, i32 4, i32 0, i1 false)
+; CHECK-NEXT: %[[fptr:.*]] = bitcast i8* %f to float*
+; CHECK-NEXT: %[[f:.*]] = load float* %[[fptr]]
+; CHECK-NEXT: %[[insert_f:.*]] = insertelement <4 x float> %[[insert_z]], float %[[f]], i32 3
+
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %out, i8* %a.cast2, i32 8, i32 0, i1 false)
+; CHECK-NEXT: %[[outptr:.*]] = bitcast i8* %out to <2 x float>*
+; CHECK-NEXT: %[[extract_out:.*]] = shufflevector <4 x float> %[[insert_f]], <4 x float> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: store <2 x float> %[[extract_out]], <2 x float>* %[[outptr]]
+
+  %ret = load <4 x float>* %a
+
+  ret <4 x float> %ret
+; CHECK-NEXT: ret <4 x float> %[[insert_f]]
+}
+
 define i32 @PR14212() {
 ; CHECK: @PR14212
 ; This caused a crash when "splitting" the load of the i32 in order to promote
diff --git a/test/Transforms/SROA/vectors-of-pointers.ll b/test/Transforms/SROA/vectors-of-pointers.ll
new file mode 100644
index 0000000000..7e995b9e44
--- /dev/null
+++ b/test/Transforms/SROA/vectors-of-pointers.ll
@@ -0,0 +1,25 @@
+; RUN: opt < %s -sroa
+
+; Make sure we don't crash on this one.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define void @foo() {
+entry:
+  %Args.i = alloca <2 x i32*>, align 16
+  br i1 undef, label %bb0.exit158, label %if.then.i.i.i.i.i138
+
+if.then.i.i.i.i.i138:
+  unreachable
+
+bb0.exit158:
+  br i1 undef, label %bb0.exit257, label %if.then.i.i.i.i.i237
+
+if.then.i.i.i.i.i237:
+  unreachable
+
+bb0.exit257:
+  %0 = load <2 x i32*>* %Args.i, align 16
+  unreachable
+}
diff --git a/test/Transforms/ScalarRepl/crash.ll b/test/Transforms/ScalarRepl/crash.ll
index 58c5a3a052..8c60dceb8b 100644
--- a/test/Transforms/ScalarRepl/crash.ll
+++ b/test/Transforms/ScalarRepl/crash.ll
@@ -1,5 +1,5 @@
-; RUN: opt -scalarrepl %s -disable-output
-; RUN: opt -scalarrepl-ssa %s -disable-output
+; RUN: opt -scalarrepl -disable-output < %s
+; RUN: opt -scalarrepl-ssa -disable-output < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
diff --git a/test/Transforms/ScalarRepl/memcpy-align.ll b/test/Transforms/ScalarRepl/memcpy-align.ll
index a7af208f4f..6046e1295d 100644
--- a/test/Transforms/ScalarRepl/memcpy-align.ll
+++ b/test/Transforms/ScalarRepl/memcpy-align.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -scalarrepl -S | FileCheck %s
+; RUN: opt -scalarrepl -S < %s | FileCheck %s
 ; PR6832
 target datalayout =
 "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32"
diff --git a/test/Transforms/ScalarRepl/phi-select.ll b/test/Transforms/ScalarRepl/phi-select.ll
index ffe0b1dd5f..5c21c3bd9f 100644
--- a/test/Transforms/ScalarRepl/phi-select.ll
+++ b/test/Transforms/ScalarRepl/phi-select.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -scalarrepl -S | FileCheck %s
+; RUN: opt -scalarrepl -S < %s | FileCheck %s
 ; Test promotion of allocas that have phis and select users.
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.2"
diff --git a/test/Transforms/SimplifyCFG/2010-03-30-InvokeCrash.ll b/test/Transforms/SimplifyCFG/2010-03-30-InvokeCrash.ll
index 7bffa1a8e0..333336de76 100644
--- a/test/Transforms/SimplifyCFG/2010-03-30-InvokeCrash.ll
+++ b/test/Transforms/SimplifyCFG/2010-03-30-InvokeCrash.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -simplifycfg -disable-output
+; RUN: opt -simplifycfg -disable-output < %s
 ; END.
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/SimplifyCFG/PR9946.ll b/test/Transforms/SimplifyCFG/PR9946.ll
index 4a61b84605..c355a8f5cc 100644
--- a/test/Transforms/SimplifyCFG/PR9946.ll
+++ b/test/Transforms/SimplifyCFG/PR9946.ll
@@ -1,4 +1,4 @@
-; RUN: opt  %s -simplifycfg -disable-output
+; RUN: opt -simplifycfg -disable-output < %s
 
 @foo = external constant i32
 
diff --git a/test/Transforms/SimplifyCFG/branch-fold-dbg.ll b/test/Transforms/SimplifyCFG/branch-fold-dbg.ll
index 0897c95a67..0526883fe8 100644
--- a/test/Transforms/SimplifyCFG/branch-fold-dbg.ll
+++ b/test/Transforms/SimplifyCFG/branch-fold-dbg.ll
@@ -1,4 +1,4 @@
-; RUN: opt -simplifycfg -S %s | FileCheck %s
+; RUN: opt -simplifycfg -S < %s | FileCheck %s
 
 %0 = type { i32*, i32* }
 
diff --git a/test/Transforms/SimplifyCFG/select-gep.ll b/test/Transforms/SimplifyCFG/select-gep.ll
index 7654d0271a..3e2a6237b2 100644
--- a/test/Transforms/SimplifyCFG/select-gep.ll
+++ b/test/Transforms/SimplifyCFG/select-gep.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -simplifycfg %s | FileCheck %s
+; RUN: opt -S -simplifycfg < %s | FileCheck %s
 
 define i8* @test1(i8* %x, i64 %y) nounwind {
 entry:
diff --git a/test/Transforms/SimplifyLibCalls/float-shrink-compare.ll b/test/Transforms/SimplifyLibCalls/float-shrink-compare.ll
index aecb887beb..ad54c3e38f 100644
--- a/test/Transforms/SimplifyLibCalls/float-shrink-compare.ll
+++ b/test/Transforms/SimplifyLibCalls/float-shrink-compare.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -simplify-libcalls -instcombine %s | FileCheck %s
+; RUN: opt -S -simplify-libcalls -instcombine < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
diff --git a/test/Transforms/StripSymbols/2010-08-25-crash.ll b/test/Transforms/StripSymbols/2010-08-25-crash.ll
index 3965c37822..7de5a02805 100644
--- a/test/Transforms/StripSymbols/2010-08-25-crash.ll
+++ b/test/Transforms/StripSymbols/2010-08-25-crash.ll
@@ -1,4 +1,4 @@
-; RUN: opt -strip-dead-debug-info -disable-output %s
+; RUN: opt -strip-dead-debug-info -disable-output < %s
 define i32 @foo() nounwind ssp {
 entry:
   ret i32 0, !dbg !8
diff --git a/test/Transforms/StripSymbols/block-address.ll b/test/Transforms/StripSymbols/block-address.ll
index d22c6b1b15..113d4d94fa 100644
--- a/test/Transforms/StripSymbols/block-address.ll
+++ b/test/Transforms/StripSymbols/block-address.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -strip -S | FileCheck %s
+; RUN: opt -strip -S < %s | FileCheck %s
 ; PR10286
 
 @main_addrs = constant [2 x i8*] [i8* blockaddress(@f, %FOO), i8* blockaddress(@f, %BAR)]
diff --git a/test/Transforms/TailCallElim/dont-tce-tail-marked-call.ll b/test/Transforms/TailCallElim/dont-tce-tail-marked-call.ll
index e4f8b483c3..97e67b2642 100644
--- a/test/Transforms/TailCallElim/dont-tce-tail-marked-call.ll
+++ b/test/Transforms/TailCallElim/dont-tce-tail-marked-call.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -tailcallelim -S | \
-; RUN:    grep "call i32 @foo"
+; RUN: opt < %s -tailcallelim -S | FileCheck %s
 
 declare void @bar(i32*)
 
@@ -7,6 +6,7 @@ define i32 @foo(i32 %N) {
 	%A = alloca i32, i32 %N		; <i32*> [#uses=2]
 	store i32 17, i32* %A
 	call void @bar( i32* %A )
+; CHECK: tail call i32 @foo
 	%X = tail call i32 @foo( i32 %N )		; <i32> [#uses=1]
 	ret i32 %X
 }
diff --git a/test/Transforms/TailCallElim/dup_tail.ll b/test/Transforms/TailCallElim/dup_tail.ll
index 42ac2f9dc4..3b87ed3ca6 100644
--- a/test/Transforms/TailCallElim/dup_tail.ll
+++ b/test/Transforms/TailCallElim/dup_tail.ll
@@ -1,5 +1,7 @@
 ; Duplicate the return into if.end to enable TCE.
-; RUN: opt %s -tailcallelim -stats -disable-output 2>&1 | grep "Number of return duplicated"
+; RUN: opt -tailcallelim -stats -disable-output < %s 2>&1 | FileCheck %s
+
+; CHECK: Number of return duplicated
 
 define i32 @fib(i32 %n) nounwind ssp {
 entry:
diff --git a/test/Transforms/TailCallElim/intervening-inst.ll b/test/Transforms/TailCallElim/intervening-inst.ll
index 0c40bd5dc5..10dffbd694 100644
--- a/test/Transforms/TailCallElim/intervening-inst.ll
+++ b/test/Transforms/TailCallElim/intervening-inst.ll
@@ -1,5 +1,5 @@
 ; This function contains intervening instructions which should be moved out of the way
-; RUN: opt < %s -tailcallelim -S | not grep call
+; RUN: opt < %s -tailcallelim -S | FileCheck %s
 
 define i32 @Test(i32 %X) {
 entry:
@@ -10,6 +10,7 @@ then.0:		; preds = %entry
 	ret i32 %tmp.4
 endif.0:		; preds = %entry
 	%tmp.10 = add i32 %X, -1		; <i32> [#uses=1]
+; CHECK-NOT: call
 	%tmp.8 = call i32 @Test( i32 %tmp.10 )		; <i32> [#uses=1]
 	%DUMMY = add i32 %X, 1		; <i32> [#uses=0]
 	ret i32 %tmp.8
diff --git a/test/Transforms/TailCallElim/move_alloca_for_tail_call.ll b/test/Transforms/TailCallElim/move_alloca_for_tail_call.ll
index a556ddb6eb..741f5848bc 100644
--- a/test/Transforms/TailCallElim/move_alloca_for_tail_call.ll
+++ b/test/Transforms/TailCallElim/move_alloca_for_tail_call.ll
@@ -1,4 +1,4 @@
-; RUN: opt -tailcallelim %s -S | FileCheck %s
+; RUN: opt -tailcallelim -S < %s | FileCheck %s
 ; PR615
 
 declare void @bar(i32*)
diff --git a/test/Transforms/TailCallElim/nocapture.ll b/test/Transforms/TailCallElim/nocapture.ll
index 87cb9dd427..e49d87cc4b 100644
--- a/test/Transforms/TailCallElim/nocapture.ll
+++ b/test/Transforms/TailCallElim/nocapture.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -tailcallelim -S | FileCheck %s
+; RUN: opt -tailcallelim -S < %s | FileCheck %s
 ; XFAIL: *
 
 declare void @use(i8* nocapture, i8* nocapture)
diff --git a/test/Transforms/TailCallElim/reorder_load.ll b/test/Transforms/TailCallElim/reorder_load.ll
index 7f5c36e4a2..53c65dab10 100644
--- a/test/Transforms/TailCallElim/reorder_load.ll
+++ b/test/Transforms/TailCallElim/reorder_load.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -tailcallelim -S | not grep call
+; RUN: opt < %s -tailcallelim -S | FileCheck %s
 ; PR4323
 
 ; Several cases where tail call elimination should move the load above the call,
@@ -21,6 +21,7 @@ if:		; preds = %entry
 
 else:		; preds = %entry
 	%tmp7 = add i32 %start_arg, 1		; <i32> [#uses=1]
+; CHECK-NOT: call
 	%tmp8 = call fastcc i32 @raise_load_1(i32* %a_arg, i32 %a_len_arg, i32 %tmp7)		; <i32> [#uses=1]
 	%tmp9 = load i32* %a_arg		; <i32> [#uses=1]
 	%tmp10 = add i32 %tmp9, %tmp8		; <i32> [#uses=1]
@@ -47,6 +48,7 @@ unwind:		; preds = %else
 
 recurse:		; preds = %else
 	%tmp7 = add i32 %start_arg, 1		; <i32> [#uses=1]
+; CHECK-NOT: call
 	%tmp8 = call fastcc i32 @raise_load_2(i32* %a_arg, i32 %a_len_arg, i32 %tmp7)		; <i32> [#uses=1]
 	%tmp9 = load i32* @global		; <i32> [#uses=1]
 	%tmp10 = add i32 %tmp9, %tmp8		; <i32> [#uses=1]
@@ -66,6 +68,7 @@ if:		; preds = %entry
 
 else:		; preds = %entry
 	%tmp7 = add i32 %start_arg, 1		; <i32> [#uses=1]
+; CHECK-NOT: call
 	%tmp8 = call fastcc i32 @raise_load_3(i32* %a_arg, i32 %a_len_arg, i32 %tmp7)		; <i32> [#uses=1]
 	%tmp9 = load i32* @extern_weak_global		; <i32> [#uses=1]
 	%tmp10 = add i32 %tmp9, %tmp8		; <i32> [#uses=1]
@@ -94,6 +97,7 @@ unwind:		; preds = %else
 recurse:		; preds = %else
 	%tmp7 = add i32 %start_arg, 1		; <i32> [#uses=1]
 	%first = load i32* %a_arg		; <i32> [#uses=1]
+; CHECK-NOT: call
 	%tmp8 = call fastcc i32 @raise_load_4(i32* %a_arg, i32 %first, i32 %tmp7)		; <i32> [#uses=1]
 	%second = load i32* %a_arg		; <i32> [#uses=1]
 	%tmp10 = add i32 %second, %tmp8		; <i32> [#uses=1]
diff --git a/test/Transforms/TailCallElim/return_constant.ll b/test/Transforms/TailCallElim/return_constant.ll
index 48e5641bb5..e99e57e145 100644
--- a/test/Transforms/TailCallElim/return_constant.ll
+++ b/test/Transforms/TailCallElim/return_constant.ll
@@ -1,7 +1,7 @@
 ; Though this case seems to be fairly unlikely to occur in the wild, someone
 ; plunked it into the demo script, so maybe they care about it.
 ;
-; RUN: opt < %s -tailcallelim -S | not grep call
+; RUN: opt < %s -tailcallelim -S | FileCheck %s
 
 define i32 @aaa(i32 %c) {
 entry:
@@ -9,6 +9,7 @@ entry:
 	br i1 %tmp.1, label %return, label %else
 else:		; preds = %entry
 	%tmp.5 = add i32 %c, -1		; <i32> [#uses=1]
+; CHECK-NOT: call
 	%tmp.3 = call i32 @aaa( i32 %tmp.5 )		; <i32> [#uses=0]
 	ret i32 0
 return:		; preds = %entry
diff --git a/test/Transforms/TailCallElim/trivial_codegen_tailcall.ll b/test/Transforms/TailCallElim/trivial_codegen_tailcall.ll
index 3d01d17099..7049e4d588 100644
--- a/test/Transforms/TailCallElim/trivial_codegen_tailcall.ll
+++ b/test/Transforms/TailCallElim/trivial_codegen_tailcall.ll
@@ -1,11 +1,11 @@
-; RUN: opt < %s -tailcallelim -S | \
-; RUN:    grep "tail call void @foo"
+; RUN: opt < %s -tailcallelim -S | FileCheck %s
 
 
 declare void @foo()
 
 define void @bar() {
-	call void @foo( )
+; CHECK: tail call void @foo()
+	call void @foo()
 	ret void
 }