3 files changed, 688 insertions, 140 deletions
diff --git a/test/NaCl/Bitcode/bitcast-elide.ll b/test/NaCl/Bitcode/bitcast-elide.ll
index eeee69ffef..383673d684 100644
--- a/test/NaCl/Bitcode/bitcast-elide.ll
+++ b/test/NaCl/Bitcode/bitcast-elide.ll
@@ -17,17 +17,19 @@
 
 ; ------------------------------------------------------
 
-@bytes = internal global [7 x i8] c"abcdefg"
+@bytes = internal global [4 x i8] c"abcd"
+
+; ------------------------------------------------------
 
 ; Test that we elide the simple case of global.
 define void @SimpleLoad() {
-  %1 = bitcast [7 x i8]* @bytes to i32*
+  %1 = bitcast [4 x i8]* @bytes to i32*
   %2 = load i32* %1, align 4
   ret void
 }
 
 ; TD1:      define void @SimpleLoad() {
-; TD1-NEXT:   %1 = bitcast [7 x i8]* @bytes to i32*
+; TD1-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
 ; TD1-NEXT:   %2 = load i32* %1, align 4
 ; TD1-NEXT:   ret void
 ; TD1-NEXT: }
@@ -40,7 +42,7 @@ define void @SimpleLoad() {
 ; PF1-NEXT:  </FUNCTION_BLOCK>
 
 ; TD2:      define void @SimpleLoad() {
-; TD2-NEXT:   %1 = bitcast [7 x i8]* @bytes to i32*
+; TD2-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
 ; TD2-NEXT:   %2 = load i32* %1, align 4
 ; TD2-NEXT:   ret void
 ; TD2-NEXT: }
@@ -51,6 +53,8 @@ define void @SimpleLoad() {
 ; PF2-NEXT:    <INST_RET/>
 ; PF2-NEXT:  </FUNCTION_BLOCK>
 
+; ------------------------------------------------------
+
 ; Test that we elide the simple case of an alloca.
 define void @SimpleLoadAlloca() {
   %1 = alloca i8, i32 4, align 4
@@ -67,8 +71,6 @@ define void @SimpleLoadAlloca() {
 ; TD1-NEXT: }
 
 ; PF1:        <FUNCTION_BLOCK>
-; PF1-NEXT:     <DECLAREBLOCKS op0=1/>
-; PF1-NEXT:     <CONSTANTS_BLOCK
 ; PF1:          </CONSTANTS_BLOCK>
 ; PF1-NEXT:     <INST_ALLOCA op0=1 op1=3/>
 ; PF1-NEXT:     <INST_CAST op0=1 op1=1 op2=11/>
@@ -84,23 +86,23 @@ define void @SimpleLoadAlloca() {
 ; TD2-NEXT: }
 
 ; PF2:        <FUNCTION_BLOCK>
-; PF2-NEXT:     <DECLAREBLOCKS op0=1/>
-; PF2-NEXT:     <CONSTANTS_BLOCK
 ; PF2:          </CONSTANTS_BLOCK>
 ; PF2-NEXT:     <INST_ALLOCA op0=1 op1=3/>
 ; PF2-NEXT:     <INST_LOAD op0=1 op1=3 op2=0/>
 ; PF2-NEXT:     <INST_RET/>
 ; PF2-NEXT:   </FUNCTION_BLOCK>
 
+; ------------------------------------------------------
+
 ; Test that we don't elide an bitcast if one of its uses is not a load.
 define i32* @NonsimpleLoad(i32 %i) {
-  %1 = bitcast [7 x i8]* @bytes to i32*       
+  %1 = bitcast [4 x i8]* @bytes to i32*       
   %2 = load i32* %1, align 4
   ret i32* %1
 }
 
 ; TD1:      define i32* @NonsimpleLoad(i32 %i) {
-; TD1-NEXT:   %1 = bitcast [7 x i8]* @bytes to i32*
+; TD1-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
 ; TD1-NEXT:   %2 = load i32* %1, align 4
 ; TD1-NEXT:   ret i32* %1
 ; TD1-NEXT: }
@@ -113,7 +115,7 @@ define i32* @NonsimpleLoad(i32 %i) {
 ; PF1:       </FUNCTION_BLOCK>
 
 ; TD2:      define i32* @NonsimpleLoad(i32 %i) {
-; TD2-NEXT:   %1 = bitcast [7 x i8]* @bytes to i32*
+; TD2-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
 ; TD2-NEXT:   %2 = load i32* %1, align 4
 ; TD2-NEXT:   ret i32* %1
 ; TD2-NEXT: }
@@ -125,20 +127,22 @@ define i32* @NonsimpleLoad(i32 %i) {
 ; PF2-NEXT:    <INST_RET op0=2/>
 ; PF2:       </FUNCTION_BLOCK>
 
+; ------------------------------------------------------
+
 ; Test that we can handle multiple bitcasts.
 define i32 @TwoLoads(i32 %i) {
-  %1 = bitcast [7 x i8]* @bytes to i32*       
+  %1 = bitcast [4 x i8]* @bytes to i32*       
   %2 = load i32* %1, align 4
-  %3 = bitcast [7 x i8]* @bytes to i32*       
+  %3 = bitcast [4 x i8]* @bytes to i32*       
   %4 = load i32* %3, align 4
   %5 = add i32 %2, %4
   ret i32 %5
 }
 
 ; TD1:      define i32 @TwoLoads(i32 %i) {
-; TD1-NEXT:   %1 = bitcast [7 x i8]* @bytes to i32*
+; TD1-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
 ; TD1-NEXT:   %2 = load i32* %1, align 4
-; TD1-NEXT:   %3 = bitcast [7 x i8]* @bytes to i32*
+; TD1-NEXT:   %3 = bitcast [4 x i8]* @bytes to i32*
 ; TD1-NEXT:   %4 = load i32* %3, align 4
 ; TD1-NEXT:   %5 = add i32 %2, %4
 ; TD1-NEXT:   ret i32 %5
@@ -155,12 +159,11 @@ define i32 @TwoLoads(i32 %i) {
 ; PF1:       </FUNCTION_BLOCK>
 
 ; TD2:      define i32 @TwoLoads(i32 %i) {
-; TD2-NEXT:   %1 = bitcast [7 x i8]* @bytes to i32*
+; TD2-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
 ; TD2-NEXT:   %2 = load i32* %1, align 4
-; TD2-NEXT:   %3 = bitcast [7 x i8]* @bytes to i32*
-; TD2-NEXT:   %4 = load i32* %3, align 4
-; TD2-NEXT:   %5 = add i32 %2, %4
-; TD2-NEXT:   ret i32 %5
+; TD2-NEXT:   %3 = load i32* %1, align 4
+; TD2-NEXT:   %4 = add i32 %2, %3
+; TD2-NEXT:   ret i32 %4
 ; TD2-NEXT: }
 
 ; PF2:       <FUNCTION_BLOCK>
@@ -171,17 +174,20 @@ define i32 @TwoLoads(i32 %i) {
 ; PF2-NEXT:    <INST_RET op0=1/>
 ; PF2:       </FUNCTION_BLOCK>
 
-; Test how we duplicate bitcasts, even if optimized in the input file.
-define i32 @TwoLoadOpt(i32 %i) {
-  %1 = bitcast [7 x i8]* @bytes to i32*       
+; ------------------------------------------------------
+
+; Test how we handle bitcasts if optimized in the input file.  This
+; case tests within a single block.
+define i32 @TwoLoadOptOneBlock(i32 %i) {
+  %1 = bitcast [4 x i8]* @bytes to i32*       
   %2 = load i32* %1, align 4
   %3 = load i32* %1, align 4
   %4 = add i32 %2, %3
   ret i32 %4
 }
 
-; TD1:      define i32 @TwoLoadOpt(i32 %i) {
-; TD1-NEXT:   %1 = bitcast [7 x i8]* @bytes to i32*
+; TD1:      define i32 @TwoLoadOptOneBlock(i32 %i) {
+; TD1-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
 ; TD1-NEXT:   %2 = load i32* %1, align 4
 ; TD1-NEXT:   %3 = load i32* %1, align 4
 ; TD1-NEXT:   %4 = add i32 %2, %3
@@ -197,13 +203,12 @@ define i32 @TwoLoadOpt(i32 %i) {
 ; PF1-NEXT:    <INST_RET op0=1/>
 ; PF1:       </FUNCTION_BLOCK>
 
-; TD2:      define i32 @TwoLoadOpt(i32 %i) {
-; TD2-NEXT:   %1 = bitcast [7 x i8]* @bytes to i32*
+; TD2:      define i32 @TwoLoadOptOneBlock(i32 %i) {
+; TD2-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
 ; TD2-NEXT:   %2 = load i32* %1, align 4
-; TD2-NEXT:   %3 = bitcast [7 x i8]* @bytes to i32*
-; TD2-NEXT:   %4 = load i32* %3, align 4
-; TD2-NEXT:   %5 = add i32 %2, %4
-; TD2-NEXT:   ret i32 %5
+; TD2-NEXT:   %3 = load i32* %1, align 4
+; TD2-NEXT:   %4 = add i32 %2, %3
+; TD2-NEXT:   ret i32 %4
 ; TD2-NEXT: }
 
 ; PF2:       <FUNCTION_BLOCK>
@@ -214,15 +219,87 @@ define i32 @TwoLoadOpt(i32 %i) {
 ; PF2-NEXT:    <INST_RET op0=1/>
 ; PF2:       </FUNCTION_BLOCK>
 
+; ------------------------------------------------------
+
+; Test how we handle bitcasts if optimized in the input file.  This
+; case tests accross blocks.
+define i32 @TwoLoadOptTwoBlocks(i32 %i) {
+  %1 = bitcast [4 x i8]* @bytes to i32*       
+  %2 = load i32* %1, align 4
+  %3 = load i32* %1, align 4
+  %4 = add i32 %2, %3
+  br label %BB
+
+BB:
+  %5 = load i32* %1, align 4
+  %6 = load i32* %1, align 4
+  %7 = add i32 %5, %6
+  ret i32 %4
+}
+
+; TD1:      define i32 @TwoLoadOptTwoBlocks(i32 %i) {
+; TD1-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
+; TD1-NEXT:   %2 = load i32* %1, align 4
+; TD1-NEXT:   %3 = load i32* %1, align 4
+; TD1-NEXT:   %4 = add i32 %2, %3
+; TD1-NEXT:   br label %BB
+; TD1:      BB:
+; TD1-NEXT:   %5 = load i32* %1, align 4
+; TD1-NEXT:   %6 = load i32* %1, align 4
+; TD1-NEXT:   %7 = add i32 %5, %6
+; TD1-NEXT:   ret i32 %4
+; TD1-NEXT: }
+
+; PF1:        <FUNCTION_BLOCK>
+; PF1-NEXT:     <DECLAREBLOCKS op0=2/>
+; PF1-NEXT:     <INST_CAST op0=2 op1=1 op2=11/>
+; PF1-NEXT:     <INST_LOAD op0=1 op1=3 op2=0/>
+; PF1-NEXT:     <INST_LOAD op0=2 op1=3 op2=0/>
+; PF1-NEXT:     <INST_BINOP op0=2 op1=1 op2=0/>
+; PF1-NEXT:     <INST_BR op0=1/>
+; PF1-NEXT:     <INST_LOAD op0=4 op1=3 op2=0/>
+; PF1-NEXT:     <INST_LOAD op0=5 op1=3 op2=0/>
+; PF1-NEXT:     <INST_BINOP op0=2 op1=1 op2=0/>
+; PF1-NEXT:     <INST_RET op0=4/>
+; PF1:        </FUNCTION_BLOCK>
+
+; TD2:      define i32 @TwoLoadOptTwoBlocks(i32 %i) {
+; TD2-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
+; TD2-NEXT:   %2 = load i32* %1, align 4
+; TD2-NEXT:   %3 = load i32* %1, align 4
+; TD2-NEXT:   %4 = add i32 %2, %3
+; TD2-NEXT:   br label %BB
+; TD2:      BB:
+; TD2-NEXT:   %5 = bitcast [4 x i8]* @bytes to i32*
+; TD2-NEXT:   %6 = load i32* %5, align 4
+; TD2-NEXT:   %7 = load i32* %5, align 4
+; TD2-NEXT:   %8 = add i32 %6, %7
+; TD2-NEXT:   ret i32 %4
+; TD2-NEXT: }
+
+; PF2:        <FUNCTION_BLOCK>
+; PF2-NEXT:     <DECLAREBLOCKS op0=2/>
+; PF2-NEXT:     <INST_LOAD op0=2 op1=3 op2=0/>
+; PF2-NEXT:     <INST_LOAD op0=3 op1=3 op2=0/>
+; PF2-NEXT:     <INST_BINOP op0=2 op1=1 op2=0/>
+; PF2-NEXT:     <INST_BR op0=1/>
+; PF2-NEXT:     <INST_LOAD op0=5 op1=3 op2=0/>
+; PF2-NEXT:     <INST_LOAD op0=6 op1=3 op2=0/>
+; PF2-NEXT:     <INST_BINOP op0=2 op1=1 op2=0/>
+; PF2-NEXT:     <INST_RET op0=4/>
+; PF2:        </FUNCTION_BLOCK>
+
+; ------------------------------------------------------
+
 ; Test that we elide the simple case of bitcast for a store.
 define void @SimpleStore(i32 %i) {
-  %1 = bitcast [7 x i8]* @bytes to i32*
+  %1 = bitcast [4 x i8]* @bytes to i32*
   store i32 %i, i32* %1, align 4
   ret void
 }
 
 ; TD1:      define void @SimpleStore(i32 %i) {
-; TD1-NEXT:   %1 = bitcast [7 x i8]* @bytes to i32*
+; TD1-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
 ; TD1-NEXT:   store i32 %i, i32* %1, align 4
 ; TD1-NEXT:   ret void
 ; TD1-NEXT: }
@@ -235,7 +312,7 @@ define void @SimpleStore(i32 %i) {
 ; PF1:        </FUNCTION_BLOCK>
 
 ; TD2:      define void @SimpleStore(i32 %i) {
-; TD2-NEXT:   %1 = bitcast [7 x i8]* @bytes to i32*
+; TD2-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
 ; TD2-NEXT:   store i32 %i, i32* %1, align 4
 ; TD2-NEXT:   ret void
 ; TD2-NEXT: }
diff --git a/test/NaCl/Bitcode/inttoptr-elide.ll b/test/NaCl/Bitcode/inttoptr-elide.ll
index 029f67adef..679f5f1d47 100644
--- a/test/NaCl/Bitcode/inttoptr-elide.ll
+++ b/test/NaCl/Bitcode/inttoptr-elide.ll
@@ -118,13 +118,11 @@ define i32 @TwoLoads(i32 %i) {
 ; TD2:      define i32 @TwoLoads(i32 %i) {
 ; TD2-NEXT:   %1 = inttoptr i32 %i to i32*
 ; TD2-NEXT:   %2 = load i32* %1, align 4
-; TD2-NEXT:   %3 = inttoptr i32 %i to i32*
-; TD2-NEXT:   %4 = load i32* %3, align 4
-; TD2-NEXT:   %5 = add i32 %2, %4
-; TD2-NEXT:   ret i32 %5
+; TD2-NEXT:   %3 = load i32* %1, align 4
+; TD2-NEXT:   %4 = add i32 %2, %3
+; TD2-NEXT:   ret i32 %4
 ; TD2-NEXT: }
 
-
 ; PF2:       <FUNCTION_BLOCK>
 ; PF2-NEXT:    <DECLAREBLOCKS op0=1/>
 ; PF2-NEXT:    <INST_LOAD op0=1 op1=3 op2=0/>
@@ -135,8 +133,9 @@ define i32 @TwoLoads(i32 %i) {
 
 ; ------------------------------------------------------
 
-; Test how we duplicate inttoptrs, even if optimized in the input file.
-define i32 @TwoLoadOpt(i32 %i) {
+; Test how we handle inttoptrs, if optimized in the input file. This
+; case tests within a single block.
+define i32 @TwoLoadOptOneBlock(i32 %i) {
   %1 = inttoptr i32 %i to i32*
   %2 = load i32* %1, align 4
   %3 = load i32* %1, align 4
@@ -144,7 +143,7 @@ define i32 @TwoLoadOpt(i32 %i) {
   ret i32 %4
 }
 
-; TD1:      define i32 @TwoLoadOpt(i32 %i) {
+; TD1:      define i32 @TwoLoadOptOneBlock(i32 %i) {
 ; TD1-NEXT:   %1 = inttoptr i32 %i to i32*
 ; TD1-NEXT:   %2 = load i32* %1, align 4
 ; TD1-NEXT:   %3 = load i32* %1, align 4
@@ -161,13 +160,12 @@ define i32 @TwoLoadOpt(i32 %i) {
 ; PF1-NEXT:    <INST_RET op0=1/>
 ; PF1:       </FUNCTION_BLOCK>
 
-; TD2:      define i32 @TwoLoadOpt(i32 %i) {
+; TD2:      define i32 @TwoLoadOptOneBlock(i32 %i) {
 ; TD2-NEXT:   %1 = inttoptr i32 %i to i32*
 ; TD2-NEXT:   %2 = load i32* %1, align 4
-; TD2-NEXT:   %3 = inttoptr i32 %i to i32*
-; TD2-NEXT:   %4 = load i32* %3, align 4
-; TD2-NEXT:   %5 = add i32 %2, %4
-; TD2-NEXT:   ret i32 %5
+; TD2-NEXT:   %3 = load i32* %1, align 4
+; TD2-NEXT:   %4 = add i32 %2, %3
+; TD2-NEXT:   ret i32 %4
 ; TD2-NEXT: }
 
 ; PF2:       <FUNCTION_BLOCK>
@@ -180,6 +178,76 @@ define i32 @TwoLoadOpt(i32 %i) {
 
 ; ------------------------------------------------------
 
+; Test how we handle inttoptrs if optimized in the input file.  This
+; case tests accross blocks.
+define i32 @TwoLoadOptTwoBlocks(i32 %i) {
+  %1 = inttoptr i32 %i to i32*
+  %2 = load i32* %1, align 4
+  %3 = load i32* %1, align 4
+  %4 = add i32 %2, %3
+  br label %BB
+
+BB:
+  %5 = load i32* %1, align 4
+  %6 = load i32* %1, align 4
+  %7 = add i32 %5, %6
+  ret i32 %7
+}
+
+; TD1:      define i32 @TwoLoadOptTwoBlocks(i32 %i) {
+; TD1-NEXT:   %1 = inttoptr i32 %i to i32*
+; TD1-NEXT:   %2 = load i32* %1, align 4
+; TD1-NEXT:   %3 = load i32* %1, align 4
+; TD1-NEXT:   %4 = add i32 %2, %3
+; TD1-NEXT:   br label %BB
+; TD1:      BB:
+; TD1-NEXT:   %5 = load i32* %1, align 4
+; TD1-NEXT:   %6 = load i32* %1, align 4
+; TD1-NEXT:   %7 = add i32 %5, %6
+; TD1-NEXT:   ret i32 %7
+; TD1-NEXT: }
+
+; PF1:        <FUNCTION_BLOCK>
+; PF1-NEXT:     <DECLAREBLOCKS op0=2/>
+; PF1-NEXT:     <INST_CAST op0=1 op1=1 op2=10/>
+; PF1-NEXT:     <INST_LOAD op0=1 op1=3 op2=0/>
+; PF1-NEXT:     <INST_LOAD op0=2 op1=3 op2=0/>
+; PF1-NEXT:     <INST_BINOP op0=2 op1=1 op2=0/>
+; PF1-NEXT:     <INST_BR op0=1/>
+; PF1-NEXT:     <INST_LOAD op0=4 op1=3 op2=0/>
+; PF1-NEXT:     <INST_LOAD op0=5 op1=3 op2=0/>
+; PF1-NEXT:     <INST_BINOP op0=2 op1=1 op2=0/>
+; PF1-NEXT:     <INST_RET op0=1/>
+; PF1:        </FUNCTION_BLOCK>
+
+; TD2:      define i32 @TwoLoadOptTwoBlocks(i32 %i) {
+; TD2-NEXT:   %1 = inttoptr i32 %i to i32*
+; TD2-NEXT:   %2 = load i32* %1, align 4
+; TD2-NEXT:   %3 = load i32* %1, align 4
+; TD2-NEXT:   %4 = add i32 %2, %3
+; TD2-NEXT:   br label %BB
+; TD2:      BB:
+; TD2-NEXT:   %5 = inttoptr i32 %i to i32*
+; TD2-NEXT:   %6 = load i32* %5, align 4
+; TD2-NEXT:   %7 = load i32* %5, align 4
+; TD2-NEXT:   %8 = add i32 %6, %7
+; TD2-NEXT:   ret i32 %8
+; TD2-NEXT: }
+
+; PF2:        <FUNCTION_BLOCK>
+; PF2-NEXT:     <DECLAREBLOCKS op0=2/>
+; PF2-NEXT:     <INST_LOAD op0=1 op1=3 op2=0/>
+; PF2-NEXT:     <INST_LOAD op0=2 op1=3 op2=0/>
+; PF2-NEXT:     <INST_BINOP op0=2 op1=1 op2=0/>
+; PF2-NEXT:     <INST_BR op0=1/>
+; PF2-NEXT:     <INST_LOAD op0=4 op1=3 op2=0/>
+; PF2-NEXT:     <INST_LOAD op0=5 op1=3 op2=0/>
+; PF2-NEXT:     <INST_BINOP op0=2 op1=1 op2=0/>
+; PF2-NEXT:     <INST_RET op0=1/>
+; PF2:        </FUNCTION_BLOCK>
+
+; ------------------------------------------------------
+
 ; Test that we elide the simple case of inttoptr for a store.
 define void @SimpleStore(i32 %i) {
   %1 = inttoptr i32 %i to i32*
@@ -210,4 +278,4 @@ define void @SimpleStore(i32 %i) {
 ; PF2-NEXT:   <DECLAREBLOCKS op0=1/>
 ; PF2-NEXT:   <INST_STORE op0=1 op1=1 op2=3/>
 ; PF2-NEXT:   <INST_RET/>
-; PF2T:     </FUNCTION_BLOCK>
+; PF2:      </FUNCTION_BLOCK>
diff --git a/test/NaCl/Bitcode/ptrtoint-elide.ll b/test/NaCl/Bitcode/ptrtoint-elide.ll
index 10504a8577..43a82a0802 100644
--- a/test/NaCl/Bitcode/ptrtoint-elide.ll
+++ b/test/NaCl/Bitcode/ptrtoint-elide.ll
@@ -153,8 +153,8 @@ define void @AllocCastDelete() {
 ; ------------------------------------------------------
 
 ; Show case where we have optimized the ptrtoint (and bitcast) into a
-; single instruction, but will get duplicated after reading back the
-; bitcode file, since we insert elided casts immediately before each use.
+; single instruction, and will only be inserted before the first use
+; in the block.
 define void @AllocCastOpt() {
   %1 = alloca i8, i32 4, align 8
   %2 = bitcast [4 x i8]* @bytes to i32*
@@ -177,7 +177,7 @@ define void @AllocCastOpt() {
 ; PF1:          </CONSTANTS_BLOCK>
 ; PF1-NEXT:     <INST_ALLOCA op0=1 op1=4/>
 ; PF1-NEXT:     <INST_CAST op0=3 op1=4 op2=11/>
-; PF1-NEXT:     <INST_CAST  op0=2 op1=0 op2=9/>
+; PF1-NEXT:     <INST_CAST op0=2 op1=0 op2=9/>
 ; PF1-NEXT:     <INST_STORE op0=2 op1=1 op2=1 op3=0/>
 ; PF1-NEXT:     <INST_STORE op0=2 op1=1 op2=1 op3=0/>
 ; PF1-NEXT:     <INST_RET/>
@@ -188,9 +188,7 @@ define void @AllocCastOpt() {
 ; TD2-NEXT:   %2 = ptrtoint i8* %1 to i32
 ; TD2-NEXT:   %3 = bitcast [4 x i8]* @bytes to i32*
 ; TD2-NEXT:   store i32 %2, i32* %3, align 1
-; TD2-NEXT:   %4 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %5 = bitcast [4 x i8]* @bytes to i32*
-; TD2-NEXT:   store i32 %4, i32* %5, align 1
+; TD2-NEXT:   store i32 %2, i32* %3, align 1
 ; TD2-NEXT:   ret void
 ; TD2-NEXT: }
 
@@ -366,7 +364,6 @@ define i32 @StoreGlobalMovePtr2Int() {
 ; PF1-NEXT:     <INST_RET op0=4/>
 ; PF1-NEXT:   </FUNCTION_BLOCK>
 
-
 ; TD2:      define i32 @StoreGlobalMovePtr2Int() {
 ; TD2-NEXT:   %1 = alloca i8, i32 4, align 8
 ; TD2-NEXT:   %2 = ptrtoint [4 x i8]* @bytes to i32
@@ -430,11 +427,8 @@ define void @CastAddAlloca() {
 ; TD2-NEXT:   %2 = add i32 1, 2
 ; TD2-NEXT:   %3 = ptrtoint i8* %1 to i32
 ; TD2-NEXT:   %4 = add i32 %3, 2
-; TD2-NEXT:   %5 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %6 = add i32 1, %5
-; TD2-NEXT:   %7 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %8 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %9 = add i32 %7, %8
+; TD2-NEXT:   %5 = add i32 1, %3
+; TD2-NEXT:   %6 = add i32 %3, %3
 ; TD2-NEXT:   ret void
 ; TD2-NEXT: }
 
@@ -491,11 +485,8 @@ define void @CastAddGlobal() {
 ; TD2-NEXT:   %1 = add i32 1, 2
 ; TD2-NEXT:   %2 = ptrtoint [4 x i8]* @bytes to i32
 ; TD2-NEXT:   %3 = add i32 %2, 2
-; TD2-NEXT:   %4 = ptrtoint [4 x i8]* @bytes to i32
-; TD2-NEXT:   %5 = add i32 1, %4
-; TD2-NEXT:   %6 = ptrtoint [4 x i8]* @bytes to i32
-; TD2-NEXT:   %7 = ptrtoint [4 x i8]* @bytes to i32
-; TD2-NEXT:   %8 = add i32 %6, %7
+; TD2-NEXT:   %4 = add i32 1, %2
+; TD2-NEXT:   %5 = add i32 %2, %2
 ; TD2-NEXT:   ret void
 ; TD2-NEXT: }
 
@@ -571,36 +562,16 @@ define void @CastBinop() {
 ; TD2-NEXT:   %2 = ptrtoint i8* %1 to i32
 ; TD2-NEXT:   %3 = ptrtoint [4 x i8]* @bytes to i32
 ; TD2-NEXT:   %4 = sub i32 %2, %3
-; TD2-NEXT:   %5 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %6 = ptrtoint [4 x i8]* @bytes to i32
-; TD2-NEXT:   %7 = mul i32 %5, %6
-; TD2-NEXT:   %8 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %9 = ptrtoint [4 x i8]* @bytes to i32
-; TD2-NEXT:   %10 = udiv i32 %8, %9
-; TD2-NEXT:   %11 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %12 = ptrtoint [4 x i8]* @bytes to i32
-; TD2-NEXT:   %13 = urem i32 %11, %12
-; TD2-NEXT:   %14 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %15 = ptrtoint [4 x i8]* @bytes to i32
-; TD2-NEXT:   %16 = srem i32 %14, %15
-; TD2-NEXT:   %17 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %18 = ptrtoint [4 x i8]* @bytes to i32
-; TD2-NEXT:   %19 = shl i32 %17, %18
-; TD2-NEXT:   %20 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %21 = ptrtoint [4 x i8]* @bytes to i32
-; TD2-NEXT:   %22 = lshr i32 %20, %21
-; TD2-NEXT:   %23 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %24 = ptrtoint [4 x i8]* @bytes to i32
-; TD2-NEXT:   %25 = ashr i32 %23, %24
-; TD2-NEXT:   %26 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %27 = ptrtoint [4 x i8]* @bytes to i32
-; TD2-NEXT:   %28 = and i32 %26, %27
-; TD2-NEXT:   %29 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %30 = ptrtoint [4 x i8]* @bytes to i32
-; TD2-NEXT:   %31 = or i32 %29, %30
-; TD2-NEXT:   %32 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %33 = ptrtoint [4 x i8]* @bytes to i32
-; TD2-NEXT:   %34 = xor i32 %32, %33
+; TD2-NEXT:   %5 = mul i32 %2, %3
+; TD2-NEXT:   %6 = udiv i32 %2, %3
+; TD2-NEXT:   %7 = urem i32 %2, %3
+; TD2-NEXT:   %8 = srem i32 %2, %3
+; TD2-NEXT:   %9 = shl i32 %2, %3
+; TD2-NEXT:   %10 = lshr i32 %2, %3
+; TD2-NEXT:   %11 = ashr i32 %2, %3
+; TD2-NEXT:   %12 = and i32 %2, %3
+; TD2-NEXT:   %13 = or i32 %2, %3
+; TD2-NEXT:   %14 = xor i32 %2, %3
 ; TD2-NEXT:   ret void
 ; TD2-NEXT: }
 
@@ -666,16 +637,16 @@ define void @TestCasts() {
 ; PF1:          </CONSTANTS_BLOCK>
 ; PF1-NEXT:     <INST_ALLOCA op0=2 op1=4/>
 ; PF1-NEXT:     <INST_CAST op0=1 op1=0 op2=9/>
-; PF1-NEXT:     <INST_CAST op0=6 op1=1 op2=0/>
-; PF1-NEXT:     <INST_CAST op0=2 op1=1 op2=0/>
-; PF1-NEXT:     <INST_CAST op0=8 op1=10 op2=1/>
-; PF1-NEXT:     <INST_CAST op0=4 op1=10 op2=1/>
-; PF1-NEXT:     <INST_CAST op0=9 op1=10 op2=2/>
-; PF1-NEXT:     <INST_CAST op0=6 op1=10 op2=2/>
-; PF1-NEXT:     <INST_CAST op0=9 op1=11 op2=5/>
-; PF1-NEXT:     <INST_CAST op0=8 op1=11 op2=5/>
-; PF1-NEXT:     <INST_CAST op0=13 op1=11 op2=6/>
-; PF1-NEXT:     <INST_CAST op0=10 op1=11 op2=6/>
+; PF1-NEXT:     <INST_CAST op0=6 op1=2 op2=0/>
+; PF1-NEXT:     <INST_CAST op0=2 op1=2 op2=0/>
+; PF1-NEXT:     <INST_CAST op0=8 op1=13 op2=1/>
+; PF1-NEXT:     <INST_CAST op0=4 op1=13 op2=1/>
+; PF1-NEXT:     <INST_CAST op0=9 op1=13 op2=2/>
+; PF1-NEXT:     <INST_CAST op0=6 op1=13 op2=2/>
+; PF1-NEXT:     <INST_CAST op0=9 op1=14 op2=5/>
+; PF1-NEXT:     <INST_CAST op0=8 op1=14 op2=5/>
+; PF1-NEXT:     <INST_CAST op0=13 op1=14 op2=6/>
+; PF1-NEXT:     <INST_CAST op0=10 op1=14 op2=6/>
 ; PF1-NEXT:     <INST_RET/>
 ; PF1-NEXT:   </FUNCTION_BLOCK>
 
@@ -685,33 +656,29 @@ define void @TestCasts() {
 ; TD2-NEXT:   %3 = ptrtoint i8* %1 to i32
 ; TD2-NEXT:   %4 = trunc i32 %3 to i8
 ; TD2-NEXT:   %5 = zext i32 257 to i64
-; TD2-NEXT:   %6 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %7 = zext i32 %6 to i64
-; TD2-NEXT:   %8 = sext i32 -1 to i64
-; TD2-NEXT:   %9 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %10 = sext i32 %9 to i64
-; TD2-NEXT:   %11 = uitofp i32 1 to float
-; TD2-NEXT:   %12 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %13 = uitofp i32 %12 to float
-; TD2-NEXT:   %14 = sitofp i32 -1 to float
-; TD2-NEXT:   %15 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %16 = sitofp i32 %15 to float
+; TD2-NEXT:   %6 = zext i32 %3 to i64
+; TD2-NEXT:   %7 = sext i32 -1 to i64
+; TD2-NEXT:   %8 = sext i32 %3 to i64
+; TD2-NEXT:   %9 = uitofp i32 1 to float
+; TD2-NEXT:   %10 = uitofp i32 %3 to float
+; TD2-NEXT:   %11 = sitofp i32 -1 to float
+; TD2-NEXT:   %12 = sitofp i32 %3 to float
 ; TD2-NEXT:   ret void
 ; TD2-NEXT: }
 
 ; PF2:        <FUNCTION_BLOCK>
 ; PF2:          </CONSTANTS_BLOCK>
 ; PF2-NEXT:     <INST_ALLOCA op0=2 op1=4/>
-; PF2-NEXT:     <INST_CAST op0=5 op1=1 op2=0/>
-; PF2-NEXT:     <INST_CAST op0=2 op1=1 op2=0/>
-; PF2-NEXT:     <INST_CAST op0=7 op1=10 op2=1/>
-; PF2-NEXT:     <INST_CAST op0=4 op1=10 op2=1/>
-; PF2-NEXT:     <INST_CAST op0=8 op1=10 op2=2/>
-; PF2-NEXT:     <INST_CAST op0=6 op1=10 op2=2/>
-; PF2-NEXT:     <INST_CAST op0=8 op1=11 op2=5/>
-; PF2-NEXT:     <INST_CAST op0=8 op1=11 op2=5/>
-; PF2-NEXT:     <INST_CAST op0=12 op1=11 op2=6/>
-; PF2-NEXT:     <INST_CAST op0=10 op1=11 op2=6/>
+; PF2-NEXT:     <INST_CAST op0=5 op1=2 op2=0/>
+; PF2-NEXT:     <INST_CAST op0=2 op1=2 op2=0/>
+; PF2-NEXT:     <INST_CAST op0=7 op1=13 op2=1/>
+; PF2-NEXT:     <INST_CAST op0=4 op1=13 op2=1/>
+; PF2-NEXT:     <INST_CAST op0=8 op1=13 op2=2/>
+; PF2-NEXT:     <INST_CAST op0=6 op1=13 op2=2/>
+; PF2-NEXT:     <INST_CAST op0=8 op1=14 op2=5/>
+; PF2-NEXT:     <INST_CAST op0=8 op1=14 op2=5/>
+; PF2-NEXT:     <INST_CAST op0=12 op1=14 op2=6/>
+; PF2-NEXT:     <INST_CAST op0=10 op1=14 op2=6/>
 ; PF2-NEXT:     <INST_RET/>
 ; PF2-NEXT:   </FUNCTION_BLOCK>
 
@@ -741,7 +708,7 @@ define void @TestSavedPtrToInt() {
 ; PF1-NEXT:     <INST_ALLOCA op0=2 op1=4/>
 ; PF1-NEXT:     <INST_CAST op0=1 op1=0 op2=9/>
 ; PF1-NEXT:     <INST_BINOP op0=1 op1=3 op2=0/>
-; PF1-NEXT:     <INST_CALL op0=0 op1=22 op2=2/>
+; PF1-NEXT:     <INST_CALL op0=0 op1=26 op2=2/>
 ; PF1-NEXT:     <INST_RET/>
 ; PF1-NEXT:   </FUNCTION_BLOCK>
 
@@ -758,7 +725,7 @@ define void @TestSavedPtrToInt() {
 ; PF2-NEXT:     <INST_ALLOCA op0=2 op1=4/>
 ; PF2-NEXT:     <INST_CAST op0=1 op1=0 op2=9/>
 ; PF2-NEXT:     <INST_BINOP op0=1 op1=3 op2=0/>
-; PF2-NEXT:     <INST_CALL op0=0 op1=22 op2=2/>
+; PF2-NEXT:     <INST_CALL op0=0 op1=26 op2=2/>
 ; PF2-NEXT:     <INST_RET/>
 ; PF2-NEXT:   </FUNCTION_BLOCK>
 
@@ -809,12 +776,8 @@ define void @CastIcmp() {
 ; TD2-NEXT:   %4 = icmp eq i32 %3, 2
 ; TD2-NEXT:   %5 = ptrtoint [4 x i8]* @bytes to i32
 ; TD2-NEXT:   %6 = icmp eq i32 1, %5
-; TD2-NEXT:   %7 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %8 = ptrtoint [4 x i8]* @bytes to i32
-; TD2-NEXT:   %9 = icmp eq i32 %7, %8
-; TD2-NEXT:   %10 = ptrtoint [4 x i8]* @bytes to i32
-; TD2-NEXT:   %11 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %12 = icmp eq i32 %10, %11
+; TD2-NEXT:   %7 = icmp eq i32 %3, %5
+; TD2-NEXT:   %8 = icmp eq i32 %5, %3
 ; TD2-NEXT:   ret void
 ; TD2-NEXT: }
 
@@ -876,12 +839,8 @@ define void @CastSelect() {
 ; TD2-NEXT:   %4 = select i1 true, i32 %3, i32 2
 ; TD2-NEXT:   %5 = ptrtoint [4 x i8]* @bytes to i32
 ; TD2-NEXT:   %6 = select i1 true, i32 1, i32 %5
-; TD2-NEXT:   %7 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %8 = ptrtoint [4 x i8]* @bytes to i32
-; TD2-NEXT:   %9 = select i1 true, i32 %7, i32 %8
-; TD2-NEXT:   %10 = ptrtoint [4 x i8]* @bytes to i32
-; TD2-NEXT:   %11 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %12 = select i1 true, i32 %10, i32 %11
+; TD2-NEXT:   %7 = select i1 true, i32 %3, i32 %5
+; TD2-NEXT:   %8 = select i1 true, i32 %5, i32 %3
 ; TD2-NEXT:   ret void
 ; TD2-NEXT: }
 
@@ -895,3 +854,447 @@ define void @CastSelect() {
 ; PF2-NEXT:     <INST_VSELECT op0=10 op1=5 op2=6/>
 ; PF2-NEXT:     <INST_RET/>
 ; PF2-NEXT:   </FUNCTION_BLOCK>
+
+; ------------------------------------------------------
+
+; Show that if a phi node refers to a pointer cast, we add
+; them at the end of the incoming block.
+define void @PhiBackwardRefs(i1) {
+  %2 = alloca i8, i32 4, align 8
+  %3 = bitcast i8* %2 to i32*
+  %4 = alloca i8, i32 4, align 8
+  %5 = ptrtoint i8* %4 to i32
+  br i1 %0, label %true, label %false
+
+true:
+  %6 = load i32* %3
+  br label %merge
+
+false:
+  %7 = load i32* %3
+  br label %merge
+
+merge:
+  %8 = phi i32 [%5, %true], [%5, %false]
+  %9 = phi i32 [%6, %true], [%7, %false]
+  ret void
+}
+
+; TD1:      define void @PhiBackwardRefs(i1) {
+; TD1-NEXT:   %2 = alloca i8, i32 4, align 8
+; TD1-NEXT:   %3 = bitcast i8* %2 to i32*
+; TD1-NEXT:   %4 = alloca i8, i32 4, align 8
+; TD1-NEXT:   %5 = ptrtoint i8* %4 to i32
+; TD1-NEXT:   br i1 %0, label %true, label %false
+; TD1: true:
+; TD1-NEXT:   %6 = load i32* %3
+; TD1-NEXT:   br label %merge
+; TD1:      false:
+; TD1-NEXT:   %7 = load i32* %3
+; TD1-NEXT:   br label %merge
+; TD1:      merge:
+; TD1-NEXT:   %8 = phi i32 [ %5, %true ], [ %5, %false ]
+; TD1-NEXT:   %9 = phi i32 [ %6, %true ], [ %7, %false ]
+; TD1-NEXT:   ret void
+; TD1-NEXT: }
+
+; PF1:        <FUNCTION_BLOCK>
+; PF1:          </CONSTANTS_BLOCK>
+; PF1-NEXT:     <INST_ALLOCA op0=1 op1=4/>
+; PF1-NEXT:     <INST_CAST op0=1 op1=4 op2=11/>
+; PF1-NEXT:     <INST_ALLOCA op0=3 op1=4/>
+; PF1-NEXT:     <INST_CAST op0=1 op1=0 op2=9/>
+; PF1-NEXT:     <INST_BR op0=1 op1=2 op2=6/>
+; PF1-NEXT:     <INST_LOAD op0=3 op1=0 op2=0/>
+; PF1-NEXT:     <INST_BR op0=3/>
+; PF1-NEXT:     <INST_LOAD op0=4 op1=0 op2=0/>
+; PF1-NEXT:     <INST_BR op0=3/>
+; PF1-NEXT:     <INST_PHI op0=0 op1=6 op2=1 op3=6 op4=2/>
+; PF1-NEXT:     <INST_PHI op0=0 op1=6 op2=1 op3=4 op4=2/>
+; PF1-NEXT:     <INST_RET/>
+; PF1:        </FUNCTION_BLOCK>
+
+; TD2:      define void @PhiBackwardRefs(i1) {
+; TD2-NEXT:   %2 = alloca i8, i32 4, align 8
+; TD2-NEXT:   %3 = alloca i8, i32 4, align 8
+; TD2-NEXT:   br i1 %0, label %true, label %false
+; TD2:      true:
+; TD2-NEXT:   %4 = bitcast i8* %2 to i32*
+; TD2-NEXT:   %5 = load i32* %4
+; TD2-NEXT:   %6 = ptrtoint i8* %3 to i32
+; TD2-NEXT:   br label %merge
+; TD2:      false:
+; TD2-NEXT:   %7 = bitcast i8* %2 to i32*
+; TD2-NEXT:   %8 = load i32* %7
+; TD2-NEXT:   %9 = ptrtoint i8* %3 to i32
+; TD2-NEXT:   br label %merge
+; TD2:      merge:
+; TD2-NEXT:   %10 = phi i32 [ %6, %true ], [ %9, %false ]
+; TD2-NEXT:   %11 = phi i32 [ %5, %true ], [ %8, %false ]
+; TD2-NEXT:   ret void
+; TD2-NEXT: }
+
+; PF2:        <FUNCTION_BLOCK>
+; PF2:          </CONSTANTS_BLOCK>
+; PF2-NEXT:     <INST_ALLOCA op0=1 op1=4/>
+; PF2-NEXT:     <INST_ALLOCA op0=2 op1=4/>
+; PF2-NEXT:     <INST_BR op0=1 op1=2 op2=4/>
+; PF2-NEXT:     <INST_LOAD op0=2 op1=0 op2=0/>
+; PF2-NEXT:     <INST_BR op0=3/>
+; PF2-NEXT:     <INST_LOAD op0=3 op1=0 op2=0/>
+; PF2-NEXT:     <INST_BR op0=3/>
+; PF2-NEXT:     <INST_PHI op0=0 op1=6 op2=1 op3=6 op4=2/>
+; PF2-NEXT:     <INST_PHI op0=0 op1=6 op2=1 op3=4 op4=2/>
+; PF2-NEXT:     <INST_RET/>
+; PF2:        </FUNCTION_BLOCK>
+
+; ------------------------------------------------------
+
+; Like PhiBackwardRefs except the phi nodes forward reference
+; instructions instead of backwards references.
+define void @PhiForwardRefs(i1) {
+  br label %start
+
+merge:
+  %2 = phi i32 [%9, %true], [%9, %false]
+  %3 = phi i32 [%4, %true], [%5, %false]
+  ret void
+
+true:
+  %4 = load i32* %7
+  br label %merge
+
+false:
+  %5 = load i32* %7
+  br label %merge
+
+start:
+  %6 = alloca i8, i32 4, align 8
+  %7 = bitcast i8* %6 to i32*
+  %8 = alloca i8, i32 4, align 8
+  %9 = ptrtoint i8* %8 to i32
+  br i1 %0, label %true, label %false
+}
+
+; TD1:      define void @PhiForwardRefs(i1) {
+; TD1-NEXT:   br label %start
+; TD1:      merge:
+; TD1-NEXT:   %2 = phi i32 [ %9, %true ], [ %9, %false ]
+; TD1-NEXT:   %3 = phi i32 [ %4, %true ], [ %5, %false ]
+; TD1-NEXT:   ret void
+; TD1:      true:
+; TD1-NEXT:   %4 = load i32* %7
+; TD1-NEXT:   br label %merge
+; TD1:      false:
+; TD1-NEXT:   %5 = load i32* %7
+; TD1-NEXT:   br label %merge
+; TD1:      start:
+; TD1-NEXT:   %6 = alloca i8, i32 4, align 8
+; TD1-NEXT:   %7 = bitcast i8* %6 to i32*
+; TD1-NEXT:   %8 = alloca i8, i32 4, align 8
+; TD1-NEXT:   %9 = ptrtoint i8* %8 to i32
+; TD1-NEXT:   br i1 %0, label %true, label %false
+; TD1-NEXT: }
+
+; PF1:        <FUNCTION_BLOCK>
+; PF1:          </CONSTANTS_BLOCK>
+; PF1-NEXT:     <INST_BR op0=4/>
+; PF1-NEXT:     <FORWARDTYPEREF op0=30 op1=0/>
+; PF1-NEXT:     <INST_PHI op0=0 op1=15 op2=2 op3=15 op4=3/>
+; PF1-NEXT:     <FORWARDTYPEREF op0=25 op1=0/>
+; PF1-NEXT:     <FORWARDTYPEREF op0=26 op1=0/>
+; PF1-NEXT:     <INST_PHI op0=0 op1=3 op2=2 op3=5 op4=3/>
+; PF1-NEXT:     <INST_RET/>
+; PF1-NEXT:     <FORWARDTYPEREF op0=28 op1=4/>
+; PF1-NEXT:     <INST_LOAD op0=4294967293 op1=0 op2=0/>
+; PF1-NEXT:     <INST_BR op0=1/>
+; PF1-NEXT:     <INST_LOAD op0=4294967294 op1=0 op2=0/>
+; PF1-NEXT:     <INST_BR op0=1/>
+; PF1-NEXT:     <INST_ALLOCA op0=5 op1=4/>
+; PF1-NEXT:     <INST_CAST op0=1 op1=4 op2=11/>
+; PF1-NEXT:     <INST_ALLOCA op0=7 op1=4/>
+; PF1-NEXT:     <INST_CAST op0=1 op1=0 op2=9/>
+; PF1-NEXT:     <INST_BR op0=2 op1=3 op2=10/>
+; PF1:        </FUNCTION_BLOCK>
+
+; TD2:      define void @PhiForwardRefs(i1) {
+; TD2-NEXT:   br label %start
+; TD2:      merge
+; TD2-NEXT:   %2 = phi i32 [ %6, %true ], [ %9, %false ]
+; TD2-NEXT:   %3 = phi i32 [ %5, %true ], [ %8, %false ]
+; TD2-NEXT:   ret void
+; TD2:      true:
+; TD2-NEXT:   %4 = bitcast i8* %10 to i32*
+; TD2-NEXT:   %5 = load i32* %4
+; TD2-NEXT:   %6 = ptrtoint i8* %11 to i32
+; TD2-NEXT:   br label %merge
+; TD2:      false:
+; TD2-NEXT:   %7 = bitcast i8* %10 to i32*
+; TD2-NEXT:   %8 = load i32* %7
+; TD2-NEXT:   %9 = ptrtoint i8* %11 to i32
+; TD2-NEXT:   br label %merge
+; TD2:      start:
+; TD2-NEXT:   %10 = alloca i8, i32 4, align 8
+; TD2-NEXT:   %11 = alloca i8, i32 4, align 8
+; TD2-NEXT:   br i1 %0, label %true, label %false
+; TD2-NEXT: }
+
+; PF2:        <FUNCTION_BLOCK>
+; PF2:          </CONSTANTS_BLOCK>
+; PF2-NEXT:     <INST_BR op0=4/>
+; PF2-NEXT:     <FORWARDTYPEREF op0=28 op1=3/>
+; PF2-NEXT:     <INST_PHI op0=0 op1=11 op2=2 op3=11 op4=3/>
+; PF2-NEXT:     <FORWARDTYPEREF op0=25 op1=0/>
+; PF2-NEXT:     <FORWARDTYPEREF op0=26 op1=0/>
+; PF2-NEXT:     <INST_PHI op0=0 op1=3 op2=2 op3=5 op4=3/>
+; PF2-NEXT:     <INST_RET/>
+; PF2-NEXT:     <FORWARDTYPEREF op0=27 op1=3/>
+; PF2-NEXT:     <INST_LOAD op0=4294967294 op1=0 op2=0/>
+; PF2-NEXT:     <INST_BR op0=1/>
+; PF2-NEXT:     <INST_LOAD op0=4294967295 op1=0 op2=0/>
+; PF2-NEXT:     <INST_BR op0=1/>
+; PF2-NEXT:     <INST_ALLOCA op0=5 op1=4/>
+; PF2-NEXT:     <INST_ALLOCA op0=6 op1=4/>
+; PF2-NEXT:     <INST_BR op0=2 op1=3 op2=8/>
+; PF2:        </FUNCTION_BLOCK>
+
+; ------------------------------------------------------
+
+; Show that if a phi node incoming block already has a pointer cast,
+; we use it instead of adding one at the end of the block. In this
+; example, we reuse instruction %7 in block true for phi node %10.
+define void @PhiMergeCast(i1) {
+  %2 = alloca i8, i32 4, align 8
+  %3 = bitcast i8* %2 to i32*
+  %4 = alloca i8, i32 4, align 8
+  %5 = ptrtoint i8* %4 to i32
+  br i1 %0, label %true, label %false
+
+true:
+  %6 = load i32* %3
+  %7 = ptrtoint i8* %4 to i32
+  %8 = add i32 %6, %7
+  br label %merge
+
+false:
+  %9 = load i32* %3
+  br label %merge
+
+merge:
+  %10 = phi i32 [%5, %true], [%5, %false]
+  %11 = phi i32 [%6, %true], [%9, %false]
+  ret void
+}
+
+; TD1:      define void @PhiMergeCast(i1) {
+; TD1-NEXT:   %2 = alloca i8, i32 4, align 8
+; TD1-NEXT:   %3 = bitcast i8* %2 to i32*
+; TD1-NEXT:   %4 = alloca i8, i32 4, align 8
+; TD1-NEXT:   %5 = ptrtoint i8* %4 to i32
+; TD1-NEXT:   br i1 %0, label %true, label %false
+; TD1:      true:
+; TD1-NEXT:   %6 = load i32* %3
+; TD1-NEXT:   %7 = ptrtoint i8* %4 to i32
+; TD1-NEXT:   %8 = add i32 %6, %7
+; TD1-NEXT:   br label %merge
+; TD1:      false:
+; TD1-NEXT:   %9 = load i32* %3
+; TD1-NEXT:   br label %merge
+; TD1:      merge:
+; TD1-NEXT:   %10 = phi i32 [ %5, %true ], [ %5, %false ]
+; TD1-NEXT:   %11 = phi i32 [ %6, %true ], [ %9, %false ]
+; TD1-NEXT:   ret void
+; TD1-NEXT: }
+
+; PF1:        <FUNCTION_BLOCK>
+; PF1:          </CONSTANTS_BLOCK>
+; PF1-NEXT:     <INST_ALLOCA op0=1 op1=4/>
+; PF1-NEXT:     <INST_CAST op0=1 op1=4 op2=11/>
+; PF1-NEXT:     <INST_ALLOCA op0=3 op1=4/>
+; PF1-NEXT:     <INST_CAST op0=1 op1=0 op2=9/>
+; PF1-NEXT:     <INST_BR op0=1 op1=2 op2=6/>
+; PF1-NEXT:     <INST_LOAD op0=3 op1=0 op2=0/>
+; PF1-NEXT:     <INST_CAST op0=3 op1=0 op2=9/>
+; PF1-NEXT:     <INST_BINOP op0=2 op1=1 op2=0/>
+; PF1-NEXT:     <INST_BR op0=3/>
+; PF1-NEXT:     <INST_LOAD op0=6 op1=0 op2=0/>
+; PF1-NEXT:     <INST_BR op0=3/>
+; PF1-NEXT:     <INST_PHI op0=0 op1=10 op2=1 op3=10 op4=2/>
+; PF1-NEXT:     <INST_PHI op0=0 op1=10 op2=1 op3=4 op4=2/>
+; PF1-NEXT:     <INST_RET/>
+; PF1:        </FUNCTION_BLOCK>
+
+; TD2:      define void @PhiMergeCast(i1) {
+; TD2-NEXT:   %2 = alloca i8, i32 4, align 8
+; TD2-NEXT:   %3 = alloca i8, i32 4, align 8
+; TD2-NEXT:   br i1 %0, label %true, label %false
+; TD2:      true:
+; TD2-NEXT:   %4 = bitcast i8* %2 to i32*
+; TD2-NEXT:   %5 = load i32* %4
+; TD2-NEXT:   %6 = ptrtoint i8* %3 to i32
+; TD2-NEXT:   %7 = add i32 %5, %6
+; TD2-NEXT:   br label %merge
+; TD2:      false:
+; TD2-NEXT:   %8 = bitcast i8* %2 to i32*
+; TD2-NEXT:   %9 = load i32* %8
+; TD2-NEXT:   %10 = ptrtoint i8* %3 to i32
+; TD2-NEXT:   br label %merge
+; TD2:      merge:
+; TD2-NEXT:   %11 = phi i32 [ %6, %true ], [ %10, %false ]
+; TD2-NEXT:   %12 = phi i32 [ %5, %true ], [ %9, %false ]
+; TD2-NEXT:   ret void
+; TD2-NEXT: }
+
+; PF2:        <FUNCTION_BLOCK>
+; PF2:          </CONSTANTS_BLOCK>
+; PF2-NEXT:     <INST_ALLOCA op0=1 op1=4/>
+; PF2-NEXT:     <INST_ALLOCA op0=2 op1=4/>
+; PF2-NEXT:     <INST_BR op0=1 op1=2 op2=4/>
+; PF2-NEXT:     <INST_LOAD op0=2 op1=0 op2=0/>
+; PF2-NEXT:     <INST_BINOP op0=1 op1=2 op2=0/>
+; PF2-NEXT:     <INST_BR op0=3/>
+; PF2-NEXT:     <INST_LOAD op0=4 op1=0 op2=0/>
+; PF2-NEXT:     <INST_BR op0=3/>
+; PF2-NEXT:     <INST_PHI op0=0 op1=8 op2=1 op3=8 op4=2/>
+; PF2-NEXT:     <INST_PHI op0=0 op1=8 op2=1 op3=4 op4=2/>
+; PF2-NEXT:     <INST_RET/>
+; PF2:        </FUNCTION_BLOCK>
+
+; ------------------------------------------------------
+
+; Show that we must introduce a cast reference for each
+; reachable block, but one is sufficient.
+define void @LongReachingCasts(i1) {
+  %2 = alloca i8, i32 4, align 8
+  %3 = ptrtoint i8* %2 to i32
+  %4 = bitcast [4 x i8]* @bytes to i32*
+  br i1 %0, label %Split1, label %Split2
+
+Split1:
+  br i1 %0, label %b1, label %b2
+
+Split2:
+  br i1 %0, label %b3, label %b4
+
+b1:
+  store i32 %3, i32* %4, align 1
+  store i32 %3, i32* %4, align 1
+  ret void
+
+b2:
+  store i32 %3, i32* %4, align 1
+  store i32 %3, i32* %4, align 1
+  ret void
+
+b3:
+  store i32 %3, i32* %4, align 1
+  store i32 %3, i32* %4, align 1
+  ret void
+
+b4:
+  store i32 %3, i32* %4, align 1
+  store i32 %3, i32* %4, align 1
+  ret void
+}
+
+; TD1:      define void @LongReachingCasts(i1) {
+; TD1-NEXT:   %2 = alloca i8, i32 4, align 8
+; TD1-NEXT:   %3 = ptrtoint i8* %2 to i32
+; TD1-NEXT:   %4 = bitcast [4 x i8]* @bytes to i32*
+; TD1-NEXT:   br i1 %0, label %Split1, label %Split2
+; TD1:      Split1:
+; TD1-NEXT:   br i1 %0, label %b1, label %b2
+; TD1:      Split2:
+; TD1-NEXT:   br i1 %0, label %b3, label %b4
+; TD1:      b1:
+; TD1-NEXT:   store i32 %3, i32* %4, align 1
+; TD1-NEXT:   store i32 %3, i32* %4, align 1
+; TD1-NEXT:   ret void
+; TD1:      b2:
+; TD1-NEXT:   store i32 %3, i32* %4, align 1
+; TD1-NEXT:   store i32 %3, i32* %4, align 1
+; TD1-NEXT:   ret void
+; TD1:      b3:
+; TD1-NEXT:   store i32 %3, i32* %4, align 1
+; TD1-NEXT:   store i32 %3, i32* %4, align 1
+; TD1-NEXT:   ret void
+; TD1:      b4:
+; TD1-NEXT:   store i32 %3, i32* %4, align 1
+; TD1-NEXT:   store i32 %3, i32* %4, align 1
+; TD1-NEXT:   ret void
+; TD1-NEXT: }
+
+; PF1:        <FUNCTION_BLOCK>
+; PF1:          </CONSTANTS_BLOCK>
+; PF1-NEXT:     <INST_ALLOCA op0=1 op1=4/>
+; PF1-NEXT:     <INST_CAST op0=1 op1=0 op2=9/>
+; PF1-NEXT:     <INST_CAST op0=5 op1=4 op2=11/>
+; PF1-NEXT:     <INST_BR op0=1 op1=2 op2=5/>
+; PF1-NEXT:     <INST_BR op0=3 op1=4 op2=5/>
+; PF1-NEXT:     <INST_BR op0=5 op1=6 op2=5/>
+; PF1-NEXT:     <INST_STORE op0=1 op1=2 op2=1 op3=0/>
+; PF1-NEXT:     <INST_STORE op0=1 op1=2 op2=1 op3=0/>
+; PF1-NEXT:     <INST_RET/>
+; PF1-NEXT:     <INST_STORE op0=1 op1=2 op2=1 op3=0/>
+; PF1-NEXT:     <INST_STORE op0=1 op1=2 op2=1 op3=0/>
+; PF1-NEXT:     <INST_RET/>
+; PF1-NEXT:     <INST_STORE op0=1 op1=2 op2=1 op3=0/>
+; PF1-NEXT:     <INST_STORE op0=1 op1=2 op2=1 op3=0/>
+; PF1-NEXT:     <INST_RET/>
+; PF1-NEXT:     <INST_STORE op0=1 op1=2 op2=1 op3=0/>
+; PF1-NEXT:     <INST_STORE op0=1 op1=2 op2=1 op3=0/>
+; PF1-NEXT:     <INST_RET/>
+; PF1:        </FUNCTION_BLOCK>
+
+; TD2:      define void @LongReachingCasts(i1) {
+; TD2-NEXT:   %2 = alloca i8, i32 4, align 8
+; TD2-NEXT:   br i1 %0, label %Split1, label %Split2
+; TD2:      Split1:
+; TD2-NEXT:   br i1 %0, label %b1, label %b2
+; TD2:      Split2:
+; TD2-NEXT:   br i1 %0, label %b3, label %b4
+; TD2:      b1:
+; TD2-NEXT:   %3 = ptrtoint i8* %2 to i32
+; TD2-NEXT:   %4 = bitcast [4 x i8]* @bytes to i32*
+; TD2-NEXT:   store i32 %3, i32* %4, align 1
+; TD2-NEXT:   store i32 %3, i32* %4, align 1
+; TD2-NEXT:   ret void
+; TD2:      b2:
+; TD2-NEXT:   %5 = ptrtoint i8* %2 to i32
+; TD2-NEXT:   %6 = bitcast [4 x i8]* @bytes to i32*
+; TD2-NEXT:   store i32 %5, i32* %6, align 1
+; TD2-NEXT:   store i32 %5, i32* %6, align 1
+; TD2-NEXT:   ret void
+; TD2:      b3:
+; TD2-NEXT:   %7 = ptrtoint i8* %2 to i32
+; TD2-NEXT:   %8 = bitcast [4 x i8]* @bytes to i32*
+; TD2-NEXT:   store i32 %7, i32* %8, align 1
+; TD2-NEXT:   store i32 %7, i32* %8, align 1
+; TD2-NEXT:   ret void
+; TD2:      b4:
+; TD2-NEXT:   %9 = ptrtoint i8* %2 to i32
+; TD2-NEXT:   %10 = bitcast [4 x i8]* @bytes to i32*
+; TD2-NEXT:   store i32 %9, i32* %10, align 1
+; TD2-NEXT:   store i32 %9, i32* %10, align 1
+; TD2-NEXT:   ret void
+; TD2-NEXT: }
+
+; PF2:        <FUNCTION_BLOCK>
+; PF2:          </CONSTANTS_BLOCK>
+; PF2-NEXT:     <INST_ALLOCA op0=1 op1=4/>
+; PF2-NEXT:     <INST_BR op0=1 op1=2 op2=3/>
+; PF2-NEXT:     <INST_BR op0=3 op1=4 op2=3/>
+; PF2-NEXT:     <INST_BR op0=5 op1=6 op2=3/>
+; PF2-NEXT:     <INST_STORE op0=4 op1=1 op2=1/>
+; PF2-NEXT:     <INST_STORE op0=4 op1=1 op2=1/>
+; PF2-NEXT:     <INST_RET/>
+; PF2-NEXT:     <INST_STORE op0=4 op1=1 op2=1/>
+; PF2-NEXT:     <INST_STORE op0=4 op1=1 op2=1/>
+; PF2-NEXT:     <INST_RET/>
+; PF2-NEXT:     <INST_STORE op0=4 op1=1 op2=1/>
+; PF2-NEXT:     <INST_STORE op0=4 op1=1 op2=1/>
+; PF2-NEXT:     <INST_RET/>
+; PF2-NEXT:     <INST_STORE op0=4 op1=1 op2=1/>
+; PF2-NEXT:     <INST_STORE op0=4 op1=1 op2=1/>
+; PF2-NEXT:     <INST_RET/>
+; PF2:        </FUNCTION_BLOCK>