6 files changed, 846 insertions, 194 deletions
diff --git a/lib/Bitcode/NaCl/Reader/NaClBitcodeReader.cpp b/lib/Bitcode/NaCl/Reader/NaClBitcodeReader.cpp
index a38b18afa1..fbe1fc0165 100644
--- a/lib/Bitcode/NaCl/Reader/NaClBitcodeReader.cpp
+++ b/lib/Bitcode/NaCl/Reader/NaClBitcodeReader.cpp
@@ -36,7 +36,6 @@ void NaClBitcodeReader::FreeState() {
   std::vector<Type*>().swap(TypeList);
   ValueList.clear();
 
-  std::vector<BasicBlock*>().swap(FunctionBBs);
   std::vector<Function*>().swap(FunctionsWithBodies);
   DeferredFunctionInfo.clear();
 }
@@ -1284,40 +1283,56 @@ bool NaClBitcodeReader::InstallInstruction(
   return false;
 }
 
-Value *NaClBitcodeReader::ConvertOpToScalar(Value *Op, BasicBlock *BB) {
+CastInst *
+NaClBitcodeReader::CreateCast(unsigned BBIndex, Instruction::CastOps Op,
+                              Type *CT, Value *V, bool DeferInsertion) {
+  if (BBIndex >= FunctionBBs.size())
+    report_fatal_error("CreateCast on unknown basic block");
+  BasicBlockInfo &BBInfo = FunctionBBs[BBIndex];
+  NaClBitcodeReaderCast ModeledCast(Op, CT, V);
+  CastInst *Cast = BBInfo.CastMap[ModeledCast];
+  if (Cast == NULL) {
+    Cast = CastInst::Create(Op, V, CT);
+    BBInfo.CastMap[ModeledCast] = Cast;
+    if (DeferInsertion) {
+      BBInfo.PhiCasts.push_back(Cast);
+    }
+  }
+  if (!DeferInsertion && Cast->getParent() == 0) {
+    InstallInstruction(BBInfo.BB, Cast);
+  }
+  return Cast;
+}
+
+Value *NaClBitcodeReader::ConvertOpToScalar(Value *Op, unsigned BBIndex,
+                                            bool DeferInsertion) {
   if (Op->getType()->isPointerTy()) {
-    Instruction *Conversion = new PtrToIntInst(Op, IntPtrType);
-    InstallInstruction(BB, Conversion);
-    return Conversion;
+    return CreateCast(BBIndex, Instruction::PtrToInt, IntPtrType, Op,
+                      DeferInsertion);
   }
   return Op;
 }
 
-Value *NaClBitcodeReader::ConvertOpToType(Value *Op, Type *T, BasicBlock *BB) {
+Value *NaClBitcodeReader::ConvertOpToType(Value *Op, Type *T,
+                                          unsigned BBIndex) {
   // Note: Currently only knows how to add inttoptr and bitcast type
   // conversions for non-phi nodes, since these are the only elided
   // instructions in the bitcode writer.
   //
   // TODO(kschimpf): Generalize this as we expand elided conversions.
-  Instruction *Conversion = 0;
   Type *OpTy = Op->getType();
   if (OpTy == T) return Op;
 
   if (OpTy->isPointerTy()) {
-    Conversion = new BitCastInst(Op, T);
+    return CreateCast(BBIndex, Instruction::BitCast, T, Op);
   } else if (OpTy == IntPtrType) {
-    Conversion = new IntToPtrInst(Op, T);
+    return CreateCast(BBIndex, Instruction::IntToPtr, T, Op);
   }
 
-  if (Conversion == 0) {
-    std::string Message;
-    raw_string_ostream StrM(Message);
-    StrM << "Can't convert " << *Op << " to type " << *T << "\n";
-    Error(StrM.str());
-  } else {
-    InstallInstruction(BB, Conversion);
-  }
-  return Conversion;
+  std::string Message;
+  raw_string_ostream StrM(Message);
+  StrM << "Can't convert " << *Op << " to type " << *T << "\n";
+  report_fatal_error(StrM.str());
 }
 
 Type *NaClBitcodeReader::ConvertTypeToScalarType(Type *T) {
@@ -1396,9 +1411,11 @@ bool NaClBitcodeReader::ParseFunctionBody(Function *F) {
         return Error("Invalid DECLAREBLOCKS record");
       // Create all the basic blocks for the function.
       FunctionBBs.resize(Record[0]);
-      for (unsigned i = 0, e = FunctionBBs.size(); i != e; ++i)
-        FunctionBBs[i] = BasicBlock::Create(Context, "", F);
-      CurBB = FunctionBBs[0];
+      for (unsigned i = 0, e = FunctionBBs.size(); i != e; ++i) {
+        BasicBlockInfo &BBInfo = FunctionBBs[i];
+        BBInfo.BB = BasicBlock::Create(Context, "", F);
+      }
+      CurBB = FunctionBBs.at(0).BB;
       continue;
 
     case naclbitc::FUNC_CODE_INST_BINOP: {
@@ -1410,8 +1427,8 @@ bool NaClBitcodeReader::ParseFunctionBody(Function *F) {
           OpNum+1 > Record.size())
         return Error("Invalid BINOP record");
 
-      LHS = ConvertOpToScalar(LHS, CurBB);
-      RHS = ConvertOpToScalar(RHS, CurBB);
+      LHS = ConvertOpToScalar(LHS, CurBBNo);
+      RHS = ConvertOpToScalar(RHS, CurBBNo);
 
       int Opc = GetDecodedBinaryOpcode(Record[OpNum++], LHS->getType());
       if (Opc == -1) return Error("Invalid BINOP record");
@@ -1472,7 +1489,7 @@ bool NaClBitcodeReader::ParseFunctionBody(Function *F) {
         case Instruction::SExt:
         case Instruction::UIToFP:
         case Instruction::SIToFP:
-          Op = ConvertOpToScalar(Op, CurBB);
+          Op = ConvertOpToScalar(Op, CurBBNo);
           break;
         default:
           break;
@@ -1493,8 +1510,8 @@ bool NaClBitcodeReader::ParseFunctionBody(Function *F) {
           popValue(Record, &OpNum, NextValueNo, &Cond))
         return Error("Invalid SELECT record");
 
-      TrueVal = ConvertOpToScalar(TrueVal, CurBB);
-      FalseVal = ConvertOpToScalar(FalseVal, CurBB);
+      TrueVal = ConvertOpToScalar(TrueVal, CurBBNo);
+      FalseVal = ConvertOpToScalar(FalseVal, CurBBNo);
 
       // expect i1
       if (Cond->getType() != Type::getInt1Ty(Context))
@@ -1514,8 +1531,8 @@ bool NaClBitcodeReader::ParseFunctionBody(Function *F) {
           OpNum+1 != Record.size())
         return Error("Invalid CMP record");
 
-      LHS = ConvertOpToScalar(LHS, CurBB);
-      RHS = ConvertOpToScalar(RHS, CurBB);
+      LHS = ConvertOpToScalar(LHS, CurBBNo);
+      RHS = ConvertOpToScalar(RHS, CurBBNo);
 
       if (LHS->getType()->isFPOrFPVectorTy())
         I = new FCmpInst((FCmpInst::Predicate)Record[OpNum], LHS, RHS);
@@ -1622,9 +1639,6 @@ bool NaClBitcodeReader::ParseFunctionBody(Function *F) {
       Type *Ty = getTypeByID(Record[0]);
       if (!Ty) return Error("Invalid PHI record");
 
-      // TODO(kschimpf): Fix handling of converting types for values,
-      // to handle elided casts, once the bitcode writer knows how.
-
       PHINode *PN = PHINode::Create(Ty, (Record.size()-1)/2);
 
       for (unsigned i = 0, e = Record.size()-1; i != e; i += 2) {
@@ -1636,8 +1650,16 @@ bool NaClBitcodeReader::ParseFunctionBody(Function *F) {
           V = getValueSigned(Record, 1+i, NextValueNo);
         else
           V = getValue(Record, 1+i, NextValueNo);
-        BasicBlock *BB = getBasicBlock(Record[2+i]);
+        unsigned BBIndex = Record[2+i];
+        BasicBlock *BB = getBasicBlock(BBIndex);
         if (!V || !BB) return Error("Invalid PHI record");
+        if (GetPNaClVersion() == 2 && Ty == IntPtrType) {
+          // Delay installing scalar casts until all instructions of
+          // the function are rendered. This guarantees that we insert
+          // the conversion just before the incoming edge (or use an
+          // existing conversion if already installed).
+          V = ConvertOpToScalar(V, BBIndex, /* DeferInsertion = */ true);
+        }
         PN->addIncoming(V, BB);
       }
       I = PN;
@@ -1672,7 +1694,7 @@ bool NaClBitcodeReader::ParseFunctionBody(Function *F) {
           Type *T = getTypeByID(Record[2]);
           if (T == 0)
             return Error("Invalid type for load instruction");
-          Op = ConvertOpToType(Op, T->getPointerTo(), CurBB);
+          Op = ConvertOpToType(Op, T->getPointerTo(), CurBBNo);
           if (Op == 0) return true;
 	  I = new LoadInst(Op, "", false, (1 << Record[OpNum]) >> 1);
 	  break;
@@ -1697,8 +1719,8 @@ bool NaClBitcodeReader::ParseFunctionBody(Function *F) {
       case 2:
 	if (OpNum+1 != Record.size())
 	  return Error("Invalid STORE record");
-	Val = ConvertOpToScalar(Val, CurBB);
-	Ptr = ConvertOpToType(Ptr, Val->getType()->getPointerTo(), CurBB);
+        Val = ConvertOpToScalar(Val, CurBBNo);
+        Ptr = ConvertOpToType(Ptr, Val->getType()->getPointerTo(), CurBBNo);
 	I = new StoreInst(Val, Ptr, false, (1 << Record[OpNum]) >> 1);
 	break;
       }
@@ -1767,7 +1789,7 @@ bool NaClBitcodeReader::ParseFunctionBody(Function *F) {
     // If this was a terminator instruction, move to the next block.
     if (isa<TerminatorInst>(I)) {
       ++CurBBNo;
-      CurBB = CurBBNo < FunctionBBs.size() ? FunctionBBs[CurBBNo] : 0;
+      CurBB = getBasicBlock(CurBBNo);
     }
 
     // Non-void values get registered in the value table for future use.
@@ -1777,6 +1799,24 @@ bool NaClBitcodeReader::ParseFunctionBody(Function *F) {
 
 OutOfRecordLoop:
 
+  // Add PHI conversions to corresponding incoming block, if not
+  // already in the block. Also clear all conversions after fixing
+  // PHI conversions.
+  for (unsigned I = 0, NumBBs = FunctionBBs.size(); I < NumBBs; ++I) {
+    BasicBlockInfo &BBInfo = FunctionBBs[I];
+    std::vector<CastInst*> &PhiCasts = BBInfo.PhiCasts;
+    for (std::vector<CastInst*>::iterator Iter = PhiCasts.begin(),
+           IterEnd = PhiCasts.end(); Iter != IterEnd; ++Iter) {
+      CastInst *Cast = *Iter;
+      if (Cast->getParent() == 0) {
+        BasicBlock *BB = BBInfo.BB;
+        BB->getInstList().insert(BB->getTerminator(), Cast);
+      }
+    }
+    PhiCasts.clear();
+    BBInfo.CastMap.clear();
+  }
+
   // Check the function list for unresolved values.
   if (Argument *A = dyn_cast<Argument>(ValueList.back())) {
     if (A->getParent() == 0) {
@@ -1793,7 +1833,7 @@ OutOfRecordLoop:
 
   // Trim the value list down to the size it was before we parsed this function.
   ValueList.shrinkTo(ModuleValueListSize);
-  std::vector<BasicBlock*>().swap(FunctionBBs);
+  FunctionBBs.clear();
   DEBUG(dbgs() << "-> ParseFunctionBody\n");
   return false;
 }
diff --git a/lib/Bitcode/NaCl/Reader/NaClBitcodeReader.h b/lib/Bitcode/NaCl/Reader/NaClBitcodeReader.h
index 814ef44efb..762088887f 100644
--- a/lib/Bitcode/NaCl/Reader/NaClBitcodeReader.h
+++ b/lib/Bitcode/NaCl/Reader/NaClBitcodeReader.h
@@ -21,6 +21,7 @@
 #include "llvm/Bitcode/NaCl/NaClLLVMBitCodes.h"
 #include "llvm/GVMaterializer.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/ValueHandle.h"
@@ -29,6 +30,46 @@
 namespace llvm {
   class MemoryBuffer;
   class LLVMContext;
+  class CastInst;
+
+// Models a Cast.  Used to cache casts created in a basic block by the
+// PNaCl bitcode reader.
+struct NaClBitcodeReaderCast {
+  // Fields of the conversion.
+  Instruction::CastOps Op;
+  Type *Ty;
+  Value *Val;
+
+  NaClBitcodeReaderCast(Instruction::CastOps Op, Type *Ty, Value *Val)
+    : Op(Op), Ty(Ty), Val(Val) {}
+};
+
+// Models the data structure used to hash/compare Casts in a DenseMap.
+template<>
+struct DenseMapInfo<NaClBitcodeReaderCast> {
+public:
+  static NaClBitcodeReaderCast getEmptyKey() {
+    return NaClBitcodeReaderCast(Instruction::CastOpsEnd,
+                                 DenseMapInfo<Type*>::getEmptyKey(),
+                                 DenseMapInfo<Value*>::getEmptyKey());
+  }
+  static NaClBitcodeReaderCast getTombstoneKey() {
+    return NaClBitcodeReaderCast(Instruction::CastOpsEnd,
+                                 DenseMapInfo<Type*>::getTombstoneKey(),
+                                 DenseMapInfo<Value*>::getTombstoneKey());
+  }
+  static unsigned getHashValue(const NaClBitcodeReaderCast &C) {
+    std::pair<int, std::pair<Type*, Value*> > Tuple;
+    Tuple.first = C.Op;
+    Tuple.second.first = C.Ty;
+    Tuple.second.second = C.Val;
+    return DenseMapInfo<std::pair<int, std::pair<Type*, Value*> > >::getHashValue(Tuple);
+  }
+  static bool isEqual(const NaClBitcodeReaderCast &LHS,
+                      const NaClBitcodeReaderCast &RHS) {
+    return LHS.Op == RHS.Op && LHS.Ty == RHS.Ty && LHS.Val == RHS.Val;
+  }
+};
 
 //===----------------------------------------------------------------------===//
 //                          NaClBitcodeReaderValueList Class
@@ -83,8 +124,8 @@ public:
   // already been declared.
   bool createValueFwdRef(unsigned Idx, Type *Ty);
 
-  // Declares the type of the forward-referenced constant Idx. Returns
-  // 0 if an error occurred.
+  // Declares the type of the forward-referenced constant Idx.
+  // Returns 0 if an error occurred.
   // TODO(kschimpf) Convert these to be like createValueFwdRef and
   // getValueFwdRef.
   Constant *getConstantFwdRef(unsigned Idx, Type *Ty);
@@ -103,7 +144,7 @@ public:
   // was forward referenced).
   void AssignValue(Value *V, unsigned Idx);
 
-  // Assigns Idx to the given global variable. If the Idx currently has
+  // Assigns Idx to the given global variable.  If the Idx currently has
   // a forward reference (built by createGlobalVarFwdRef(unsigned Idx)),
   // replaces uses of the global variable forward reference with the
   // value GV.
@@ -133,9 +174,20 @@ class NaClBitcodeReader : public GVMaterializer {
   NaClBitcodeReaderValueList ValueList;
   SmallVector<SmallVector<uint64_t, 64>, 64> UseListRecords;
 
+  // Holds information about each BasicBlock in the function being read.
+  struct BasicBlockInfo {
+    // A basic block within the function being modeled.
+    BasicBlock *BB;
+    // The set of generated conversions.
+    DenseMap<NaClBitcodeReaderCast, CastInst*> CastMap;
+    // The set of generated conversions that were added for phi nodes,
+    // and may need thier parent basic block defined.
+    std::vector<CastInst*> PhiCasts;
+  };
+
   /// FunctionBBs - While parsing a function body, this is a list of the basic
   /// blocks for the function.
-  std::vector<BasicBlock*> FunctionBBs;
+  std::vector<BasicBlockInfo> FunctionBBs;
 
   // When reading the module header, this list is populated with functions that
   // have bodies later in the file.
@@ -147,7 +199,7 @@ class NaClBitcodeReader : public GVMaterializer {
   UpgradedIntrinsicMap UpgradedIntrinsics;
 
   // Several operations happen after the module header has been read, but
-  // before function bodies are processed. This keeps track of whether
+  // before function bodies are processed.  This keeps track of whether
   // we've done this yet.
   bool SeenFirstFunctionBody;
 
@@ -226,14 +278,14 @@ private:
     return Header.GetPNaClVersion();
   }
   Type *getTypeByID(unsigned ID);
-  // Returns the value associated with ID. The value must already exist,
+  // Returns the value associated with ID.  The value must already exist,
   // or a forward referenced value created by getOrCreateFnVaueByID.
   Value *getFnValueByID(unsigned ID) {
     return ValueList.getValueFwdRef(ID);
   }
   BasicBlock *getBasicBlock(unsigned ID) const {
     if (ID >= FunctionBBs.size()) return 0; // Invalid ID
-    return FunctionBBs[ID];
+    return FunctionBBs[ID].BB;
   }
 
   /// \brief Read a value out of the specified record from slot '*Slot'.
@@ -273,18 +325,30 @@ private:
     return getFnValueByID(ValNo);
   }
 
-  /// \brief Add instructions to cast Op to the given type T into block BB.
-  /// Follows rules for pointer conversion as defined in
-  /// llvm/lib/Transforms/NaCl/ReplacePtrsWithInts.cpp.
+  /// \brief Create an (elided) cast instruction for basic block
+  /// BBIndex.  Op is the type of cast.  V is the value to cast.  CT
+  /// is the type to convert V to.  DeferInsertion defines whether the
+  /// generated conversion should also be installed into basic block
+  /// BBIndex.  Note: For PHI nodes, we don't insert when created
+  /// (i.e. DeferInsertion=true), since they must be inserted at the end
+  /// of the corresponding incoming basic block.
+  CastInst *CreateCast(unsigned BBIndex, Instruction::CastOps Op,
+                       Type *CT, Value *V, bool DeferInsertion = false);
+
+  /// \brief Add instructions to cast Op to the given type T into
+  /// block BBIndex.  Follows rules for pointer conversion as defined
+  /// in llvm/lib/Transforms/NaCl/ReplacePtrsWithInts.cpp.
   ///
   /// Returns 0 if unable to generate conversion value (also generates
   /// an appropriate error message and calls Error).
-  Value *ConvertOpToType(Value *Op, Type *T, BasicBlock *BB);
-
-  /// \brief If Op is a scalar value, this is a nop. If Op is a
-  /// pointer value, a PtrToInt instruction is inserted (in BB) to
-  /// convert Op to an integer.
-  Value *ConvertOpToScalar(Value *Op, BasicBlock *BB);
+  Value *ConvertOpToType(Value *Op, Type *T, unsigned BBIndex);
+
+  /// \brief If Op is a scalar value, this is a nop.  If Op is a
+  /// pointer value, a PtrToInt instruction is inserted (in BBIndex)
+  /// to convert Op to an integer.  For defaults on DeferInsertion,
+  /// see comments for method CreateCast.
+  Value *ConvertOpToScalar(Value *Op, unsigned BBIndex,
+                           bool DeferInsertion = false);
 
   /// \brief Returns the corresponding, PNaCl non-pointer equivalent
   /// for the given type.
diff --git a/lib/Bitcode/NaCl/Writer/NaClValueEnumerator.cpp b/lib/Bitcode/NaCl/Writer/NaClValueEnumerator.cpp
index bee36e2631..060a6d63f4 100644
--- a/lib/Bitcode/NaCl/Writer/NaClValueEnumerator.cpp
+++ b/lib/Bitcode/NaCl/Writer/NaClValueEnumerator.cpp
@@ -479,6 +479,7 @@ static bool ExpectsScalarValue(const Value *V, const Instruction *Arg) {
     switch (I->getOpcode()) {
     default:
       return false;
+    case Instruction::PHI:
     case Instruction::Trunc:
     case Instruction::ZExt:
     case Instruction::SExt:
@@ -497,7 +498,6 @@ static bool ExpectsScalarValue(const Value *V, const Instruction *Arg) {
     // instructions:
     // case Instruction::IntToPtr:
     // case Instruction::BitCast:
-    // case Instruction::PHI:
     // case Instruction::Call:
   }
 }
diff --git a/test/NaCl/Bitcode/bitcast-elide.ll b/test/NaCl/Bitcode/bitcast-elide.ll
index eeee69ffef..383673d684 100644
--- a/test/NaCl/Bitcode/bitcast-elide.ll
+++ b/test/NaCl/Bitcode/bitcast-elide.ll
@@ -17,17 +17,19 @@
 
 ; ------------------------------------------------------
 
-@bytes = internal global [7 x i8] c"abcdefg"
+@bytes = internal global [4 x i8] c"abcd"
+
+; ------------------------------------------------------
 
 ; Test that we elide the simple case of global.
 define void @SimpleLoad() {
-  %1 = bitcast [7 x i8]* @bytes to i32*
+  %1 = bitcast [4 x i8]* @bytes to i32*
   %2 = load i32* %1, align 4
   ret void
 }
 
 ; TD1:      define void @SimpleLoad() {
-; TD1-NEXT:   %1 = bitcast [7 x i8]* @bytes to i32*
+; TD1-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
 ; TD1-NEXT:   %2 = load i32* %1, align 4
 ; TD1-NEXT:   ret void
 ; TD1-NEXT: }
@@ -40,7 +42,7 @@ define void @SimpleLoad() {
 ; PF1-NEXT:  </FUNCTION_BLOCK>
 
 ; TD2:      define void @SimpleLoad() {
-; TD2-NEXT:   %1 = bitcast [7 x i8]* @bytes to i32*
+; TD2-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
 ; TD2-NEXT:   %2 = load i32* %1, align 4
 ; TD2-NEXT:   ret void
 ; TD2-NEXT: }
@@ -51,6 +53,8 @@ define void @SimpleLoad() {
 ; PF2-NEXT:    <INST_RET/>
 ; PF2-NEXT:  </FUNCTION_BLOCK>
 
+; ------------------------------------------------------
+
 ; Test that we elide the simple case of an alloca.
 define void @SimpleLoadAlloca() {
   %1 = alloca i8, i32 4, align 4
@@ -67,8 +71,6 @@ define void @SimpleLoadAlloca() {
 ; TD1-NEXT: }
 
 ; PF1:        <FUNCTION_BLOCK>
-; PF1-NEXT:     <DECLAREBLOCKS op0=1/>
-; PF1-NEXT:     <CONSTANTS_BLOCK
 ; PF1:          </CONSTANTS_BLOCK>
 ; PF1-NEXT:     <INST_ALLOCA op0=1 op1=3/>
 ; PF1-NEXT:     <INST_CAST op0=1 op1=1 op2=11/>
@@ -84,23 +86,23 @@ define void @SimpleLoadAlloca() {
 ; TD2-NEXT: }
 
 ; PF2:        <FUNCTION_BLOCK>
-; PF2-NEXT:     <DECLAREBLOCKS op0=1/>
-; PF2-NEXT:     <CONSTANTS_BLOCK
 ; PF2:          </CONSTANTS_BLOCK>
 ; PF2-NEXT:     <INST_ALLOCA op0=1 op1=3/>
 ; PF2-NEXT:     <INST_LOAD op0=1 op1=3 op2=0/>
 ; PF2-NEXT:     <INST_RET/>
 ; PF2-NEXT:   </FUNCTION_BLOCK>
 
+; ------------------------------------------------------
+
 ; Test that we don't elide an bitcast if one of its uses is not a load.
 define i32* @NonsimpleLoad(i32 %i) {
-  %1 = bitcast [7 x i8]* @bytes to i32*       
+  %1 = bitcast [4 x i8]* @bytes to i32*       
   %2 = load i32* %1, align 4
   ret i32* %1
 }
 
 ; TD1:      define i32* @NonsimpleLoad(i32 %i) {
-; TD1-NEXT:   %1 = bitcast [7 x i8]* @bytes to i32*
+; TD1-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
 ; TD1-NEXT:   %2 = load i32* %1, align 4
 ; TD1-NEXT:   ret i32* %1
 ; TD1-NEXT: }
@@ -113,7 +115,7 @@ define i32* @NonsimpleLoad(i32 %i) {
 ; PF1:       </FUNCTION_BLOCK>
 
 ; TD2:      define i32* @NonsimpleLoad(i32 %i) {
-; TD2-NEXT:   %1 = bitcast [7 x i8]* @bytes to i32*
+; TD2-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
 ; TD2-NEXT:   %2 = load i32* %1, align 4
 ; TD2-NEXT:   ret i32* %1
 ; TD2-NEXT: }
@@ -125,20 +127,22 @@ define i32* @NonsimpleLoad(i32 %i) {
 ; PF2-NEXT:    <INST_RET op0=2/>
 ; PF2:       </FUNCTION_BLOCK>
 
+; ------------------------------------------------------
+
 ; Test that we can handle multiple bitcasts.
 define i32 @TwoLoads(i32 %i) {
-  %1 = bitcast [7 x i8]* @bytes to i32*       
+  %1 = bitcast [4 x i8]* @bytes to i32*       
   %2 = load i32* %1, align 4
-  %3 = bitcast [7 x i8]* @bytes to i32*       
+  %3 = bitcast [4 x i8]* @bytes to i32*       
   %4 = load i32* %3, align 4
   %5 = add i32 %2, %4
   ret i32 %5
 }
 
 ; TD1:      define i32 @TwoLoads(i32 %i) {
-; TD1-NEXT:   %1 = bitcast [7 x i8]* @bytes to i32*
+; TD1-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
 ; TD1-NEXT:   %2 = load i32* %1, align 4
-; TD1-NEXT:   %3 = bitcast [7 x i8]* @bytes to i32*
+; TD1-NEXT:   %3 = bitcast [4 x i8]* @bytes to i32*
 ; TD1-NEXT:   %4 = load i32* %3, align 4
 ; TD1-NEXT:   %5 = add i32 %2, %4
 ; TD1-NEXT:   ret i32 %5
@@ -155,12 +159,11 @@ define i32 @TwoLoads(i32 %i) {
 ; PF1:       </FUNCTION_BLOCK>
 
 ; TD2:      define i32 @TwoLoads(i32 %i) {
-; TD2-NEXT:   %1 = bitcast [7 x i8]* @bytes to i32*
+; TD2-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
 ; TD2-NEXT:   %2 = load i32* %1, align 4
-; TD2-NEXT:   %3 = bitcast [7 x i8]* @bytes to i32*
-; TD2-NEXT:   %4 = load i32* %3, align 4
-; TD2-NEXT:   %5 = add i32 %2, %4
-; TD2-NEXT:   ret i32 %5
+; TD2-NEXT:   %3 = load i32* %1, align 4
+; TD2-NEXT:   %4 = add i32 %2, %3
+; TD2-NEXT:   ret i32 %4
 ; TD2-NEXT: }
 
 ; PF2:       <FUNCTION_BLOCK>
@@ -171,17 +174,20 @@ define i32 @TwoLoads(i32 %i) {
 ; PF2-NEXT:    <INST_RET op0=1/>
 ; PF2:       </FUNCTION_BLOCK>
 
-; Test how we duplicate bitcasts, even if optimized in the input file.
-define i32 @TwoLoadOpt(i32 %i) {
-  %1 = bitcast [7 x i8]* @bytes to i32*       
+; ------------------------------------------------------
+
+; Test how we handle bitcasts if optimized in the input file.  This
+; case tests within a single block.
+define i32 @TwoLoadOptOneBlock(i32 %i) {
+  %1 = bitcast [4 x i8]* @bytes to i32*       
   %2 = load i32* %1, align 4
   %3 = load i32* %1, align 4
   %4 = add i32 %2, %3
   ret i32 %4
 }
 
-; TD1:      define i32 @TwoLoadOpt(i32 %i) {
-; TD1-NEXT:   %1 = bitcast [7 x i8]* @bytes to i32*
+; TD1:      define i32 @TwoLoadOptOneBlock(i32 %i) {
+; TD1-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
 ; TD1-NEXT:   %2 = load i32* %1, align 4
 ; TD1-NEXT:   %3 = load i32* %1, align 4
 ; TD1-NEXT:   %4 = add i32 %2, %3
@@ -197,13 +203,12 @@ define i32 @TwoLoadOpt(i32 %i) {
 ; PF1-NEXT:    <INST_RET op0=1/>
 ; PF1:       </FUNCTION_BLOCK>
 
-; TD2:      define i32 @TwoLoadOpt(i32 %i) {
-; TD2-NEXT:   %1 = bitcast [7 x i8]* @bytes to i32*
+; TD2:      define i32 @TwoLoadOptOneBlock(i32 %i) {
+; TD2-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
 ; TD2-NEXT:   %2 = load i32* %1, align 4
-; TD2-NEXT:   %3 = bitcast [7 x i8]* @bytes to i32*
-; TD2-NEXT:   %4 = load i32* %3, align 4
-; TD2-NEXT:   %5 = add i32 %2, %4
-; TD2-NEXT:   ret i32 %5
+; TD2-NEXT:   %3 = load i32* %1, align 4
+; TD2-NEXT:   %4 = add i32 %2, %3
+; TD2-NEXT:   ret i32 %4
 ; TD2-NEXT: }
 
 ; PF2:       <FUNCTION_BLOCK>
@@ -214,15 +219,87 @@ define i32 @TwoLoadOpt(i32 %i) {
 ; PF2-NEXT:    <INST_RET op0=1/>
 ; PF2:       </FUNCTION_BLOCK>
 
+; ------------------------------------------------------
+
+; Test how we handle bitcasts if optimized in the input file.  This
+; case tests accross blocks.
+define i32 @TwoLoadOptTwoBlocks(i32 %i) {
+  %1 = bitcast [4 x i8]* @bytes to i32*       
+  %2 = load i32* %1, align 4
+  %3 = load i32* %1, align 4
+  %4 = add i32 %2, %3
+  br label %BB
+
+BB:
+  %5 = load i32* %1, align 4
+  %6 = load i32* %1, align 4
+  %7 = add i32 %5, %6
+  ret i32 %4
+}
+
+; TD1:      define i32 @TwoLoadOptTwoBlocks(i32 %i) {
+; TD1-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
+; TD1-NEXT:   %2 = load i32* %1, align 4
+; TD1-NEXT:   %3 = load i32* %1, align 4
+; TD1-NEXT:   %4 = add i32 %2, %3
+; TD1-NEXT:   br label %BB
+; TD1:      BB:
+; TD1-NEXT:   %5 = load i32* %1, align 4
+; TD1-NEXT:   %6 = load i32* %1, align 4
+; TD1-NEXT:   %7 = add i32 %5, %6
+; TD1-NEXT:   ret i32 %4
+; TD1-NEXT: }
+
+; PF1:        <FUNCTION_BLOCK>
+; PF1-NEXT:     <DECLAREBLOCKS op0=2/>
+; PF1-NEXT:     <INST_CAST op0=2 op1=1 op2=11/>
+; PF1-NEXT:     <INST_LOAD op0=1 op1=3 op2=0/>
+; PF1-NEXT:     <INST_LOAD op0=2 op1=3 op2=0/>
+; PF1-NEXT:     <INST_BINOP op0=2 op1=1 op2=0/>
+; PF1-NEXT:     <INST_BR op0=1/>
+; PF1-NEXT:     <INST_LOAD op0=4 op1=3 op2=0/>
+; PF1-NEXT:     <INST_LOAD op0=5 op1=3 op2=0/>
+; PF1-NEXT:     <INST_BINOP op0=2 op1=1 op2=0/>
+; PF1-NEXT:     <INST_RET op0=4/>
+; PF1:        </FUNCTION_BLOCK>
+
+; TD2:      define i32 @TwoLoadOptTwoBlocks(i32 %i) {
+; TD2-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
+; TD2-NEXT:   %2 = load i32* %1, align 4
+; TD2-NEXT:   %3 = load i32* %1, align 4
+; TD2-NEXT:   %4 = add i32 %2, %3
+; TD2-NEXT:   br label %BB
+; TD2:      BB:
+; TD2-NEXT:   %5 = bitcast [4 x i8]* @bytes to i32*
+; TD2-NEXT:   %6 = load i32* %5, align 4
+; TD2-NEXT:   %7 = load i32* %5, align 4
+; TD2-NEXT:   %8 = add i32 %6, %7
+; TD2-NEXT:   ret i32 %4
+; TD2-NEXT: }
+
+; PF2:        <FUNCTION_BLOCK>
+; PF2-NEXT:     <DECLAREBLOCKS op0=2/>
+; PF2-NEXT:     <INST_LOAD op0=2 op1=3 op2=0/>
+; PF2-NEXT:     <INST_LOAD op0=3 op1=3 op2=0/>
+; PF2-NEXT:     <INST_BINOP op0=2 op1=1 op2=0/>
+; PF2-NEXT:     <INST_BR op0=1/>
+; PF2-NEXT:     <INST_LOAD op0=5 op1=3 op2=0/>
+; PF2-NEXT:     <INST_LOAD op0=6 op1=3 op2=0/>
+; PF2-NEXT:     <INST_BINOP op0=2 op1=1 op2=0/>
+; PF2-NEXT:     <INST_RET op0=4/>
+; PF2:        </FUNCTION_BLOCK>
+
+; ------------------------------------------------------
+
 ; Test that we elide the simple case of bitcast for a store.
 define void @SimpleStore(i32 %i) {
-  %1 = bitcast [7 x i8]* @bytes to i32*
+  %1 = bitcast [4 x i8]* @bytes to i32*
   store i32 %i, i32* %1, align 4
   ret void
 }
 
 ; TD1:      define void @SimpleStore(i32 %i) {
-; TD1-NEXT:   %1 = bitcast [7 x i8]* @bytes to i32*
+; TD1-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
 ; TD1-NEXT:   store i32 %i, i32* %1, align 4
 ; TD1-NEXT:   ret void
 ; TD1-NEXT: }
@@ -235,7 +312,7 @@ define void @SimpleStore(i32 %i) {
 ; PF1:        </FUNCTION_BLOCK>
 
 ; TD2:      define void @SimpleStore(i32 %i) {
-; TD2-NEXT:   %1 = bitcast [7 x i8]* @bytes to i32*
+; TD2-NEXT:   %1 = bitcast [4 x i8]* @bytes to i32*
 ; TD2-NEXT:   store i32 %i, i32* %1, align 4
 ; TD2-NEXT:   ret void
 ; TD2-NEXT: }
diff --git a/test/NaCl/Bitcode/inttoptr-elide.ll b/test/NaCl/Bitcode/inttoptr-elide.ll
index 029f67adef..679f5f1d47 100644
--- a/test/NaCl/Bitcode/inttoptr-elide.ll
+++ b/test/NaCl/Bitcode/inttoptr-elide.ll
@@ -118,13 +118,11 @@ define i32 @TwoLoads(i32 %i) {
 ; TD2:      define i32 @TwoLoads(i32 %i) {
 ; TD2-NEXT:   %1 = inttoptr i32 %i to i32*
 ; TD2-NEXT:   %2 = load i32* %1, align 4
-; TD2-NEXT:   %3 = inttoptr i32 %i to i32*
-; TD2-NEXT:   %4 = load i32* %3, align 4
-; TD2-NEXT:   %5 = add i32 %2, %4
-; TD2-NEXT:   ret i32 %5
+; TD2-NEXT:   %3 = load i32* %1, align 4
+; TD2-NEXT:   %4 = add i32 %2, %3
+; TD2-NEXT:   ret i32 %4
 ; TD2-NEXT: }
 
-
 ; PF2:       <FUNCTION_BLOCK>
 ; PF2-NEXT:    <DECLAREBLOCKS op0=1/>
 ; PF2-NEXT:    <INST_LOAD op0=1 op1=3 op2=0/>
@@ -135,8 +133,9 @@ define i32 @TwoLoads(i32 %i) {
 
 ; ------------------------------------------------------
 
-; Test how we duplicate inttoptrs, even if optimized in the input file.
-define i32 @TwoLoadOpt(i32 %i) {
+; Test how we handle inttoptrs, if optimized in the input file. This
+; case tests within a single block.
+define i32 @TwoLoadOptOneBlock(i32 %i) {
   %1 = inttoptr i32 %i to i32*
   %2 = load i32* %1, align 4
   %3 = load i32* %1, align 4
@@ -144,7 +143,7 @@ define i32 @TwoLoadOpt(i32 %i) {
   ret i32 %4
 }
 
-; TD1:      define i32 @TwoLoadOpt(i32 %i) {
+; TD1:      define i32 @TwoLoadOptOneBlock(i32 %i) {
 ; TD1-NEXT:   %1 = inttoptr i32 %i to i32*
 ; TD1-NEXT:   %2 = load i32* %1, align 4
 ; TD1-NEXT:   %3 = load i32* %1, align 4
@@ -161,13 +160,12 @@ define i32 @TwoLoadOpt(i32 %i) {
 ; PF1-NEXT:    <INST_RET op0=1/>
 ; PF1:       </FUNCTION_BLOCK>
 
-; TD2:      define i32 @TwoLoadOpt(i32 %i) {
+; TD2:      define i32 @TwoLoadOptOneBlock(i32 %i) {
 ; TD2-NEXT:   %1 = inttoptr i32 %i to i32*
 ; TD2-NEXT:   %2 = load i32* %1, align 4
-; TD2-NEXT:   %3 = inttoptr i32 %i to i32*
-; TD2-NEXT:   %4 = load i32* %3, align 4
-; TD2-NEXT:   %5 = add i32 %2, %4
-; TD2-NEXT:   ret i32 %5
+; TD2-NEXT:   %3 = load i32* %1, align 4
+; TD2-NEXT:   %4 = add i32 %2, %3
+; TD2-NEXT:   ret i32 %4
 ; TD2-NEXT: }
 
 ; PF2:       <FUNCTION_BLOCK>
@@ -180,6 +178,76 @@ define i32 @TwoLoadOpt(i32 %i) {
 
 ; ------------------------------------------------------
 
+; Test how we handle inttoptrs if optimized in the input file.  This
+; case tests accross blocks.
+define i32 @TwoLoadOptTwoBlocks(i32 %i) {
+  %1 = inttoptr i32 %i to i32*
+  %2 = load i32* %1, align 4
+  %3 = load i32* %1, align 4
+  %4 = add i32 %2, %3
+  br label %BB
+
+BB:
+  %5 = load i32* %1, align 4
+  %6 = load i32* %1, align 4
+  %7 = add i32 %5, %6
+  ret i32 %7
+}
+
+; TD1:      define i32 @TwoLoadOptTwoBlocks(i32 %i) {
+; TD1-NEXT:   %1 = inttoptr i32 %i to i32*
+; TD1-NEXT:   %2 = load i32* %1, align 4
+; TD1-NEXT:   %3 = load i32* %1, align 4
+; TD1-NEXT:   %4 = add i32 %2, %3
+; TD1-NEXT:   br label %BB
+; TD1:      BB:
+; TD1-NEXT:   %5 = load i32* %1, align 4
+; TD1-NEXT:   %6 = load i32* %1, align 4
+; TD1-NEXT:   %7 = add i32 %5, %6
+; TD1-NEXT:   ret i32 %7
+; TD1-NEXT: }
+
+; PF1:        <FUNCTION_BLOCK>
+; PF1-NEXT:     <DECLAREBLOCKS op0=2/>
+; PF1-NEXT:     <INST_CAST op0=1 op1=1 op2=10/>
+; PF1-NEXT:     <INST_LOAD op0=1 op1=3 op2=0/>
+; PF1-NEXT:     <INST_LOAD op0=2 op1=3 op2=0/>
+; PF1-NEXT:     <INST_BINOP op0=2 op1=1 op2=0/>
+; PF1-NEXT:     <INST_BR op0=1/>
+; PF1-NEXT:     <INST_LOAD op0=4 op1=3 op2=0/>
+; PF1-NEXT:     <INST_LOAD op0=5 op1=3 op2=0/>
+; PF1-NEXT:     <INST_BINOP op0=2 op1=1 op2=0/>
+; PF1-NEXT:     <INST_RET op0=1/>
+; PF1:        </FUNCTION_BLOCK>
+
+; TD2:      define i32 @TwoLoadOptTwoBlocks(i32 %i) {
+; TD2-NEXT:   %1 = inttoptr i32 %i to i32*
+; TD2-NEXT:   %2 = load i32* %1, align 4
+; TD2-NEXT:   %3 = load i32* %1, align 4
+; TD2-NEXT:   %4 = add i32 %2, %3
+; TD2-NEXT:   br label %BB
+; TD2:      BB:
+; TD2-NEXT:   %5 = inttoptr i32 %i to i32*
+; TD2-NEXT:   %6 = load i32* %5, align 4
+; TD2-NEXT:   %7 = load i32* %5, align 4
+; TD2-NEXT:   %8 = add i32 %6, %7
+; TD2-NEXT:   ret i32 %8
+; TD2-NEXT: }
+
+; PF2:        <FUNCTION_BLOCK>
+; PF2-NEXT:     <DECLAREBLOCKS op0=2/>
+; PF2-NEXT:     <INST_LOAD op0=1 op1=3 op2=0/>
+; PF2-NEXT:     <INST_LOAD op0=2 op1=3 op2=0/>
+; PF2-NEXT:     <INST_BINOP op0=2 op1=1 op2=0/>
+; PF2-NEXT:     <INST_BR op0=1/>
+; PF2-NEXT:     <INST_LOAD op0=4 op1=3 op2=0/>
+; PF2-NEXT:     <INST_LOAD op0=5 op1=3 op2=0/>
+; PF2-NEXT:     <INST_BINOP op0=2 op1=1 op2=0/>
+; PF2-NEXT:     <INST_RET op0=1/>
+; PF2:        </FUNCTION_BLOCK>
+
+; ------------------------------------------------------
+
 ; Test that we elide the simple case of inttoptr for a store.
 define void @SimpleStore(i32 %i) {
   %1 = inttoptr i32 %i to i32*
@@ -210,4 +278,4 @@ define void @SimpleStore(i32 %i) {
 ; PF2-NEXT:   <DECLAREBLOCKS op0=1/>
 ; PF2-NEXT:   <INST_STORE op0=1 op1=1 op2=3/>
 ; PF2-NEXT:   <INST_RET/>
-; PF2T:     </FUNCTION_BLOCK>
+; PF2:      </FUNCTION_BLOCK>
diff --git a/test/NaCl/Bitcode/ptrtoint-elide.ll b/test/NaCl/Bitcode/ptrtoint-elide.ll
index 10504a8577..43a82a0802 100644
--- a/test/NaCl/Bitcode/ptrtoint-elide.ll
+++ b/test/NaCl/Bitcode/ptrtoint-elide.ll
@@ -153,8 +153,8 @@ define void @AllocCastDelete() {
 ; ------------------------------------------------------
 
 ; Show case where we have optimized the ptrtoint (and bitcast) into a
-; single instruction, but will get duplicated after reading back the
-; bitcode file, since we insert elided casts immediately before each use.
+; single instruction, and will only be inserted before the first use
+; in the block.
 define void @AllocCastOpt() {
   %1 = alloca i8, i32 4, align 8
   %2 = bitcast [4 x i8]* @bytes to i32*
@@ -177,7 +177,7 @@ define void @AllocCastOpt() {
 ; PF1:          </CONSTANTS_BLOCK>
 ; PF1-NEXT:     <INST_ALLOCA op0=1 op1=4/>
 ; PF1-NEXT:     <INST_CAST op0=3 op1=4 op2=11/>
-; PF1-NEXT:     <INST_CAST  op0=2 op1=0 op2=9/>
+; PF1-NEXT:     <INST_CAST op0=2 op1=0 op2=9/>
 ; PF1-NEXT:     <INST_STORE op0=2 op1=1 op2=1 op3=0/>
 ; PF1-NEXT:     <INST_STORE op0=2 op1=1 op2=1 op3=0/>
 ; PF1-NEXT:     <INST_RET/>
@@ -188,9 +188,7 @@ define void @AllocCastOpt() {
 ; TD2-NEXT:   %2 = ptrtoint i8* %1 to i32
 ; TD2-NEXT:   %3 = bitcast [4 x i8]* @bytes to i32*
 ; TD2-NEXT:   store i32 %2, i32* %3, align 1
-; TD2-NEXT:   %4 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %5 = bitcast [4 x i8]* @bytes to i32*
-; TD2-NEXT:   store i32 %4, i32* %5, align 1
+; TD2-NEXT:   store i32 %2, i32* %3, align 1
 ; TD2-NEXT:   ret void
 ; TD2-NEXT: }
 
@@ -366,7 +364,6 @@ define i32 @StoreGlobalMovePtr2Int() {
 ; PF1-NEXT:     <INST_RET op0=4/>
 ; PF1-NEXT:   </FUNCTION_BLOCK>
 
-
 ; TD2:      define i32 @StoreGlobalMovePtr2Int() {
 ; TD2-NEXT:   %1 = alloca i8, i32 4, align 8
 ; TD2-NEXT:   %2 = ptrtoint [4 x i8]* @bytes to i32
@@ -430,11 +427,8 @@ define void @CastAddAlloca() {
 ; TD2-NEXT:   %2 = add i32 1, 2
 ; TD2-NEXT:   %3 = ptrtoint i8* %1 to i32
 ; TD2-NEXT:   %4 = add i32 %3, 2
-; TD2-NEXT:   %5 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %6 = add i32 1, %5
-; TD2-NEXT:   %7 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %8 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %9 = add i32 %7, %8
+; TD2-NEXT:   %5 = add i32 1, %3
+; TD2-NEXT:   %6 = add i32 %3, %3
 ; TD2-NEXT:   ret void
 ; TD2-NEXT: }
 
@@ -491,11 +485,8 @@ define void @CastAddGlobal() {
 ; TD2-NEXT:   %1 = add i32 1, 2
 ; TD2-NEXT:   %2 = ptrtoint [4 x i8]* @bytes to i32
 ; TD2-NEXT:   %3 = add i32 %2, 2
-; TD2-NEXT:   %4 = ptrtoint [4 x i8]* @bytes to i32
-; TD2-NEXT:   %5 = add i32 1, %4
-; TD2-NEXT:   %6 = ptrtoint [4 x i8]* @bytes to i32
-; TD2-NEXT:   %7 = ptrtoint [4 x i8]* @bytes to i32
-; TD2-NEXT:   %8 = add i32 %6, %7
+; TD2-NEXT:   %4 = add i32 1, %2
+; TD2-NEXT:   %5 = add i32 %2, %2
 ; TD2-NEXT:   ret void
 ; TD2-NEXT: }
 
@@ -571,36 +562,16 @@ define void @CastBinop() {
 ; TD2-NEXT:   %2 = ptrtoint i8* %1 to i32
 ; TD2-NEXT:   %3 = ptrtoint [4 x i8]* @bytes to i32
 ; TD2-NEXT:   %4 = sub i32 %2, %3
-; TD2-NEXT:   %5 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %6 = ptrtoint [4 x i8]* @bytes to i32
-; TD2-NEXT:   %7 = mul i32 %5, %6
-; TD2-NEXT:   %8 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %9 = ptrtoint [4 x i8]* @bytes to i32
-; TD2-NEXT:   %10 = udiv i32 %8, %9
-; TD2-NEXT:   %11 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %12 = ptrtoint [4 x i8]* @bytes to i32
-; TD2-NEXT:   %13 = urem i32 %11, %12
-; TD2-NEXT:   %14 = ptrtoint i8* %1 to i32
-; TD2-NEXT:   %15 = ptrtoint [4 x i8