diff options
-rw-r--r-- | include/llvm/InitializePasses.h | 1 | ||||
-rw-r--r-- | include/llvm/Transforms/Scalar.h | 6 | ||||
-rw-r--r-- | lib/Transforms/IPO/PassManagerBuilder.cpp | 14 | ||||
-rw-r--r-- | lib/Transforms/Scalar/CMakeLists.txt | 1 | ||||
-rw-r--r-- | lib/Transforms/Scalar/SROA.cpp | 2628 | ||||
-rw-r--r-- | lib/Transforms/Scalar/Scalar.cpp | 1 | ||||
-rw-r--r-- | test/Transforms/SROA/basictest.ll | 741 | ||||
-rw-r--r-- | test/Transforms/SROA/lit.local.cfg | 1 | ||||
-rw-r--r-- | test/Transforms/SROA/phi-and-select.ll | 325 | ||||
-rw-r--r-- | test/Transforms/SROA/vector-promotion.ll | 191 |
10 files changed, 3907 insertions, 2 deletions
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index c1c833d949..84c2b65f97 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -221,6 +221,7 @@ void initializeRegionOnlyViewerPass(PassRegistry&); void initializeRegionPrinterPass(PassRegistry&); void initializeRegionViewerPass(PassRegistry&); void initializeSCCPPass(PassRegistry&); +void initializeSROAPass(PassRegistry&); void initializeSROA_DTPass(PassRegistry&); void initializeSROA_SSAUpPass(PassRegistry&); void initializeScalarEvolutionAliasAnalysisPass(PassRegistry&); diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h index 3dce6fe37f..18114ce293 100644 --- a/include/llvm/Transforms/Scalar.h +++ b/include/llvm/Transforms/Scalar.h @@ -70,6 +70,12 @@ FunctionPass *createAggressiveDCEPass(); //===----------------------------------------------------------------------===// // +// SROA - Replace aggregates or pieces of aggregates with scalar SSA values. +// +FunctionPass *createSROAPass(); + +//===----------------------------------------------------------------------===// +// // ScalarReplAggregates - Break up alloca's of aggregates into multiple allocas // if possible. // diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index 43b4ab5efa..66d979cf2a 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -40,6 +40,10 @@ UseGVNAfterVectorization("use-gvn-after-vectorization", cl::init(false), cl::Hidden, cl::desc("Run GVN instead of Early CSE after vectorization passes")); +static cl::opt<bool> UseNewSROA("use-new-sroa", + cl::init(true), cl::Hidden, + cl::desc("Enable the new, experimental SROA pass")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; @@ -100,7 +104,10 @@ void PassManagerBuilder::populateFunctionPassManager(FunctionPassManager &FPM) { addInitialAliasAnalysisPasses(FPM); FPM.add(createCFGSimplificationPass()); - FPM.add(createScalarReplAggregatesPass()); + if (UseNewSROA) + FPM.add(createSROAPass()); + else + FPM.add(createScalarReplAggregatesPass()); FPM.add(createEarlyCSEPass()); FPM.add(createLowerExpectIntrinsicPass()); } @@ -265,7 +272,10 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM, PM.add(createInstructionCombiningPass()); PM.add(createJumpThreadingPass()); // Break up allocas - PM.add(createScalarReplAggregatesPass()); + if (UseNewSROA) + PM.add(createSROAPass()); + else + PM.add(createScalarReplAggregatesPass()); // Run a few AA driven optimizations here and now, to cleanup the code. PM.add(createFunctionAttrsPass()); // Add nocapture. diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index a01e0661b1..b3fc6e338c 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -25,6 +25,7 @@ add_llvm_library(LLVMScalarOpts Reassociate.cpp Reg2Mem.cpp SCCP.cpp + SROA.cpp Scalar.cpp ScalarReplAggregates.cpp SimplifyCFGPass.cpp diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp new file mode 100644 index 0000000000..9ecefa6199 --- /dev/null +++ b/lib/Transforms/Scalar/SROA.cpp @@ -0,0 +1,2628 @@ +//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This transformation implements the well known scalar replacement of +/// aggregates transformation. It tries to identify promotable elements of an +/// aggregate alloca, and promote them to registers. It will also try to +/// convert uses of an element (or set of elements) of an alloca into a vector +/// or bitfield-style integer scalar if appropriate. +/// +/// It works to do this with minimal slicing of the alloca so that regions +/// which are merely transferred in and out of external memory remain unchanged +/// and are not decomposed to scalar code. +/// +/// Because this also performs alloca promotion, it can be thought of as also +/// serving the purpose of SSA formation. The algorithm iterates on the +/// function until all opportunities for promotion have been realized. +/// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "sroa" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Constants.h" +#include "llvm/DIBuilder.h" +#include "llvm/DebugInfo.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/GlobalVariable.h" +#include "llvm/IRBuilder.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/LLVMContext.h" +#include "llvm/Module.h" +#include "llvm/Operator.h" +#include "llvm/Pass.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/TinyPtrVector.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Support/InstVisitor.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/ValueHandle.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +using namespace llvm; + +STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement"); +STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced"); +STATISTIC(NumPromoted, "Number of allocas promoted to SSA values"); +STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion"); +STATISTIC(NumDeleted, "Number of instructions deleted"); +STATISTIC(NumVectorized, "Number of vectorized aggregates"); + +namespace { +/// \brief Alloca partitioning representation. +/// +/// This class represents a partitioning of an alloca into slices, and +/// information about the nature of uses of each slice of the alloca. The goal +/// is that this information is sufficient to decide if and how to split the +/// alloca apart and replace slices with scalars. It is also intended that this +/// structure can capture the relevant information needed both due decide about +/// and to enact these transformations. +class AllocaPartitioning { +public: + /// \brief A common base class for representing a half-open byte range. + struct ByteRange { + /// \brief The beginning offset of the range. + uint64_t BeginOffset; + + /// \brief The ending offset, not included in the range. + uint64_t EndOffset; + + ByteRange() : BeginOffset(), EndOffset() {} + ByteRange(uint64_t BeginOffset, uint64_t EndOffset) + : BeginOffset(BeginOffset), EndOffset(EndOffset) {} + + /// \brief Support for ordering ranges. + /// + /// This provides an ordering over ranges such that start offsets are + /// always increasing, and within equal start offsets, the end offsets are + /// decreasing. Thus the spanning range comes first in in cluster with the + /// same start position. + bool operator<(const ByteRange &RHS) const { + if (BeginOffset < RHS.BeginOffset) return true; + if (BeginOffset > RHS.BeginOffset) return false; + if (EndOffset > RHS.EndOffset) return true; + return false; + } + + /// \brief Support comparison with a single offset to allow binary searches. + bool operator<(uint64_t RHSOffset) const { + return BeginOffset < RHSOffset; + } + + bool operator==(const ByteRange &RHS) const { + return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset; + } + bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); } + }; + + /// \brief A partition of an alloca. + /// + /// This structure represents a contiguous partition of the alloca. These are + /// formed by examining the uses of the alloca. During formation, they may + /// overlap but once an AllocaPartitioning is built, the Partitions within it + /// are all disjoint. + struct Partition : public ByteRange { + /// \brief Whether this partition is splittable into smaller partitions. + /// + /// We flag partitions as splittable when they are formed entirely due to + /// accesses by trivially split operations such as memset and memcpy. + /// + /// FIXME: At some point we should consider loads and stores of FCAs to be + /// splittable and eagerly split them into scalar values. + bool IsSplittable; + + Partition() : ByteRange(), IsSplittable() {} + Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable) + : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {} + }; + + /// \brief A particular use of a partition of the alloca. + /// + /// This structure is used to associate uses of a partition with it. They + /// mark the range of bytes which are referenced by a particular instruction, + /// and includes a handle to the user itself and the pointer value in use. + /// The bounds of these uses are determined by intersecting the bounds of the + /// memory use itself with a particular partition. As a consequence there is + /// intentionally overlap between various usues of the same partition. + struct PartitionUse : public ByteRange { + /// \brief The user of this range of the alloca. + AssertingVH<Instruction> User; + + /// \brief The particular pointer value derived from this alloca in use. + AssertingVH<Instruction> Ptr; + + PartitionUse() : ByteRange(), User(), Ptr() {} + PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, + Instruction *User, Instruction *Ptr) + : ByteRange(BeginOffset, EndOffset), User(User), Ptr(Ptr) {} + }; + + /// \brief Construct a partitioning of a particular alloca. + /// + /// Construction does most of the work for partitioning the alloca. This + /// performs the necessary walks of users and builds a partitioning from it. + AllocaPartitioning(const TargetData &TD, AllocaInst &AI); + + /// \brief Test whether a pointer to the allocation escapes our analysis. + /// + /// If this is true, the partitioning is never fully built and should be + /// ignored. + bool isEscaped() const { return PointerEscapingInstr; } + + /// \brief Support for iterating over the partitions. + /// @{ + typedef SmallVectorImpl<Partition>::iterator iterator; + iterator begin() { return Partitions.begin(); } + iterator end() { return Partitions.end(); } + + typedef SmallVectorImpl<Partition>::const_iterator const_iterator; + const_iterator begin() const { return Partitions.begin(); } + const_iterator end() const { return Partitions.end(); } + /// @} + + /// \brief Support for iterating over and manipulating a particular + /// partition's uses. + /// + /// The iteration support provided for uses is more limited, but also + /// includes some manipulation routines to support rewriting the uses of + /// partitions during SROA. + /// @{ + typedef SmallVectorImpl<PartitionUse>::iterator use_iterator; + use_iterator use_begin(unsigned Idx) { return Uses[Idx].begin(); } + use_iterator use_begin(const_iterator I) { return Uses[I - begin()].begin(); } + use_iterator use_end(unsigned Idx) { return Uses[Idx].end(); } + use_iterator use_end(const_iterator I) { return Uses[I - begin()].end(); } + void use_insert(unsigned Idx, use_iterator UI, const PartitionUse &U) { + Uses[Idx].insert(UI, U); + } + void use_insert(const_iterator I, use_iterator UI, const PartitionUse &U) { + Uses[I - begin()].insert(UI, U); + } + void use_erase(unsigned Idx, use_iterator UI) { Uses[Idx].erase(UI); } + void use_erase(const_iterator I, use_iterator UI) { + Uses[I - begin()].erase(UI); + } + + typedef SmallVectorImpl<PartitionUse>::const_iterator const_use_iterator; + const_use_iterator use_begin(unsigned Idx) const { return Uses[Idx].begin(); } + const_use_iterator use_begin(const_iterator I) const { + return Uses[I - begin()].begin(); + } + const_use_iterator use_end(unsigned Idx) const { return Uses[Idx].end(); } + const_use_iterator use_end(const_iterator I) const { + return Uses[I - begin()].end(); + } + /// @} + + /// \brief Allow iterating the dead users for this alloca. + /// + /// These are instructions which will never actually use the alloca as they + /// are outside the allocated range. They are safe to replace with undef and + /// delete. + /// @{ + typedef SmallVectorImpl<Instruction *>::const_iterator dead_user_iterator; + dead_user_iterator dead_user_begin() const { return DeadUsers.begin(); } + dead_user_iterator dead_user_end() const { return DeadUsers.end(); } + /// @} + + /// \brief Allow iterating the dead operands referring to this alloca. + /// + /// These are operands which have cannot actually be used to refer to the + /// alloca as they are outside its range and the user doesn't correct for + /// that. These mostly consist of PHI node inputs and the like which we just + /// need to replace with undef. + /// @{ + typedef SmallVectorImpl<Use *>::const_iterator dead_op_iterator; + dead_op_iterator dead_op_begin() const { return DeadOperands.begin(); } + dead_op_iterator dead_op_end() const { return DeadOperands.end(); } + /// @} + + /// \brief MemTransferInst auxiliary data. + /// This struct provides some auxiliary data about memory transfer + /// intrinsics such as memcpy and memmove. These intrinsics can use two + /// different ranges within the same alloca, and provide other challenges to + /// correctly represent. We stash extra data to help us untangle this + /// after the partitioning is complete. + struct MemTransferOffsets { + uint64_t DestBegin, DestEnd; + uint64_t SourceBegin, SourceEnd; + bool IsSplittable; + }; + MemTransferOffsets getMemTransferOffsets(MemTransferInst &II) const { + return MemTransferInstData.lookup(&II); + } + + /// \brief Map from a PHI or select operand back to a partition. + /// + /// When manipulating PHI nodes or selects, they can use more than one + /// partition of an alloca. We store a special mapping to allow finding the + /// partition referenced by each of these operands, if any. + iterator findPartitionForPHIOrSelectOperand(Instruction &I, Value *Op) { + SmallDenseMap<std::pair<Instruction *, Value *>, + std::pair<unsigned, unsigned> >::const_iterator MapIt + = PHIOrSelectOpMap.find(std::make_pair(&I, Op)); + if (MapIt == PHIOrSelectOpMap.end()) + return end(); + + return begin() + MapIt->second.first; + } + + /// \brief Map from a PHI or select operand back to the specific use of + /// a partition. + /// + /// Similar to mapping these operands back to the partitions, this maps + /// directly to the use structure of that partition. + use_iterator findPartitionUseForPHIOrSelectOperand(Instruction &I, + Value *Op) { + SmallDenseMap<std::pair<Instruction *, Value *>, + std::pair<unsigned, unsigned> >::const_iterator MapIt + = PHIOrSelectOpMap.find(std::make_pair(&I, Op)); + assert(MapIt != PHIOrSelectOpMap.end()); + return Uses[MapIt->second.first].begin() + MapIt->second.second; + } + + /// \brief Compute a common type among the uses of a particular partition. + /// + /// This routines walks all of the uses of a particular partition and tries + /// to find a common type between them. Untyped operations such as memset and + /// memcpy are ignored. + Type *getCommonType(iterator I) const; + + void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; + void printUsers(raw_ostream &OS, const_iterator I, + StringRef Indent = " ") const; + void print(raw_ostream &OS) const; + void dump(const_iterator I) const LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED; + void dump() const LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED; + +private: + template <typename DerivedT, typename RetT = void> class BuilderBase; + class PartitionBuilder; + friend class AllocaPartitioning::PartitionBuilder; + class UseBuilder; + friend class AllocaPartitioning::UseBuilder; + + /// \brief Handle to alloca instruction to simplify method interfaces. + AllocaInst &AI; + + /// \brief The instruction responsible for this alloca having no partitioning. + /// + /// When an instruction (potentially) escapes the pointer to the alloca, we + /// store a pointer to that here and abort trying to partition the alloca. + /// This will be null if the alloca is partitioned successfully. + Instruction *PointerEscapingInstr; + + /// \brief The partitions of the alloca. + /// + /// We store a vector of the partitions over the alloca here. This vector is + /// sorted by increasing begin offset, and then by decreasing end offset. See + /// the Partition inner class for more details. Initially there are overlaps, + /// be during construction we form a disjoint sequence toward the end. + SmallVector<Partition, 8> Partitions; + + /// \brief The uses of the partitions. + /// + /// This is essentially a mapping from each partition to a list of uses of + /// that partition. The mapping is done with a Uses vector that has the exact + /// same number of entries as the partition vector. Each entry is itself + /// a vector of the uses. + SmallVector<SmallVector<PartitionUse, 2>, 8> Uses; + + /// \brief Instructions which will become dead if we rewrite the alloca. + /// + /// Note that these are not separated by partition. This is because we expect + /// a partitioned alloca to be completely rewritten or not rewritten at all. + /// If rewritten, all these instructions can simply be removed and replaced + /// with undef as they come from outside of the allocated space. + SmallVector<Instruction *, 8> DeadUsers; + + /// \brief Operands which will become dead if we rewrite the alloca. + /// + /// These are operands that in their particular use can be replaced with + /// undef when we rewrite the alloca. These show up in out-of-bounds inputs + /// to PHI nodes and the like. They aren't entirely dead (there might be + /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we + /// want to swap this particular input for undef to simplify the use lists of + /// the alloca. + SmallVector<Use *, 8> DeadOperands; + + /// \brief The underlying storage for auxiliary memcpy and memset info. + SmallDenseMap<MemTransferInst *, MemTransferOffsets, 4> MemTransferInstData; + + /// \brief A side datastructure used when building up the partitions and uses. + /// + /// This mapping is only really used during the initial building of the + /// partitioning so that we can retain information about PHI and select nodes + /// processed. + SmallDenseMap<Instruction *, std::pair<uint64_t, bool> > PHIOrSelectSizes; + + /// \brief Auxiliary information for particular PHI or select operands. + SmallDenseMap<std::pair<Instruction *, Value *>, + std::pair<unsigned, unsigned>, 4> PHIOrSelectOpMap; + + /// \brief A utility routine called from the constructor. + /// + /// This does what it says on the tin. It is the key of the alloca partition + /// splitting and merging. After it is called we have the desired disjoint + /// collection of partitions. + void splitAndMergePartitions(); +}; +} + +template <typename DerivedT, typename RetT> +class AllocaPartitioning::BuilderBase + : public InstVisitor<DerivedT, RetT> { +public: + BuilderBase(const TargetData &TD, AllocaInst &AI, AllocaPartitioning &P) + : TD(TD), + AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())), + P(P) { + enqueueUsers(AI, 0); + } + +protected: + const TargetData &TD; + const uint64_t AllocSize; + AllocaPartitioning &P; + + struct OffsetUse { + Use *U; + uint64_t Offset; + }; + SmallVector<OffsetUse, 8> Queue; + + // The active offset and use while visiting. + Use *U; + uint64_t Offset; + + void enqueueUsers(Instruction &I, uint64_t UserOffset) { + SmallPtrSet<User *, 8> UserSet; + for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); + UI != UE; ++UI) { + if (!UserSet.insert(*UI)) + continue; + + OffsetUse OU = { &UI.getUse(), UserOffset }; + Queue.push_back(OU); + } + } + + bool computeConstantGEPOffset(GetElementPtrInst &GEPI, uint64_t &GEPOffset) { + GEPOffset = Offset; + for (gep_type_iterator GTI = gep_type_begin(GEPI), GTE = gep_type_end(GEPI); + GTI != GTE; ++GTI) { + ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); + if (!OpC) + return false; + if (OpC->isZero()) + continue; + + // Handle a struct index, which adds its field offset to the pointer. + if (StructType *STy = dyn_cast<StructType>(*GTI)) { + unsigned ElementIdx = OpC->getZExtValue(); + const StructLayout *SL = TD.getStructLayout(STy); + GEPOffset += SL->getElementOffset(ElementIdx); + continue; + } + + GEPOffset + += OpC->getZExtValue() * TD.getTypeAllocSize(GTI.getIndexedType()); + } + return true; + } + + Value *foldSelectInst(SelectInst &SI) { + // If the condition being selected on is a constant or the same value is + // being selected between, fold the select. Yes this does (rarely) happen + // early on. + if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition())) + return SI.getOperand(1+CI->isZero()); + if (SI.getOperand(1) == SI.getOperand(2)) { + assert(*U == SI.getOperand(1)); + return SI.getOperand(1); + } + return 0; + } +}; + +/// \brief Builder for the alloca partitioning. +/// +/// This class builds an alloca partitioning by recursively visiting the uses +/// of an alloca and splitting the partitions for each load and store at each +/// offset. +class AllocaPartitioning::PartitionBuilder + : public BuilderBase<PartitionBuilder, bool> { + friend class InstVisitor<PartitionBuilder, bool>; + + SmallDenseMap<Instruction *, unsigned> MemTransferPartitionMap; + +public: + PartitionBuilder(const TargetData &TD, AllocaInst &AI, AllocaPartitioning &P) + : BuilderBase(TD, AI, P) {} + + /// \brief Run the builder over the allocation. + bool operator()() { + // Note that we have to re-evaluate size on each trip through the loop as + // the queue grows at the tail. + for (unsigned Idx = 0; Idx < Queue.size(); ++Idx) { + U = Queue[Idx].U; + Offset = Queue[Idx].Offset; + if (!visit(cast<Instruction>(U->getUser()))) + return false; + } + return true; + } + +private: + bool markAsEscaping(Instruction &I) { + P.PointerEscapingInstr = &I; + return false; + } + + void insertUse(Instruction &I, uint64_t Size, bool IsSplittable = false) { + uint64_t BeginOffset = Offset, EndOffset = Offset + Size; + + // Completely skip uses which start outside of the allocation. + if (BeginOffset >= AllocSize) { + DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset + << " which starts past the end of the " << AllocSize + << " byte alloca:\n" + << " alloca: " << P.AI << "\n" + << " use: " << I << "\n"); + return; + } + + // Clamp the size to the allocation. + if (EndOffset > AllocSize) { + DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset + << " to remain within the " << AllocSize << " byte alloca:\n" + << " alloca: " << P.AI << "\n" + << " use: " << I << "\n"); + EndOffset = AllocSize; + } + + // See if we can just add a user onto the last slot currently occupied. + if (!P.Partitions.empty() && + P.Partitions.back().BeginOffset == BeginOffset && + P.Partitions.back().EndOffset == EndOffset) { + P.Partitions.back().IsSplittable &= IsSplittable; + return; + } + + Partition New(BeginOffset, EndOffset, IsSplittable); + P.Partitions.push_back(New); + } + + bool handleLoadOrStore(Type *Ty, Instruction &I) { + uint64_t Size = TD.getTypeStoreSize(Ty); + + // If this memory access can be shown to *statically* extend outside the + // bounds of of the allocation, it's behavior is undefined, so simply + // ignore it. Note that this is more strict than the generic clamping + // behavior of insertUse. We also try to handle cases which might run the + // risk of overflow. + // FIXME: We should instead consider the pointer to have escaped if this + // function is being instrumented for addressing bugs or race conditions. + if (Offset >= AllocSize || Size > AllocSize || Offset + Size > AllocSize) { + DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte " + << (isa<LoadInst>(I) ? "load" : "store") << " @" << Offset + << " which extends past the end of the " << AllocSize + << " byte alloca:\n" + << " alloca: " << P.AI << "\n" + << " use: " << I << "\n"); + return true; + } + + insertUse(I, Size); + return true; + } + + bool visitBitCastInst(BitCastInst &BC) { + enqueueUsers(BC, Offset); + return true; + } + + bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { + //unsigned IntPtrWidth = TD->getPointerSizeInBits(); + //assert(IntPtrWidth == Offset.getBitWidth()); + uint64_t GEPOffset; + if (!computeConstantGEPOffset(GEPI, GEPOffset)) + return markAsEscaping(GEPI); + + enqueueUsers(GEPI, GEPOffset); + return true; + } + + bool visitLoadInst(LoadInst &LI) { + return handleLoadOrStore(LI.getType(), LI); + } + + bool visitStoreInst(StoreInst &SI) { + if (SI.getOperand(0) == *U) + return markAsEscaping(SI); + + return handleLoadOrStore(SI.getOperand(0)->getType(), SI); + } + + + bool visitMemSetInst(MemSetInst &II) { + ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); + insertUse(II, Length ? Length->getZExtValue() : AllocSize - Offset, Length); + return true; + } + + bool visitMemTransferInst(MemTransferInst &II) { + ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); + uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset; + if (!Size) + // Zero-length mem transfer intrinsics can be ignored entirely. + return true; + + MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; + + // Only intrinsics with a constant length can be split. + Offsets.IsSplittable = Length; + + if (*U != II.getRawDest()) { + assert(*U == II.getRawSource()); + Offsets.SourceBegin = Offset; + Offsets.SourceEnd = Offset + Size; + } else { + Offsets.DestBegin = Offset; + Offsets.DestEnd = Offset + Size; + } + + insertUse(II, Size, Offsets.IsSplittable); + unsigned NewIdx = P.Partitions.size() - 1; + + SmallDenseMap<Instruction *, unsigned>::const_iterator PMI; + bool Inserted = false; + llvm::tie(PMI, Inserted) + = MemTransferPartitionMap.insert(std::make_pair(&II, NewIdx)); + if (!Inserted && Offsets.IsSplittable) { + // We've found a memory transfer intrinsic which refers to the alloca as + // both a source and dest. We refuse to split these to simplify splitting + // logic. If possible, SROA will still split them into separate allocas + // and then re-analyze. + Offsets.IsSplittable = false; + P.Partitions[PMI->second].IsSplittable = false; + P.Partitions[NewIdx].IsSplittable = false; + } + + return true; + } + + // Disable SRoA for any intrinsics except for lifetime invariants. + bool visitIntrinsicInst(IntrinsicInst &II) { + if (II.getIntrinsicID() == Intrinsic::lifetime_start || + II.getIntrinsicID() == Intrinsic::lifetime_end) { + ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); + uint64_t Size = std::min(AllocSize - Offset, Length->getLimitedValue()); + insertUse(II, Size, true); + return true; + } + + return markAsEscaping(II); + } + + Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) { + // We consider any PHI or select that results in a direct load or store of + // the same offset to be a viable use for partitioning purposes. These uses + // are considered unsplittable and the size is the maximum loaded or stored + // size. + SmallPtrSet<Instruction *, 4> Visited; + SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses; + Visited.insert(Root); + Uses.push_back(std::make_pair(cast<Instruction>(*U), Root)); + do { + Instruction *I, *UsedI; + llvm::tie(UsedI, I) = Uses.pop_back_val(); + + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + Size = std::max(Size, TD.getTypeStoreSize(LI->getType())); + continue; + } + if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + Value *Op = SI->getOperand(0); + if (Op == UsedI) + return SI; + Size = std::max(Size, TD.getTypeStoreSize(Op->getType())); + continue; + } + + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { + if (!GEP->hasAllZeroIndices()) + return GEP; + } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) && + !isa<SelectInst>(I)) { + return I; + } + + for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE; + ++UI) + if (Visited.insert(cast<Instruction>(*UI))) + Uses.push_back(std::make_pair(I, cast<Instruction>(*UI))); + } while (!Uses.empty()); + + return 0; + } + + bool visitPHINode(PHINode &PN) { + // See if we already have computed info on this node. + std::pair<uint64_t, bool> &PHIInfo = P.PHIOrSelectSizes[&PN]; + if (PHIInfo.first) { + PHIInfo.second = true; + insertUse(PN, PHIInfo.first); + return true; + } + + // Check for an unsafe use of the PHI node. + if (Instruction *EscapingI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first)) + return markAsEscaping(*EscapingI); + + insertUse(PN, PHIInfo.first); + return true; + } + + bool visitSelectInst(SelectInst &SI) { + if (Value *Result = foldSelectInst(SI)) { + if (Result == *U) + // If the result of the constant fold will be the pointer, recurse + // through the select as if we had RAUW'ed it. + enqueueUsers(SI, Offset); + + return true; + } + + // See if we already have computed info on this node. + std::pair<uint64_t, bool> &SelectInfo = P.PHIOrSelectSizes[&SI]; + if (SelectInfo.first) { + SelectInfo.second = true; + insertUse(SI, SelectInfo.first); + return true; + } + + // Check for an unsafe use of the PHI node. + if (Instruction *EscapingI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first)) + return markAsEscaping(*EscapingI); + + insertUse(SI, SelectInfo.first); + return true; + } + + /// \brief Disable SROA entirely if there are unhandled users of the alloca. + bool visitInstruction(Instruction &I) { return markAsEscaping(I); } +}; + + +/// \brief Use adder for the alloca partitioning. +/// +/// This class adds the uses of an alloca to all of the partitions which it +/// uses. For splittable partitions, this can end up doing essentially a linear +/// walk of the partitions, but the number of steps remains bounded by the +/// total result instruction size: +/// - The number of partitions is a result of the number unsplittable +/// instructions using the alloca. +/// - The number of users of each partition is at worst the total number of +/// splittable instructions using the alloca. +/// Thus we will produce N * M instructions in the end, where N are the number +/// of unsplittable uses and M are the number of splittable. This visitor does +/// the exact same number of updates to the partitioning. +/// +/// In the more common case, this visitor will leverage the fact that the +/// partition space is pre-sorted, and do a logarithmic search for the +/// partition needed, making the total visit a classical ((N + M) * log(N)) +/// complexity operation. +class AllocaPartitioning::UseBuilder : public BuilderBase<UseBuilder> { + friend class InstVisitor<UseBuilder>; + + /// \brief Set to de-duplicate dead instructions found in the use walk. + SmallPtrSet<Instruction *, 4> VisitedDeadInsts; + +public: + UseBuilder(const TargetData &TD, AllocaInst &AI, AllocaPartitioning &P) + : BuilderBase(TD, AI, P) {} + + /// \brief Run the builder over the allocation. + void operator()() { + // Note that we have to re-evaluate size on each trip through the loop as + // the queue grows at the tail. + for (unsigned Idx = 0; Idx < Queue.size(); ++Idx) { + U = Queue[Idx].U; + Offset = Queue[Idx].Offset; + this->visit(cast<Instruction>(U->getUser())); + } + } + +private: + void markAsDead(Instruction &I) { + if (VisitedDeadInsts.insert(&I)) + P.DeadUsers.push_back(&I); + } + + void insertUse(uint64_t Size, Instruction &User) { + uint64_t BeginOffset = Offset, EndOffset = Offset + Size; + + // If the use extends outside of the allocation, record it as a dead use + // for elimination later. + if (BeginOffset >= AllocSize || Size == 0) + return markAsDead(User); + + // Bound the use by the size of the allocation. + if (EndOffset > AllocSize) + EndOffset = AllocSize; + + // NB: This only works if we have zero overlapping partitions. + iterator B = std::lower_bound(P.begin(), P.end(), BeginOffset); + if (B != P.begin() && llvm::prior(B)->EndOffset > BeginOffset) + B = llvm::prior(B); + for (iterator I = B, E = P.end(); I != E && I->BeginOffset < EndOffset; + ++I) { + PartitionUse NewUse(std::max(I->BeginOffset, BeginOffset), + std::min(I->EndOffset, EndOffset), + &User, cast<Instruction>(*U)); + P.Uses[I - P.begin()].push_back(NewUse); + if (isa<PHINode>(U->getUser()) || isa<SelectInst>(U->getUser())) + P.PHIOrSelectOpMap[std::make_pair(&User, U->get())] + = std::make_pair(I - P.begin(), P.Uses[I - P.begin()].size() - 1); + } + } + + void handleLoadOrStore(Type *Ty, Instruction &I) { + uint64_t Size = TD.getTypeStoreSize(Ty); + + // If this memory access can be shown to *statically* extend outside the + // bounds of of the allocation, it's behavior is undefined, so simply + // ignore it. Note that this is more strict than the generic clamping + // behavior of insertUse. + if (Offset >= AllocSize || Size > AllocSize || Offset + Size > AllocSize) + return markAsDead(I); + + insertUse(Size, I); + } + + void visitBitCastInst(BitCastInst &BC) { + if (BC.use_empty()) + return markAsDead(BC); + + |