diff options
Diffstat (limited to 'lib/Transforms')
-rw-r--r-- | lib/Transforms/IPO/BarrierNoopPass.cpp | 47 | ||||
-rw-r--r-- | lib/Transforms/IPO/CMakeLists.txt | 1 | ||||
-rw-r--r-- | lib/Transforms/IPO/PassManagerBuilder.cpp | 14 | ||||
-rw-r--r-- | lib/Transforms/Instrumentation/AddressSanitizer.cpp | 8 | ||||
-rw-r--r-- | lib/Transforms/Scalar/LoopStrengthReduce.cpp | 134 | ||||
-rw-r--r-- | lib/Transforms/Scalar/SROA.cpp | 167 | ||||
-rw-r--r-- | lib/Transforms/Scalar/SimplifyLibCalls.cpp | 47 | ||||
-rw-r--r-- | lib/Transforms/Utils/LowerInvoke.cpp | 32 | ||||
-rw-r--r-- | lib/Transforms/Utils/SimplifyLibCalls.cpp | 55 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/CMakeLists.txt | 1 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 885 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/Vectorize.cpp | 8 |
12 files changed, 1178 insertions, 221 deletions
diff --git a/lib/Transforms/IPO/BarrierNoopPass.cpp b/lib/Transforms/IPO/BarrierNoopPass.cpp new file mode 100644 index 0000000000..2e32240621 --- /dev/null +++ b/lib/Transforms/IPO/BarrierNoopPass.cpp @@ -0,0 +1,47 @@ +//===- BarrierNoopPass.cpp - A barrier pass for the pass manager ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// NOTE: DO NOT USE THIS IF AVOIDABLE +// +// This pass is a nonce pass intended to allow manipulation of the implicitly +// nesting pass manager. For example, it can be used to cause a CGSCC pass +// manager to be closed prior to running a new collection of function passes. +// +// FIXME: This is a huge HACK. This should be removed when the pass manager's +// nesting is made explicit instead of implicit. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Pass.h" +#include "llvm/Transforms/IPO.h" +using namespace llvm; + +namespace { +/// \brief A nonce module pass used to place a barrier in a pass manager. +/// +/// There is no mechanism for ending a CGSCC pass manager once one is started. +/// This prevents extension points from having clear deterministic ordering +/// when they are phrased as non-module passes. +class BarrierNoop : public ModulePass { +public: + static char ID; // Pass identification. + + BarrierNoop() : ModulePass(ID) { + initializeBarrierNoopPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) { return false; } +}; +} + +ModulePass *llvm::createBarrierNoopPass() { return new BarrierNoop(); } + +char BarrierNoop::ID = 0; +INITIALIZE_PASS(BarrierNoop, "barrier", "A No-Op Barrier Pass", + false, false) diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt index 3f6b1de614..90c1c33e6d 100644 --- a/lib/Transforms/IPO/CMakeLists.txt +++ b/lib/Transforms/IPO/CMakeLists.txt @@ -1,5 +1,6 @@ add_llvm_library(LLVMipo ArgumentPromotion.cpp + BarrierNoopPass.cpp ConstantMerge.cpp DeadArgumentElimination.cpp ExtractGV.cpp diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index 9e328b9ac9..04163f751f 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -119,6 +119,14 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(Inliner); Inliner = 0; } + + // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC + // pass manager, but we don't want to add extensions into that pass manager. + // To prevent this we must insert a no-op module pass to reset the pass + // manager to get the same behavior as EP_OptimizerLast in non-O0 builds. + if (!GlobalExtensions->empty() || !Extensions.empty()) + MPM.add(createBarrierNoopPass()); + addExtensionsToPM(EP_EnabledOnOptLevel0, MPM); return; } @@ -176,6 +184,12 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. MPM.add(createLoopDeletionPass()); // Delete dead loops + + if (Vectorize) { + MPM.add(createLoopVectorizePass()); + MPM.add(createLICMPass()); + } + if (!DisableUnrollLoops) MPM.add(createLoopUnrollPass()); // Unroll small loops addExtensionsToPM(EP_LoopOptimizerEnd, MPM); diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp index b566994edf..75f42f30fa 100644 --- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -534,7 +534,7 @@ void AddressSanitizer::createInitializerPoisonCalls(Module &M, bool AddressSanitizer::ShouldInstrumentGlobal(GlobalVariable *G) { Type *Ty = cast<PointerType>(G->getType())->getElementType(); - DEBUG(dbgs() << "GLOBAL: " << *G); + DEBUG(dbgs() << "GLOBAL: " << *G << "\n"); if (BL->isIn(*G)) return false; if (!Ty->isSized()) return false; @@ -682,7 +682,7 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) { FirstDynamic = LastDynamic; } - DEBUG(dbgs() << "NEW GLOBAL:\n" << *NewGlobal); + DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n"); } ArrayType *ArrayOfGlobalStructTy = ArrayType::get(GlobalStructTy, n); @@ -851,6 +851,7 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) { bool AddressSanitizer::runOnFunction(Function &F) { if (BL->isIn(F)) return false; if (&F == AsanCtorFunction) return false; + DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n"); // If needed, insert __asan_init before checking for AddressSafety attr. maybeInsertAsanInitAtFunctionEntry(F); @@ -914,8 +915,6 @@ bool AddressSanitizer::runOnFunction(Function &F) { NumInstrumented++; } - DEBUG(dbgs() << F); - bool ChangedStack = poisonStackInFunction(F); // We must unpoison the stack before every NoReturn call (throw, _exit, etc). @@ -925,6 +924,7 @@ bool AddressSanitizer::runOnFunction(Function &F) { IRBuilder<> IRB(CI); IRB.CreateCall(AsanHandleNoReturnFunc); } + DEBUG(dbgs() << "ASAN done instrumenting:\n" << F << "\n"); return NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty(); } diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 99a62dbe62..958348d9fa 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -37,7 +37,7 @@ // // TODO: Handle multiple loops at a time. // -// TODO: Should AddrMode::BaseGV be changed to a ConstantExpr +// TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr // instead of a GlobalValue? // // TODO: When truncation is free, truncate ICmp users' operands to make it a @@ -67,7 +67,6 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/TargetTransformInfo.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/DenseSet.h" @@ -75,6 +74,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLowering.h" #include <algorithm> using namespace llvm; @@ -1118,7 +1118,7 @@ public: enum KindType { Basic, ///< A normal use, with no folding. Special, ///< A special case of basic, allowing -1 scales. - Address, ///< An address use; folding according to ScalarTargetTransformInfo. + Address, ///< An address use; folding according to TargetLowering ICmpZero ///< An equality icmp with both operands folded into one. // TODO: Add a generic icmp too? }; @@ -1272,12 +1272,12 @@ void LSRUse::dump() const { /// address-mode folding and special icmp tricks. static bool isLegalUse(const AddrMode &AM, LSRUse::KindType Kind, Type *AccessTy, - const ScalarTargetTransformInfo *STTI) { + const TargetLowering *TLI) { switch (Kind) { case LSRUse::Address: // If we have low-level target information, ask the target if it can // completely fold this address. - if (STTI) return STTI->isLegalAddressingMode(AM, AccessTy); + if (TLI) return TLI->isLegalAddressingMode(AM, AccessTy); // Otherwise, just guess that reg+reg addressing is legal. return !AM.BaseGV && AM.BaseOffs == 0 && AM.Scale <= 1; @@ -1300,7 +1300,7 @@ static bool isLegalUse(const AddrMode &AM, // If we have low-level target information, ask the target if it can fold an // integer immediate on an icmp. if (AM.BaseOffs != 0) { - if (!STTI) + if (!TLI) return false; // We have one of: // ICmpZero BaseReg + Offset => ICmp BaseReg, -Offset @@ -1309,7 +1309,7 @@ static bool isLegalUse(const AddrMode &AM, int64_t Offs = AM.BaseOffs; if (AM.Scale == 0) Offs = -(uint64_t)Offs; // The cast does the right thing with INT64_MIN. - return STTI->isLegalICmpImmediate(Offs); + return TLI->isLegalICmpImmediate(Offs); } // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg @@ -1330,20 +1330,20 @@ static bool isLegalUse(const AddrMode &AM, static bool isLegalUse(AddrMode AM, int64_t MinOffset, int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, - const ScalarTargetTransformInfo *LTTI) { + const TargetLowering *TLI) { // Check for overflow. if (((int64_t)((uint64_t)AM.BaseOffs + MinOffset) > AM.BaseOffs) != (MinOffset > 0)) return false; AM.BaseOffs = (uint64_t)AM.BaseOffs + MinOffset; - if (isLegalUse(AM, Kind, AccessTy, LTTI)) { + if (isLegalUse(AM, Kind, AccessTy, TLI)) { AM.BaseOffs = (uint64_t)AM.BaseOffs - MinOffset; // Check for overflow. if (((int64_t)((uint64_t)AM.BaseOffs + MaxOffset) > AM.BaseOffs) != (MaxOffset > 0)) return false; AM.BaseOffs = (uint64_t)AM.BaseOffs + MaxOffset; - return isLegalUse(AM, Kind, AccessTy, LTTI); + return isLegalUse(AM, Kind, AccessTy, TLI); } return false; } @@ -1352,7 +1352,7 @@ static bool isAlwaysFoldable(int64_t BaseOffs, GlobalValue *BaseGV, bool HasBaseReg, LSRUse::KindType Kind, Type *AccessTy, - const ScalarTargetTransformInfo *LTTI) { + const TargetLowering *TLI) { // Fast-path: zero is always foldable. if (BaseOffs == 0 && !BaseGV) return true; @@ -1371,14 +1371,14 @@ static bool isAlwaysFoldable(int64_t BaseOffs, AM.HasBaseReg = true; } - return isLegalUse(AM, Kind, AccessTy, LTTI); + return isLegalUse(AM, Kind, AccessTy, TLI); } static bool isAlwaysFoldable(const SCEV *S, int64_t MinOffset, int64_t MaxOffset, bool HasBaseReg, LSRUse::KindType Kind, Type *AccessTy, - const ScalarTargetTransformInfo *LTTI, + const TargetLowering *TLI, ScalarEvolution &SE) { // Fast-path: zero is always foldable. if (S->isZero()) return true; @@ -1402,7 +1402,7 @@ static bool isAlwaysFoldable(const SCEV *S, AM.HasBaseReg = HasBaseReg; AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1; - return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, LTTI); + return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, TLI); } namespace { @@ -1502,7 +1502,7 @@ class LSRInstance { ScalarEvolution &SE; DominatorTree &DT; LoopInfo &LI; - const ScalarTargetTransformInfo *const STTI; + const TargetLowering *const TLI; Loop *const L; bool Changed; @@ -1638,7 +1638,7 @@ class LSRInstance { Pass *P); public: - LSRInstance(const ScalarTargetTransformInfo *ltti, Loop *l, Pass *P); + LSRInstance(const TargetLowering *tli, Loop *l, Pass *P); bool getChanged() const { return Changed; } @@ -1688,10 +1688,11 @@ void LSRInstance::OptimizeShadowIV() { } if (!DestTy) continue; - if (STTI) { + if (TLI) { // If target does not support DestTy natively then do not apply // this transformation. - if (!STTI->isTypeLegal(DestTy)) continue; + EVT DVT = TLI->getValueType(DestTy); + if (!TLI->isTypeLegal(DVT)) continue; } PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0)); @@ -2014,18 +2015,18 @@ LSRInstance::OptimizeLoopTermCond() { if (C->getValue().getMinSignedBits() >= 64 || C->getValue().isMinSignedValue()) goto decline_post_inc; - // Without STTI, assume that any stride might be valid, and so any + // Without TLI, assume that any stride might be valid, and so any // use might be shared. - if (!STTI) + if (!TLI) goto decline_post_inc; // Check for possible scaled-address reuse. Type *AccessTy = getAccessType(UI->getUser()); AddrMode AM; AM.Scale = C->getSExtValue(); - if (STTI->isLegalAddressingMode(AM, AccessTy)) + if (TLI->isLegalAddressingMode(AM, AccessTy)) goto decline_post_inc; AM.Scale = -AM.Scale; - if (STTI->isLegalAddressingMode(AM, AccessTy)) + if (TLI->isLegalAddressingMode(AM, AccessTy)) goto decline_post_inc; } } @@ -2096,12 +2097,12 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, // Conservatively assume HasBaseReg is true for now. if (NewOffset < LU.MinOffset) { if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, HasBaseReg, - Kind, AccessTy, STTI)) + Kind, AccessTy, TLI)) return false; NewMinOffset = NewOffset; } else if (NewOffset > LU.MaxOffset) { if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, HasBaseReg, - Kind, AccessTy, STTI)) + Kind, AccessTy, TLI)) return false; NewMaxOffset = NewOffset; } @@ -2130,7 +2131,7 @@ LSRInstance::getUse(const SCEV *&Expr, int64_t Offset = ExtractImmediate(Expr, SE); // Basic uses can't accept any offset, for example. - if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true, Kind, AccessTy, STTI)) { + if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true, Kind, AccessTy, TLI)) { Expr = Copy; Offset = 0; } @@ -2395,7 +2396,7 @@ bool IVChain::isProfitableIncrement(const SCEV *OperExpr, /// TODO: Consider IVInc free if it's already used in another chains. static bool isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users, - ScalarEvolution &SE, const ScalarTargetTransformInfo *STTI) { + ScalarEvolution &SE, const TargetLowering *TLI) { if (StressIVChain) return true; @@ -2653,7 +2654,7 @@ void LSRInstance::CollectChains() { for (unsigned UsersIdx = 0, NChains = IVChainVec.size(); UsersIdx < NChains; ++UsersIdx) { if (!isProfitableChain(IVChainVec[UsersIdx], - ChainUsersVec[UsersIdx].FarUsers, SE, STTI)) + ChainUsersVec[UsersIdx].FarUsers, SE, TLI)) continue; // Preserve the chain at UsesIdx. if (ChainIdx != UsersIdx) @@ -2680,8 +2681,7 @@ void LSRInstance::FinalizeChain(IVChain &Chain) { /// Return true if the IVInc can be folded into an addressing mode. static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, - Value *Operand, - const ScalarTargetTransformInfo *STTI) { + Value *Operand, const TargetLowering *TLI) { const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr); if (!IncConst || !isAddressUse(UserInst, Operand)) return false; @@ -2691,7 +2691,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, int64_t IncOffset = IncConst->getValue()->getSExtValue(); if (!isAlwaysFoldable(IncOffset, /*BaseGV=*/0, /*HaseBaseReg=*/false, - LSRUse::Address, getAccessType(UserInst), STTI)) + LSRUse::Address, getAccessType(UserInst), TLI)) return false; return true; @@ -2762,7 +2762,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, // If an IV increment can't be folded, use it as the next IV value. if (!canFoldIVIncExpr(LeftOverExpr, IncI->UserInst, IncI->IVOperand, - STTI)) { + TLI)) { assert(IVTy == IVOper->getType() && "inconsistent IV increment type"); IVSrc = IVOper; LeftOverExpr = 0; @@ -3108,7 +3108,7 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, // into an immediate field. if (isAlwaysFoldable(*J, LU.MinOffset, LU.MaxOffset, Base.getNumRegs() > 1, - LU.Kind, LU.AccessTy, STTI, SE)) + LU.Kind, LU.AccessTy, TLI, SE)) continue; // Collect all operands except *J. @@ -3122,7 +3122,7 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, if (InnerAddOps.size() == 1 && isAlwaysFoldable(InnerAddOps[0], LU.MinOffset, LU.MaxOffset, Base.getNumRegs() > 1, - LU.Kind, LU.AccessTy, STTI, SE)) + LU.Kind, LU.AccessTy, TLI, SE)) continue; const SCEV *InnerSum = SE.getAddExpr(InnerAddOps); @@ -3132,9 +3132,9 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, // Add the remaining pieces of the add back into the new formula. const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum); - if (STTI && InnerSumSC && + if (TLI && InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 && - STTI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue())) { F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue(); @@ -3144,8 +3144,8 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, // Add J as its own register, or an unfolded immediate. const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J); - if (STTI && SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && - STTI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + if (TLI && SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && + TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue())) F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue(); @@ -3205,7 +3205,7 @@ void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula F = Base; F.AM.BaseGV = GV; if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, STTI)) + LU.Kind, LU.AccessTy, TLI)) continue; F.BaseRegs[i] = G; (void)InsertFormula(LU, LUIdx, F); @@ -3230,7 +3230,7 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula F = Base; F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs - *I; if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I, - LU.Kind, LU.AccessTy, STTI)) { + LU.Kind, LU.AccessTy, TLI)) { // Add the offset to the base register. const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G); // If it cancelled out, drop the base register, otherwise update it. @@ -3250,7 +3250,7 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula F = Base; F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Imm; if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, STTI)) + LU.Kind, LU.AccessTy, TLI)) continue; F.BaseRegs[i] = G; (void)InsertFormula(LU, LUIdx, F); @@ -3297,7 +3297,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, F.AM.BaseOffs = NewBaseOffs; // Check that this scale is legal. - if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, STTI)) + if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, TLI)) continue; // Compensate for the use having MinOffset built into it. @@ -3353,12 +3353,12 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { Base.AM.HasBaseReg = Base.BaseRegs.size() > 1; // Check whether this scale is going to be legal. if (!isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, STTI)) { + LU.Kind, LU.AccessTy, TLI)) { // As a special-case, handle special out-of-loop Basic users specially. // TODO: Reconsider this special case. if (LU.Kind == LSRUse::Basic && isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset, - LSRUse::Special, LU.AccessTy, STTI) && + LSRUse::Special, LU.AccessTy, TLI) && LU.AllFixupsOutsideLoop) LU.Kind = LSRUse::Special; else @@ -3391,8 +3391,8 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { /// GenerateTruncates - Generate reuse formulae from different IV types. void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { - // This requires ScalarTargetTransformInfo to tell us which truncates are free. - if (!STTI) return; + // This requires TargetLowering to tell us which truncates are free. + if (!TLI) return; // Don't bother truncating symbolic values. if (Base.AM.BaseGV) return; @@ -3405,7 +3405,7 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { for (SmallSetVector<Type *, 4>::const_iterator I = Types.begin(), E = Types.end(); I != E; ++I) { Type *SrcTy = *I; - if (SrcTy != DstTy && STTI->isTruncateFree(SrcTy, DstTy)) { + if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) { Formula F = Base; if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I); @@ -3561,7 +3561,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { Formula NewF = F; NewF.AM.BaseOffs = Offs; if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, STTI)) + LU.Kind, LU.AccessTy, TLI)) continue; NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg); @@ -3586,9 +3586,9 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { Formula NewF = F; NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm; if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, STTI)) { - if (!STTI || - !STTI->isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm)) + LU.Kind, LU.AccessTy, TLI)) { + if (!TLI || + !TLI->isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm)) continue; NewF = F; NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm; @@ -3900,7 +3900,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { Formula &F = LUThatHas->Formulae[i]; if (!isLegalUse(F.AM, LUThatHas->MinOffset, LUThatHas->MaxOffset, - LUThatHas->Kind, LUThatHas->AccessTy, STTI)) { + LUThatHas->Kind, LUThatHas->AccessTy, TLI)) { DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n'); LUThatHas->DeleteFormula(F); @@ -4589,12 +4589,12 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, Changed |= DeleteTriviallyDeadInstructions(DeadInsts); } -LSRInstance::LSRInstance(const ScalarTargetTransformInfo *stti, Loop *l, Pass *P) +LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P) : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()), DT(P->getAnalysis<DominatorTree>()), LI(P->getAnalysis<LoopInfo>()), - STTI(stti), L(l), Changed(false), IVIncInsertPos(0) { + TLI(tli), L(l), Changed(false), IVIncInsertPos(0) { // If LoopSimplify form is not available, stay out of trouble. if (!L->isLoopSimplifyForm()) @@ -4684,7 +4684,7 @@ LSRInstance::LSRInstance(const ScalarTargetTransformInfo *stti, Loop *l, Pass *P for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(), JE = LU.Formulae.end(); J != JE; ++J) assert(isLegalUse(J->AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, STTI) && + LU.Kind, LU.AccessTy, TLI) && "Illegal formula generated!"); }; #endif @@ -4757,13 +4757,13 @@ void LSRInstance::dump() const { namespace { class LoopStrengthReduce : public LoopPass { - /// ScalarTargetTransformInfo provides target information that is needed - /// for strength reducing loops. - const ScalarTargetTransformInfo *STTI; + /// TLI - Keep a pointer of a TargetLowering to consult for determining + /// transformation profitability. + const TargetLowering *const TLI; public: static char ID; // Pass ID, replacement for typeid - LoopStrengthReduce(); + explicit LoopStrengthReduce(const TargetLowering *tli = 0); private: bool runOnLoop(Loop *L, LPPassManager &LPM); @@ -4783,12 +4783,13 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce", "Loop Strength Reduction", false, false) -Pass *llvm::createLoopStrengthReducePass() { - return new LoopStrengthReduce(); + +Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) { + return new LoopStrengthReduce(TLI); } -LoopStrengthReduce::LoopStrengthReduce() - : LoopPass(ID), STTI(0) { +LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli) + : LoopPass(ID), TLI(tli) { initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry()); } @@ -4814,13 +4815,8 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { bool Changed = false; - TargetTransformInfo *TTI = getAnalysisIfAvailable<TargetTransformInfo>(); - - if (TTI) - STTI = TTI->getScalarTargetTransformInfo(); - // Run the main LSR transformation. - Changed |= LSRInstance(STTI, L, this).getChanged(); + Changed |= LSRInstance(TLI, L, this).getChanged(); // Remove any extra phis created by processing inner loops. Changed |= DeleteDeadPHIs(L->getHeader()); @@ -4831,7 +4827,7 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { Rewriter.setDebugType(DEBUG_TYPE); #endif unsigned numFolded = Rewriter. - replaceCongruentIVs(L, &getAnalysis<DominatorTree>(), DeadInsts, STTI); + replaceCongruentIVs(L, &getAnalysis<DominatorTree>(), DeadInsts, TLI); if (numFolded) { Changed = true; DeleteTriviallyDeadInstructions(DeadInsts); diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index 3e84a91c1d..377a6250de 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -132,9 +132,6 @@ public: /// /// We flag partitions as splittable when they are formed entirely due to /// accesses by trivially splittable operations such as memset and memcpy. - /// - /// FIXME: At some point we should consider loads and stores of FCAs to be - /// splittable and eagerly split them into scalar values. bool IsSplittable; /// \brief Test whether a partition has been marked as dead. @@ -1785,9 +1782,9 @@ static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const DataLayout &TD, break; if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) { ElementTy = SeqTy->getElementType(); - Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits( - ElementTy->isPointerTy() ? - cast<PointerType>(ElementTy)->getAddressSpace(): 0), 0))); + // Note that we use the default address space as this index is over an + // array or a vector, not a pointer. + Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(0), 0))); } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) { if (STy->element_begin() == STy->element_end()) break; // Nothing left to descend into. @@ -1828,7 +1825,7 @@ static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD, if (ElementSizeInBits % 8) return 0; // GEPs over non-multiple of 8 size vector elements are invalid. APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8); - APInt NumSkippedElements = Offset.udiv(ElementSize); + APInt NumSkippedElements = Offset.sdiv(ElementSize); if (NumSkippedElements.ugt(VecTy->getNumElements())) return 0; Offset -= NumSkippedElements * ElementSize; @@ -1840,7 +1837,7 @@ static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD, if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { Type *ElementTy = ArrTy->getElementType(); APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); - APInt NumSkippedElements = Offset.udiv(ElementSize); + APInt NumSkippedElements = Offset.sdiv(ElementSize); if (NumSkippedElements.ugt(ArrTy->getNumElements())) return 0; @@ -1896,7 +1893,7 @@ static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD, APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); if (ElementSize == 0) return 0; // Zero-length arrays can't help us build a natural GEP. - APInt NumSkippedElements = Offset.udiv(ElementSize); + APInt NumSkippedElements = Offset.sdiv(ElementSize); Offset -= NumSkippedElements * ElementSize; Indices.push_back(IRB.getInt(NumSkippedElements)); @@ -2211,6 +2208,48 @@ static bool isIntegerWideningViable(const DataLayout &TD, return WholeAllocaOp; } +static Value *extractInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *V, + IntegerType *Ty, uint64_t Offset, + const Twine &Name) { + IntegerType *IntTy = cast<IntegerType>(V->getType()); + assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && + "Element extends past full value"); + uint64_t ShAmt = 8*Offset; + if (DL.isBigEndian()) + ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); + if (ShAmt) + V = IRB.CreateLShr(V, ShAmt, Name + ".shift"); + assert(Ty->getBitWidth() <= IntTy->getBitWidth() && + "Cannot extract to a larger integer!"); + if (Ty != IntTy) + V = IRB.CreateTrunc(V, Ty, Name + ".trunc"); + return V; +} + +static Value *insertInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *Old, + Value *V, uint64_t Offset, const Twine &Name) { + IntegerType *IntTy = cast<IntegerType>(Old->getType()); + IntegerType *Ty = cast<IntegerType>(V->getType()); + assert(Ty->getBitWidth() <= IntTy->getBitWidth() && + "Cannot insert a larger integer!"); + if (Ty != IntTy) + V = IRB.CreateZExt(V, IntTy, Name + ".ext"); + assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && + "Element store outside of alloca store"); + uint64_t ShAmt = 8*Offset; + if (DL.isBigEndian()) + ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); + if (ShAmt) + V = IRB.CreateShl(V, ShAmt, Name + ".shift"); + + if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) { + APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt); + Old = IRB.CreateAnd(Old, Mask, Name + ".mask"); + V = IRB.CreateOr(Old, V, Name + ".insert"); + } + return V; +} + namespace { /// \brief Visitor to rewrite instructions using a partition of an alloca to /// use a new alloca. @@ -2371,60 +2410,6 @@ private: return IRB.getInt32(Index); } - Value *extractInteger(IRBuilder<> &IRB, IntegerType *TargetTy, - uint64_t Offset) { - assert(IntTy && "We cannot extract an integer from the alloca"); - Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - getName(".load")); - V = convertValue(TD, IRB, V, IntTy); - assert(Offset >= NewAllocaBeginOffset && "Out of bounds offset"); - uint64_t RelOffset = Offset - NewAllocaBeginOffset; - assert(TD.getTypeStoreSize(TargetTy) + RelOffset <= - TD.getTypeStoreSize(IntTy) && - "Element load outside of alloca store"); - uint64_t ShAmt = 8*RelOffset; - if (TD.isBigEndian()) - ShAmt = 8*(TD.getTypeStoreSize(IntTy) - - TD.getTypeStoreSize(TargetTy) - RelOffset); - if (ShAmt) - V = IRB.CreateLShr(V, ShAmt, getName(".shift")); - assert(TargetTy->getBitWidth() <= IntTy->getBitWidth() && - "Cannot extract to a larger integer!"); - if (TargetTy != IntTy) - V = IRB.CreateTrunc(V, TargetTy, getName(".trunc")); - return V; - } - - StoreInst *insertInteger(IRBuilder<> &IRB, Value *V, uint64_t Offset) { - IntegerType *Ty = cast<IntegerType>(V->getType()); - assert(Ty->getBitWidth() <= IntTy->getBitWidth() && - "Cannot insert a larger integer!"); - if (Ty != IntTy) - V = IRB.CreateZExt(V, IntTy, getName(".ext")); - assert(Offset >= NewAllocaBeginOffset && "Out of bounds offset"); - uint64_t RelOffset = Offset - NewAllocaBeginOffset; - assert(TD.getTypeStoreSize(Ty) + RelOffset <= - TD.getTypeStoreSize(IntTy) && - "Element store outside of alloca store"); - uint64_t ShAmt = 8*RelOffset; - if (TD.isBigEndian()) - ShAmt = 8*(TD.getTypeStoreSize(IntTy) - TD.getTypeStoreSize(Ty) - - RelOffset); - if (ShAmt) - V = IRB.CreateShl(V, ShAmt, getName(".shift")); - - if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) { - APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt); - Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - getName(".oldload")); - Old = convertValue(TD, IRB, Old, IntTy); - Old = IRB.CreateAnd(Old, Mask, getName(".mask")); - V = IRB.CreateOr(Old, V, getName(".insert")); - } - V = convertValue(TD, IRB, V, NewAllocaTy); - return IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); - } - void deleteIfTriviallyDead(Value *V) { Instruction *I = cast<Instruction>(V); if (isInstructionTriviallyDead(I)) @@ -2452,12 +2437,18 @@ private: } bool rewriteIntegerLoad(IRBuilder<> &IRB, LoadInst &LI) { + assert(IntTy && "We cannot insert an integer to the alloca"); assert(!LI.isVolatile()); - Value *Result = extractInteger(IRB, cast<IntegerType>(LI.getType()), - BeginOffset); - LI.replaceAllUsesWith(Result); + Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".load")); + V = convertValue(TD, IRB, V, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + V = extractInteger(TD, IRB, V, cast<IntegerType>(LI.getType()), Offset, + getName(".extract")); + LI.replaceAllUsesWith(V); Pass.DeadInsts.push_back(&LI); - DEBUG(dbgs() << " to: " << *Result << "\n"); + DEBUG(dbgs() << " to: " << *V << "\n"); return true; } @@ -2519,8 +2510,20 @@ private: } bool rewriteIntegerStore(IRBuilder<> &IRB, StoreInst &SI) { + assert(IntTy && "We cannot extract an integer from the alloca"); assert(!SI.isVolatile()); - StoreInst *Store = insertInteger(IRB, SI.getValueOperand(), BeginOffset); + Value *V = SI.getValueOperand(); + if (TD.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) { + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".oldload")); + Old = convertValue(TD, IRB, Old, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + V = insertInteger(TD, IRB, Old, SI.getValueOperand(), Offset, + getName(".insert")); + } + V = convertValue(TD, IRB, V, NewAllocaTy); + StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); Pass.DeadInsts.push_back(&SI); (void)Store; DEBUG(dbgs() << " to: " << *Store << "\n"); @@ -2652,10 +2655,12 @@ private: if (IntTy && (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset)) { assert(!II.isVolatile()); - StoreInst *Store = insertInteger(IRB, V, BeginOffset); - (void)Store; - DEBUG(dbgs() << " to: " << *Store << "\n"); - return true; + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".oldload")); + Old = convertValue(TD, IRB, Old, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + V = insertInteger(TD, IRB, Old, V, Offset, getName(".insert")); } if (V->getType() != AllocaTy) @@ -2811,17 +2816,25 @@ private: getIndex(IRB, BeginOffset), getName(".copyextract")); } else if (IntTy && !IsWholeAlloca && !IsDest) { - Src = extractInteger(IRB, SubIntTy, BeginOffset); + Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".load")); + Src = convertValue(TD, IRB, Src, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, getName(".extract")); } else { Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(), getName(".copyload")); } if (IntTy && !IsWholeAlloca && IsDest) { - StoreInst *Store = insertInteger(IRB, Src, BeginOffset); - (void)Store; - DEBUG(dbgs() << " to: " << *Store << "\n"); - return true; + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".oldload")); + Old = convertValue(TD, IRB, Old, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + Src = insertInteger(TD, IRB, Old, Src, Offset, getName(".insert")); + Src = convertValue(TD, IRB, Src, NewAllocaTy); } if (IsVectorElement && IsDest) { diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp index d86c4cbc9f..90efa8ae0e 100644 --- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp +++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp @@ -135,47 +135,6 @@ static bool IsOnlyUsedInEqualityComparison(Value *V, Value *With) { namespace { //===---------------------------------------===// -// 'strcpy' Optimizations - -struct StrCpyOpt : public LibCallOptimization { - bool OptChkCall; // True if it's optimizing a __strcpy_chk libcall. - - StrCpyOpt(bool c) : OptChkCall(c) {} - - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "strcpy" function prototype. - unsigned NumParams = OptChkCall ? 3 : 2; - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != NumParams || - FT->getReturnType() != FT->getParamType(0) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != B.getInt8PtrTy()) - return 0; - - Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); - if (Dst == Src) // strcpy(x,x) -> x - return Src; - - // These optimizations require DataLayout. - if (!TD) return 0; - - // See if we can get the length of the input string. - uint64_t Len = GetStringLength(Src); - if (Len == 0) return 0; - - // We have enough information to now generate the memcpy call to do the - // concatenation for us. Make a memcpy to copy the nul byte with align = 1. - if (!OptChkCall || - !EmitMemCpyChk(Dst, Src, - ConstantInt::get(TD->getIntPtrType(*Context), Len), - CI->getArgOperand(2), B, TD, TLI)) - B.CreateMemCpy(Dst, Src, - ConstantInt::get(TD->getIntPtrType(*Context), Len), 1); - return Dst; - } -}; - -//===---------------------------------------===// // 'stpcpy' Optimizations struct StpCpyOpt: public LibCallOptimization { @@ -1275,7 +1234,6 @@ namespace { StringMap<LibCallOptimization*> Optimizations; // String and Memory LibCall Optimizations - StrCpyOpt StrCpy; StrCpyOpt StrCpyChk; StpCpyOpt StpCpy; StpCpyOpt StpCpyChk; StrNCpyOpt StrNCpy; StrLenOpt StrLen; StrPBrkOpt StrPBrk; @@ -1295,8 +1253,7 @@ namespace { bool Modified; // This is only used by doInitialization. public: static char ID; // Pass identification - SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true), - StpCpy(false), StpCpyChk(true), + SimplifyLibCalls() : FunctionPass(ID), StpCpy(false), StpCpyChk(true), UnaryDoubleFP(false), UnsafeUnaryDoubleFP(true) { initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry()); } @@ -1348,7 +1305,6 @@ void SimplifyLibCalls::AddOpt(LibFunc::Func F1, LibFunc::Func F2, /// we know. void SimplifyLibCalls::InitOptimizations() { // String and Memory LibCall Optimizations - Optimizations["strcpy"] = &StrCpy; Optimizations["strncpy"] = &StrNCpy; Optimizations["stpcpy"] = &StpCpy; Optimizations["strlen"] = &StrLen; @@ -1369,7 +1325,6 @@ void SimplifyLibCalls::InitOptimizations() { AddOpt(LibFunc::memset, &MemSet); // _chk variants of String and Memory LibCall Optimizations. - Optimizations["__strcpy_chk"] = &StrCpyChk; Optimizations["__stpcpy_chk"] = &StpCpyChk; // Math Library Optimizations diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp index f35cbbdde5..930555424d 100644 --- a/lib/Transforms/Utils/LowerInvoke.cpp +++ b/lib/Transforms/Utils/LowerInvoke.cpp @@ -45,10 +45,10 @@ #include "llvm/Pass.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/TargetTransformInfo.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetLowering.h" #include <csetjmp> #include <set> using namespace llvm; @@ -70,14 +70,15 @@ namespace { Constant *SetJmpFn, *LongJmpFn, *StackSaveFn, *StackRestoreFn; bool useExpensiveEHSupport; - // We peek in STTI to grab the target's jmp_buf size and alignment - const ScalarTargetTransformInfo *STTI; + // We peek in TLI to grab the target's jmp_buf size and alignment + const TargetLowering *TLI; public: static char ID; // Pass identification, replacement for typeid - explicit LowerInvoke(bool useExpensiveEHSupport = ExpensiveEHSupport) + explicit LowerInvoke(const TargetLowering *tli = NULL, + bool useExpensiveEHSupport = ExpensiveEHSupport) : FunctionPass(ID), useExpensiveEHSupport(useExpensiveEHSupport), - STTI(0) { + TLI(tli) { initializeLowerInvokePass(*PassRegistry::getPassRegistry()); } bool doInitialization(Module &M); @@ -107,24 +108,21 @@ INITIALIZE_PASS(LowerInvoke, "lowerinvoke", char &llvm::LowerInvokePassID = LowerInvoke::ID; // Public Interface To the LowerInvoke pass. -FunctionPass *llvm::createLowerInvokePass() { - return new LowerInvoke(ExpensiveEHSupport); +FunctionPass *llvm::createLowerInvokePass(const TargetLowering *TLI) { + return new LowerInvoke(TLI, ExpensiveEHSupport); } -FunctionPass *llvm::createLowerInvokePass(bool useExpensiveEHSupport) { - return new LowerInvoke(useExpensiveEHSupport); +FunctionPass *llvm::createLowerInvokePass(const TargetLowering *TLI, + bool useExpensiveEHSupport) { + return new LowerInvoke(TLI, useExpensiveEHSupport); } // doInitialization - Make sure that there is a prototype for abort in the // current module. bool LowerInvoke::doInitialization(Module &M) { - TargetTransformInfo *TTI = getAnalysisIfAvailable<TargetTransformInfo>(); - if (TTI) - STTI = TTI->getScalarTargetTransformInfo(); - Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext()); if (useExpensiveEHSupport) { // Insert a type for the linked list of jump buffers. - unsigned JBSize = STTI ? STTI->getJumpBufSize() : 0; + unsigned JBSize = TLI ? TLI->getJumpBufSize() : 0; JBSize = JBSize ? JBSize : 200; Type *JmpBufTy = ArrayType::get(VoidPtrTy, JBSize); @@ -432,7 +430,7 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) { // Create an alloca for the incoming jump buffer ptr and the new jump buffer // that needs to be restored on all exits from the function. This is an // alloca because the value needs to be live across invokes. - unsigned Align = STTI ? STTI->getJumpBufAlignment() : 0; + unsigned Align = TLI ? TLI->getJumpBufAlignment() : 0; AllocaInst *JmpBuf = new AllocaInst(JBLinkTy, 0, Align, "jblink", F.begin()->begin()); @@ -577,10 +575,6 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) { } bool LowerInvoke::runOnFunction(Function &F) { - TargetTransformInfo *TTI = getAnalysisIfAvailable<TargetTransformInfo>(); - if (TTI) - STTI = TTI->getScalarTargetTransformInfo(); - if (useExpensiveEHSupport) return insertExpensiveEHSupport(F); else diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp index bd28ec3527..b15acdff63 100644 --- a/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -183,14 +183,30 @@ struct StrCpyChkOpt : public InstFortifiedLibCallOptimization { FT->getParamType(2) != TD->getIntPtrType(Context)) return 0; + Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); + if (Dst == Src) // __strcpy_chk(x,x) -> x + return Src; + // If a) we don't have any length information, or b) we know this will // fit then just lower to a plain st[rp]cpy. Otherwise we'll keep our // st[rp]cpy_chk call which may fail at runtime if the size is too long. // TODO: It might be nice to get a maximum length out of the possible // string lengths for varying. if (isFoldable(2, 1, true)) { - Value *Ret = EmitStrCpy(CI->getArgOperand(0), CI->getArgOperand(1), B, TD, - TLI, Name.substr(2, 6)); + Value *Ret = EmitStrCpy(Dst, Src, B, TD, TLI, Name.substr(2, 6)); + return Ret; + } else { + // Maybe we can stil fold __strcpy_chk to __memcpy_chk. + uint64_t Len = GetStringLength(Src); + if (Len == 0) return 0; + + // This optimization require DataLayout. + if (!TD) return 0; + + Value *Ret = + EmitMemCpyChk(Dst, Src, + ConstantInt::get(TD->getIntPtrType(Context), Len), + CI->getArgOperand(2), B, TD, TLI); return Ret; } return 0; @@ -497,6 +513,35 @@ struct StrNCmpOpt : public LibCallOptimization { } }; +struct StrCpyOpt : public LibCallOptimization { + virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strcpy" function prototype. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != B.getInt8PtrTy()) + return 0; + + Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); + if (Dst == Src) // strcpy(x,x) -> x + return Src; + + // These optimizations require DataLayout. + if (!TD) return 0; + + // See if we can get the length of the input string. + uint64_t Len = GetStringLength(Src); + if (Len == 0) return 0; + + // We have enough information to now generate the memcpy call to do the + // copy for us. Make a memcpy to copy the nul byte with align = 1. + B.CreateMemCpy(Dst, Src, + ConstantInt::get(TD->getIntPtrType(*Context), Len), 1); + return Dst; + } +}; + } // End anonymous namespace. namespace llvm { @@ -520,6 +565,7 @@ class LibCallSimplifierImpl { StrRChrOpt StrRChr; StrCmpOpt StrCmp; StrNCmpOpt StrNCmp; + StrCpyOpt StrCpy; void initOptimizations(); public: @@ -540,14 +586,15 @@ void LibCallSimplifierImpl::initOptimizations() { Optimizations["__stpcpy_chk"] = &StrCpyChk; Optimizations["__strncpy_chk"] = &StrNCpyChk; Optimizations["__stpncpy_chk"] = &StrNCpyChk; - Optimizations["strcmp"] = &StrCmp; - Optimizations["strncmp"] = &StrNCmp; // String and memory library call optimizations. Optimizations["strcat"] = &StrCat; Optimizations["strncat"] = &StrNCat; Optimizations["strchr"] = &StrChr; Optimizations["strrchr"] = &StrRChr; + Optimizations["strcmp"] = &StrCmp; + Optimizations["strncmp"] = &StrNCmp; + Optimizations["strcpy"] = &StrCpy; } Value *LibCallSimplifierImpl::optimizeCall(CallInst *CI) { diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt index 06cf1e4e53..e64034ab26 100644 --- a/lib/Transforms/Vectorize/CMakeLists.txt +++ b/lib/Transforms/Vectorize/CMakeLists.txt @@ -1,6 +1,7 @@ add_llvm_library(LLVMVectorize BBVectorize.cpp Vectorize.cpp + LoopVectorize.cpp ) add_dependencies(LLVMVectorize intrinsics_gen) diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp new file mode 100644 index 0000000000..9bbd9ab60b --- /dev/null +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -0,0 +1,885 @@ +//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is a simple loop vectorizer. We currently only support single block +// loops. We have a very simple and restrictive legality check: we need to read +// and write from disjoint memory locations. We still don't have a cost model. +// This pass has three parts: +// 1. The main loop pass that drives the different parts. +// 2. LoopVectorizationLegality - A helper class that checks for the legality +// of the vectorization. +// 3. SingleBlockLoopVectorizer - A helper class that performs the actual +// widening of instructions. +// +//===----------------------------------------------------------------------===// +#define LV_NAME "loop-vectorize" +#define DEBUG_TYPE LV_NAME +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Instructions.h" +#include "llvm/LLVMContext.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Value.h" +#include "llvm/Function.h" +#include "llvm/Analysis/Verifier.h" +#include "llvm/Module.h" +#include "llvm/Type.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/DataLayout.h" +#include "llvm/Transforms/Utils/Local.h" +#include <algorithm> +using namespace llvm; + +static cl::opt<unsigned> +DefaultVectorizationFactor("default-loop-vectorize-width", + cl::init(4), cl::Hidden, + cl::desc("Set the default loop vectorization width")); + +namespace { + +/// Vectorize a simple loop. This class performs the widening of simple single +/// basic block loops into vectors. It does not perform any +/// vectorization-legality checks, and just does it. It widens the vectors +/// to a given vectorization factor (VF). +class SingleBlockLoopVectorizer { +public: + /// Ctor. + SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li, + LPPassManager *Lpm, unsigned VecWidth): + Orig(OrigLoop), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth), + Builder(0), Induction(0), OldInduction(0) { } + + ~SingleBlockLoopVectorizer() { + delete Builder; + } + + // Perform the actual loop widening (vectorization). + void vectorize() { + ///Create a new empty loop. Unlink the old loop and connect the new one. + createEmptyLoop(); + /// Widen each instruction in the old loop to a new one in the new loop. + vectorizeLoop(); + // register the new loop. + cleanup(); + } + +private: + /// Create an empty loop, based on the loop ranges of the old loop. + void createEmptyLoop(); + /// Copy and widen the instructions from the old loop. + void vectorizeLoop(); + /// Insert the new loop to the loop hierarchy and pass manager. + void cleanup(); + + /// This instruction is un-vectorizable. Implement it as a sequence + /// of scalars. + void scalarizeInstruction(Instruction *Instr); + + /// Create a broadcast instruction. This method generates a broadcast + /// instruction (shuffle) for loop invariant values and for the induction + /// value. If this is the induction variable then we extend it to N, N+1, ... + /// this is needed because each iteration in the loop corresponds to a SIMD + /// element. + Value *getBroadcastInstrs(Value *V); + + /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 .. + /// for each element in the vector. Starting from zero. + Value *getConsecutiveVector(Value* Val); + + /// Check that the GEP operands are all uniform except for the last index + /// which has to be the induction variable. + bool isConsecutiveGep(GetElementPtrInst *Gep); + + /// When we go over instructions in the basic block we rely on previous + /// values within the current basic block or on loop invariant values. + /// When we widen (vectorize) values we place them in the map. If the values + /// are not within the map, they have to be loop invariant, so we simply + /// broadcast them into a vector. + Value *getVectorValue(Value *V); + + typedef DenseMap<Value*, Value*> ValueMap; + + /// The original loop. + Loop *Orig; + // Scev analysis to use. + ScalarEvolution *SE; + // Loop Info. + LoopInfo *LI; + // Loop Pass Manager; + LPPassManager *LPM; + // The vectorization factor to use. + unsigned VF; + + // The builder that we use + IRBuilder<> *Builder; + + // --- Vectorization state --- + + /// The new Induction variable which was added to the new block. + PHINode *Induction; + /// The induction variable of the old basic block. + PHINode *OldInduction; + // Maps scalars to widened vectors. + ValueMap WidenMap; +}; + +/// Perform the vectorization legality check. This class does not look at the +/// profitability of vectorization, only the legality. At the moment the checks +/// are very simple and focus on single basic block loops with a constant +/// iteration count and no reductions. +class LoopVectorizationLegality { +public: + LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl): + TheLoop(Lp), SE(Se), DL(Dl) { } + + /// Returns the maximum vectorization factor that we *can* use to vectorize + /// this loop. This does not mean that it is profitable to vectorize this + /// loop, only that it is legal to do so. This may be a large number. We + /// can vectorize to any SIMD width below this number. + unsigned getLoopMaxVF(); + +private: + /// Check if a single basic block loop is vectorizable. + /// At this point we know that this is a loop with a constant trip count + /// and we only need to check individual instructions. + bool canVectorizeBlock(BasicBlock &BB); + + // Check if a pointer value is known to be disjoint. + // Example: Alloca, Global, NoAlias. + bool isidentifiedSafeObject(Value* Val); + + /// The loop that we evaluate. + Loop *TheLoop; + /// Scev analysis. + ScalarEvolution *SE; + /// DataLayout analysis. + DataLayout *DL; +}; + +struct LoopVectorize : public LoopPass { + static char ID; // Pass identification, replacement for typeid + + LoopVectorize() : LoopPass(ID) { + initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); + } + + ScalarEvolution *SE; + DataLayout *DL; + LoopInfo *LI; + + virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { + // Only vectorize innermost loops. + if (!L->empty()) + return false; + + SE = &getAnalysis<ScalarEvolution>(); + DL = getAnalysisIfAvailable<DataLayout>(); + LI = &getAnalysis<LoopInfo>(); + + DEBUG(dbgs() << "LV: Checking a loop in \"" << + L->getHeader()->getParent()->getName() << "\"\n"); + + // Check if it is legal to vectorize the loop. + LoopVectorizationLegality LVL(L, SE, DL); + unsigned MaxVF = LVL.getLoopMaxVF(); + + // Check that we can vectorize using the chosen vectorization width. + if (MaxVF < DefaultVectorizationFactor) { + DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n"); + return false; + } + + DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n"); + + // If we decided that is is *legal* to vectorizer the loop. Do it. + SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor); + LB.vectorize(); + + DEBUG(verifyFunction(*L->getHeader()->getParent())); + return true; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + LoopPass::getAnalysisUsage(AU); + AU.addRequiredID(LoopSimplifyID); + AU.addRequired<LoopInfo>(); + AU.addRequired<ScalarEvolution>(); + } + +}; + +Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { + // Instructions that access the old induction variable + // actually want to get the new one. + if (V == OldInduction) + V = Induction; + // Create the types. + LLVMContext &C = V->getContext(); + Type *VTy = VectorType::get(V->getType(), VF); + Type *I32 = IntegerType::getInt32Ty(C); + Constant *Zero = ConstantInt::get(I32, 0); + Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); + Value *UndefVal = UndefValue::get(VTy); + // Insert the value into a new vector. + Value *SingleElem = Builder->CreateInsertElement(UndefVal, V, Zero); + // Broadcast the scalar into all locations in the vector. + Value *Shuf = Builder->CreateShuffleVector(SingleElem, UndefVal, Zeros, + "broadcast"); + // We are accessing the induction variable. Make sure to promote the + // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes. + if (V == Induction) + return getConsecutiveVector(Shuf); + return Shuf; +} + +Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { + assert(Val->getType()->isVectorTy() && "Must be a vector"); + assert(Val->getType()->getScalarType()->isIntegerTy() && + "Elem must be an integer"); + // Create the types. + Type *ITy = Val->getType()->getScalarType(); + VectorType *Ty = cast<VectorType>(Val->getType()); + unsigned VLen = Ty->getNumElements(); + SmallVector<Constant*, 8> Indices; + + // Create a vector of consecutive numbers from zero to VF. + for (unsigned i = 0; i < VLen; ++i) + Indices.push_back(ConstantInt::get(ITy, i)); + + // Add the consecutive indices to the vector value. + Constant *Cv = ConstantVector::get(Indices); + assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); + return Builder->CreateAdd(Val, Cv, "induction"); +} + + +bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) { + if (!Gep) + return false; + + unsigned NumOperands = Gep->getNumOperands(); + Value *LastIndex = Gep->getOperand(NumOperands - 1); + + // Check that all of the gep indices are uniform except for the last. + for (unsigned i = 0; i < NumOperands - 1; ++i) + if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), Orig)) + return false; + + // We can emit wide load/stores only of the last index is the induction + // variable. + const SCEV *Last = SE->getSCEV(LastIndex); + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { + const SCEV *Step = AR->getStepRecurrence(*SE); + + // The memory is consecutive because the last index is consecutive + // and all other indices are loop invariant. + if (Step->isOne()) + return true; + } + + return false; +} + +Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { + // If we saved a vectorized copy of V, use it. + ValueMap::iterator it = WidenMap.find(V); + if (it != WidenMap.end()) + return it->second; + + // Broadcast V and save the value for future uses. + Value *B = getBroadcastInstrs(V); + WidenMap[V] = B; + return B; +} + +void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { + assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); + // Holds vector parameters or scalars, in case of uniform vals. + SmallVector<Value*, 8> Params; + + // Find all of the vectorized parameters. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + Value *SrcOp = Instr->getOperand(op); + + // If we are accessing the old induction variable, use the new one. + if (SrcOp == OldInduction) { + Params.push_back(getBroadcastInstrs(Induction)); + continue; + } + + // Try using previously calculated values. + Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); + + // If the src is an instruction that appeared earlier in the basic block + // then it should already be vectorized. + if (SrcInst && SrcInst->getParent() == Instr->getParent()) { + assert(WidenMap.count(SrcInst) && "Source operand is unavailable"); + // The parameter is a vector value from earlier. + Params.push_back(WidenMap[SrcInst]); + } else { + // The parameter is a scalar from outside the loop. Maybe even a constant. + Params.push_back(SrcOp); + } + } + + assert(Params.size() == Instr->getNumOperands() && + "Invalid number of operands"); + + // Does this instruction return a value ? + bool IsVoidRetTy = Instr->getType()->isVoidTy(); + Value *VecResults = 0; + + // If we have a return value, create an empty vector. We place the scalarized + // instructions in this vector. + if (!IsVoidRetTy) + VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF)); + + // For each scalar that we create. + for (unsigned i = 0; i < VF; ++i) { + Instruction *Cloned = Instr->clone(); + if (!IsVoidRetTy) + Cloned->setName(Instr->getName() + ".cloned"); + // Replace the operands of the cloned instrucions with extracted scalars. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + Value *Op = Params[op]; + // Param is a vector. Need to extract the right lane. + if (Op->getType()->isVectorTy()) + Op = Builder->CreateExtractElement(Op, Builder->getInt32(i)); + Cloned->setOperand(op, Op); + } + + // Place the cloned scalar in the new loop. + Builder->Insert(Cloned); + + // If the original scalar returns a value we need to place it in a vector + // so that future users will be able to use it. + if (!IsVoidRetTy) + VecResults = Builder->CreateInsertElement(VecResults, Cloned, + Builder->getInt32(i)); + } + + if (!IsVoidRetTy) + WidenMap[Instr] = VecResults; +} + +void SingleBlockLoopVectorizer::createEmptyLoop() { + /* + In this function we generate a new loop. The new loop will contain + the vectorized instructions while the old loop will continue to run the + scalar remainder. + + [ ] <-- vector loop bypass. + / | + / v +| [ ] <-- vector pre header. +| | +| v +| [ ] \ +| [ ]_| <-- vector loop. +| | + \ v + >[ ] <--- middle-block. + / | + / v +| [ ] <--- new preheader. +| | +| v +| [ ] \ +| [ ]_| <-- old scalar loop to handle remainder. () + \ | + \ v + >[ ] <-- exit block. + ... + */ + + // This is the original scalar-loop preheader. + BasicBlock *BypassBlock = Orig->getLoopPreheader(); + BasicBlock *ExitBlock = Orig->getExitBlock(); + assert(ExitBlock && "Must have an exit block"); + + assert(Orig->getNumBlocks() == 1 && "Invalid loop"); + assert(BypassBlock && "Invalid loop structure"); + + BasicBlock *VectorPH = + BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); + BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), + "vector.body"); + + BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), + "middle.block"); + + + BasicBlock *ScalarPH = + MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), + "scalar.preheader"); + + // Find the induction variable. + BasicBlock *OldBasicBlock = Orig->getHeader(); + OldInduction = dyn_cast<PHINode>(OldBasicBlock->begin()); + assert(OldInduction && "We must have a single phi node."); + Type *IdxTy = OldInduction->getType(); + + // Use this IR builder to create the loop instructions (Phi, Br, Cmp) + // inside the loop. + Builder = new IRBuilder<>(VecBody); + Builder->SetInsertPoint(VecBody->getFirstInsertionPt()); + + // Generate the induction variable. + Induction = Builder->CreatePHI(IdxTy, 2, "index"); + Constant *Zero = ConstantInt::get(IdxTy, 0); + Constant *Step = ConstantInt::get(IdxTy, VF); + + // Find the loop boundaries. + const SCEV *ExitCount = SE->getExitCount(Orig, Orig->getHeader()); + assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); + + // Get the total trip count from the count by adding 1. + ExitCount = SE->getAddExpr(ExitCount, + SE->getConstant(ExitCount->getType(), 1)); + + // Expand the trip count and place the new instructions in the preheader. + // Notice that the pre-header does not change, only the loop body. + SCEVExpander Exp(*SE, "induction"); + Instruction *Loc = BypassBlock->getTerminator(); + + // We may need to extend the index in case there is a type mismatch. + // We know that the count starts at zero and does not overflow. + // We are using Zext because it should be less expensive. + if (ExitCount->getType() != Induction->getType()) + ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy); + + // Count holds the overall loop count (N). + Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc); + // Now we need to generate the expression for N - (N % VF), which is + // the part that the vectorized body will execute. + Constant *CIVF = ConstantInt::get(IdxTy, VF); + Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc); + Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); + + // Now, compare the new count to zero. If it is zero, jump to the scalar part. + Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, + CountRoundDown, ConstantInt::getNullValue(IdxTy), + "cmp.zero", Loc); + BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); + // Remove the old terminator. + Loc->eraseFromParent(); + + // Add a check in the middle block to see if we have completed + // all of the iterations in the first vector loop. + // If (N - N%VF) == N, then we *don't* need to run the remainder. + Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, + CountRoundDown, "cmp.n", + MiddleBlock->getTerminator()); + + BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); + // Remove the old terminator. + MiddleBlock->getTerminator()->eraseFromParent(); + + // Create i+1 and fill the PHINode. + Value *NextIdx = Builder->CreateAdd(Induction, Step, "index.next"); + Induction->addIncoming(Zero, VectorPH); + Induction->addIncoming(NextIdx, VecBody); + // Create the compare. + Value *ICmp = Builder->CreateICmpEQ(NextIdx, CountRoundDown); + Builder->CreateCondBr(ICmp, MiddleBlock, VecBody); + + // Now we have two terminators. Remove the old one from the block. + VecBody->getTerminator()->eraseFromParent(); + + // Fix the scalar body iteration count. + unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH); + OldInduction->setIncomingValue(BlockIdx, CountRoundDown); + + // Get ready to start creating new instructions into the vectorized body. + Builder->SetInsertPoint(VecBody->getFirstInsertionPt()); + + // Register the new loop. + Loop* Lp = new Loop(); + LPM->insertLoop(Lp, Orig->getParentLoop()); + + Lp->addBasicBlockToLoop(VecBody, LI->getBase()); + + Loop *ParentLoop = Orig->getParentLoop(); + if (ParentLoop) { + ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); + ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); + ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); + } +} + +void SingleBlockLoopVectorizer::vectorizeLoop() { + BasicBlock &BB = *Orig->getHeader(); + + // For each instruction in the old loop. + for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { + Instruction *Inst = it; + + switch (Inst->getOpcode()) { + case Instruction::PHI: + case Instruction::Br: + // Nothing to do for PHIs and BR, since we already took care of the + // loop control flow instructions. + continue; + + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Just widen binops. + BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst); + Value *A = getVectorValue(Inst->getOperand(0)); + Value *B = getVectorValue(Inst->getOperand(1)); + // Use this vector value for all users of the original instruction. + WidenMap[Inst] = Builder->CreateBinOp(BinOp->getOpcode(), A, B); + break; + } + case Instruction::Select: { + // Widen selects. + Value *A = getVectorValue(Inst->getOperand(0)); + Value *B = getVectorValue(Inst->getOperand(1)); + Value *C = getVectorValue(Inst->getOperand(2)); + WidenMap[Inst] = Builder->CreateSelect(A, B, C); + break; + } + + case Instruction::ICmp: + case Instruction::FCmp: { + // Widen compares. Generate vector compares. + bool FCmp = (Inst->getOpcode() == Instruction::FCmp); + CmpInst *Cmp = dyn_cast<CmpInst>(Inst); + Value *A = getVectorValue(Inst->getOperand(0)); + Value *B = getVectorValue(Inst->getOperand(1)); + if (FCmp) + WidenMap[Inst] = Builder->CreateFCmp(Cmp->getPredicate(), A, B); + else + WidenMap[Inst] = Builder->CreateICmp(Cmp->getPredicate(), A, B); + break; + } + + case Instruction::Store: { + // Attempt to issue a wide store. + StoreInst *SI = dyn_cast<StoreInst>(Inst); + Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); + Value *Ptr = SI->getPointerOperand(); + unsigned Alignment = SI->getAlignment(); + GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); + // This store does not use GEPs. + if (!isConsecutiveGep(Gep)) { + scalarizeInstruction(Inst); + break; + } + + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); + unsigned NumOperands = Gep->getNumOperands(); + Gep2->setOperand(NumOperands - 1, Induction); + Ptr = Builder->Insert(Gep2); + Ptr = Builder->CreateBitCast(Ptr, StTy->getPointerTo()); + Value *Val = getVectorValue(SI->getValueOperand()); + Builder->CreateStore(Val, Ptr)->setAlignment(Alignment); + break; + } + case Instruction::Load: { + // Attempt to issue a wide load. + LoadInst *LI = dyn_cast<LoadInst>(Inst); + Type *RetTy = VectorType::get(LI->getType(), VF); + Value *Ptr = LI->getPointerOperand(); + unsigned Alignment = LI->getAlignment(); + GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); + + // We don't have a gep. Scalarize the load. + if (!isConsecutiveGep(Gep)) { + scalarizeInstruction(Inst); + break; + } + + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); + unsigned NumOperands = Gep->getNumOperands(); + Gep2->setOperand(NumOperands - 1, Induction); + Ptr = Builder->Insert(Gep2); + Ptr = Builder->CreateBitCast(Ptr, RetTy->getPointerTo()); + LI = Builder->CreateLoad(Ptr); + LI->setAlignment(Alignment); + // Use this vector value for all users of the load. + WidenMap[Inst] = LI; + break; + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + /// Vectorize bitcasts. + CastInst *CI = dyn_cast<CastInst>(Inst); + Value *A = getVectorValue(Inst->getOperand(0)); + Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); + WidenMap[Inst] = Builder->CreateCast(CI->getOpcode(), A, DestTy); + break; + } + + default: + /// All other instructions are unsupported. Scalarize them. + scalarizeInstruction(Inst); + break; + }// end of switch. + }// end of for_each instr. +} + +void SingleBlockLoopVectorizer::cleanup() { + // The original basic block. + SE->forgetLoop(Orig); +} + +unsigned LoopVectorizationLegality::getLoopMaxVF() { + if (!TheLoop->getLoopPreheader()) { + assert(false && "No preheader!!"); + DEBUG(dbgs() << "LV: Loop not normalized." << "\n"); + return 1; + } + + // We can only vectorize single basic block loops. + unsigned NumBlocks = TheLoop->getNumBlocks(); + if (NumBlocks != 1) { + DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n"); + return 1; + } + + // We need to have a loop header. + BasicBlock *BB = TheLoop->getHeader(); + DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n"); + + // Go over each instruction and look at memory deps. + if (!canVectorizeBlock(*BB)) { + DEBUG(dbgs() << "LV: Can't vectorize this loop header\n"); + return 1; + } + + // ScalarEvolution needs to be able to find the exit count. + const SCEV *ExitCount = SE->getExitCount(TheLoop, BB); + if (ExitCount == SE->getCouldNotCompute()) { + DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); + return 1; + } + + DEBUG(dbgs() << "LV: We can vectorize this loop!\n"); + + // Okay! We can vectorize. At this point we don't have any other mem analysis + // which may limit our maximum vectorization factor, so just return the + // maximum SIMD size. + return DefaultVectorizationFactor; +} + +bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { + // Holds the read and write pointers that we find. + typedef SmallVector<Value*, 10> ValueVector; + ValueVector Reads; + ValueVector Writes; + + unsigned NumPhis = 0; + for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { + Instruction *I = it; + + PHINode *Phi = dyn_cast<PHINode>(I); + if (Phi) { + NumPhis++; + // We only look at integer phi nodes. + if (!Phi->getType()->isIntegerTy()) { + DEBUG(dbgs() << "LV: Found an non-int PHI.\n"); + return false; + } + + // If we found an induction variable. + if (NumPhis > 1) { + DEBUG(dbgs() << "LV: Found more than one PHI.\n"); + return false; + } + + // This should not happen because the loop should be normalized. + if (Phi->getNumIncomingValues() != 2) { + DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); + return false; + } + + // Check that the PHI is consecutive and starts at zero. + const SCEV *PhiScev = SE->getSCEV(Phi); + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); + if (!AR) { + DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); + return false; + } + + const SCEV *Step = AR->getStepRecurrence(*SE); + const SCEV *Start = AR->getStart(); + + if (!Step->isOne() || !Start->isZero()) { + DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n"); + return false; + } + } + + // If this is a load, record its pointer. If it is not a load, abort. + // Notice that we don't handle function calls that read or write. + if (I->mayReadFromMemory()) { + LoadInst *Ld = dyn_cast<LoadInst>(I); + if (!Ld) return false; + if (!Ld->isSimple()) { + DEBUG(dbgs() << "LV: Found a non-simple load.\n"); + return false; + } + GetUnderlyingObjects(Ld->getPointerOperand(), Reads, DL); + } + + // Record store pointers. Abort on all other instructions that write to + // memory. + if (I->mayWriteToMemory()) { + StoreInst *St = dyn_cast<StoreInst>(I); + if (!St) return false; + if (!St->isSimple()) { + DEBUG(dbgs() << "LV: Found a non-simple store.\n"); + return false; + } + GetUnderlyingObjects(St->getPointerOperand(), Writes, DL); + } + + // We still don't handle functions. + CallInst *CI = dyn_cast<CallInst>(I); + if (CI) { + DEBUG(dbgs() << "LV: Found a call site:"<< + CI->getCalledFunction()->getName() << "\n"); + return false; + } + + // We do not re-vectorize vectors. + if (!VectorType::isValidElementType(I->getType()) && + !I->getType()->isVoidTy()) { + DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); + return false; + } + //Check that all of the users of the loop are inside the BB. + for (Value::use_iterator it = I->use_begin(), e = I->use_end(); + it != e; ++it) { + Instruction *U = cast<Instruction>(*it); + BasicBlock *Parent = U->getParent(); + if (Parent != &BB) { + DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); + return false; + } + } + } // next instr. + + if (NumPhis != 1) { + DEBUG(dbgs() << "LV: Did not find a Phi node.\n"); + return false; + } + + // Check that the underlying objects of the reads and writes are either + // disjoint memory locations, or that they are no-alias arguments. + ValueVector::iterator r, re, w, we; + for (r = Reads.begin(), re = Reads.end(); r != re; ++r) { + if (!isidentifiedSafeObject(*r)) { + DEBUG(dbgs() << "LV: Found a bad read Ptr: "<< **r << "\n"); + return false; + } + } + + for (w = Writes.begin(), we = Writes.end(); w != we; ++w) { + if (!isidentifiedSafeObject(*w)) { + DEBUG(dbgs() << "LV: Found a bad write Ptr: "<< **w << "\n"); + return false; + } + } + + // Check that there are no multiple write locations to the same pointer. + SmallPtrSet<Value*, 8> WritePointerSet; + for (w = Writes.begin(), we = Writes.end(); w != we; ++w) { + if (!WritePointerSet.insert(*w)) { + DEBUG(dbgs() << "LV: Multiple writes to the same index :"<< **w << "\n"); + return false; + } + } + + // Check that the reads and the writes are disjoint. + for (r = Reads.begin(), re = Reads.end(); r != re; ++r) { + if (WritePointerSet.count(*r)) { + DEBUG(dbgs() << "Vectorizer: Found a read/write ptr:"<< **r << "\n"); + return false; + } + } + + // All is okay. + return true; +} + +/// Checks if the value is a Global variable or if it is an Arguments +/// marked with the NoAlias attribute. +bool LoopVectorizationLegality::isidentifiedSafeObject(Value* Val) { + assert(Val && "Invalid value"); + if (dyn_cast<GlobalValue>(Val)) + return true; + if (dyn_cast<AllocaInst>(Val)) + return true; + Argument *A = dyn_cast<Argument>(Val); + if (!A) + return false; + return A->hasNoAliasAttr(); +} + +} // namespace + +char LoopVectorize::ID = 0; +static const char lv_name[] = "Loop Vectorization"; +INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) + +namespace llvm { + Pass *createLoopVectorizePass() { + return new LoopVectorize(); + } + +} + diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp index 1ef60029bc..d26973a7b3 100644 --- a/lib/Transforms/Vectorize/Vectorize.cpp +++ b/lib/Transforms/Vectorize/Vectorize.cpp @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// // -// This file implements common infrastructure for libLLVMVectorizeOpts.a, which +// This file implements common infrastructure for libLLVMVectorizeOpts.a, which // implements several vectorization transformations over the LLVM intermediate // representation, including the C bindings for that library. // @@ -23,10 +23,11 @@ using namespace llvm; -/// initializeVectorizationPasses - Initialize all passes linked into the +/// initializeVectorizationPasses - Initialize all passes linked into the /// Vectorization library. void llvm::initializeVectorization(PassRegistry &Registry) { initializeBBVectorizePass(Registry); + initializeLoopVectorizePass(Registry); } void LLVMInitializeVectorization(LLVMPassRegistryRef R) { @@ -37,3 +38,6 @@ void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) { unwrap(PM)->add(createBBVectorizePass()); } +void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopVectorizePass()); +} |