diff options
author | Nadav Rotem <nrotem@apple.com> | 2013-04-09 19:44:35 +0000 |
---|---|---|
committer | Nadav Rotem <nrotem@apple.com> | 2013-04-09 19:44:35 +0000 |
commit | 8383b539ff4c039108ee0c202a27b787621d96cf (patch) | |
tree | a94c718adf657b35e9c1581987a588bac83242f1 /lib/Transforms/Vectorize/VecUtils.cpp | |
parent | 376e05fd7ba37b76ea26fa7604671c9abd32307e (diff) |
Add support for bottom-up SLP vectorization infrastructure.
This commit adds the infrastructure for performing bottom-up SLP vectorization (and other optimizations) on parallel computations.
The infrastructure has three potential users:
1. The loop vectorizer needs to be able to vectorize AOS data structures such as (sum += A[i] + A[i+1]).
2. The BB-vectorizer needs this infrastructure for bottom-up SLP vectorization, because bottom-up vectorization is faster to compute.
3. A loop-roller needs to be able to analyze consecutive chains and roll them into a loop, in order to reduce code size. A loop roller does not need to create vector instructions, and this infrastructure separates the chain analysis from the vectorization.
This patch also includes a simple (100 LOC) bottom up SLP vectorizer that uses the infrastructure, and can vectorize this code:
void SAXPY(int *x, int *y, int a, int i) {
x[i] = a * x[i] + y[i];
x[i+1] = a * x[i+1] + y[i+1];
x[i+2] = a * x[i+2] + y[i+2];
x[i+3] = a * x[i+3] + y[i+3];
}
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179117 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Transforms/Vectorize/VecUtils.cpp')
-rw-r--r-- | lib/Transforms/Vectorize/VecUtils.cpp | 439 |
1 files changed, 439 insertions, 0 deletions
diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp new file mode 100644 index 0000000000..7e9f12d43c --- /dev/null +++ b/lib/Transforms/Vectorize/VecUtils.cpp @@ -0,0 +1,439 @@ +//===- VecUtils.h --- Vectorization Utilities -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "VecUtils" + +#include "VecUtils.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/Verifier.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" +#include <algorithm> +#include <map> + +using namespace llvm; + +namespace llvm { + +BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl, + TargetTransformInfo *Tti, AliasAnalysis *Aa) : + BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa) { + numberInstructions(); +} + +void BoUpSLP::numberInstructions() { + int Loc = 0; + InstrIdx.clear(); + InstrVec.clear(); + // Number the instructions in the block. + for (BasicBlock::iterator it=BB->begin(), e=BB->end(); it != e; ++it) { + InstrIdx[it] = Loc++; + InstrVec.push_back(it); + assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation"); + } +} + +Value *BoUpSLP::getPointerOperand(Value *I) { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand(); + if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand(); + return 0; +} + +unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { + if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace(); + if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->getPointerAddressSpace(); + return -1; +} + +bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { + Value *PtrA = getPointerOperand(A); + Value *PtrB = getPointerOperand(B); + unsigned ASA = getAddressSpaceOperand(A); + unsigned ASB = getAddressSpaceOperand(B); + + // Check that the address spaces match and that the pointers are valid. + if (!PtrA || !PtrB || (ASA != ASB)) return false; + + // Check that A and B are of the same type. + if (PtrA->getType() != PtrB->getType()) return false; + + // Calculate the distance. + const SCEV *PtrSCEVA = SE->getSCEV(PtrA); + const SCEV *PtrSCEVB = SE->getSCEV(PtrB); + const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB); + const SCEVConstant *ConstOffSCEV = dyn_cast<SCEVConstant>(OffsetSCEV); + + // Non constant distance. + if (!ConstOffSCEV) return false; + + unsigned Offset = ConstOffSCEV->getValue()->getSExtValue(); + Type *Ty = cast<PointerType>(PtrA->getType())->getElementType(); + // The Instructions are connsecutive if the size of the first load/store is + // the same as the offset. + unsigned Sz = (DL ? DL->getTypeStoreSize(Ty) : Ty->getScalarSizeInBits()/8); + return ((-Offset) == Sz); +} + +bool BoUpSLP::vectorizeStores(StoreList &Stores, int costThreshold) { + ValueSet Heads, Tails; + SmallDenseMap<Value*, Value*> ConsecutiveChain; + bool Changed = false; + + // Do a quadratic search on all of the given stores and find + // all of the pairs of loads that follow each other. + for (unsigned i = 0, e = Stores.size(); i < e; ++i) + for (unsigned j = 0; j < e; ++j) { + if (i == j) continue; + if (isConsecutiveAccess(Stores[i], Stores[j])) { + Tails.insert(Stores[j]); + Heads.insert(Stores[i]); + ConsecutiveChain[Stores[i]] = Stores[j]; + } + } + + // For stores that start but don't end a link in the chain: + for (ValueSet::iterator it = Heads.begin(), e = Heads.end();it != e; ++it) { + if (Tails.count(*it)) continue; + + // We found a store instr that starts a chain. Now follow the chain and try + // to vectorize it. + ValueList Operands; + Value *I = *it; + int MinCost = 0, MinVF = 0; + while (Tails.count(I) || Heads.count(I)) { + Operands.push_back(I); + unsigned VF = Operands.size(); + if (isPowerOf2_32(VF) && VF > 1) { + int cost = getTreeRollCost(Operands, 0); + DEBUG(dbgs() << "Found cost=" << cost << " for VF=" << VF << "\n"); + if (cost < MinCost) { MinCost = cost; MinVF = VF; } + } + // Move to the next value in the chain. + I = ConsecutiveChain[I]; + } + + if (MinCost <= costThreshold && MinVF > 1) { + DEBUG(dbgs() << "Decided to vectorize cost=" << MinCost << "\n"); + vectorizeTree(Operands, MinVF); + Stores.clear(); + // The current numbering is invalid because we added and removed instrs. + numberInstructions(); + Changed = true; + } + } + + return Changed; +} + +int BoUpSLP::getScalarizationCost(Type *Ty) { + int Cost = 0; + for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) + Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + return Cost; +} + +AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) { + if (StoreInst *SI = dyn_cast<StoreInst>(I)) return AA->getLocation(SI); + if (LoadInst *LI = dyn_cast<LoadInst>(I)) return AA->getLocation(LI); + return AliasAnalysis::Location(); +} + +Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) { + assert(Src->getParent() == Dst->getParent() && "Not the same BB"); + BasicBlock::iterator I = Src, E = Dst; + /// Scan all of the instruction from SRC to DST and check if + /// the source may alias. + for (++I; I != E; ++I) { + // Ignore store instructions that are marked as 'ignore'. + if (MemBarrierIgnoreList.count(I)) continue; + if (Src->mayWriteToMemory()) /* Write */ { + if (!I->mayReadOrWriteMemory()) continue; + } else /* Read */ { + if (!I->mayWriteToMemory()) continue; + } + AliasAnalysis::Location A = getLocation(&*I); + AliasAnalysis::Location B = getLocation(Src); + + if (!A.Ptr || !B.Ptr || AA->alias(A, B)) + return I; + } + return 0; +} + +int BoUpSLP::getTreeRollCost(ValueList &VL, unsigned Depth) { + if (Depth == 6) return max_cost; + Type *ScalarTy = VL[0]->getType(); + + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + + /// Don't mess with vectors. + if (ScalarTy->isVectorTy()) return max_cost; + + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + + // Check if all of the operands are constants. + bool AllConst = true; + bool AllSameScalar = true; + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + AllConst &= isa<Constant>(VL[i]); + AllSameScalar &= (VL[0] == VL[i]); + // Must have a single use. + Instruction *I = dyn_cast<Instruction>(VL[i]); + // Need to scalarize instructions with multiple users or from other BBs. + if (I && ((I->getNumUses() > 1) || (I->getParent() != BB))) + return getScalarizationCost(VecTy); + } + + // Is this a simple vector constant. + if (AllConst) return 0; + + // If all of the operands are identical we can broadcast them. + if (AllSameScalar) + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); + + // Scalarize unknown structures. + Instruction *VL0 = dyn_cast<Instruction>(VL[0]); + if (!VL0) return getScalarizationCost(VecTy); + assert(VL0->getParent() == BB && "Wrong BB"); + + unsigned Opcode = VL0->getOpcode(); + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + // If not all of the instructions are identical then we have to scalarize. + if (!I || Opcode != I->getOpcode()) return getScalarizationCost(VecTy); + } + + // Check if it is safe to sink the loads or the stores. + if (Opcode == Instruction::Load || Opcode == Instruction::Store) { + int MaxIdx = InstrIdx[VL0]; + for (unsigned i = 1, e = VL.size(); i < e; ++i ) + MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]); + + Instruction *Last = InstrVec[MaxIdx]; + for (unsigned i = 0, e = VL.size(); i < e; ++i ) { + if (VL[i] == Last) continue; + Value *Barrier = isUnsafeToSink(cast<Instruction>(VL[i]), Last); + if (Barrier) { + DEBUG(dbgs() << "LR: Can't sink " << *VL[i] << "\n down to " << + *Last << "\n because of " << *Barrier << "\n"); + return max_cost; + } + } + } + + switch (Opcode) { + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + ValueList Operands; + int Cost = 0; + // Calculate the cost of all of the operands. + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); + Cost += getTreeRollCost(Operands, Depth+1); + Operands.clear(); + } + + // Calculate the cost of this instruction. + int ScalarCost = VecTy->getNumElements() * + TTI->getArithmeticInstrCost(Opcode, ScalarTy); + int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); + Cost += (VecCost - ScalarCost); + return Cost; + } + case Instruction::Load: { + // If we are scalarize the loads, add the cost of forming the vector. + for (unsigned i = 0, e = VL.size()-1; i < e; ++i) + if (!isConsecutiveAccess(VL[i], VL[i+1])) + return getScalarizationCost(VecTy); + + // Cost of wide load - cost of scalar loads. + int ScalarLdCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + return VecLdCost - ScalarLdCost; + } + case Instruction::Store: { + // We know that we can merge the stores. Calculate the cost. + int ScalarStCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); + int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1,0); + int StoreCost = VecStCost - ScalarStCost; + + ValueList Operands; + for (unsigned j = 0; j < VL.size(); ++j) { + Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); + MemBarrierIgnoreList.insert(VL[j]); + } + + int TotalCost = StoreCost + getTreeRollCost(Operands, Depth + 1); + MemBarrierIgnoreList.clear(); + return TotalCost; + } + default: + // Unable to vectorize unknown instructions. + return getScalarizationCost(VecTy); + } +} + +Instruction *BoUpSLP::GetLastInstr(ValueList &VL, unsigned VF) { + int MaxIdx = InstrIdx[BB->getFirstNonPHI()]; + for (unsigned i = 0; i < VF; ++i ) + MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]); + return InstrVec[MaxIdx + 1]; +} + +Value *BoUpSLP::Scalarize(ValueList &VL, VectorType *Ty) { + IRBuilder<> Builder(GetLastInstr(VL, Ty->getNumElements())); + Value *Vec = UndefValue::get(Ty); + for (unsigned i=0; i < Ty->getNumElements(); ++i) + Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); + return Vec; +} + +Value *BoUpSLP::vectorizeTree(ValueList &VL, int VF) { + Type *ScalarTy = VL[0]->getType(); + if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, VF); + + // Check if all of the operands are constants or identical. + bool AllConst = true; + bool AllSameScalar = true; + for (unsigned i = 0, e = VF; i < e; ++i) { + AllConst &= !!dyn_cast<Constant>(VL[i]); + AllSameScalar &= (VL[0] == VL[i]); + // Must have a single use. + Instruction *I = dyn_cast<Instruction>(VL[i]); + if (I && (I->getNumUses() > 1 || I->getParent() != BB)) + return Scalarize(VL, VecTy); + } + + // Is this a simple vector constant. + if (AllConst || AllSameScalar) return Scalarize(VL, VecTy); + + // Scalarize unknown structures. + Instruction *VL0 = dyn_cast<Instruction>(VL[0]); + if (!VL0) return Scalarize(VL, VecTy); + + unsigned Opcode = VL0->getOpcode(); + for (unsigned i = 0, e = VF; i < e; ++i) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + // If not all of the instructions are identical then we have to scalarize. + if (!I || Opcode != I->getOpcode()) return Scalarize(VL, VecTy); + } + + switch (Opcode) { + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + ValueList LHSVL, RHSVL; + for (int i = 0; i < VF; ++i) { + RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0)); + LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1)); + } + + Value *RHS = vectorizeTree(RHSVL, VF); + Value *LHS = vectorizeTree(LHSVL, VF); + IRBuilder<> Builder(GetLastInstr(VL, VF)); + BinaryOperator *BinOp = dyn_cast<BinaryOperator>(VL0); + return Builder.CreateBinOp(BinOp->getOpcode(), RHS,LHS); + } + case Instruction::Load: { + LoadInst *LI = dyn_cast<LoadInst>(VL0); + unsigned Alignment = LI->getAlignment(); + + // Check if all of the loads are consecutive. + for (unsigned i = 1, e = VF; i < e; ++i) + if (!isConsecutiveAccess(VL[i-1], VL[i])) + return Scalarize(VL, VecTy); + + IRBuilder<> Builder(GetLastInstr(VL, VF)); + Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), + VecTy->getPointerTo()); + LI = Builder.CreateLoad(VecPtr); + LI->setAlignment(Alignment); + return LI; + } + case Instruction::Store: { + StoreInst *SI = dyn_cast<StoreInst>(VL0); + unsigned Alignment = SI->getAlignment(); + + ValueList ValueOp; + for (int i = 0; i < VF; ++i) + ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand()); + + Value *VecValue = vectorizeTree(ValueOp, VF); + + IRBuilder<> Builder(GetLastInstr(VL, VF)); + Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), + VecTy->getPointerTo()); + Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment); + + for (int i = 0; i < VF; ++i) + cast<Instruction>(VL[i])->eraseFromParent(); + return 0; + } + default: + return Scalarize(VL, VecTy); + } +} + +} // end of namespace |