From 8383b539ff4c039108ee0c202a27b787621d96cf Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Tue, 9 Apr 2013 19:44:35 +0000 Subject: Add support for bottom-up SLP vectorization infrastructure. This commit adds the infrastructure for performing bottom-up SLP vectorization (and other optimizations) on parallel computations. The infrastructure has three potential users: 1. The loop vectorizer needs to be able to vectorize AOS data structures such as (sum += A[i] + A[i+1]). 2. The BB-vectorizer needs this infrastructure for bottom-up SLP vectorization, because bottom-up vectorization is faster to compute. 3. A loop-roller needs to be able to analyze consecutive chains and roll them into a loop, in order to reduce code size. A loop roller does not need to create vector instructions, and this infrastructure separates the chain analysis from the vectorization. This patch also includes a simple (100 LOC) bottom up SLP vectorizer that uses the infrastructure, and can vectorize this code: void SAXPY(int *x, int *y, int a, int i) { x[i] = a * x[i] + y[i]; x[i+1] = a * x[i+1] + y[i+1]; x[i+2] = a * x[i+2] + y[i+2]; x[i+3] = a * x[i+3] + y[i+3]; } git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179117 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/SLPVectorizer.cpp | 153 +++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 lib/Transforms/Vectorize/SLPVectorizer.cpp (limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp') diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp new file mode 100644 index 0000000000..4b61dc9120 --- /dev/null +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -0,0 +1,153 @@ +//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This pass implements the Bottom Up SLP vectorizer. It detects consecutive +// stores that can be put together into vector-stores. Next, it attempts to +// construct vectorizable tree using the use-def chains. If a profitable tree +// was found, the SLP vectorizer performs vectorization on the tree. +// +// The pass is inspired by the work described in the paper: +// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. +// +//===----------------------------------------------------------------------===// +#define SV_NAME "slp-vectorizer" +#define DEBUG_TYPE SV_NAME + +#include "VecUtils.h" +#include "llvm/Transforms/Vectorize.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/Verifier.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include + +using namespace llvm; + +static cl::opt +SLPCostThreshold("slp-threshold", cl::init(1), cl::Hidden, + cl::desc("Only vectorize trees if the gain is above this " + "number. (gain = -cost of vectorization)")); +namespace { + +/// The SLPVectorizer Pass. +struct SLPVectorizer : public BasicBlockPass { + typedef std::map StoreListMap; + + /// Pass identification, replacement for typeid + static char ID; + + explicit SLPVectorizer() : BasicBlockPass(ID) { + initializeSLPVectorizerPass(*PassRegistry::getPassRegistry()); + } + + ScalarEvolution *SE; + DataLayout *DL; + TargetTransformInfo *TTI; + AliasAnalysis *AA; + + /// \brief Collect memory references and sort them according to their base + /// object. We sort the stores to their base objects to reduce the cost of the + /// quadratic search on the stores. TODO: We can further reduce this cost + /// if we flush the chain creation every time we run into a memory barrier. + bool CollectStores(BasicBlock *BB, BoUpSLP &R) { + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + // Can't vectorize instructions with side effects. + if (it->mayThrow()) + return false; + + StoreInst *SI = dyn_cast(it); + if (!SI) + continue; + + // Check that the pointer points to scalars. + if (SI->getValueOperand()->getType()->isAggregateType()) + return false; + + // Find the base of the GEP. + Value *Ptr = SI->getPointerOperand(); + if (GetElementPtrInst *GEP = dyn_cast(Ptr)) + Ptr = GEP->getPointerOperand(); + + // Save the store locations. + StoreRefs[Ptr].push_back(SI); + } + return true; + } + + bool RollStoreChains(BoUpSLP &R) { + bool Changed = false; + // Attempt to sort and vectorize each of the store-groups. + for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end(); + it != e; ++it) { + if (it->second.size() < 2) + continue; + Changed |= R.vectorizeStores(it->second, -SLPCostThreshold); + } + return Changed; + } + + virtual bool runOnBasicBlock(BasicBlock &BB) { + SE = &getAnalysis(); + DL = getAnalysisIfAvailable(); + TTI = &getAnalysis(); + AA = &getAnalysis(); + StoreRefs.clear(); + + // Use the bollom up slp vectorizer to construct chains that start with + // he store instructions. + BoUpSLP R(&BB, SE, DL, TTI, AA); + + if (!CollectStores(&BB, R)) + return false; + + bool Changed = RollStoreChains(R); + if (Changed) { + DEBUG(dbgs()<<"Rolled chains in \""<getName()<<"\"\n"); + DEBUG(verifyFunction(*BB.getParent())); + } + + return Changed; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + BasicBlockPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + } + +private: + StoreListMap StoreRefs; +}; + +} // end anonymous namespace + +char SLPVectorizer::ID = 0; +static const char lv_name[] = "SLP Vectorizer"; +INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) + +namespace llvm { + Pass *createSLPVectorizerPass() { + return new SLPVectorizer(); + } +} + -- cgit v1.2.3-70-g09d2 From 20cd5e68626ff1af698201fb34f86a59e15c2ff8 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Wed, 10 Apr 2013 18:57:27 +0000 Subject: We require DataLayout for analyzing the size of stores. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179206 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/SLPVectorizer.cpp | 5 +++++ lib/Transforms/Vectorize/VecUtils.cpp | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp') diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 4b61dc9120..01b2b92870 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -107,6 +107,11 @@ struct SLPVectorizer : public BasicBlockPass { AA = &getAnalysis(); StoreRefs.clear(); + // Must have DataLayout. We can't require it because some tests run w/o + // triple. + if (!DL) + return false; + // Use the bollom up slp vectorizer to construct chains that start with // he store instructions. BoUpSLP R(&BB, SE, DL, TTI, AA); diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp index 7e9f12d43c..3efaaf6edd 100644 --- a/lib/Transforms/Vectorize/VecUtils.cpp +++ b/lib/Transforms/Vectorize/VecUtils.cpp @@ -94,7 +94,7 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { Type *Ty = cast(PtrA->getType())->getElementType(); // The Instructions are connsecutive if the size of the first load/store is // the same as the offset. - unsigned Sz = (DL ? DL->getTypeStoreSize(Ty) : Ty->getScalarSizeInBits()/8); + unsigned Sz = DL->getTypeStoreSize(Ty); return ((-Offset) == Sz); } -- cgit v1.2.3-70-g09d2 From 4b924d3a61442fb70773057d40789ed1e3187a77 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Wed, 10 Apr 2013 19:41:36 +0000 Subject: Make the SLP store-merger less paranoid about function calls. We check for function calls when we check if it is safe to sink instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179207 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/SLPVectorizer.cpp | 4 --- test/Transforms/SLPVectorizer/X86/barriercall.ll | 40 ++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 4 deletions(-) create mode 100644 test/Transforms/SLPVectorizer/X86/barriercall.ll (limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp') diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 01b2b92870..21bdec83f0 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -65,10 +65,6 @@ struct SLPVectorizer : public BasicBlockPass { /// if we flush the chain creation every time we run into a memory barrier. bool CollectStores(BasicBlock *BB, BoUpSLP &R) { for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - // Can't vectorize instructions with side effects. - if (it->mayThrow()) - return false; - StoreInst *SI = dyn_cast(it); if (!SI) continue; diff --git a/test/Transforms/SLPVectorizer/X86/barriercall.ll b/test/Transforms/SLPVectorizer/X86/barriercall.ll new file mode 100644 index 0000000000..f520e129b6 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/barriercall.ll @@ -0,0 +1,40 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +;CHECK: @foo +;CHECK: store <4 x i32> +;CHECK: ret +define i32 @foo(i32* nocapture %A, i32 %n) #0 { +entry: + %call = tail call i32 (...)* @bar() #2 + %mul = mul nsw i32 %n, 5 + %add = add nsw i32 %mul, 9 + store i32 %add, i32* %A, align 4, !tbaa !0 + %mul1 = mul nsw i32 %n, 9 + %add2 = add nsw i32 %mul1, 9 + %arrayidx3 = getelementptr inbounds i32* %A, i64 1 + store i32 %add2, i32* %arrayidx3, align 4, !tbaa !0 + %mul4 = shl i32 %n, 3 + %add5 = add nsw i32 %mul4, 9 + %arrayidx6 = getelementptr inbounds i32* %A, i64 2 + store i32 %add5, i32* %arrayidx6, align 4, !tbaa !0 + %mul7 = mul nsw i32 %n, 10 + %add8 = add nsw i32 %mul7, 9 + %arrayidx9 = getelementptr inbounds i32* %A, i64 3 + store i32 %add8, i32* %arrayidx9, align 4, !tbaa !0 + ret i32 undef +} + + ; We can still vectorize the stores below. + +declare i32 @bar(...) #1 + +attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind } + +!0 = metadata !{metadata !"int", metadata !1} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA"} -- cgit v1.2.3-70-g09d2 From 196ee11f85ce0148d2c2e33fbe1f1171ac5a8828 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Fri, 12 Apr 2013 21:11:14 +0000 Subject: Add debug prints. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179412 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/SLPVectorizer.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp') diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 21bdec83f0..209d287743 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -91,6 +91,10 @@ struct SLPVectorizer : public BasicBlockPass { it != e; ++it) { if (it->second.size() < 2) continue; + + DEBUG(dbgs()<<"SLP: Analyzing a store chain of length " << + it->second.size() << ".\n"); + Changed |= R.vectorizeStores(it->second, -SLPCostThreshold); } return Changed; @@ -117,7 +121,7 @@ struct SLPVectorizer : public BasicBlockPass { bool Changed = RollStoreChains(R); if (Changed) { - DEBUG(dbgs()<<"Rolled chains in \""<getName()<<"\"\n"); + DEBUG(dbgs()<<"SLP: vectorized in \""<getName()<<"\"\n"); DEBUG(verifyFunction(*BB.getParent())); } -- cgit v1.2.3-70-g09d2 From f7eaf29cf70a545f5b717c638db83ba6e8b6b3c5 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sun, 14 Apr 2013 03:22:20 +0000 Subject: SLPVectorizer: add initial support for reduction variable vectorization. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179470 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/SLPVectorizer.cpp | 89 ++++++++++++++++++++++++-- lib/Transforms/Vectorize/VecUtils.cpp | 10 +++ lib/Transforms/Vectorize/VecUtils.h | 3 + test/Transforms/SLPVectorizer/X86/reduction.ll | 52 +++++++++++++++ 4 files changed, 147 insertions(+), 7 deletions(-) create mode 100644 test/Transforms/SLPVectorizer/X86/reduction.ll (limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp') diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 209d287743..2f55a007f2 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -26,6 +26,7 @@ #include "llvm/Analysis/Verifier.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" @@ -38,7 +39,7 @@ using namespace llvm; static cl::opt -SLPCostThreshold("slp-threshold", cl::init(1), cl::Hidden, +SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize trees if the gain is above this " "number. (gain = -cost of vectorization)")); namespace { @@ -63,7 +64,7 @@ struct SLPVectorizer : public BasicBlockPass { /// object. We sort the stores to their base objects to reduce the cost of the /// quadratic search on the stores. TODO: We can further reduce this cost /// if we flush the chain creation every time we run into a memory barrier. - bool CollectStores(BasicBlock *BB, BoUpSLP &R) { + bool collectStores(BasicBlock *BB, BoUpSLP &R) { for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { StoreInst *SI = dyn_cast(it); if (!SI) @@ -84,7 +85,79 @@ struct SLPVectorizer : public BasicBlockPass { return true; } - bool RollStoreChains(BoUpSLP &R) { + bool tryToVectorizePair(BinaryOperator *A, BinaryOperator *B, BoUpSLP &R) { + if (!A || !B) return false; + BoUpSLP::ValueList VL; + VL.push_back(A); + VL.push_back(B); + int Cost = R.getTreeCost(VL); + DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost << ".\n"); + if (Cost >= -SLPCostThreshold) return false; + DEBUG(dbgs()<<"SLP: Vectorizing pair.\n"); + R.vectorizeArith(VL); + return true; + } + + bool tryToVectorizeCandidate(BinaryOperator *V, BoUpSLP &R) { + if (!V) return false; + BinaryOperator *A = dyn_cast(V->getOperand(0)); + BinaryOperator *B = dyn_cast(V->getOperand(1)); + // Try to vectorize V. + if (tryToVectorizePair(A, B, R)) return true; + + // Try to skip B. + if (B && B->hasOneUse()) { + BinaryOperator *B0 = dyn_cast(B->getOperand(0)); + BinaryOperator *B1 = dyn_cast(B->getOperand(1)); + if (tryToVectorizePair(A, B0, R)) { + B->moveBefore(V); + return true; + } + if (tryToVectorizePair(A, B1, R)) { + B->moveBefore(V); + return true; + } + } + + // Try to slip A. + if (A && A->hasOneUse()) { + BinaryOperator *A0 = dyn_cast(A->getOperand(0)); + BinaryOperator *A1 = dyn_cast(A->getOperand(1)); + if (tryToVectorizePair(A0, B, R)) { + A->moveBefore(V); + return true; + } + if (tryToVectorizePair(A1, B, R)) { + A->moveBefore(V); + return true; + } + } + return 0; + } + + bool vectorizeReductions(BasicBlock *BB, BoUpSLP &R) { + bool Changed = false; + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + if (isa(it)) continue; + PHINode *P = dyn_cast(it); + if (!P) return Changed; + // Check that the PHI is a reduction PHI. + if (P->getNumIncomingValues() != 2) return Changed; + Value *Rdx = (P->getIncomingBlock(0) == BB ? P->getIncomingValue(0) : + (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) : 0)); + // Check if this is a Binary Operator. + BinaryOperator *BI = dyn_cast_or_null(Rdx); + if (!BI) continue; + + Value *Inst = BI->getOperand(0); + if (Inst == P) Inst = BI->getOperand(1); + Changed |= tryToVectorizeCandidate(dyn_cast(Inst), R); + } + + return Changed; + } + + bool rollStoreChains(BoUpSLP &R) { bool Changed = false; // Attempt to sort and vectorize each of the store-groups. for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end(); @@ -116,13 +189,15 @@ struct SLPVectorizer : public BasicBlockPass { // he store instructions. BoUpSLP R(&BB, SE, DL, TTI, AA); - if (!CollectStores(&BB, R)) - return false; + bool Changed = vectorizeReductions(&BB, R); + + if (!collectStores(&BB, R)) + return Changed; - bool Changed = RollStoreChains(R); - if (Changed) { + if (rollStoreChains(R)) { DEBUG(dbgs()<<"SLP: vectorized in \""<getName()<<"\"\n"); DEBUG(verifyFunction(*BB.getParent())); + Changed |= true; } return Changed; diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp index d8be0ae721..4d075c505d 100644 --- a/lib/Transforms/Vectorize/VecUtils.cpp +++ b/lib/Transforms/Vectorize/VecUtils.cpp @@ -208,6 +208,16 @@ Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) { return 0; } +void BoUpSLP::vectorizeArith(ValueList &Operands) { + Value *Vec = vectorizeTree(Operands, Operands.size()); + BasicBlock::iterator Loc = cast(Vec); + IRBuilder<> Builder(++Loc); + for (unsigned i = 0, e = Operands.size(); i != e; ++i) { + Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i)); + Operands[i]->replaceAllUsesWith(S); + } +} + int BoUpSLP::getTreeCost(ValueList &VL) { // Get rid of the list of stores that were removed, and from the // lists of instructions with multiple users. diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h index 808fb10f9f..f865236ff8 100644 --- a/lib/Transforms/Vectorize/VecUtils.h +++ b/lib/Transforms/Vectorize/VecUtils.h @@ -66,6 +66,9 @@ struct BoUpSLP { /// \returns true if the basic block was modified. bool vectorizeStores(StoreList &Stores, int costThreshold); + /// \brief Vectorize a group of scalars into a vector tree. + void vectorizeArith(ValueList &Operands); + private: /// \returns This method contains the recursive part of getTreeCost. int getTreeCost_rec(ValueList &VL, unsigned Depth); diff --git a/test/Transforms/SLPVectorizer/X86/reduction.ll b/test/Transforms/SLPVectorizer/X86/reduction.ll new file mode 100644 index 0000000000..ced9f15783 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/reduction.ll @@ -0,0 +1,52 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128" +target triple = "i386-apple-macosx10.8.0" + +; int foo(double *A, int n, int m) { +; double sum = 0, v1 = 2, v0 = 3; +; for (int i=0; i < n; ++i) +; sum += 7*A[i*2] + 7*A[i*2+1]; +; return sum; +; } + +;CHECK: reduce +;CHECK: load <2 x double> +;CHECK: fmul <2 x double> +;CHECK: ret +define i32 @reduce(double* nocapture %A, i32 %n, i32 %m) #0 { +entry: + %cmp13 = icmp sgt i32 %n, 0 + br i1 %cmp13, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %i.015 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %sum.014 = phi double [ %add6, %for.body ], [ 0.000000e+00, %entry ] + %mul = shl nsw i32 %i.015, 1 + %arrayidx = getelementptr inbounds double* %A, i32 %mul + %0 = load double* %arrayidx, align 4, !tbaa !0 + %mul1 = fmul double %0, 7.000000e+00 + %add12 = or i32 %mul, 1 + %arrayidx3 = getelementptr inbounds double* %A, i32 %add12 + %1 = load double* %arrayidx3, align 4, !tbaa !0 + %mul4 = fmul double %1, 7.000000e+00 + %add5 = fadd double %mul1, %mul4 + %add6 = fadd double %sum.014, %add5 + %inc = add nsw i32 %i.015, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body + +for.cond.for.end_crit_edge: ; preds = %for.body + %phitmp = fptosi double %add6 to i32 + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry + %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] + ret i32 %sum.0.lcssa +} + +attributes #0 = { nounwind readonly ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!0 = metadata !{metadata !"double", metadata !1} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA"} -- cgit v1.2.3-70-g09d2 From ab105ae95fc473c19d9f0b019fc7c7a16d17b1a5 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sun, 14 Apr 2013 05:15:53 +0000 Subject: SLPVectorizer: Add support for trees that don't start at binary operators, and add the cost of extracting values from the roots of the tree. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179475 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/SLPVectorizer.cpp | 15 ++++++---- lib/Transforms/Vectorize/VecUtils.cpp | 10 +++++++ lib/Transforms/Vectorize/VecUtils.h | 7 ++++- test/Transforms/SLPVectorizer/X86/reduction2.ll | 37 +++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 7 deletions(-) create mode 100644 test/Transforms/SLPVectorizer/X86/reduction2.ll (limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp') diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 2f55a007f2..d94b2b2a0e 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -85,14 +85,16 @@ struct SLPVectorizer : public BasicBlockPass { return true; } - bool tryToVectorizePair(BinaryOperator *A, BinaryOperator *B, BoUpSLP &R) { + bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { if (!A || !B) return false; BoUpSLP::ValueList VL; VL.push_back(A); VL.push_back(B); int Cost = R.getTreeCost(VL); - DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost << ".\n"); - if (Cost >= -SLPCostThreshold) return false; + int ExtrCost = R.getScalarizationCost(VL); + DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost << + " Cost of extract:" << ExtrCost << ".\n"); + if ((Cost+ExtrCost) >= -SLPCostThreshold) return false; DEBUG(dbgs()<<"SLP: Vectorizing pair.\n"); R.vectorizeArith(VL); return true; @@ -100,11 +102,12 @@ struct SLPVectorizer : public BasicBlockPass { bool tryToVectorizeCandidate(BinaryOperator *V, BoUpSLP &R) { if (!V) return false; - BinaryOperator *A = dyn_cast(V->getOperand(0)); - BinaryOperator *B = dyn_cast(V->getOperand(1)); // Try to vectorize V. - if (tryToVectorizePair(A, B, R)) return true; + if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R)) + return true; + BinaryOperator *A = dyn_cast(V->getOperand(0)); + BinaryOperator *B = dyn_cast(V->getOperand(1)); // Try to skip B. if (B && B->hasOneUse()) { BinaryOperator *B0 = dyn_cast(B->getOperand(0)); diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp index 4d075c505d..584f3d9778 100644 --- a/lib/Transforms/Vectorize/VecUtils.cpp +++ b/lib/Transforms/Vectorize/VecUtils.cpp @@ -173,6 +173,16 @@ bool BoUpSLP::vectorizeStores(StoreList &Stores, int costThreshold) { return Changed; } +int BoUpSLP::getScalarizationCost(ValueList &VL) { + Type *ScalarTy = VL[0]->getType(); + + if (StoreInst *SI = dyn_cast(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + return getScalarizationCost(VecTy); +} + int BoUpSLP::getScalarizationCost(Type *Ty) { int Cost = 0; for (unsigned i = 0, e = cast(Ty)->getNumElements(); i < e; ++i) diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h index f865236ff8..edebcb3e27 100644 --- a/lib/Transforms/Vectorize/VecUtils.h +++ b/lib/Transforms/Vectorize/VecUtils.h @@ -61,6 +61,11 @@ struct BoUpSLP { /// A negative number means that this is profitable. int getTreeCost(ValueList &VL); + /// \returns the scalarization cost for this ValueList. Assuming that this + /// subtree gets vectorized, we may need to extract the values from the + /// roots. This method calculates the cost of extracting the values. + int getScalarizationCost(ValueList &VL); + /// \brief Attempts to order and vectorize a sequence of stores. This /// function does a quadratic scan of the given stores. /// \returns true if the basic block was modified. @@ -118,7 +123,7 @@ private: /// by multiple lanes, or by users outside the tree. /// NOTICE: The vectorization methods also use this set. ValueSet MustScalarize; - + // Contains a list of values that are used outside the current tree. This // set must be reset between runs. ValueSet MultiUserVals; diff --git a/test/Transforms/SLPVectorizer/X86/reduction2.ll b/test/Transforms/SLPVectorizer/X86/reduction2.ll new file mode 100644 index 0000000000..9b5d5f701d --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/reduction2.ll @@ -0,0 +1,37 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128" +target triple = "i386-apple-macosx10.8.0" + +;CHECK: @foo +;CHECK: load <2 x double> +;CHECK: ret +define double @foo(double* nocapture %D) #0 { + br label %1 + +;