From 8383b539ff4c039108ee0c202a27b787621d96cf Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Tue, 9 Apr 2013 19:44:35 +0000
Subject: Add support for bottom-up SLP vectorization infrastructure.

This commit adds the infrastructure for performing bottom-up SLP vectorization (and other optimizations) on parallel computations.
The infrastructure has three potential users:

  1. The loop vectorizer needs to be able to vectorize AOS data structures such as (sum += A[i] + A[i+1]).

  2. The BB-vectorizer needs this infrastructure for bottom-up SLP vectorization, because bottom-up vectorization is faster to compute.

  3. A loop-roller needs to be able to analyze consecutive chains and roll them into a loop, in order to reduce code size. A loop roller does not need to create vector instructions, and this infrastructure separates the chain analysis from the vectorization.

This patch also includes a simple (100 LOC) bottom up SLP vectorizer that uses the infrastructure, and can vectorize this code:

void SAXPY(int *x, int *y, int a, int i) {
  x[i]   = a * x[i]   + y[i];
  x[i+1] = a * x[i+1] + y[i+1];
  x[i+2] = a * x[i+2] + y[i+2];
  x[i+3] = a * x[i+3] + y[i+3];
}



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179117 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/SLPVectorizer.cpp | 153 +++++++++++++++++++++++++++++
 1 file changed, 153 insertions(+)
 create mode 100644 lib/Transforms/Vectorize/SLPVectorizer.cpp

(limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
new file mode 100644
index 0000000000..4b61dc9120
--- /dev/null
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -0,0 +1,153 @@
+//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
+// stores that can be put together into vector-stores. Next, it attempts to
+// construct vectorizable tree using the use-def chains. If a profitable tree
+// was found, the SLP vectorizer performs vectorization on the tree.
+//
+// The pass is inspired by the work described in the paper:
+//  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
+//
+//===----------------------------------------------------------------------===//
+#define SV_NAME "slp-vectorizer"
+#define DEBUG_TYPE SV_NAME
+
+#include "VecUtils.h"
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+
+using namespace llvm;
+
+static cl::opt<int>
+SLPCostThreshold("slp-threshold", cl::init(1), cl::Hidden,
+                 cl::desc("Only vectorize trees if the gain is above this "
+                          "number. (gain = -cost of vectorization)"));
+namespace {
+
+/// The SLPVectorizer Pass.
+struct SLPVectorizer : public BasicBlockPass {
+  typedef std::map<Value*, BoUpSLP::StoreList> StoreListMap;
+
+  /// Pass identification, replacement for typeid
+  static char ID;
+
+  explicit SLPVectorizer() : BasicBlockPass(ID) {
+    initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
+  }
+
+  ScalarEvolution *SE;
+  DataLayout *DL;
+  TargetTransformInfo *TTI;
+  AliasAnalysis *AA;
+
+  /// \brief Collect memory references and sort them according to their base
+  /// object. We sort the stores to their base objects to reduce the cost of the
+  /// quadratic search on the stores. TODO: We can further reduce this cost
+  /// if we flush the chain creation every time we run into a memory barrier.
+  bool CollectStores(BasicBlock *BB, BoUpSLP &R) {
+    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+      // Can't vectorize instructions with side effects.
+      if (it->mayThrow())
+        return false;
+
+      StoreInst *SI = dyn_cast<StoreInst>(it);
+      if (!SI)
+        continue;
+
+      // Check that the pointer points to scalars.
+      if (SI->getValueOperand()->getType()->isAggregateType())
+        return false;
+
+      // Find the base of the GEP.
+      Value *Ptr = SI->getPointerOperand();
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
+        Ptr = GEP->getPointerOperand();
+
+      // Save the store locations.
+      StoreRefs[Ptr].push_back(SI);
+    }
+    return true;
+  }
+
+  bool RollStoreChains(BoUpSLP &R) {
+    bool Changed = false;
+    // Attempt to sort and vectorize each of the store-groups.
+    for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
+         it != e; ++it) {
+      if (it->second.size() < 2)
+        continue;
+      Changed |= R.vectorizeStores(it->second, -SLPCostThreshold);
+    }
+    return Changed;
+  }
+
+  virtual bool runOnBasicBlock(BasicBlock &BB) {
+    SE = &getAnalysis<ScalarEvolution>();
+    DL = getAnalysisIfAvailable<DataLayout>();
+    TTI = &getAnalysis<TargetTransformInfo>();
+    AA = &getAnalysis<AliasAnalysis>();
+    StoreRefs.clear();
+
+    // Use the bollom up slp vectorizer to construct chains that start with
+    // he store instructions.
+    BoUpSLP R(&BB, SE, DL, TTI, AA);
+
+    if (!CollectStores(&BB, R))
+      return false;
+
+    bool Changed = RollStoreChains(R);
+    if (Changed) {
+      DEBUG(dbgs()<<"Rolled chains in \""<<BB.getParent()->getName()<<"\"\n");
+      DEBUG(verifyFunction(*BB.getParent()));
+    }
+
+    return Changed;
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    BasicBlockPass::getAnalysisUsage(AU);
+    AU.addRequired<ScalarEvolution>();
+    AU.addRequired<AliasAnalysis>();
+    AU.addRequired<TargetTransformInfo>();
+  }
+
+private:
+  StoreListMap StoreRefs;
+};
+
+} // end anonymous namespace
+
+char SLPVectorizer::ID = 0;
+static const char lv_name[] = "SLP Vectorizer";
+INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
+
+namespace llvm {
+  Pass *createSLPVectorizerPass() {
+    return new SLPVectorizer();
+  }
+}
+
-- 
cgit v1.2.3-70-g09d2


From 20cd5e68626ff1af698201fb34f86a59e15c2ff8 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 10 Apr 2013 18:57:27 +0000
Subject: We require DataLayout for analyzing the size of stores.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179206 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/SLPVectorizer.cpp | 5 +++++
 lib/Transforms/Vectorize/VecUtils.cpp      | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')

diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4b61dc9120..01b2b92870 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -107,6 +107,11 @@ struct SLPVectorizer : public BasicBlockPass {
     AA = &getAnalysis<AliasAnalysis>();
     StoreRefs.clear();
 
+    // Must have DataLayout. We can't require it because some tests run w/o
+    // triple.
+    if (!DL)
+      return false;
+
     // Use the bollom up slp vectorizer to construct chains that start with
     // he store instructions.
     BoUpSLP R(&BB, SE, DL, TTI, AA);
diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp
index 7e9f12d43c..3efaaf6edd 100644
--- a/lib/Transforms/Vectorize/VecUtils.cpp
+++ b/lib/Transforms/Vectorize/VecUtils.cpp
@@ -94,7 +94,7 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
   Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
   // The Instructions are connsecutive if the size of the first load/store is
   // the same as the offset.
-  unsigned Sz = (DL ? DL->getTypeStoreSize(Ty) : Ty->getScalarSizeInBits()/8);
+  unsigned Sz = DL->getTypeStoreSize(Ty);
   return ((-Offset) == Sz);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 4b924d3a61442fb70773057d40789ed1e3187a77 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 10 Apr 2013 19:41:36 +0000
Subject: Make the SLP store-merger less paranoid about function calls. We
 check for function calls when we check if it is safe to sink instructions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179207 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/SLPVectorizer.cpp       |  4 ---
 test/Transforms/SLPVectorizer/X86/barriercall.ll | 40 ++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 4 deletions(-)
 create mode 100644 test/Transforms/SLPVectorizer/X86/barriercall.ll

(limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')

diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 01b2b92870..21bdec83f0 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -65,10 +65,6 @@ struct SLPVectorizer : public BasicBlockPass {
   /// if we flush the chain creation every time we run into a memory barrier.
   bool CollectStores(BasicBlock *BB, BoUpSLP &R) {
     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-      // Can't vectorize instructions with side effects.
-      if (it->mayThrow())
-        return false;
-
       StoreInst *SI = dyn_cast<StoreInst>(it);
       if (!SI)
         continue;
diff --git a/test/Transforms/SLPVectorizer/X86/barriercall.ll b/test/Transforms/SLPVectorizer/X86/barriercall.ll
new file mode 100644
index 0000000000..f520e129b6
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/barriercall.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @foo
+;CHECK: store <4 x i32>
+;CHECK: ret
+define i32 @foo(i32* nocapture %A, i32 %n) #0 {
+entry:
+  %call = tail call i32 (...)* @bar() #2
+  %mul = mul nsw i32 %n, 5
+  %add = add nsw i32 %mul, 9
+  store i32 %add, i32* %A, align 4, !tbaa !0
+  %mul1 = mul nsw i32 %n, 9
+  %add2 = add nsw i32 %mul1, 9
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 1
+  store i32 %add2, i32* %arrayidx3, align 4, !tbaa !0
+  %mul4 = shl i32 %n, 3
+  %add5 = add nsw i32 %mul4, 9
+  %arrayidx6 = getelementptr inbounds i32* %A, i64 2
+  store i32 %add5, i32* %arrayidx6, align 4, !tbaa !0
+  %mul7 = mul nsw i32 %n, 10
+  %add8 = add nsw i32 %mul7, 9
+  %arrayidx9 = getelementptr inbounds i32* %A, i64 3
+  store i32 %add8, i32* %arrayidx9, align 4, !tbaa !0
+  ret i32 undef
+}
+
+  ; We can still vectorize the stores below.
+
+declare i32 @bar(...) #1
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-- 
cgit v1.2.3-70-g09d2


From 196ee11f85ce0148d2c2e33fbe1f1171ac5a8828 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Fri, 12 Apr 2013 21:11:14 +0000
Subject: Add debug prints.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179412 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/SLPVectorizer.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')

diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 21bdec83f0..209d287743 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -91,6 +91,10 @@ struct SLPVectorizer : public BasicBlockPass {
          it != e; ++it) {
       if (it->second.size() < 2)
         continue;
+
+      DEBUG(dbgs()<<"SLP: Analyzing a store chain of length " <<
+            it->second.size() << ".\n");
+
       Changed |= R.vectorizeStores(it->second, -SLPCostThreshold);
     }
     return Changed;
@@ -117,7 +121,7 @@ struct SLPVectorizer : public BasicBlockPass {
 
     bool Changed = RollStoreChains(R);
     if (Changed) {
-      DEBUG(dbgs()<<"Rolled chains in \""<<BB.getParent()->getName()<<"\"\n");
+      DEBUG(dbgs()<<"SLP: vectorized in \""<<BB.getParent()->getName()<<"\"\n");
       DEBUG(verifyFunction(*BB.getParent()));
     }
 
-- 
cgit v1.2.3-70-g09d2


From f7eaf29cf70a545f5b717c638db83ba6e8b6b3c5 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sun, 14 Apr 2013 03:22:20 +0000
Subject: SLPVectorizer: add initial support for reduction variable
 vectorization.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179470 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/SLPVectorizer.cpp     | 89 ++++++++++++++++++++++++--
 lib/Transforms/Vectorize/VecUtils.cpp          | 10 +++
 lib/Transforms/Vectorize/VecUtils.h            |  3 +
 test/Transforms/SLPVectorizer/X86/reduction.ll | 52 +++++++++++++++
 4 files changed, 147 insertions(+), 7 deletions(-)
 create mode 100644 test/Transforms/SLPVectorizer/X86/reduction.ll

(limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')

diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 209d287743..2f55a007f2 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
@@ -38,7 +39,7 @@
 using namespace llvm;
 
 static cl::opt<int>
-SLPCostThreshold("slp-threshold", cl::init(1), cl::Hidden,
+SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
                  cl::desc("Only vectorize trees if the gain is above this "
                           "number. (gain = -cost of vectorization)"));
 namespace {
@@ -63,7 +64,7 @@ struct SLPVectorizer : public BasicBlockPass {
   /// object. We sort the stores to their base objects to reduce the cost of the
   /// quadratic search on the stores. TODO: We can further reduce this cost
   /// if we flush the chain creation every time we run into a memory barrier.
-  bool CollectStores(BasicBlock *BB, BoUpSLP &R) {
+  bool collectStores(BasicBlock *BB, BoUpSLP &R) {
     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
       StoreInst *SI = dyn_cast<StoreInst>(it);
       if (!SI)
@@ -84,7 +85,79 @@ struct SLPVectorizer : public BasicBlockPass {
     return true;
   }
 
-  bool RollStoreChains(BoUpSLP &R) {
+  bool tryToVectorizePair(BinaryOperator *A, BinaryOperator *B,  BoUpSLP &R) {
+    if (!A || !B) return false;
+    BoUpSLP::ValueList VL;
+    VL.push_back(A);
+    VL.push_back(B);
+    int Cost = R.getTreeCost(VL);
+    DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost << ".\n");
+    if (Cost >= -SLPCostThreshold) return false;
+    DEBUG(dbgs()<<"SLP: Vectorizing pair.\n");
+    R.vectorizeArith(VL);
+    return true;
+  }
+
+  bool tryToVectorizeCandidate(BinaryOperator *V,  BoUpSLP &R) {
+    if (!V) return false;
+    BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
+    BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
+    // Try to vectorize V.
+    if (tryToVectorizePair(A, B, R)) return true;
+
+    // Try to skip B.
+    if (B && B->hasOneUse()) {
+      BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+      BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+      if (tryToVectorizePair(A, B0, R)) {
+        B->moveBefore(V);
+        return true;
+      }
+      if (tryToVectorizePair(A, B1, R)) {
+        B->moveBefore(V);
+        return true;
+      }
+    }
+
+    // Try to slip A.
+    if (A && A->hasOneUse()) {
+      BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+      BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+      if (tryToVectorizePair(A0, B, R)) {
+        A->moveBefore(V);
+        return true;
+      }
+      if (tryToVectorizePair(A1, B, R)) {
+        A->moveBefore(V);
+        return true;
+      }
+    }
+    return 0;
+  }
+
+  bool vectorizeReductions(BasicBlock *BB, BoUpSLP &R) {
+    bool Changed = false;
+    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+      if (isa<DbgInfoIntrinsic>(it)) continue;
+      PHINode *P = dyn_cast<PHINode>(it);
+      if (!P) return Changed;
+      // Check that the PHI is a reduction PHI.
+      if (P->getNumIncomingValues() != 2) return Changed;
+      Value *Rdx = (P->getIncomingBlock(0) == BB ? P->getIncomingValue(0) :
+                   (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) : 0));
+      // Check if this is a Binary Operator.
+      BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
+      if (!BI) continue;
+
+      Value *Inst = BI->getOperand(0);
+      if (Inst == P) Inst = BI->getOperand(1);
+      Changed |= tryToVectorizeCandidate(dyn_cast<BinaryOperator>(Inst), R);
+    }
+
+    return Changed;
+  }
+
+  bool rollStoreChains(BoUpSLP &R) {
     bool Changed = false;
     // Attempt to sort and vectorize each of the store-groups.
     for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
@@ -116,13 +189,15 @@ struct SLPVectorizer : public BasicBlockPass {
     // he store instructions.
     BoUpSLP R(&BB, SE, DL, TTI, AA);
 
-    if (!CollectStores(&BB, R))
-      return false;
+    bool Changed = vectorizeReductions(&BB, R);
+
+    if (!collectStores(&BB, R))
+      return Changed;
 
-    bool Changed = RollStoreChains(R);
-    if (Changed) {
+    if (rollStoreChains(R)) {
       DEBUG(dbgs()<<"SLP: vectorized in \""<<BB.getParent()->getName()<<"\"\n");
       DEBUG(verifyFunction(*BB.getParent()));
+      Changed |= true;
     }
 
     return Changed;
diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp
index d8be0ae721..4d075c505d 100644
--- a/lib/Transforms/Vectorize/VecUtils.cpp
+++ b/lib/Transforms/Vectorize/VecUtils.cpp
@@ -208,6 +208,16 @@ Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) {
   return 0;
 }
 
+void BoUpSLP::vectorizeArith(ValueList &Operands) {
+  Value *Vec = vectorizeTree(Operands, Operands.size());
+  BasicBlock::iterator Loc = cast<Instruction>(Vec);
+  IRBuilder<> Builder(++Loc);
+  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+    Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i));
+    Operands[i]->replaceAllUsesWith(S);
+  }
+}
+
 int BoUpSLP::getTreeCost(ValueList &VL) {
   // Get rid of the list of stores that were removed, and from the
   // lists of instructions with multiple users.
diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h
index 808fb10f9f..f865236ff8 100644
--- a/lib/Transforms/Vectorize/VecUtils.h
+++ b/lib/Transforms/Vectorize/VecUtils.h
@@ -66,6 +66,9 @@ struct BoUpSLP  {
   /// \returns true if the basic block was modified.
   bool vectorizeStores(StoreList &Stores, int costThreshold);
 
+  /// \brief Vectorize a group of scalars into a vector tree.
+  void vectorizeArith(ValueList &Operands);
+
 private:
   /// \returns This method contains the recursive part of getTreeCost.
   int getTreeCost_rec(ValueList &VL, unsigned Depth);
diff --git a/test/Transforms/SLPVectorizer/X86/reduction.ll b/test/Transforms/SLPVectorizer/X86/reduction.ll
new file mode 100644
index 0000000000..ced9f15783
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/reduction.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+; int foo(double *A, int n, int m) {
+;   double sum = 0, v1 = 2, v0 = 3;
+;   for (int i=0; i < n; ++i)
+;     sum += 7*A[i*2] + 7*A[i*2+1];
+;   return sum;
+; }
+
+;CHECK: reduce
+;CHECK: load <2 x double>
+;CHECK: fmul <2 x double>
+;CHECK: ret
+define i32 @reduce(double* nocapture %A, i32 %n, i32 %m) #0 {
+entry:
+  %cmp13 = icmp sgt i32 %n, 0
+  br i1 %cmp13, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.015 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %sum.014 = phi double [ %add6, %for.body ], [ 0.000000e+00, %entry ]
+  %mul = shl nsw i32 %i.015, 1
+  %arrayidx = getelementptr inbounds double* %A, i32 %mul
+  %0 = load double* %arrayidx, align 4, !tbaa !0
+  %mul1 = fmul double %0, 7.000000e+00
+  %add12 = or i32 %mul, 1
+  %arrayidx3 = getelementptr inbounds double* %A, i32 %add12
+  %1 = load double* %arrayidx3, align 4, !tbaa !0
+  %mul4 = fmul double %1, 7.000000e+00
+  %add5 = fadd double %mul1, %mul4
+  %add6 = fadd double %sum.014, %add5
+  %inc = add nsw i32 %i.015, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  %phitmp = fptosi double %add6 to i32
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+attributes #0 = { nounwind readonly ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = metadata !{metadata !"double", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-- 
cgit v1.2.3-70-g09d2


From ab105ae95fc473c19d9f0b019fc7c7a16d17b1a5 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sun, 14 Apr 2013 05:15:53 +0000
Subject: SLPVectorizer: Add support for trees that don't start at binary
 operators, and add the cost of extracting values from the roots of the tree.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179475 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/SLPVectorizer.cpp      | 15 ++++++----
 lib/Transforms/Vectorize/VecUtils.cpp           | 10 +++++++
 lib/Transforms/Vectorize/VecUtils.h             |  7 ++++-
 test/Transforms/SLPVectorizer/X86/reduction2.ll | 37 +++++++++++++++++++++++++
 4 files changed, 62 insertions(+), 7 deletions(-)
 create mode 100644 test/Transforms/SLPVectorizer/X86/reduction2.ll

(limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')

diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2f55a007f2..d94b2b2a0e 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -85,14 +85,16 @@ struct SLPVectorizer : public BasicBlockPass {
     return true;
   }
 
-  bool tryToVectorizePair(BinaryOperator *A, BinaryOperator *B,  BoUpSLP &R) {
+  bool tryToVectorizePair(Value *A, Value *B,  BoUpSLP &R) {
     if (!A || !B) return false;
     BoUpSLP::ValueList VL;
     VL.push_back(A);
     VL.push_back(B);
     int Cost = R.getTreeCost(VL);
-    DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost << ".\n");
-    if (Cost >= -SLPCostThreshold) return false;
+    int ExtrCost = R.getScalarizationCost(VL);
+    DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost <<
+                  " Cost of extract:" << ExtrCost << ".\n");
+    if ((Cost+ExtrCost) >= -SLPCostThreshold) return false;
     DEBUG(dbgs()<<"SLP: Vectorizing pair.\n");
     R.vectorizeArith(VL);
     return true;
@@ -100,11 +102,12 @@ struct SLPVectorizer : public BasicBlockPass {
 
   bool tryToVectorizeCandidate(BinaryOperator *V,  BoUpSLP &R) {
     if (!V) return false;
-    BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
-    BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
     // Try to vectorize V.
-    if (tryToVectorizePair(A, B, R)) return true;
+    if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
+      return true;
 
+    BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
+    BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
     // Try to skip B.
     if (B && B->hasOneUse()) {
       BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp
index 4d075c505d..584f3d9778 100644
--- a/lib/Transforms/Vectorize/VecUtils.cpp
+++ b/lib/Transforms/Vectorize/VecUtils.cpp
@@ -173,6 +173,16 @@ bool BoUpSLP::vectorizeStores(StoreList &Stores, int costThreshold) {
   return Changed;
 }
 
+int BoUpSLP::getScalarizationCost(ValueList &VL) {
+  Type *ScalarTy = VL[0]->getType();
+
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+
+  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+  return getScalarizationCost(VecTy);
+}
+
 int BoUpSLP::getScalarizationCost(Type *Ty) {
   int Cost = 0;
   for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h
index f865236ff8..edebcb3e27 100644
--- a/lib/Transforms/Vectorize/VecUtils.h
+++ b/lib/Transforms/Vectorize/VecUtils.h
@@ -61,6 +61,11 @@ struct BoUpSLP  {
   /// A negative number means that this is profitable.
   int getTreeCost(ValueList &VL);
 
+  /// \returns the scalarization cost for this ValueList. Assuming that this
+  /// subtree gets vectorized, we may need to extract the values from the
+  /// roots. This method calculates the cost of extracting the values.
+  int getScalarizationCost(ValueList &VL);
+
   /// \brief Attempts to order and vectorize a sequence of stores. This
   /// function does a quadratic scan of the given stores.
   /// \returns true if the basic block was modified.
@@ -118,7 +123,7 @@ private:
   /// by multiple lanes, or by users outside the tree.
   /// NOTICE: The vectorization methods also use this set.
   ValueSet MustScalarize;
-  
+
   // Contains a list of values that are used outside the current tree. This
   // set must be reset between runs.
   ValueSet MultiUserVals;
diff --git a/test/Transforms/SLPVectorizer/X86/reduction2.ll b/test/Transforms/SLPVectorizer/X86/reduction2.ll
new file mode 100644
index 0000000000..9b5d5f701d
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/reduction2.ll
@@ -0,0 +1,37 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+;CHECK: @foo
+;CHECK: load <2 x double>
+;CHECK: ret
+define double @foo(double* nocapture %D) #0 {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %i.02 = phi i32 [ 0, %0 ], [ %10, %1 ]
+  %sum.01 = phi double [ 0.000000e+00, %0 ], [ %9, %1 ]
+  %2 = shl nsw i32 %i.02, 1
+  %3 = getelementptr inbounds double* %D, i32 %2
+  %4 = load double* %3, align 4, !tbaa !0
+  %A4 = fmul double %4, %4
+  %5 = or i32 %2, 1
+  %6 = getelementptr inbounds double* %D, i32 %5
+  %7 = load double* %6, align 4, !tbaa !0
+  %A7 = fmul double %7, %7
+  %8 = fadd double %A4, %A7
+  %9 = fadd double %sum.01, %8
+  %10 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %10, 100
+  br i1 %exitcond, label %11, label %1
+
+; <label>:11                                      ; preds = %1
+  ret double %9
+}
+
+attributes #0 = { nounwind readonly ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = metadata !{metadata !"double", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-- 
cgit v1.2.3-70-g09d2


From 09616565dd41192c745605828b4f51f787a53bcf Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Mon, 15 Apr 2013 04:25:27 +0000
Subject: SLPVectorizer: Add support for vectorizing trees that start at
 compare instructions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179504 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/SLPVectorizer.cpp         | 61 ++++++++++++++--------
 .../Transforms/SLPVectorizer/X86/compare-reduce.ll | 53 +++++++++++++++++++
 2 files changed, 93 insertions(+), 21 deletions(-)
 create mode 100644 test/Transforms/SLPVectorizer/X86/compare-reduce.ll

(limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')

diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d94b2b2a0e..ea33801fd2 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -100,7 +100,7 @@ struct SLPVectorizer : public BasicBlockPass {
     return true;
   }
 
-  bool tryToVectorizeCandidate(BinaryOperator *V,  BoUpSLP &R) {
+  bool tryToVectorize(BinaryOperator *V,  BoUpSLP &R) {
     if (!V) return false;
     // Try to vectorize V.
     if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
@@ -142,25 +142,42 @@ struct SLPVectorizer : public BasicBlockPass {
     bool Changed = false;
     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
       if (isa<DbgInfoIntrinsic>(it)) continue;
-      PHINode *P = dyn_cast<PHINode>(it);
-      if (!P) return Changed;
-      // Check that the PHI is a reduction PHI.
-      if (P->getNumIncomingValues() != 2) return Changed;
-      Value *Rdx = (P->getIncomingBlock(0) == BB ? P->getIncomingValue(0) :
-                   (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) : 0));
-      // Check if this is a Binary Operator.
-      BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
-      if (!BI) continue;
-
-      Value *Inst = BI->getOperand(0);
-      if (Inst == P) Inst = BI->getOperand(1);
-      Changed |= tryToVectorizeCandidate(dyn_cast<BinaryOperator>(Inst), R);
+
+      // Try to vectorize reductions that use PHINodes.
+      if (PHINode *P = dyn_cast<PHINode>(it)) {
+        // Check that the PHI is a reduction PHI.
+        if (P->getNumIncomingValues() != 2) return Changed;
+        Value *Rdx = (P->getIncomingBlock(0) == BB ? P->getIncomingValue(0) :
+                     (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) :
+                      0));
+        // Check if this is a Binary Operator.
+        BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
+        if (!BI)
+          continue;
+
+        Value *Inst = BI->getOperand(0);
+        if (Inst == P) Inst = BI->getOperand(1);
+        Changed |= tryToVectorize(dyn_cast<BinaryOperator>(Inst), R);
+        continue;
+      }
+
+      // Try to vectorize trees that start at compare instructions.
+      if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
+        if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
+          Changed |= true;
+          continue;
+        }
+        for (int i = 0; i < 2; ++i)
+          if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i)))
+            Changed |= tryToVectorize(BI, R);
+        continue;
+      }
     }
 
     return Changed;
   }
 
-  bool rollStoreChains(BoUpSLP &R) {
+  bool vectorizeStoreChains(BoUpSLP &R) {
     bool Changed = false;
     // Attempt to sort and vectorize each of the store-groups.
     for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
@@ -192,17 +209,19 @@ struct SLPVectorizer : public BasicBlockPass {
     // he store instructions.
     BoUpSLP R(&BB, SE, DL, TTI, AA);
 
+    // Vectorize trees that end at reductions.
     bool Changed = vectorizeReductions(&BB, R);
 
-    if (!collectStores(&BB, R))
-      return Changed;
+    // Vectorize trees that end at stores.
+    if (collectStores(&BB, R)) {
+      DEBUG(dbgs()<<"SLP: Found stores to vectorize.\n");
+      Changed |= vectorizeStoreChains(R);
+    }
 
-    if (rollStoreChains(R)) {
-      DEBUG(dbgs()<<"SLP: vectorized in \""<<BB.getParent()->getName()<<"\"\n");
+    if (Changed) {
+      DEBUG(dbgs()<<"SLP: vectorized \""<<BB.getParent()->getName()<<"\"\n");
       DEBUG(verifyFunction(*BB.getParent()));
-      Changed |= true;
     }
-
     return Changed;
   }
 
diff --git a/test/Transforms/SLPVectorizer/X86/compare-reduce.ll b/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
new file mode 100644
index 0000000000..05f8e616bb
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.7.0"
+
+@.str = private unnamed_addr constant [6 x i8] c"bingo\00", align 1
+
+;CHECK: @reduce_compare
+;CHECK: load <2 x double>
+;CHECK: fmul <2 x double>
+;CHECK: fmul <2 x double>
+;CHECK: fadd <2 x double>
+;CHECK: extractelement
+;CHECK: extractelement
+;CHECK: ret
+define void @reduce_compare(double* nocapture %A, i32 %n) {
+entry:
+  %conv = sitofp i32 %n to double
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds double* %A, i64 %0
+  %1 = load double* %arrayidx, align 8
+  %mul1 = fmul double %conv, %1
+  %mul2 = fmul double %mul1, 7.000000e+00
+  %add = fadd double %mul2, 5.000000e+00
+  %2 = or i64 %0, 1
+  %arrayidx6 = getelementptr inbounds double* %A, i64 %2
+  %3 = load double* %arrayidx6, align 8
+  %mul8 = fmul double %conv, %3
+  %mul9 = fmul double %mul8, 4.000000e+00
+  %add10 = fadd double %mul9, 9.000000e+00
+  %cmp11 = fcmp ogt double %add, %add10
+  br i1 %cmp11, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([6 x i8]* @.str, i64 0, i64 0))
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 100
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc
+  ret void
+}
+
+declare i32 @printf(i8* nocapture, ...)
+
-- 
cgit v1.2.3-70-g09d2


From e9a4411db4d3a05965630f668daf8071bf2d3513 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Mon, 15 Apr 2013 22:00:26 +0000
Subject: SLPVectorizer: Make it a function pass and add code for hoisting the
 vector-gather sequence out of loops.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179562 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/IPO/PassManagerBuilder.cpp  |   6 +-
 lib/Transforms/Vectorize/SLPVectorizer.cpp | 367 +++++++++++++++++------------
 lib/Transforms/Vectorize/VecUtils.cpp      |   9 +-
 lib/Transforms/Vectorize/VecUtils.h        |  37 ++-
 test/Transforms/SLPVectorizer/X86/hoist.ll |  59 +++++
 5 files changed, 315 insertions(+), 163 deletions(-)
 create mode 100644 test/Transforms/SLPVectorizer/X86/hoist.ll

(limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')

diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 60957d2c31..ffd07b6afc 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -213,10 +213,8 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
 
   addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
 
-  if (SLPVectorize) {
-    MPM.add(createSLPVectorizerPass());
-    MPM.add(createEarlyCSEPass());
-  }
+  if (SLPVectorize)
+    MPM.add(createSLPVectorizerPass());     // Vectorize parallel scalar chains.
 
   if (BBVectorize) {
     MPM.add(createBBVectorizePass());
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ea33801fd2..6d4c36aacd 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/Verifier.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -45,13 +46,13 @@ SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
 namespace {
 
 /// The SLPVectorizer Pass.
-struct SLPVectorizer : public BasicBlockPass {
+struct SLPVectorizer : public FunctionPass {
   typedef std::map<Value*, BoUpSLP::StoreList> StoreListMap;
 
   /// Pass identification, replacement for typeid
   static char ID;
 
-  explicit SLPVectorizer() : BasicBlockPass(ID) {
+  explicit SLPVectorizer() : FunctionPass(ID) {
     initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
   }
 
@@ -59,182 +60,256 @@ struct SLPVectorizer : public BasicBlockPass {
   DataLayout *DL;
   TargetTransformInfo *TTI;
   AliasAnalysis *AA;
+  LoopInfo *LI;
+
+  virtual bool runOnFunction(Function &F) {
+    SE = &getAnalysis<ScalarEvolution>();
+    DL = getAnalysisIfAvailable<DataLayout>();
+    TTI = &getAnalysis<TargetTransformInfo>();
+    AA = &getAnalysis<AliasAnalysis>();
+    LI = &getAnalysis<LoopInfo>();
+
+    StoreRefs.clear();
+    bool Changed = false;
+
+    // Must have DataLayout. We can't require it because some tests run w/o
+    // triple.
+    if (!DL)
+      return false;
+
+    for (Function::iterator it = F.begin(), e = F.end(); it != e; ++it) {
+      BasicBlock *BB = it;
+      bool BBChanged = false;
+
+      // Use the bollom up slp vectorizer to construct chains that start with
+      // he store instructions.
+      BoUpSLP R(BB, SE, DL, TTI, AA);
+
+      // Vectorize trees that end at reductions.
+      BBChanged |= vectorizeReductions(BB, R);
+
+      // Vectorize trees that end at stores.
+      if (collectStores(BB, R)) {
+        DEBUG(dbgs()<<"SLP: Found stores to vectorize.\n");
+        BBChanged |= vectorizeStoreChains(R);
+      }
+
+      // Try to hoist some of the scalarization code to the preheader.
+      if (BBChanged) hoistGatherSequence(LI, BB, R);
+
+      Changed |= BBChanged;
+    }
+
+    if (Changed) {
+      DEBUG(dbgs()<<"SLP: vectorized \""<<F.getName()<<"\"\n");
+      DEBUG(verifyFunction(F));
+    }
+    return Changed;
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    FunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<ScalarEvolution>();
+    AU.addRequired<AliasAnalysis>();
+    AU.addRequired<TargetTransformInfo>();
+    AU.addRequired<LoopInfo>();
+  }
+
+private:
 
   /// \brief Collect memory references and sort them according to their base
   /// object. We sort the stores to their base objects to reduce the cost of the
   /// quadratic search on the stores. TODO: We can further reduce this cost
   /// if we flush the chain creation every time we run into a memory barrier.
-  bool collectStores(BasicBlock *BB, BoUpSLP &R) {
-    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-      StoreInst *SI = dyn_cast<StoreInst>(it);
-      if (!SI)
-        continue;
+  bool collectStores(BasicBlock *BB, BoUpSLP &R);
 
-      // Check that the pointer points to scalars.
-      if (SI->getValueOperand()->getType()->isAggregateType())
-        return false;
+  /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
+  bool tryToVectorizePair(Value *A, Value *B,  BoUpSLP &R);
 
-      // Find the base of the GEP.
-      Value *Ptr = SI->getPointerOperand();
-      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
-        Ptr = GEP->getPointerOperand();
+  /// \brief Try to vectorize a chain that may start at the operands of \V;
+  bool tryToVectorize(BinaryOperator *V,  BoUpSLP &R);
 
-      // Save the store locations.
-      StoreRefs[Ptr].push_back(SI);
-    }
-    return true;
+  /// \brief Vectorize the stores that were collected in StoreRefs.
+  bool vectorizeStoreChains(BoUpSLP &R);
+
+  /// \brief Try to hoist gather sequences outside of the loop in cases where
+  /// all of the sources are loop invariant.
+  void hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, BoUpSLP &R);
+
+  /// \brief Scan the basic block and look for reductions that may start a
+  /// vectorization chain.
+  bool vectorizeReductions(BasicBlock *BB, BoUpSLP &R);
+
+private:
+  StoreListMap StoreRefs;
+};
+
+bool SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
+  StoreRefs.clear();
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    StoreInst *SI = dyn_cast<StoreInst>(it);
+    if (!SI)
+      continue;
+
+    // Check that the pointer points to scalars.
+    if (SI->getValueOperand()->getType()->isAggregateType())
+      return false;
+
+    // Find the base of the GEP.
+    Value *Ptr = SI->getPointerOperand();
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
+      Ptr = GEP->getPointerOperand();
+
+    // Save the store locations.
+    StoreRefs[Ptr].push_back(SI);
   }
+  return true;
+}
+
+bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B,  BoUpSLP &R) {
+  if (!A || !B) return false;
+  BoUpSLP::ValueList VL;
+  VL.push_back(A);
+  VL.push_back(B);
+  int Cost = R.getTreeCost(VL);
+  int ExtrCost = R.getScalarizationCost(VL);
+  DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost <<
+        " Cost of extract:" << ExtrCost << ".\n");
+  if ((Cost+ExtrCost) >= -SLPCostThreshold) return false;
+  DEBUG(dbgs()<<"SLP: Vectorizing pair.\n");
+  R.vectorizeArith(VL);
+  return true;
+}
 
-  bool tryToVectorizePair(Value *A, Value *B,  BoUpSLP &R) {
-    if (!A || !B) return false;
-    BoUpSLP::ValueList VL;
-    VL.push_back(A);
-    VL.push_back(B);
-    int Cost = R.getTreeCost(VL);
-    int ExtrCost = R.getScalarizationCost(VL);
-    DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost <<
-                  " Cost of extract:" << ExtrCost << ".\n");
-    if ((Cost+ExtrCost) >= -SLPCostThreshold) return false;
-    DEBUG(dbgs()<<"SLP: Vectorizing pair.\n");
-    R.vectorizeArith(VL);
+bool SLPVectorizer::tryToVectorize(BinaryOperator *V,  BoUpSLP &R) {
+  if (!V) return false;
+  // Try to vectorize V.
+  if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
     return true;
-  }
 
-  bool tryToVectorize(BinaryOperator *V,  BoUpSLP &R) {
-    if (!V) return false;
-    // Try to vectorize V.
-    if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
+  BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
+  BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
+  // Try to skip B.
+  if (B && B->hasOneUse()) {
+    BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+    BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+    if (tryToVectorizePair(A, B0, R)) {
+      B->moveBefore(V);
       return true;
-
-    BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
-    BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
-    // Try to skip B.
-    if (B && B->hasOneUse()) {
-      BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
-      BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
-      if (tryToVectorizePair(A, B0, R)) {
-        B->moveBefore(V);
-        return true;
-      }
-      if (tryToVectorizePair(A, B1, R)) {
-        B->moveBefore(V);
-        return true;
-      }
     }
-
-    // Try to slip A.
-    if (A && A->hasOneUse()) {
-      BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
-      BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
-      if (tryToVectorizePair(A0, B, R)) {
-        A->moveBefore(V);
-        return true;
-      }
-      if (tryToVectorizePair(A1, B, R)) {
-        A->moveBefore(V);
-        return true;
-      }
+    if (tryToVectorizePair(A, B1, R)) {
+      B->moveBefore(V);
+      return true;
     }
-    return 0;
   }
 
-  bool vectorizeReductions(BasicBlock *BB, BoUpSLP &R) {
-    bool Changed = false;
-    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-      if (isa<DbgInfoIntrinsic>(it)) continue;
-
-      // Try to vectorize reductions that use PHINodes.
-      if (PHINode *P = dyn_cast<PHINode>(it)) {
-        // Check that the PHI is a reduction PHI.
-        if (P->getNumIncomingValues() != 2) return Changed;
-        Value *Rdx = (P->getIncomingBlock(0) == BB ? P->getIncomingValue(0) :
-                     (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) :
-                      0));
-        // Check if this is a Binary Operator.
-        BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
-        if (!BI)
-          continue;
-
-        Value *Inst = BI->getOperand(0);
-        if (Inst == P) Inst = BI->getOperand(1);
-        Changed |= tryToVectorize(dyn_cast<BinaryOperator>(Inst), R);
-        continue;
-      }
-
-      // Try to vectorize trees that start at compare instructions.
-      if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
-        if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
-          Changed |= true;
-          continue;
-        }
-        for (int i = 0; i < 2; ++i)
-          if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i)))
-            Changed |= tryToVectorize(BI, R);
-        continue;
-      }
+  // Try to slip A.
+  if (A && A->hasOneUse()) {
+    BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+    BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+    if (tryToVectorizePair(A0, B, R)) {
+      A->moveBefore(V);
+      return true;
+    }
+    if (tryToVectorizePair(A1, B, R)) {
+      A->moveBefore(V);
+      return true;
     }
-
-    return Changed;
   }
+  return 0;
+}
 
-  bool vectorizeStoreChains(BoUpSLP &R) {
-    bool Changed = false;
-    // Attempt to sort and vectorize each of the store-groups.
-    for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
-         it != e; ++it) {
-      if (it->second.size() < 2)
+bool SLPVectorizer::vectorizeReductions(BasicBlock *BB, BoUpSLP &R) {
+  bool Changed = false;
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    if (isa<DbgInfoIntrinsic>(it)) continue;
+
+    // Try to vectorize reductions that use PHINodes.
+    if (PHINode *P = dyn_cast<PHINode>(it)) {
+      // Check that the PHI is a reduction PHI.
+      if (P->getNumIncomingValues() != 2) return Changed;
+      Value *Rdx = (P->getIncomingBlock(0) == BB ? P->getIncomingValue(0) :
+                    (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) :
+                     0));
+      // Check if this is a Binary Operator.
+      BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
+      if (!BI)
         continue;
 
-      DEBUG(dbgs()<<"SLP: Analyzing a store chain of length " <<
-            it->second.size() << ".\n");
+      Value *Inst = BI->getOperand(0);
+      if (Inst == P) Inst = BI->getOperand(1);
+      Changed |= tryToVectorize(dyn_cast<BinaryOperator>(Inst), R);
+      continue;
+    }
 
-      Changed |= R.vectorizeStores(it->second, -SLPCostThreshold);
+    // Try to vectorize trees that start at compare instructions.
+    if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
+      if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
+        Changed |= true;
+        continue;
+      }
+      for (int i = 0; i < 2; ++i)
+        if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i)))
+          Changed |= tryToVectorize(BI, R);
+      continue;
     }
-    return Changed;
   }
 
-  virtual bool runOnBasicBlock(BasicBlock &BB) {
-    SE = &getAnalysis<ScalarEvolution>();
-    DL = getAnalysisIfAvailable<DataLayout>();
-    TTI = &getAnalysis<TargetTransformInfo>();
-    AA = &getAnalysis<AliasAnalysis>();
-    StoreRefs.clear();
-
-    // Must have DataLayout. We can't require it because some tests run w/o
-    // triple.
-    if (!DL)
-      return false;
-
-    // Use the bollom up slp vectorizer to construct chains that start with
-    // he store instructions.
-    BoUpSLP R(&BB, SE, DL, TTI, AA);
+  return Changed;
+}
 
-    // Vectorize trees that end at reductions.
-    bool Changed = vectorizeReductions(&BB, R);
+bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
+  bool Changed = false;
+  // Attempt to sort and vectorize each of the store-groups.
+  for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
+       it != e; ++it) {
+    if (it->second.size() < 2)
+      continue;
 
-    // Vectorize trees that end at stores.
-    if (collectStores(&BB, R)) {
-      DEBUG(dbgs()<<"SLP: Found stores to vectorize.\n");
-      Changed |= vectorizeStoreChains(R);
-    }
+    DEBUG(dbgs()<<"SLP: Analyzing a store chain of length " <<
+          it->second.size() << ".\n");
 
-    if (Changed) {
-      DEBUG(dbgs()<<"SLP: vectorized \""<<BB.getParent()->getName()<<"\"\n");
-      DEBUG(verifyFunction(*BB.getParent()));
-    }
-    return Changed;
+    Changed |= R.vectorizeStores(it->second, -SLPCostThreshold);
   }
+  return Changed;
+}
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-    BasicBlockPass::getAnalysisUsage(AU);
-    AU.addRequired<ScalarEvolution>();
-    AU.addRequired<AliasAnalysis>();
-    AU.addRequired<TargetTransformInfo>();
+void SLPVectorizer::hoistGatherSequence(LoopInfo *LI, BasicBlock *BB,
+                                        BoUpSLP &R) {
+  // Check if this block is inside a loop.
+  Loop *L = LI->getLoopFor(BB);
+  if (!L)
+    return;
+
+  // Check if it has a preheader.
+  BasicBlock *PreHeader = L->getLoopPreheader();
+  if (!PreHeader)
+    return;
+
+  // Mark the insertion point for the block.
+  Instruction *Location = PreHeader->getTerminator();
+
+  BoUpSLP::ValueList &Gathers = R.getGatherSeqInstructions();
+  for (BoUpSLP::ValueList::iterator it = Gathers.begin(), e = Gathers.end();
+       it != e; ++it) {
+    InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
+
+    // The InsertElement sequence can be simplified into a constant.
+    if (!Insert)
+      continue;
+
+    // If the vector or the element that we insert into it are
+    // instructions that are defined in this basic block then we can't
+    // hoist this instruction.
+    Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
+    Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
+    if (CurrVec && L->contains(CurrVec)) continue;
+    if (NewElem && L->contains(NewElem)) continue;
+
+    // We can hoist this instruction. Move it to the pre-header.
+    Insert->moveBefore(Location);
   }
-
-private:
-  StoreListMap StoreRefs;
-};
+}
 
 } // end anonymous namespace
 
diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp
index c85711532d..a69646336c 100644
--- a/lib/Transforms/Vectorize/VecUtils.cpp
+++ b/lib/Transforms/Vectorize/VecUtils.cpp
@@ -511,8 +511,15 @@ Instruction *BoUpSLP::GetLastInstr(ValueList &VL, unsigned VF) {
 Value *BoUpSLP::Scalarize(ValueList &VL, VectorType *Ty) {
   IRBuilder<> Builder(GetLastInstr(VL, Ty->getNumElements()));
   Value *Vec = UndefValue::get(Ty);
-  for (unsigned i=0; i < Ty->getNumElements(); ++i)
+  for (unsigned i=0; i < Ty->getNumElements(); ++i) {
+    // Generate the 'InsertElement' instruction.
     Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
+    // Remember that this instruction is used as part of a 'gather' sequence.
+    // The caller of the bottom-up slp vectorizer can try to hoist the sequence
+    // if the users are outside of the basic block.
+    GatherInstructions.push_back(Vec);
+  }
+
   return Vec;
 }
 
diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h
index 03512bf8c3..fed5178b80 100644
--- a/lib/Transforms/Vectorize/VecUtils.h
+++ b/lib/Transforms/Vectorize/VecUtils.h
@@ -71,6 +71,11 @@ struct BoUpSLP  {
   /// \brief Vectorize a group of scalars into a vector tree.
   void vectorizeArith(ValueList &Operands);
 
+  /// \returns the list of new instructions that were added in order to collect
+  /// scalars into vectors. This list can be used to further optimize the gather
+  /// sequences.
+  ValueList &getGatherSeqInstructions() {return GatherInstructions; }
+
 private:
   /// \brief This method contains the recursive part of getTreeCost.
   int getTreeCost_rec(ValueList &VL, unsigned Depth);
@@ -107,11 +112,11 @@ private:
 
   /// \returns a vector from a collection of scalars in \p VL.
   Value *Scalarize(ValueList &VL, VectorType *Ty);
-
+  
 private:
-  // Maps instructions to numbers and back.
+  /// Maps instructions to numbers and back.
   SmallDenseMap<Value*, int> InstrIdx;
-  // Maps integers to Instructions.
+  /// Maps integers to Instructions.
   std::vector<Instruction*> InstrVec;
 
   // -- containers that are used during getTreeCost -- //
@@ -121,21 +126,29 @@ private:
   /// NOTICE: The vectorization methods also use this set.
   ValueSet MustScalarize;
 
-  // Contains a list of values that are used outside the current tree. This
-  // set must be reset between runs.
+  /// Contains a list of values that are used outside the current tree. This
+  /// set must be reset between runs.
   ValueSet MultiUserVals;
-  // Maps values in the tree to the vector lanes that uses them. This map must
-  // be reset between runs of getCost.
+  /// Maps values in the tree to the vector lanes that uses them. This map must
+  /// be reset between runs of getCost.
   std::map<Value*, int> LaneMap;
-  // A list of instructions to ignore while sinking
-  // memory instructions. This map must be reset between runs of getCost.
+  /// A list of instructions to ignore while sinking
+  /// memory instructions. This map must be reset between runs of getCost.
   SmallPtrSet<Value *, 8> MemBarrierIgnoreList;
 
-  // -- containers that are used during vectorizeTree -- //
-  // Maps between the first scalar to the vector. This map must be reset between
-  // runs.
+  // -- Containers that are used during vectorizeTree -- //
+
+  /// Maps between the first scalar to the vector. This map must be reset
+  ///between runs.
   DenseMap<Value*, Value*> VectorizedValues;
 
+  // -- Containers that are used after vectorization by the caller -- //
+
+  /// A list of instructions that are used when gathering scalars into vectors.
+  /// In many cases these instructions can be hoisted outside of the BB.
+  /// Iterating over this list is faster than calling LICM.
+  ValueList GatherInstructions;
+
   // Analysis and block reference.
   BasicBlock *BB;
   ScalarEvolution *SE;
diff --git a/test/Transforms/SLPVectorizer/X86/hoist.ll b/test/Transforms/SLPVectorizer/X86/hoist.ll
new file mode 100644
index 0000000000..5074ceaaab
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/hoist.ll
@@ -0,0 +1,59 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.9.0"
+
+;int foo(int *A, int n, int k) {
+;  for (int i=0; i < 10000; i+=4) {
+;    A[i]   += n;
+;    A[i+1] += k;
+;    A[i+2] += n;
+;    A[i+3] += k;
+;  }
+;}
+
+; preheader:
+;CHECK: entry
+;CHECK-NEXT: insertelement
+;CHECK-NEXT: insertelement
+;CHECK-NEXT: insertelement
+;CHECK-NEXT: insertelement
+; loop body:
+;CHECK: phi
+;CHECK: load <4 x i32>
+;CHECK: add <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret
+define i32 @foo(i32* nocapture %A, i32 %n, i32 %k) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.024 = phi i32 [ 0, %entry ], [ %add10, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %A, i32 %i.024
+  %0 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %n
+  store i32 %add, i32* %arrayidx, align 4
+  %add121 = or i32 %i.024, 1
+  %arrayidx2 = getelementptr inbounds i32* %A, i32 %add121
+  %1 = load i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %1, %k
+  store i32 %add3, i32* %arrayidx2, align 4
+  %add422 = or i32 %i.024, 2
+  %arrayidx5 = getelementptr inbounds i32* %A, i32 %add422
+  %2 = load i32* %arrayidx5, align 4
+  %add6 = add nsw i32 %2, %n
+  store i32 %add6, i32* %arrayidx5, align 4
+  %add723 = or i32 %i.024, 3
+  %arrayidx8 = getelementptr inbounds i32* %A, i32 %add723
+  %3 = load i32* %arrayidx8, align 4
+  %add9 = add nsw i32 %3, %k
+  store i32 %add9, i32* %arrayidx8, align 4
+  %add10 = add nsw i32 %i.024, 4
+  %cmp = icmp slt i32 %add10, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret i32 undef
+}
+
-- 
cgit v1.2.3-70-g09d2


From ef332b1ca1721be962c73e76b4c4e0e44ffaf5d9 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sat, 20 Apr 2013 05:23:11 +0000
Subject: Report the number of stores that were found in the debug message.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179929 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/SLPVectorizer.cpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')

diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 6d4c36aacd..a9ec243bc0 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -89,8 +89,8 @@ struct SLPVectorizer : public FunctionPass {
       BBChanged |= vectorizeReductions(BB, R);
 
       // Vectorize trees that end at stores.
-      if (collectStores(BB, R)) {
-        DEBUG(dbgs()<<"SLP: Found stores to vectorize.\n");
+      if (unsigned count = collectStores(BB, R)) {
+        DEBUG(dbgs()<<"SLP: Found " << count << " stores to vectorize.\n");
         BBChanged |= vectorizeStoreChains(R);
       }
 
@@ -121,7 +121,7 @@ private:
   /// object. We sort the stores to their base objects to reduce the cost of the
   /// quadratic search on the stores. TODO: We can further reduce this cost
   /// if we flush the chain creation every time we run into a memory barrier.
-  bool collectStores(BasicBlock *BB, BoUpSLP &R);
+  unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
 
   /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
   bool tryToVectorizePair(Value *A, Value *B,  BoUpSLP &R);
@@ -144,7 +144,8 @@ private:
   StoreListMap StoreRefs;
 };
 
-bool SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
+unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
+  unsigned count = 0;
   StoreRefs.clear();
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
     StoreInst *SI = dyn_cast<StoreInst>(it);
@@ -153,7 +154,7 @@ bool SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
 
     // Check that the pointer points to scalars.
     if (SI->getValueOperand()->getType()->isAggregateType())
-      return false;
+      return 0;
 
     // Find the base of the GEP.
     Value *Ptr = SI->getPointerOperand();
@@ -162,8 +163,9 @@ bool SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
 
     // Save the store locations.
     StoreRefs[Ptr].push_back(SI);
+    count++;
   }
-  return true;
+  return count;
 }
 
 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B,  BoUpSLP &R) {
-- 
cgit v1.2.3-70-g09d2


From 1d2ad834f2ff2bc5620dbba83dce6d2477023429 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sat, 20 Apr 2013 06:13:47 +0000
Subject: SLPVectorizer: Improve the cost model for loop invariant broadcast
 values.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179930 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/SLPVectorizer.cpp         |  2 +-
 lib/Transforms/Vectorize/VecUtils.cpp              | 31 ++++++---
 lib/Transforms/Vectorize/VecUtils.h                |  6 +-
 test/Transforms/SLPVectorizer/X86/loopinvariant.ll | 73 ++++++++++++++++++++++
 4 files changed, 101 insertions(+), 11 deletions(-)
 create mode 100644 test/Transforms/SLPVectorizer/X86/loopinvariant.ll

(limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')

diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a9ec243bc0..024dd337ac 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -83,7 +83,7 @@ struct SLPVectorizer : public FunctionPass {
 
       // Use the bollom up slp vectorizer to construct chains that start with
       // he store instructions.
-      BoUpSLP R(BB, SE, DL, TTI, AA);
+      BoUpSLP R(BB, SE, DL, TTI, AA, LI->getLoopFor(BB));
 
       // Vectorize trees that end at reductions.
       BBChanged |= vectorizeReductions(BB, R);
diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp
index c08bed7170..244af394b1 100644
--- a/lib/Transforms/Vectorize/VecUtils.cpp
+++ b/lib/Transforms/Vectorize/VecUtils.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/Verifier.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
@@ -44,8 +45,8 @@ static const unsigned RecursionMaxDepth = 6;
 namespace llvm {
 
 BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl,
-                             TargetTransformInfo *Tti, AliasAnalysis *Aa) :
-                             BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa) {
+                 TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp) :
+  BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa), L(Lp)  {
   numberInstructions();
 }
 
@@ -121,7 +122,7 @@ bool BoUpSLP::vectorizeStoreChain(ValueList &Chain, int CostThreshold) {
     if (Cost < CostThreshold) {
       DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
       vectorizeTree(Operands, VF);
-      i += VF;
+      i += VF - 1;
       Changed = true;
     }
   }
@@ -381,13 +382,15 @@ int BoUpSLP::getTreeCost_rec(ValueList &VL, unsigned Depth) {
   // Check if all of the operands are constants.
   bool AllConst = true;
   bool AllSameScalar = true;
+  bool MustScalarizeFlag = false;
   for (unsigned i = 0, e = VL.size(); i < e; ++i) {
     AllConst &= isa<Constant>(VL[i]);
     AllSameScalar &= (VL[0] == VL[i]);
     // Must have a single use.
     Instruction *I = dyn_cast<Instruction>(VL[i]);
-    // This instruction is outside the basic block or if it is a known hazard.
-    if (MustScalarize.count(VL[i]) || (I && I->getParent() != BB))
+    MustScalarizeFlag |= MustScalarize.count(VL[i]);
+    // This instruction is outside the basic block.
+    if (I && I->getParent() != BB)
       return getScalarizationCost(VecTy);
   }
 
@@ -395,11 +398,23 @@ int BoUpSLP::getTreeCost_rec(ValueList &VL, unsigned Depth) {
   if (AllConst) return 0;
 
   // If all of the operands are identical we can broadcast them.
-  if (AllSameScalar)
+  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
+  if (AllSameScalar) {
+    // If we are in a loop, and this is not an instruction (e.g. constant or
+    // argument) or the instruction is defined outside the loop then assume
+    // that the cost is zero.
+    if (L && (!VL0 || !L->contains(VL0)))
+      return 0;
+
+    // We need to broadcast the scalar.
     return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
+  }
+
+  // If this is not a constant, or a scalar from outside the loop then we
+  // need to scalarize it.
+  if (MustScalarizeFlag)
+    return getScalarizationCost(VecTy);
 
-  // Scalarize unknown structures.
-  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
   if (!VL0) return getScalarizationCost(VecTy);
   assert(VL0->getParent() == BB && "Wrong BB");
 
diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h
index 3f6866407a..c756bd3968 100644
--- a/lib/Transforms/Vectorize/VecUtils.h
+++ b/lib/Transforms/Vectorize/VecUtils.h
@@ -27,6 +27,7 @@ class BasicBlock; class Instruction; class Type;
 class VectorType; class StoreInst; class Value;
 class ScalarEvolution; class DataLayout;
 class TargetTransformInfo; class AliasAnalysis;
+class Loop;
 
 /// Bottom Up SLP vectorization utility class.
 struct BoUpSLP  {
@@ -37,7 +38,7 @@ struct BoUpSLP  {
 
   // \brief C'tor.
   BoUpSLP(BasicBlock *Bb, ScalarEvolution *Se, DataLayout *Dl,
-         TargetTransformInfo *Tti, AliasAnalysis *Aa);
+         TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp);
 
   /// \brief Take the pointer operand from the Load/Store instruction.
   /// \returns NULL if this is not a valid Load/Store instruction.
@@ -112,7 +113,7 @@ private:
 
   /// \returns a vector from a collection of scalars in \p VL.
   Value *Scalarize(ValueList &VL, VectorType *Ty);
-  
+
 private:
   /// Maps instructions to numbers and back.
   SmallDenseMap<Value*, int> InstrIdx;
@@ -155,6 +156,7 @@ private:
   DataLayout *DL;
   TargetTransformInfo *TTI;
   AliasAnalysis *AA;
+  Loop *L;
 };
 
 } // end of namespace
diff --git a/test/Transforms/SLPVectorizer/X86/loopinvariant.ll b/test/Transforms/SLPVectorizer/X86/loopinvariant.ll
new file mode 100644
index 0000000000..329a349af8
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/loopinvariant.ll
@@ -0,0 +1,73 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @foo
+;CHECK: load <4 x i32>
+;CHECK: add <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret
+define i32 @foo(i32* nocapture %A, i32 %n) #0 {
+entry:
+  %cmp62 = icmp sgt i32 %n, 0
+  br i1 %cmp62, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4, !tbaa !0
+  %add1 = add nsw i32 %0, %n
+  store i32 %add1, i32* %arrayidx, align 4, !tbaa !0
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 %1
+  %2 = load i32* %arrayidx4, align 4, !tbaa !0
+  %add5 = add nsw i32 %2, %n
+  store i32 %add5, i32* %arrayidx4, align 4, !tbaa !0
+  %3 = or i64 %indvars.iv, 2
+  %arrayidx8 = getelementptr inbounds i32* %A, i64 %3
+  %4 = load i32* %arrayidx8, align 4, !tbaa !0
+  %add9 = add nsw i32 %4, %n
+  store i32 %add9, i32* %arrayidx8, align 4, !tbaa !0
+  %5 = or i64 %indvars.iv, 3
+  %arrayidx12 = getelementptr inbounds i32* %A, i64 %5
+  %6 = load i32* %arrayidx12, align 4, !tbaa !0
+  %add13 = add nsw i32 %6, %n
+  store i32 %add13, i32* %arrayidx12, align 4, !tbaa !0
+  %7 = or i64 %indvars.iv, 4
+  %arrayidx16 = getelementptr inbounds i32* %A, i64 %7
+  %8 = load i32* %arrayidx16, align 4, !tbaa !0
+  %add17 = add nsw i32 %8, %n
+  store i32 %add17, i32* %arrayidx16, align 4, !tbaa !0
+  %9 = or i64 %indvars.iv, 5
+  %arrayidx20 = getelementptr inbounds i32* %A, i64 %9
+  %10 = load i32* %arrayidx20, align 4, !tbaa !0
+  %add21 = add nsw i32 %10, %n
+  store i32 %add21, i32* %arrayidx20, align 4, !tbaa !0
+  %11 = or i64 %indvars.iv, 6
+  %arrayidx24 = getelementptr inbounds i32* %A, i64 %11
+  %12 = load i32* %arrayidx24, align 4, !tbaa !0
+  %add25 = add nsw i32 %12, %n
+  store i32 %add25, i32* %arrayidx24, align 4, !tbaa !0
+  %13 = or i64 %indvars.iv, 7
+  %arrayidx28 = getelementptr inbounds i32* %A, i64 %13
+  %14 = load i32* %arrayidx28, align 4, !tbaa !0
+  %add29 = add nsw i32 %14, %n
+  store i32 %add29, i32* %arrayidx28, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 8
+  %15 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %15, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 undef
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-- 
cgit v1.2.3-70-g09d2


From d7e8cce287616c1cc4dcbab6a43328b01fbe7be4 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sat, 20 Apr 2013 06:40:28 +0000
Subject: Fix an unused variable warning.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179931 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/SLPVectorizer.cpp | 1 +
 1 file changed, 1 insertion(+)

(limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')

diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 024dd337ac..40875c305b 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -90,6 +90,7 @@ struct SLPVectorizer : public FunctionPass {
 
       // Vectorize trees that end at stores.
       if (unsigned count = collectStores(BB, R)) {
+        (void)count;
         DEBUG(dbgs()<<"SLP: Found " << count << " stores to vectorize.\n");
         BBChanged |= vectorizeStoreChains(R);
       }
-- 
cgit v1.2.3-70-g09d2


From 444e33e8987110c6669bc2d9b8efd768bb17faa1 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sat, 20 Apr 2013 07:22:58 +0000
Subject: refactor tryToVectorizePair to a new method that supports
 vectorization of lists.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179932 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/SLPVectorizer.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')

diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 40875c305b..6f4d69e342 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -127,6 +127,9 @@ private:
   /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
   bool tryToVectorizePair(Value *A, Value *B,  BoUpSLP &R);
 
+  /// \brief Try to vectorize a list of operands.
+  bool tryToVectorizeList(BoUpSLP::ValueList &VL, BoUpSLP &R);
+
   /// \brief Try to vectorize a chain that may start at the operands of \V;
   bool tryToVectorize(BinaryOperator *V,  BoUpSLP &R);
 
@@ -174,6 +177,11 @@ bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B,  BoUpSLP &R) {
   BoUpSLP::ValueList VL;
   VL.push_back(A);
   VL.push_back(B);
+  return tryToVectorizeList(VL, R);
+}
+
+bool SLPVectorizer::tryToVectorizeList(BoUpSLP::ValueList &VL,  BoUpSLP &R) {
+  DEBUG(dbgs()<<"SLP: Vectorizing a list of length = " << VL.size() << ".\n");
   int Cost = R.getTreeCost(VL);
   int ExtrCost = R.getScalarizationCost(VL);
   DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost <<
-- 
cgit v1.2.3-70-g09d2


From cd949714ebfab4b6bc54d964a337c23370aab5b2 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sat, 20 Apr 2013 07:29:34 +0000
Subject: SLPVectorizer: Reduce the compile time by eliminating the search for
 some of the more expensive patterns. After this change will only check basic
 arithmetic trees that start at cmpinstr.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179933 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')

diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 6f4d69e342..3a0e9abe57 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -261,7 +261,7 @@ bool SLPVectorizer::vectorizeReductions(BasicBlock *BB, BoUpSLP &R) {
       }
       for (int i = 0; i < 2; ++i)
         if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i)))
-          Changed |= tryToVectorize(BI, R);
+          Changed |= tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R);
       continue;
     }
   }
-- 
cgit v1.2.3-70-g09d2


From 6fe5cc49d88c9dd48a1eefe4c1bdba1567b8eef2 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Sat, 20 Apr 2013 09:49:10 +0000
Subject: SLPVectorizer: Strength reduce SmallVectors to ArrayRefs.

Avoids a couple of copies and allows more flexibility in the clients.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179935 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/SLPVectorizer.cpp |  8 +++-----
 lib/Transforms/Vectorize/VecUtils.cpp      | 24 ++++++++++++------------
 lib/Transforms/Vectorize/VecUtils.h        | 26 +++++++++++++-------------
 3 files changed, 28 insertions(+), 30 deletions(-)

(limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')

diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3a0e9abe57..207d607644 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -128,7 +128,7 @@ private:
   bool tryToVectorizePair(Value *A, Value *B,  BoUpSLP &R);
 
   /// \brief Try to vectorize a list of operands.
-  bool tryToVectorizeList(BoUpSLP::ValueList &VL, BoUpSLP &R);
+  bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R);
 
   /// \brief Try to vectorize a chain that may start at the operands of \V;
   bool tryToVectorize(BinaryOperator *V,  BoUpSLP &R);
@@ -174,13 +174,11 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
 
 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B,  BoUpSLP &R) {
   if (!A || !B) return false;
-  BoUpSLP::ValueList VL;
-  VL.push_back(A);
-  VL.push_back(B);
+  Value *VL[] = { A, B };
   return tryToVectorizeList(VL, R);
 }
 
-bool SLPVectorizer::tryToVectorizeList(BoUpSLP::ValueList &VL,  BoUpSLP &R) {
+bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) {
   DEBUG(dbgs()<<"SLP: Vectorizing a list of length = " << VL.size() << ".\n");
   int Cost = R.getTreeCost(VL);
   int ExtrCost = R.getScalarizationCost(VL);
diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp
index 244af394b1..ad9ccf2bbc 100644
--- a/lib/Transforms/Vectorize/VecUtils.cpp
+++ b/lib/Transforms/Vectorize/VecUtils.cpp
@@ -103,7 +103,7 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
   return ((-Offset) == Sz);
 }
 
-bool BoUpSLP::vectorizeStoreChain(ValueList &Chain, int CostThreshold) {
+bool BoUpSLP::vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold) {
   Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
   unsigned Sz = DL->getTypeSizeInBits(StoreTy);
   unsigned VF = MinVecRegSize / Sz;
@@ -115,7 +115,7 @@ bool BoUpSLP::vectorizeStoreChain(ValueList &Chain, int CostThreshold) {
   for (unsigned i = 0, e = Chain.size(); i < e; ++i) {
     if (i + VF > e) return Changed;
     DEBUG(dbgs()<<"SLP: Analyzing " << VF << " stores at offset "<< i << "\n");
-    ValueList Operands(&Chain[i], &Chain[i] + VF);
+    ArrayRef<Value *> Operands = Chain.slice(i, VF);
 
     int Cost = getTreeCost(Operands);
     DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
@@ -130,7 +130,7 @@ bool BoUpSLP::vectorizeStoreChain(ValueList &Chain, int CostThreshold) {
   return Changed;
 }
 
-bool BoUpSLP::vectorizeStores(StoreList &Stores, int costThreshold) {
+bool BoUpSLP::vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold) {
   ValueSet Heads, Tails;
   SmallDenseMap<Value*, Value*> ConsecutiveChain;
 
@@ -178,7 +178,7 @@ bool BoUpSLP::vectorizeStores(StoreList &Stores, int costThreshold) {
   return Changed;
 }
 
-int BoUpSLP::getScalarizationCost(ValueList &VL) {
+int BoUpSLP::getScalarizationCost(ArrayRef<Value *> VL) {
   // Find the type of the operands in VL.
   Type *ScalarTy = VL[0]->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
@@ -223,7 +223,7 @@ Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) {
   return 0;
 }
 
-void BoUpSLP::vectorizeArith(ValueList &Operands) {
+void BoUpSLP::vectorizeArith(ArrayRef<Value *> Operands) {
   Value *Vec = vectorizeTree(Operands, Operands.size());
   BasicBlock::iterator Loc = cast<Instruction>(Vec);
   IRBuilder<> Builder(++Loc);
@@ -236,7 +236,7 @@ void BoUpSLP::vectorizeArith(ValueList &Operands) {
   }
 }
 
-int BoUpSLP::getTreeCost(ValueList &VL) {
+int BoUpSLP::getTreeCost(ArrayRef<Value *> VL) {
   // Get rid of the list of stores that were removed, and from the
   // lists of instructions with multiple users.
   MemBarrierIgnoreList.clear();
@@ -278,7 +278,7 @@ int BoUpSLP::getTreeCost(ValueList &VL) {
   return getTreeCost_rec(VL, 0);
 }
 
-void BoUpSLP::getTreeUses_rec(ValueList &VL, unsigned Depth) {
+void BoUpSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) {
   if (Depth == RecursionMaxDepth) return;
 
   // Don't handle vectors.
@@ -367,7 +367,7 @@ void BoUpSLP::getTreeUses_rec(ValueList &VL, unsigned Depth) {
   }
 }
 
-int BoUpSLP::getTreeCost_rec(ValueList &VL, unsigned Depth) {
+int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
   Type *ScalarTy = VL[0]->getType();
 
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
@@ -516,14 +516,14 @@ int BoUpSLP::getTreeCost_rec(ValueList &VL, unsigned Depth) {
   }
 }
 
-Instruction *BoUpSLP::GetLastInstr(ValueList &VL, unsigned VF) {
+Instruction *BoUpSLP::GetLastInstr(ArrayRef<Value *> VL, unsigned VF) {
   int MaxIdx = InstrIdx[BB->getFirstNonPHI()];
   for (unsigned i = 0; i < VF; ++i )
     MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]);
   return InstrVec[MaxIdx + 1];
 }
 
-Value *BoUpSLP::Scalarize(ValueList &VL, VectorType *Ty) {
+Value *BoUpSLP::Scalarize(ArrayRef<Value *> VL, VectorType *Ty) {
   IRBuilder<> Builder(GetLastInstr(VL, Ty->getNumElements()));
   Value *Vec = UndefValue::get(Ty);
   for (unsigned i=0; i < Ty->getNumElements(); ++i) {
@@ -538,7 +538,7 @@ Value *BoUpSLP::Scalarize(ValueList &VL, VectorType *Ty) {
   return Vec;
 }
 
-Value *BoUpSLP::vectorizeTree(ValueList &VL, int VF) {
+Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int VF) {
   Value *V = vectorizeTree_rec(VL, VF);
   // We moved some instructions around. We have to number them again
   // before we can do any analysis.
@@ -547,7 +547,7 @@ Value *BoUpSLP::vectorizeTree(ValueList &VL, int VF) {
   return V;
 }
 
-Value *BoUpSLP::vectorizeTree_rec(ValueList &VL, int VF) {
+Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) {
   Type *ScalarTy = VL[0]->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
     ScalarTy = SI->getValueOperand()->getType();
diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h
index c756bd3968..5456c6c779 100644
--- a/lib/Transforms/Vectorize/VecUtils.h
+++ b/lib/Transforms/Vectorize/VecUtils.h
@@ -53,24 +53,24 @@ struct BoUpSLP  {
 
   /// \brief Vectorize the tree that starts with the elements in \p VL.
   /// \returns the vectorized value.
-  Value *vectorizeTree(ValueList &VL, int VF);
+  Value *vectorizeTree(ArrayRef<Value *> VL, int VF);
 
   /// \returns the vectorization cost of the subtree that starts at \p VL.
   /// A negative number means that this is profitable.
-  int getTreeCost(ValueList &VL);
+  int getTreeCost(ArrayRef<Value *> VL);
 
-  /// \returns the scalarization cost for this ValueList. Assuming that this
-  /// subtree gets vectorized, we may need to extract the values from the
+  /// \returns the scalarization cost for this list of values. Assuming that
+  /// this subtree gets vectorized, we may need to extract the values from the
   /// roots. This method calculates the cost of extracting the values.
-  int getScalarizationCost(ValueList &VL);
+  int getScalarizationCost(ArrayRef<Value *> VL);
 
   /// \brief Attempts to order and vectorize a sequence of stores. This
   /// function does a quadratic scan of the given stores.
   /// \returns true if the basic block was modified.
-  bool vectorizeStores(StoreList &Stores, int costThreshold);
+  bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold);
 
   /// \brief Vectorize a group of scalars into a vector tree.
-  void vectorizeArith(ValueList &Operands);
+  void vectorizeArith(ArrayRef<Value *> Operands);
 
   /// \returns the list of new instructions that were added in order to collect
   /// scalars into vectors. This list can be used to further optimize the gather
@@ -79,21 +79,21 @@ struct BoUpSLP  {
 
 private:
   /// \brief This method contains the recursive part of getTreeCost.
-  int getTreeCost_rec(ValueList &VL, unsigned Depth);
+  int getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth);
 
   /// \brief This recursive method looks for vectorization hazards such as
   /// values that are used by multiple users and checks that values are used
   /// by only one vector lane. It updates the variables LaneMap, MultiUserVals.
-  void getTreeUses_rec(ValueList &VL, unsigned Depth);
+  void getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth);
 
   /// \brief This method contains the recursive part of vectorizeTree.
-  Value *vectorizeTree_rec(ValueList &VL, int VF);
+  Value *vectorizeTree_rec(ArrayRef<Value *> VL, int VF);
 
   /// \brief Number all of the instructions in the block.
   void numberInstructions();
 
   ///  \brief Vectorize a sorted sequence of stores.
-  bool vectorizeStoreChain(ValueList &Chain, int CostThreshold);
+  bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold);
 
   /// \returns the scalarization cost for this type. Scalarization in this
   /// context means the creation of vectors from a group of scalars.
@@ -109,10 +109,10 @@ private:
 
   /// \returns the instruction that appears last in the BB from \p VL.
   /// Only consider the first \p VF elements.
-  Instruction *GetLastInstr(ValueList &VL, unsigned VF);
+  Instruction *GetLastInstr(ArrayRef<Value *> VL, unsigned VF);
 
   /// \returns a vector from a collection of scalars in \p VL.
-  Value *Scalarize(ValueList &VL, VectorType *Ty);
+  Value *Scalarize(ArrayRef<Value *> VL, VectorType *Ty);
 
 private:
   /// Maps instructions to numbers and back.
-- 
cgit v1.2.3-70-g09d2


From 4f38e16b89895b795ece58742195d0d95cbd4187 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sat, 20 Apr 2013 22:29:43 +0000
Subject: Fix PR15800. Do not try to vectorize vectors and structs.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179960 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/SLPVectorizer.cpp  | 11 ++++++++++-
 test/Transforms/SLPVectorizer/X86/vector.ll | 14 ++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)
 create mode 100644 test/Transforms/SLPVectorizer/X86/vector.ll

(limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')

diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 207d607644..9a4784f420 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -157,7 +157,8 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
       continue;
 
     // Check that the pointer points to scalars.
-    if (SI->getValueOperand()->getType()->isAggregateType())
+    Type *Ty = SI->getValueOperand()->getType();
+    if (Ty->isAggregateType() || Ty->isVectorTy())
       return 0;
 
     // Find the base of the GEP.
@@ -180,6 +181,14 @@ bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B,  BoUpSLP &R) {
 
 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) {
   DEBUG(dbgs()<<"SLP: Vectorizing a list of length = " << VL.size() << ".\n");
+
+  // Check that all of the parts are scalar.
+  for (int i = 0, e = VL.size(); i < e; ++i) {
+    Type *Ty = VL[i]->getType();
+    if (Ty->isAggregateType() || Ty->isVectorTy())
+      return 0;
+  }
+
   int Cost = R.getTreeCost(VL);
   int ExtrCost = R.getScalarizationCost(VL);
   DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost <<
diff --git a/test/Transforms/SLPVectorizer/X86/vector.ll b/test/Transforms/SLPVectorizer/X86/vector.ll
new file mode 100644
index 0000000000..02a18979c6
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/vector.ll
@@ -0,0 +1,14 @@
+; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Make sure that we are not crashing or changing the code.
+;CHECK: test
+;CHECK: icmp
+;CHECK: ret
+define void @test(<4 x i32> %in, <4 x i32> %in2) {
+  %k = icmp eq <4 x i32> %in, %in2
+  ret void
+}
+
-- 
cgit v1.2.3-70-g09d2


From 7fac0ef71cfaeafd91b9520b553d00d91f83a442 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Tue, 30 Apr 2013 21:04:51 +0000
Subject: Fix a typo

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@180806 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')

diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9a4784f420..cc30cc9278 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -221,7 +221,7 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V,  BoUpSLP &R) {
     }
   }
 
-  // Try to slip A.
+  // Try to skip A.
   if (A && A->hasOneUse()) {
     BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
     BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
-- 
cgit v1.2.3-70-g09d2