diff options
-rw-r--r-- | include/llvm/Analysis/TargetTransformInfo.h | 5 | ||||
-rw-r--r-- | lib/Analysis/TargetTransformInfo.cpp | 8 | ||||
-rw-r--r-- | lib/CodeGen/BasicTargetTransformInfo.cpp | 5 | ||||
-rw-r--r-- | lib/Target/ARM/ARMTargetTransformInfo.cpp | 25 | ||||
-rw-r--r-- | lib/Target/X86/X86TargetTransformInfo.cpp | 15 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 5 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/ARM/lit.local.cfg | 6 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/ARM/sanity.ll | 25 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/X86/gcc-examples.ll | 2 |
9 files changed, 89 insertions, 7 deletions
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index ddf615fe3b..1679c4f2b3 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -148,6 +148,11 @@ public: /// set to false, it returns the number of scalar registers. virtual unsigned getNumberOfRegisters(bool Vector) const; + /// \return The maximum unroll factor that the vectorizer should try to + /// perform for this target. This number depends on the level of parallelism + /// and the number of execution units in the CPU. + virtual unsigned getMaximumUnrollFactor() const; + /// \return The expected cost of arithmetic ops, such as mul, xor, fsub, etc. virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const; diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index 63f495a430..02af2d34c5 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -92,6 +92,10 @@ unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const { return PrevTTI->getNumberOfRegisters(Vector); } +unsigned TargetTransformInfo::getMaximumUnrollFactor() const { + return PrevTTI->getMaximumUnrollFactor(); +} + unsigned TargetTransformInfo::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const { return PrevTTI->getArithmeticInstrCost(Opcode, Ty); @@ -216,6 +220,10 @@ struct NoTTI : ImmutablePass, TargetTransformInfo { return 8; } + unsigned getMaximumUnrollFactor() const { + return 1; + } + unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const { return 1; } diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp index c27e081a5e..2f3ac9a901 100644 --- a/lib/CodeGen/BasicTargetTransformInfo.cpp +++ b/lib/CodeGen/BasicTargetTransformInfo.cpp @@ -83,6 +83,7 @@ public: /// @{ virtual unsigned getNumberOfRegisters(bool Vector) const; + virtual unsigned getMaximumUnrollFactor() const; virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const; virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) const; @@ -182,6 +183,10 @@ unsigned BasicTTI::getNumberOfRegisters(bool Vector) const { return 1; } +unsigned BasicTTI::getMaximumUnrollFactor() const { + return 1; +} + unsigned BasicTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const { // Check if any of the operands are vector operands. int ISD = TLI->InstructionOpcodeToISD(Opcode); diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index 03a23be0a6..634004acb4 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -77,6 +77,31 @@ public: virtual unsigned getIntImmCost(const APInt &Imm, Type *Ty) const; /// @} + + + /// \name Vector TTI Implementations + /// @{ + + unsigned getNumberOfRegisters(bool Vector) const { + if (Vector) { + if (ST->hasNEON()) + return 16; + return 0; + } + + if (ST->isThumb1Only()) + return 8; + return 16; + } + + unsigned getMaximumUnrollFactor() const { + // These are out of order CPUs: + if (ST->isCortexA15() || ST->isSwift()) + return 2; + return 1; + } + + /// @} }; } // end anonymous namespace diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 9cc1b180e9..6ab08cbd12 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -75,7 +75,6 @@ public: /// \name Scalar TTI Implementations /// @{ - virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const; /// @} @@ -84,6 +83,7 @@ public: /// @{ virtual unsigned getNumberOfRegisters(bool Vector) const; + virtual unsigned getMaximumUnrollFactor() const; virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const; virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) const; @@ -156,7 +156,6 @@ FindInConvertTable(const X86TypeConversionCostTblEntry *Tbl, unsigned len, return -1; } - X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const { assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); // TODO: Currently the __builtin_popcount() implementation using SSE3 @@ -171,6 +170,18 @@ unsigned X86TTI::getNumberOfRegisters(bool Vector) const { return 8; } +unsigned X86TTI::getMaximumUnrollFactor() const { + if (ST->isAtom()) + return 1; + + // Sandybridge and Haswell have multiple execution ports and pipelined + // vector units. + if (ST->hasAVX()) + return 4; + + return 2; +} + unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const { // Legalize the type. std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 9c82cb8dca..c29f416be7 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -116,9 +116,6 @@ static const unsigned RuntimeMemoryCheckThreshold = 4; /// This is the highest vector width that we try to generate. static const unsigned MaxVectorSize = 8; -/// This is the highest Unroll Factor. -static const unsigned MaxUnrollSize = 4; - namespace { // Forward declarations. @@ -2715,6 +2712,8 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, UF = std::min(UF, (MaxLoopSizeThreshold / R.NumInstructions)); // Clamp the unroll factor ranges to reasonable factors. + unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor(); + if (UF > MaxUnrollSize) UF = MaxUnrollSize; else if (UF < 1) diff --git a/test/Transforms/LoopVectorize/ARM/lit.local.cfg b/test/Transforms/LoopVectorize/ARM/lit.local.cfg new file mode 100644 index 0000000000..cb77b09ef4 --- /dev/null +++ b/test/Transforms/LoopVectorize/ARM/lit.local.cfg @@ -0,0 +1,6 @@ +config.suffixes = ['.ll', '.c', '.cpp'] + +targets = set(config.root.targets_to_build.split()) +if not 'ARM' in targets: + config.unsupported = True + diff --git a/test/Transforms/LoopVectorize/ARM/sanity.ll b/test/Transforms/LoopVectorize/ARM/sanity.ll new file mode 100644 index 0000000000..11c28a8c0b --- /dev/null +++ b/test/Transforms/LoopVectorize/ARM/sanity.ll @@ -0,0 +1,25 @@ +; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S + +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" +target triple = "thumbv7-apple-ios3.0.0" + +; Make sure that we are not crashing on ARM. + +define i32 @foo(i32* nocapture %A, i32 %n) nounwind readonly ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ] + %sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ] + %2 = getelementptr inbounds i32* %A, i32 %i.02 + %3 = load i32* %2, align 4 + %4 = add nsw i32 %3, %sum.01 + %5 = add nsw i32 %i.02, 1 + %exitcond = icmp eq i32 %5, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + %sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ] + ret i32 %sum.0.lcssa +} diff --git a/test/Transforms/LoopVectorize/X86/gcc-examples.ll b/test/Transforms/LoopVectorize/X86/gcc-examples.ll index 0f21ba678c..57d6301581 100644 --- a/test/Transforms/LoopVectorize/X86/gcc-examples.ll +++ b/test/Transforms/LoopVectorize/X86/gcc-examples.ll @@ -53,8 +53,6 @@ define void @example1() nounwind uwtable ssp { ;UNROLL: @example10b ;UNROLL: load <4 x i16> ;UNROLL: load <4 x i16> -;UNROLL: load <4 x i16> -;UNROLL: store <4 x i32> ;UNROLL: store <4 x i32> ;UNROLL: store <4 x i32> ;UNROLL: ret void |