diff options
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 43 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/reduction.ll | 40 |
2 files changed, 71 insertions, 12 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index d143f919ce..e3c76bb15d 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -817,34 +817,53 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { NewPhi->addIncoming(VectorStart, LoopBypassBlock); NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody); - // Extract the first scalar. - Value *Scalar0 = - Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); - // Extract and reduce the remaining vector elements. - for (unsigned i=1; i < VF; ++i) { - Value *Scalar1 = - Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); + // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles + // and vector ops, reducing the set of values being computed by half each + // round. + assert(isPowerOf2_32(VF) && + "Reduction emission only supported for pow2 vectors!"); + Value *TmpVec = NewPhi; + SmallVector<Constant*, 32> ShuffleMask(VF, 0); + for (unsigned i = VF; i != 1; i >>= 1) { + // Move the upper half of the vector to the lower half. + for (unsigned j = 0; j != i/2; ++j) + ShuffleMask[j] = Builder.getInt32(i/2 + j); + + // Fill the rest of the mask with undef. + std::fill(&ShuffleMask[i/2], ShuffleMask.end(), + UndefValue::get(Builder.getInt32Ty())); + + Value *Shuf = + Builder.CreateShuffleVector(TmpVec, + UndefValue::get(TmpVec->getType()), + ConstantVector::get(ShuffleMask), + "rdx.shuf"); + + // Emit the operation on the shuffled value. switch (RdxDesc.Kind) { case LoopVectorizationLegality::IntegerAdd: - Scalar0 = Builder.CreateAdd(Scalar0, Scalar1, "add.rdx"); + TmpVec = Builder.CreateAdd(TmpVec, Shuf, "add.rdx"); break; case LoopVectorizationLegality::IntegerMult: - Scalar0 = Builder.CreateMul(Scalar0, Scalar1, "mul.rdx"); + TmpVec = Builder.CreateMul(TmpVec, Shuf, "mul.rdx"); break; case LoopVectorizationLegality::IntegerOr: - Scalar0 = Builder.CreateOr(Scalar0, Scalar1, "or.rdx"); + TmpVec = Builder.CreateOr(TmpVec, Shuf, "or.rdx"); break; case LoopVectorizationLegality::IntegerAnd: - Scalar0 = Builder.CreateAnd(Scalar0, Scalar1, "and.rdx"); + TmpVec = Builder.CreateAnd(TmpVec, Shuf, "and.rdx"); break; case LoopVectorizationLegality::IntegerXor: - Scalar0 = Builder.CreateXor(Scalar0, Scalar1, "xor.rdx"); + TmpVec = Builder.CreateXor(TmpVec, Shuf, "xor.rdx"); break; default: llvm_unreachable("Unknown reduction operation"); } } + // The result is in the first element of the vector. + Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); + // Now, we need to fix the users of the reduction variable // inside and outside of the scalar remainder loop. // We know that the loop is in LCSSA form. We need to update the diff --git a/test/Transforms/LoopVectorize/reduction.ll b/test/Transforms/LoopVectorize/reduction.ll index c1848b35fc..bc1c0290de 100644 --- a/test/Transforms/LoopVectorize/reduction.ll +++ b/test/Transforms/LoopVectorize/reduction.ll @@ -7,6 +7,11 @@ target triple = "x86_64-apple-macosx10.8.0" ;CHECK: phi <4 x i32> ;CHECK: load <4 x i32> ;CHECK: add <4 x i32> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> +;CHECK: add <4 x i32> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> +;CHECK: add <4 x i32> +;CHECK: extractelement <4 x i32> %{{.*}}, i32 0 ;CHECK: ret i32 define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { %1 = icmp sgt i32 %n, 0 @@ -37,6 +42,11 @@ define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapt ;CHECK: phi <4 x i32> ;CHECK: load <4 x i32> ;CHECK: mul <4 x i32> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> +;CHECK: mul <4 x i32> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> +;CHECK: mul <4 x i32> +;CHECK: extractelement <4 x i32> %{{.*}}, i32 0 ;CHECK: ret i32 define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { %1 = icmp sgt i32 %n, 0 @@ -67,6 +77,11 @@ define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocap ;CHECK: phi <4 x i32> ;CHECK: load <4 x i32> ;CHECK: mul nsw <4 x i32> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> +;CHECK: add <4 x i32> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> +;CHECK: add <4 x i32> +;CHECK: extractelement <4 x i32> %{{.*}}, i32 0 ;CHECK: ret i32 define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { %1 = icmp sgt i32 %n, 0 @@ -95,6 +110,11 @@ define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapt ;CHECK: @reduction_mul ;CHECK: mul <4 x i32> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> +;CHECK: mul <4 x i32> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> +;CHECK: mul <4 x i32> +;CHECK: extractelement <4 x i32> %{{.*}}, i32 0 ;CHECK: ret i32 define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { %1 = icmp sgt i32 %n, 0 @@ -124,6 +144,11 @@ define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapt ;CHECK: @start_at_non_zero ;CHECK: phi <4 x i32> ;CHECK: <i32 120, i32 0, i32 0, i32 0> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> +;CHECK: add <4 x i32> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> +;CHECK: add <4 x i32> +;CHECK: extractelement <4 x i32> %{{.*}}, i32 0 ;CHECK: ret i32 define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp { entry: @@ -152,6 +177,11 @@ for.end: ; preds = %for.body, %entry ;CHECK: @reduction_and ;CHECK: and <4 x i32> ;CHECK: <i32 -1, i32 -1, i32 -1, i32 -1> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> +;CHECK: and <4 x i32> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> +;CHECK: and <4 x i32> +;CHECK: extractelement <4 x i32> %{{.*}}, i32 0 ;CHECK: ret i32 define i32 @reduction_and(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly { entry: @@ -179,6 +209,11 @@ for.end: ; preds = %for.body, %entry ;CHECK: @reduction_or ;CHECK: or <4 x i32> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> +;CHECK: or <4 x i32> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> +;CHECK: or <4 x i32> +;CHECK: extractelement <4 x i32> %{{.*}}, i32 0 ;CHECK: ret i32 define i32 @reduction_or(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly { entry: @@ -206,6 +241,11 @@ for.end: ; preds = %for.body, %entry ;CHECK: @reduction_xor ;CHECK: xor <4 x i32> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> +;CHECK: xor <4 x i32> +;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> +;CHECK: xor <4 x i32> +;CHECK: extractelement <4 x i32> %{{.*}}, i32 0 ;CHECK: ret i32 define i32 @reduction_xor(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly { entry: |