diff options
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 43 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/same-base-access.ll | 110 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/scalar-store.ll | 48 |
3 files changed, 134 insertions, 67 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 827c13ffc9..4a90d78c24 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1593,8 +1593,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { ValueVector::iterator I, IE; for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { - StoreInst *ST = dyn_cast<StoreInst>(*I); - assert(ST && "Bad StoreInst"); + StoreInst *ST = cast<StoreInst>(*I); Value* Ptr = ST->getPointerOperand(); if (isUniform(Ptr)) { @@ -1609,8 +1608,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { } for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { - LoadInst *LD = dyn_cast<LoadInst>(*I); - assert(LD && "Bad LoadInst"); + LoadInst *LD = cast<LoadInst>(*I); Value* Ptr = LD->getPointerOperand(); // If we did *not* see this pointer before, insert it to the // read list. If we *did* see it before, then it is already in @@ -1633,13 +1631,13 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // Find pointers with computable bounds. We are going to use this information // to place a runtime bound check. - bool RT = true; + bool CanDoRT = true; for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) if (hasComputableBounds(*I)) { PtrRtCheck.insert(SE, TheLoop, *I); DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); } else { - RT = false; + CanDoRT = false; break; } for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) @@ -1647,23 +1645,23 @@ bool LoopVectorizationLegality::canVectorizeMemory() { PtrRtCheck.insert(SE, TheLoop, *I); DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); } else { - RT = false; + CanDoRT = false; break; } // Check that we did not collect too many pointers or found a // unsizeable pointer. - if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) { + if (!CanDoRT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) { PtrRtCheck.reset(); - RT = false; + CanDoRT = false; } - PtrRtCheck.Need = RT; - - if (RT) { + if (CanDoRT) { DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); } + bool NeedRTCheck = false; + // Now that the pointers are in two lists (Reads and ReadWrites), we // can check that there are no conflicts between each of the writes and // between the writes to the reads. @@ -1678,12 +1676,12 @@ bool LoopVectorizationLegality::canVectorizeMemory() { it != e; ++it) { if (!isIdentifiedObject(*it)) { DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); - return RT; + NeedRTCheck = true; } if (!WriteObjects.insert(*it)) { DEBUG(dbgs() << "LV: Found a possible write-write reorder:" << **it <<"\n"); - return RT; + return false; } } TempObjects.clear(); @@ -1696,20 +1694,27 @@ bool LoopVectorizationLegality::canVectorizeMemory() { it != e; ++it) { if (!isIdentifiedObject(*it)) { DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); - return RT; + NeedRTCheck = true; } if (WriteObjects.count(*it)) { DEBUG(dbgs() << "LV: Found a possible read/write reorder:" << **it <<"\n"); - return RT; + return false; } } TempObjects.clear(); } - // It is safe to vectorize and we don't need any runtime checks. - DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n"); - PtrRtCheck.reset(); + PtrRtCheck.Need = NeedRTCheck; + if (NeedRTCheck && !CanDoRT) { + DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << + "the array bounds.\n"); + PtrRtCheck.reset(); + return false; + } + + DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") << + " need a runtime memory check.\n"); return true; } diff --git a/test/Transforms/LoopVectorize/same-base-access.ll b/test/Transforms/LoopVectorize/same-base-access.ll new file mode 100644 index 0000000000..f9ef32e492 --- /dev/null +++ b/test/Transforms/LoopVectorize/same-base-access.ll @@ -0,0 +1,110 @@ +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S -enable-if-conversion | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.9.0" + +; This is kernel11 from "LivermoreLoops". We can't vectorize it because we +; access both x[k] and x[k-1]. +; +; void kernel11(double *x, double *y, int n) { +; for ( int k=1 ; k<n ; k++ ) +; x[k] = x[k-1] + y[k]; +; } + +; CHECK: @kernel11 +; CHECK-NOT: <4 x double> +; CHECK: ret +define i32 @kernel11(double* %x, double* %y, i32 %n) nounwind uwtable ssp { + %1 = alloca double*, align 8 + %2 = alloca double*, align 8 + %3 = alloca i32, align 4 + %k = alloca i32, align 4 + store double* %x, double** %1, align 8 + store double* %y, double** %2, align 8 + store i32 %n, i32* %3, align 4 + store i32 1, i32* %k, align 4 + br label %4 + +; <label>:4 ; preds = %25, %0 + %5 = load i32* %k, align 4 + %6 = load i32* %3, align 4 + %7 = icmp slt i32 %5, %6 + br i1 %7, label %8, label %28 + +; <label>:8 ; preds = %4 + %9 = load i32* %k, align 4 + %10 = sub nsw i32 %9, 1 + %11 = sext i32 %10 to i64 + %12 = load double** %1, align 8 + %13 = getelementptr inbounds double* %12, i64 %11 + %14 = load double* %13, align 8 + %15 = load i32* %k, align 4 + %16 = sext i32 %15 to i64 + %17 = load double** %2, align 8 + %18 = getelementptr inbounds double* %17, i64 %16 + %19 = load double* %18, align 8 + %20 = fadd double %14, %19 + %21 = load i32* %k, align 4 + %22 = sext i32 %21 to i64 + %23 = load double** %1, align 8 + %24 = getelementptr inbounds double* %23, i64 %22 + store double %20, double* %24, align 8 + br label %25 + +; <label>:25 ; preds = %8 + %26 = load i32* %k, align 4 + %27 = add nsw i32 %26, 1 + store i32 %27, i32* %k, align 4 + br label %4 + +; <label>:28 ; preds = %4 + ret i32 0 +} + + + +; We don't vectorize this function because A[i*7] is scalarized, and the +; different scalars can in theory wrap around and overwrite other scalar +; elements. At the moment we only allow read/write access to arrays +; that are consecutive. +; +; void foo(int *a) { +; for (int i=0; i<256; ++i) { +; int x = a[i*7]; +; if (x>3) +; x = x*x+x*4; +; a[i*7] = x+3; +; } +; } + +; CHECK: @func2 +; CHECK-NOT: <4 x i32> +; CHECK: ret +define i32 @func2(i32* nocapture %a) nounwind uwtable ssp { + br label %1 + +; <label>:1 ; preds = %7, %0 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %7 ] + %2 = mul nsw i64 %indvars.iv, 7 + %3 = getelementptr inbounds i32* %a, i64 %2 + %4 = load i32* %3, align 4 + %5 = icmp sgt i32 %4, 3 + br i1 %5, label %6, label %7 + +; <label>:6 ; preds = %1 + %tmp = add i32 %4, 4 + %tmp1 = mul i32 %tmp, %4 + br label %7 + +; <label>:7 ; preds = %6, %1 + %x.0 = phi i32 [ %tmp1, %6 ], [ %4, %1 ] + %8 = add nsw i32 %x.0, 3 + store i32 %8, i32* %3, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 256 + br i1 %exitcond, label %9, label %1 + +; <label>:9 ; preds = %7 + ret i32 0 +} diff --git a/test/Transforms/LoopVectorize/scalar-store.ll b/test/Transforms/LoopVectorize/scalar-store.ll deleted file mode 100644 index 5d207f9445..0000000000 --- a/test/Transforms/LoopVectorize/scalar-store.ll +++ /dev/null @@ -1,48 +0,0 @@ -; RUN: opt < %s -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S -enable-if-conversion | FileCheck %s - -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-apple-macosx10.9.0" - -; When we scalarize a store, make sure that the addresses are extracted -; from a vector. We had a bug where the addresses were the old scalar -; addresses. - -; CHECK: @foo -; CHECK: select -; CHECK: extractelement -; CHECK-NEXT: store -; CHECK: extractelement -; CHECK-NEXT: store -; CHECK: extractelement -; CHECK-NEXT: store -; CHECK: extractelement -; CHECK-NEXT: store -; CHECK: ret -define i32 @foo(i32* nocapture %a) nounwind uwtable ssp { - br label %1 - -; <label>:1 ; preds = %7, %0 - %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %7 ] - %2 = mul nsw i64 %indvars.iv, 7 - %3 = getelementptr inbounds i32* %a, i64 %2 - %4 = load i32* %3, align 4 - %5 = icmp sgt i32 %4, 3 - br i1 %5, label %6, label %7 - -; <label>:6 ; preds = %1 - %tmp = add i32 %4, 4 - %tmp1 = mul i32 %tmp, %4 - br label %7 - -; <label>:7 ; preds = %6, %1 - %x.0 = phi i32 [ %tmp1, %6 ], [ %4, %1 ] - %8 = add nsw i32 %x.0, 3 - store i32 %8, i32* %3, align 4 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, 256 - br i1 %exitcond, label %9, label %1 - -; <label>:9 ; preds = %7 - ret i32 0 -} |