aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEvan Cheng <evan.cheng@apple.com>2009-02-21 02:06:47 +0000
committerEvan Cheng <evan.cheng@apple.com>2009-02-21 02:06:47 +0000
commitd9fb7124035d6f95fb08c5157ab1813fbb2a282f (patch)
tree4eea0bdefddeeb15360a0c1b03206707d7f3614a
parent5a45d76c25d859c526fcdcdec3f07c97fec07d6b (diff)
Teach LSR sink to sink the immediate portion of the common expression back into uses if they fit in address modes of all the uses.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@65215 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Transforms/Scalar/LoopStrengthReduce.cpp99
-rw-r--r--test/CodeGen/X86/2007-10-05-3AddrConvert.ll52
-rw-r--r--test/CodeGen/X86/loop-strength-reduce-2.ll2
-rw-r--r--test/CodeGen/X86/loop-strength-reduce8.ll78
-rw-r--r--test/CodeGen/X86/stride-nine-with-base-reg.ll8
5 files changed, 199 insertions, 40 deletions
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index d18a008fef..2099ceace5 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -26,19 +26,19 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Transforms/Utils/AddrModeMatcher.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Target/TargetData.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CFG.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
#include "llvm/Target/TargetLowering.h"
#include <algorithm>
-#include <set>
using namespace llvm;
STATISTIC(NumReduced , "Number of GEPs strength reduced");
@@ -46,6 +46,7 @@ STATISTIC(NumInserted, "Number of PHIs inserted");
STATISTIC(NumVariable, "Number of PHIs with variable strides");
STATISTIC(NumEliminated, "Number of strides eliminated");
STATISTIC(NumShadow, "Number of Shadow IVs optimized");
+STATISTIC(NumImmSunk, "Number of common expr immediates sunk into uses");
static cl::opt<bool> EnableFullLSRMode("enable-full-lsr",
cl::init(false),
@@ -954,21 +955,17 @@ static void MoveLoopVariantsToImmediateField(SCEVHandle &Val, SCEVHandle &Imm,
/// that can fit into the immediate field of instructions in the target.
/// Accumulate these immediate values into the Imm value.
static void MoveImmediateValues(const TargetLowering *TLI,
- Instruction *User,
+ const Type *UseTy,
SCEVHandle &Val, SCEVHandle &Imm,
bool isAddress, Loop *L,
ScalarEvolution *SE) {
- const Type *UseTy = User->getType();
- if (StoreInst *SI = dyn_cast<StoreInst>(User))
- UseTy = SI->getOperand(0)->getType();
-
if (SCEVAddExpr *SAE = dyn_cast<SCEVAddExpr>(Val)) {
std::vector<SCEVHandle> NewOps;
NewOps.reserve(SAE->getNumOperands());
for (unsigned i = 0; i != SAE->getNumOperands(); ++i) {
SCEVHandle NewOp = SAE->getOperand(i);
- MoveImmediateValues(TLI, User, NewOp, Imm, isAddress, L, SE);
+ MoveImmediateValues(TLI, UseTy, NewOp, Imm, isAddress, L, SE);
if (!NewOp->isLoopInvariant(L)) {
// If this is a loop-variant expression, it must stay in the immediate
@@ -987,7 +984,7 @@ static void MoveImmediateValues(const TargetLowering *TLI,
} else if (SCEVAddRecExpr *SARE = dyn_cast<SCEVAddRecExpr>(Val)) {
// Try to pull immediates out of the start value of nested addrec's.
SCEVHandle Start = SARE->getStart();
- MoveImmediateValues(TLI, User, Start, Imm, isAddress, L, SE);
+ MoveImmediateValues(TLI, UseTy, Start, Imm, isAddress, L, SE);
if (Start != SARE->getStart()) {
std::vector<SCEVHandle> Ops(SARE->op_begin(), SARE->op_end());
@@ -1002,7 +999,7 @@ static void MoveImmediateValues(const TargetLowering *TLI,
SCEVHandle SubImm = SE->getIntegerSCEV(0, Val->getType());
SCEVHandle NewOp = SME->getOperand(1);
- MoveImmediateValues(TLI, User, NewOp, SubImm, isAddress, L, SE);
+ MoveImmediateValues(TLI, UseTy, NewOp, SubImm, isAddress, L, SE);
// If we extracted something out of the subexpressions, see if we can
// simplify this!
@@ -1034,6 +1031,16 @@ static void MoveImmediateValues(const TargetLowering *TLI,
// Otherwise, no immediates to move.
}
+static void MoveImmediateValues(const TargetLowering *TLI,
+ Instruction *User,
+ SCEVHandle &Val, SCEVHandle &Imm,
+ bool isAddress, Loop *L,
+ ScalarEvolution *SE) {
+ const Type *UseTy = User->getType();
+ if (StoreInst *SI = dyn_cast<StoreInst>(User))
+ UseTy = SI->getOperand(0)->getType();
+ MoveImmediateValues(TLI, UseTy, Val, Imm, isAddress, L, SE);
+}
/// SeparateSubExprs - Decompose Expr into all of the subexpressions that are
/// added together. This is used to reassociate common addition subexprs
@@ -1450,6 +1457,9 @@ SCEVHandle LoopStrengthReduce::CollectIVUsers(const SCEVHandle &Stride,
UsersToProcess[i].Base =
SE->getIntegerSCEV(0, UsersToProcess[i].Base->getType());
} else {
+ // Not all uses are outside the loop.
+ AllUsesAreOutsideLoop = false;
+
// Addressing modes can be folded into loads and stores. Be careful that
// the store is through the expression, not of the expression though.
bool isPHI = false;
@@ -1460,9 +1470,6 @@ SCEVHandle LoopStrengthReduce::CollectIVUsers(const SCEVHandle &Stride,
++NumPHI;
}
- // Not all uses are outside the loop.
- AllUsesAreOutsideLoop = false;
-
if (isAddress)
HasAddress = true;
@@ -1475,12 +1482,12 @@ SCEVHandle LoopStrengthReduce::CollectIVUsers(const SCEVHandle &Stride,
}
}
- // If one of the use if a PHI node and all other uses are addresses, still
+ // If one of the use is a PHI node and all other uses are addresses, still
// allow iv reuse. Essentially we are trading one constant multiplication
// for one fewer iv.
if (NumPHI > 1)
AllUsesAreAddresses = false;
-
+
// There are no in-loop address uses.
if (AllUsesAreAddresses && (!HasAddress && !AllUsesAreOutsideLoop))
AllUsesAreAddresses = false;
@@ -1754,6 +1761,28 @@ LoopStrengthReduce::PrepareToStrengthReduceFromSmallerStride(
"commonbase", PreInsertPt);
}
+static bool IsImmFoldedIntoAddrMode(GlobalValue *GV, int64_t Offset,
+ const Type *ReplacedTy,
+ std::vector<BasedUser> &UsersToProcess,
+ const TargetLowering *TLI) {
+ SmallVector<Instruction*, 16> AddrModeInsts;
+ for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) {
+ if (UsersToProcess[i].isUseOfPostIncrementedValue)
+ continue;
+ ExtAddrMode AddrMode =
+ AddressingModeMatcher::Match(UsersToProcess[i].OperandValToReplace,
+ ReplacedTy, UsersToProcess[i].Inst,
+ AddrModeInsts, *TLI);
+ if (GV && GV != AddrMode.BaseGV)
+ return false;
+ if (Offset && !AddrMode.BaseOffs)
+ // FIXME: How to accurate check it's immediate offset is folded.
+ return false;
+ AddrModeInsts.clear();
+ }
+ return true;
+}
+
/// StrengthReduceStridedIVUsers - Strength reduce all of the users of a single
/// stride of IV. All of the users may have different starting values, and this
/// may not be the only stride (we know it is if isOnlyStride is true).
@@ -1797,6 +1826,41 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
const Type *ReplacedTy = CommonExprs->getType();
+ // If all uses are addresses, consider sinking the immediate part of the
+ // common expression back into uses if they can fit in the immediate fields.
+ if (HaveCommonExprs && AllUsesAreAddresses) {
+ SCEVHandle NewCommon = CommonExprs;
+ SCEVHandle Imm = SE->getIntegerSCEV(0, ReplacedTy);
+ MoveImmediateValues(TLI, ReplacedTy, NewCommon, Imm, true, L, SE);
+ if (!Imm->isZero()) {
+ bool DoSink = true;
+
+ // If the immediate part of the common expression is a GV, check if it's
+ // possible to fold it into the target addressing mode.
+ GlobalValue *GV = 0;
+ if (SCEVUnknown *SU = dyn_cast<SCEVUnknown>(Imm)) {
+ if (ConstantExpr *CE = dyn_cast<ConstantExpr>(SU->getValue()))
+ if (CE->getOpcode() == Instruction::PtrToInt)
+ GV = dyn_cast<GlobalValue>(CE->getOperand(0));
+ }
+ int64_t Offset = 0;
+ if (SCEVConstant *SC = dyn_cast<SCEVConstant>(Imm))
+ Offset = SC->getValue()->getSExtValue();
+ if (GV || Offset)
+ DoSink = IsImmFoldedIntoAddrMode(GV, Offset, ReplacedTy,
+ UsersToProcess, TLI);
+
+ if (DoSink) {
+ DOUT << " Sinking " << *Imm << " back down into uses\n";
+ for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i)
+ UsersToProcess[i].Imm = SE->getAddExpr(UsersToProcess[i].Imm, Imm);
+ CommonExprs = NewCommon;
+ HaveCommonExprs = !CommonExprs->isZero();
+ ++NumImmSunk;
+ }
+ }
+ }
+
// Now that we know what we need to do, insert the PHI node itself.
//
DOUT << "LSR: Examining IVs of TYPE " << *ReplacedTy << " of STRIDE "
@@ -2556,7 +2620,8 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
bool HasOneStride = IVUsesByStride.size() == 1;
#ifndef NDEBUG
- DOUT << "\nLSR on ";
+ DOUT << "\nLSR on \"" << L->getHeader()->getParent()->getNameStart()
+ << "\" ";
DEBUG(L->dump());
#endif
diff --git a/test/CodeGen/X86/2007-10-05-3AddrConvert.ll b/test/CodeGen/X86/2007-10-05-3AddrConvert.ll
index 2cc9124727..e9fbe797f5 100644
--- a/test/CodeGen/X86/2007-10-05-3AddrConvert.ll
+++ b/test/CodeGen/X86/2007-10-05-3AddrConvert.ll
@@ -4,29 +4,43 @@
%struct.bnode = type { i16, double, [3 x double], i32, i32, [3 x double], [3 x double], [3 x double], double, %struct.bnode*, %struct.bnode* }
%struct.node = type { i16, double, [3 x double], i32, i32 }
-define fastcc void @old_main() {
+define i32 @main(i32 %argc, i8** nocapture %argv) nounwind {
entry:
- %tmp44 = malloc %struct.anon ; <%struct.anon*> [#uses=2]
- store double 4.000000e+00, double* null, align 4
- br label %bb41
+ %0 = malloc %struct.anon ; <%struct.anon*> [#uses=2]
+ %1 = getelementptr %struct.anon* %0, i32 0, i32 2 ; <%struct.node**> [#uses=1]
+ br label %bb14.i
-bb41: ; preds = %uniform_testdata.exit, %entry
- %i.0110 = phi i32 [ 0, %entry ], [ %tmp48, %uniform_testdata.exit ] ; <i32> [#uses=2]
- %tmp48 = add i32 %i.0110, 1 ; <i32> [#uses=1]
- br i1 false, label %uniform_testdata.exit, label %bb33.preheader.i
+bb14.i: ; preds = %bb14.i, %entry
+ %i8.0.reg2mem.0.i = phi i32 [ 0, %entry ], [ %2, %bb14.i ] ; <i32> [#uses=1]
+ %2 = add i32 %i8.0.reg2mem.0.i, 1 ; <i32> [#uses=2]
+ %exitcond74.i = icmp eq i32 %2, 32 ; <i1> [#uses=1]
+ br i1 %exitcond74.i, label %bb32.i, label %bb14.i
-bb33.preheader.i: ; preds = %bb41
- ret void
+bb32.i: ; preds = %bb32.i, %bb14.i
+ %tmp.0.reg2mem.0.i = phi i32 [ %indvar.next63.i, %bb32.i ], [ 0, %bb14.i ] ; <i32> [#uses=1]
+ %indvar.next63.i = add i32 %tmp.0.reg2mem.0.i, 1 ; <i32> [#uses=2]
+ %exitcond64.i = icmp eq i32 %indvar.next63.i, 64 ; <i1> [#uses=1]
+ br i1 %exitcond64.i, label %bb47.loopexit.i, label %bb32.i
-uniform_testdata.exit: ; preds = %bb41
- %tmp57 = getelementptr %struct.anon* %tmp44, i32 0, i32 3, i32 %i.0110 ; <%struct.bnode**> [#uses=1]
- store %struct.bnode* null, %struct.bnode** %tmp57, align 4
- br i1 false, label %bb154, label %bb41
+bb.i.i: ; preds = %bb47.loopexit.i
+ unreachable
-bb154: ; preds = %bb154, %uniform_testdata.exit
- br i1 false, label %bb166, label %bb154
+stepsystem.exit.i: ; preds = %bb47.loopexit.i
+ store %struct.node* null, %struct.node** %1, align 4
+ br label %bb.i6.i
-bb166: ; preds = %bb154
- %tmp169 = getelementptr %struct.anon* %tmp44, i32 0, i32 3, i32 0 ; <%struct.bnode**> [#uses=0]
- ret void
+bb.i6.i: ; preds = %bb.i6.i, %stepsystem.exit.i
+ %tmp.0.i.i = add i32 0, -1 ; <i32> [#uses=1]
+ %3 = icmp slt i32 %tmp.0.i.i, 0 ; <i1> [#uses=1]
+ br i1 %3, label %bb107.i.i, label %bb.i6.i
+
+bb107.i.i: ; preds = %bb107.i.i, %bb.i6.i
+ %q_addr.0.i.i.in = phi %struct.bnode** [ null, %bb107.i.i ], [ %4, %bb.i6.i ] ; <%struct.bnode**> [#uses=1]
+ %q_addr.0.i.i = load %struct.bnode** %q_addr.0.i.i.in ; <%struct.bnode*> [#uses=0]
+ br label %bb107.i.i
+
+bb47.loopexit.i: ; preds = %bb32.i
+ %4 = getelementptr %struct.anon* %0, i32 0, i32 4, i32 0 ; <%struct.bnode**> [#uses=1]
+ %5 = icmp eq %struct.node* null, null ; <i1> [#uses=1]
+ br i1 %5, label %stepsystem.exit.i, label %bb.i.i
}
diff --git a/test/CodeGen/X86/loop-strength-reduce-2.ll b/test/CodeGen/X86/loop-strength-reduce-2.ll
index b67e618ac8..8ea5bdb208 100644
--- a/test/CodeGen/X86/loop-strength-reduce-2.ll
+++ b/test/CodeGen/X86/loop-strength-reduce-2.ll
@@ -1,8 +1,10 @@
; RUN: llvm-as < %s | llc -march=x86 -relocation-model=pic | \
; RUN: grep {, 4} | count 1
+; RUN: llvm-as < %s | llc -march=x86 | not grep lea
;
; Make sure the common loop invariant A is hoisted up to preheader,
; since too many registers are needed to subsume it into the addressing modes.
+; It's safe to sink A in when it's not pic.
@A = global [16 x [16 x i32]] zeroinitializer, align 32 ; <[16 x [16 x i32]]*> [#uses=2]
diff --git a/test/CodeGen/X86/loop-strength-reduce8.ll b/test/CodeGen/X86/loop-strength-reduce8.ll
new file mode 100644
index 0000000000..1846c7d446
--- /dev/null
+++ b/test/CodeGen/X86/loop-strength-reduce8.ll
@@ -0,0 +1,78 @@
+; RUN: llvm-as < %s | llc -mtriple=i386-apple-darwin | grep leal | not grep 16
+
+ %struct.CUMULATIVE_ARGS = type { i32, i32, i32, i32, i32, i32, i32 }
+ %struct.bitmap_element = type { %struct.bitmap_element*, %struct.bitmap_element*, i32, [2 x i64] }
+ %struct.bitmap_head_def = type { %struct.bitmap_element*, %struct.bitmap_element*, i32 }
+ %struct.branch_path = type { %struct.rtx_def*, i32 }
+ %struct.c_lang_decl = type <{ i8, [3 x i8] }>
+ %struct.constant_descriptor = type { %struct.constant_descriptor*, i8*, %struct.rtx_def*, { x86_fp80 } }
+ %struct.eh_region = type { %struct.eh_region*, %struct.eh_region*, %struct.eh_region*, i32, %struct.bitmap_head_def*, i32, { { %struct.eh_region*, %struct.eh_region*, %struct.eh_region*, %struct.rtx_def* } }, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def* }
+ %struct.eh_status = type { %struct.eh_region*, %struct.eh_region**, %struct.eh_region*, %struct.eh_region*, %struct.tree_node*, %struct.rtx_def*, %struct.rtx_def*, i32, i32, %struct.varray_head_tag*, %struct.varray_head_tag*, %struct.varray_head_tag*, %struct.branch_path*, i32, i32, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def* }
+ %struct.emit_status = type { i32, i32, %struct.rtx_def*, %struct.rtx_def*, %struct.tree_node*, %struct.sequence_stack*, i32, i32, i8*, i32, i8*, %struct.tree_node**, %struct.rtx_def** }
+ %struct.equiv_table = type { %struct.rtx_def*, %struct.rtx_def* }
+ %struct.expr_status = type { i32, i32, i32, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def* }
+ %struct.function = type { %struct.eh_status*, %struct.stmt_status*, %struct.expr_status*, %struct.emit_status*, %struct.varasm_status*, i8*, %struct.tree_node*, %struct.function*, i32, i32, i32, i32, %struct.rtx_def*, %struct.CUMULATIVE_ARGS, %struct.rtx_def*, %struct.rtx_def*, i8*, %struct.initial_value_struct*, i32, %struct.tree_node*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.tree_node*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, i64, %struct.tree_node*, %struct.tree_node*, %struct.rtx_def*, %struct.rtx_def*, i32, %struct.rtx_def**, %struct.temp_slot*, i32, i32, i32, %struct.var_refs_queue*, i32, i32, i8*, %struct.tree_node*, %struct.rtx_def*, i32, i32, %struct.machine_function*, i32, i32, %struct.language_function*, %struct.rtx_def*, i8, i8, i8 }
+ %struct.goto_fixup = type { %struct.goto_fixup*, %struct.rtx_def*, %struct.tree_node*, %struct.tree_node*, %struct.rtx_def*, i32, %struct.rtx_def*, %struct.tree_node* }
+ %struct.initial_value_struct = type { i32, i32, %struct.equiv_table* }
+ %struct.label_chain = type { %struct.label_chain*, %struct.tree_node* }
+ %struct.lang_decl = type { %struct.c_lang_decl, %struct.tree_node* }
+ %struct.language_function = type { %struct.stmt_tree_s, %struct.tree_node* }
+ %struct.machine_function = type { [59 x [3 x %struct.rtx_def*]], i32, i32 }
+ %struct.nesting = type { %struct.nesting*, %struct.nesting*, i32, %struct.rtx_def*, { { i32, %struct.rtx_def*, %struct.rtx_def*, %struct.nesting*, %struct.tree_node*, %struct.tree_node*, %struct.label_chain*, i32, i32, i32, i32, %struct.rtx_def*, %struct.tree_node** } } }
+ %struct.pool_constant = type { %struct.constant_descriptor*, %struct.pool_constant*, %struct.pool_constant*, %struct.rtx_def*, i32, i32, i32, i64, i32 }
+ %struct.rtunion = type { i64 }
+ %struct.rtx_def = type { i16, i8, i8, [1 x %struct.rtunion] }
+ %struct.sequence_stack = type { %struct.rtx_def*, %struct.rtx_def*, %struct.tree_node*, %struct.sequence_stack* }
+ %struct.stmt_status = type { %struct.nesting*, %struct.nesting*, %struct.nesting*, %struct.nesting*, %struct.nesting*, %struct.nesting*, i32, i32, %struct.tree_node*, %struct.rtx_def*, i32, i8*, i32, %struct.goto_fixup* }
+ %struct.stmt_tree_s = type { %struct.tree_node*, %struct.tree_node*, i8*, i32 }
+ %struct.temp_slot = type { %struct.temp_slot*, %struct.rtx_def*, %struct.rtx_def*, i32, i64, %struct.tree_node*, %struct.tree_node*, i8, i8, i32, i32, i64, i64 }
+ %struct.tree_common = type { %struct.tree_node*, %struct.tree_node*, i8, i8, i8, i8 }
+ %struct.tree_decl = type { %struct.tree_common, i8*, i32, i32, %struct.tree_node*, i8, i8, i8, i8, i8, i8, %struct.rtunion, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.rtx_def*, %struct.rtx_def*, { %struct.function* }, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, i64, %struct.lang_decl* }
+ %struct.tree_exp = type { %struct.tree_common, i32, [1 x %struct.tree_node*] }
+ %struct.tree_node = type { %struct.tree_decl }
+ %struct.var_refs_queue = type { %struct.rtx_def*, i32, i32, %struct.var_refs_queue* }
+ %struct.varasm_status = type { %struct.constant_descriptor**, %struct.pool_constant**, %struct.pool_constant*, %struct.pool_constant*, i64, %struct.rtx_def* }
+ %struct.varray_data = type { [1 x i64] }
+ %struct.varray_head_tag = type { i32, i32, i32, i8*, %struct.varray_data }
+@lineno = internal global i32 0 ; <i32*> [#uses=1]
+@tree_code_length = internal global [256 x i32] zeroinitializer
+@llvm.used = appending global [1 x i8*] [ i8* bitcast (%struct.tree_node* (i32, ...)* @build_stmt to i8*) ], section "llvm.metadata" ; <[1 x i8*]*> [#uses=0]
+
+define %struct.tree_node* @build_stmt(i32 %code, ...) nounwind {
+entry:
+ %p = alloca i8* ; <i8**> [#uses=3]
+ %p1 = bitcast i8** %p to i8* ; <i8*> [#uses=2]
+ call void @llvm.va_start(i8* %p1)
+ %0 = call fastcc %struct.tree_node* @make_node(i32 %code) nounwind ; <%struct.tree_node*> [#uses=2]
+ %1 = getelementptr [256 x i32]* @tree_code_length, i32 0, i32 %code ; <i32*> [#uses=1]
+ %2 = load i32* %1, align 4 ; <i32> [#uses=2]
+ %3 = load i32* @lineno, align 4 ; <i32> [#uses=1]
+ %4 = bitcast %struct.tree_node* %0 to %struct.tree_exp* ; <%struct.tree_exp*> [#uses=2]
+ %5 = getelementptr %struct.tree_exp* %4, i32 0, i32 1 ; <i32*> [#uses=1]
+ store i32 %3, i32* %5, align 4
+ %6 = icmp sgt i32 %2, 0 ; <i1> [#uses=1]
+ br i1 %6, label %bb, label %bb3
+
+bb: ; preds = %bb, %entry
+ %i.01 = phi i32 [ %indvar.next, %bb ], [ 0, %entry ] ; <i32> [#uses=2]
+ %7 = load i8** %p, align 4 ; <i8*> [#uses=2]
+ %8 = getelementptr i8* %7, i32 4 ; <i8*> [#uses=1]
+ store i8* %8, i8** %p, align 4
+ %9 = bitcast i8* %7 to %struct.tree_node** ; <%struct.tree_node**> [#uses=1]
+ %10 = load %struct.tree_node** %9, align 4 ; <%struct.tree_node*> [#uses=1]
+ %11 = getelementptr %struct.tree_exp* %4, i32 0, i32 2, i32 %i.01 ; <%struct.tree_node**> [#uses=1]
+ store %struct.tree_node* %10, %struct.tree_node** %11, align 4
+ %indvar.next = add i32 %i.01, 1 ; <i32> [#uses=2]
+ %exitcond = icmp eq i32 %indvar.next, %2 ; <i1> [#uses=1]
+ br i1 %exitcond, label %bb3, label %bb
+
+bb3: ; preds = %bb, %entry
+ call void @llvm.va_end(i8* %p1)
+ ret %struct.tree_node* %0
+}
+
+declare void @llvm.va_start(i8*) nounwind
+
+declare void @llvm.va_end(i8*) nounwind
+
+declare fastcc %struct.tree_node* @make_node(i32) nounwind
diff --git a/test/CodeGen/X86/stride-nine-with-base-reg.ll b/test/CodeGen/X86/stride-nine-with-base-reg.ll
index 4bd9924f26..c0cfb852bd 100644
--- a/test/CodeGen/X86/stride-nine-with-base-reg.ll
+++ b/test/CodeGen/X86/stride-nine-with-base-reg.ll
@@ -1,14 +1,14 @@
-; RUN: llvm-as < %s | llc -march=x86 -relocation-model=static | grep lea | count 1
+; RUN: llvm-as < %s | llc -march=x86 -relocation-model=static | not grep lea
; RUN: llvm-as < %s | llc -march=x86-64 | not grep lea
-; For x86 there's an lea above the loop. In both cases, there shouldn't
-; be any lea instructions inside the loop.
+; _P should be sunk into the loop and folded into the address mode. There
+; shouldn't be any lea instructions inside the loop.
@B = external global [1000 x i8], align 32
@A = external global [1000 x i8], align 32
@P = external global [1000 x i8], align 32
-define void @foo(i32 %m, i32 %p) {
+define void @foo(i32 %m, i32 %p) nounwind {
entry:
%tmp1 = icmp sgt i32 %m, 0
br i1 %tmp1, label %bb, label %return