aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDan Gohman <gohman@apple.com>2009-01-29 01:59:02 +0000
committerDan Gohman <gohman@apple.com>2009-01-29 01:59:02 +0000
commite5af2d3a224d4b38760a26d237cde040cb6e14eb (patch)
tree8ef65dfe14fd2541c8d0785d1ce6855abfd876b4
parent63e3e6350b14d04917268be07ca1c29e158fdfd7 (diff)
Make x86's BT instruction matching more thorough, and add some
dagcombines that help it match in several more cases. Add several more cases to test/CodeGen/X86/bt.ll. This doesn't yet include matching for BT with an immediate operand, it just covers more register+register cases. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@63266 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--include/llvm/Target/TargetLowering.h2
-rw-r--r--lib/CodeGen/SelectionDAG/DAGCombiner.cpp50
-rw-r--r--lib/CodeGen/SelectionDAG/TargetLowering.cpp43
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp70
-rw-r--r--test/CodeGen/X86/bt.ll420
-rw-r--r--test/CodeGen/X86/commute-cmov.ll17
6 files changed, 562 insertions, 40 deletions
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index e6f18ea43f..4ec7d3f627 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -780,6 +780,8 @@ public:
SDValue CombineTo(SDNode *N, const std::vector<SDValue> &To);
SDValue CombineTo(SDNode *N, SDValue Res);
SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1);
+
+ void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO);
};
/// SimplifySetCC - Try to simplify a setcc built with the specified operands
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 48e556b236..848051940f 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -102,6 +102,8 @@ namespace {
SDValue To[] = { Res0, Res1 };
return CombineTo(N, To, 2, AddTo);
}
+
+ void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO);
private:
@@ -298,6 +300,10 @@ CombineTo(SDNode *N, SDValue Res0, SDValue Res1) {
return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1);
}
+void TargetLowering::DAGCombinerInfo::
+CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
+ return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO);
+}
//===----------------------------------------------------------------------===//
// Helper Functions
@@ -539,29 +545,14 @@ SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
return SDValue(N, 0);
}
-/// SimplifyDemandedBits - Check the specified integer node value to see if
-/// it can be simplified or if things it uses can be simplified by bit
-/// propagation. If so, return true.
-bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) {
- TargetLowering::TargetLoweringOpt TLO(DAG);
- APInt KnownZero, KnownOne;
- if (!TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
- return false;
-
- // Revisit the node.
- AddToWorkList(Op.getNode());
-
- // Replace the old value with the new one.
- ++NodesCombined;
- DOUT << "\nReplacing.2 "; DEBUG(TLO.Old.getNode()->dump(&DAG));
- DOUT << "\nWith: "; DEBUG(TLO.New.getNode()->dump(&DAG));
- DOUT << '\n';
-
+void
+DAGCombiner::CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &
+ TLO) {
// Replace all uses. If any nodes become isomorphic to other nodes and
// are deleted, make sure to remove them from our worklist.
WorkListRemover DeadNodes(*this);
DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New, &DeadNodes);
-
+
// Push the new node and any (possibly new) users onto the worklist.
AddToWorkList(TLO.New.getNode());
AddUsersToWorkList(TLO.New.getNode());
@@ -580,6 +571,27 @@ bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) {
DAG.DeleteNode(TLO.Old.getNode());
}
+}
+
+/// SimplifyDemandedBits - Check the specified integer node value to see if
+/// it can be simplified or if things it uses can be simplified by bit
+/// propagation. If so, return true.
+bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) {
+ TargetLowering::TargetLoweringOpt TLO(DAG);
+ APInt KnownZero, KnownOne;
+ if (!TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
+ return false;
+
+ // Revisit the node.
+ AddToWorkList(Op.getNode());
+
+ // Replace the old value with the new one.
+ ++NodesCombined;
+ DOUT << "\nReplacing.2 "; DEBUG(TLO.Old.getNode()->dump(&DAG));
+ DOUT << "\nWith: "; DEBUG(TLO.New.getNode()->dump(&DAG));
+ DOUT << '\n';
+
+ CommitTargetLoweringOpt(TLO);
return true;
}
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 7245244684..e479e05bdd 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -724,7 +724,7 @@ TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op,
const APInt &Demanded) {
// FIXME: ISD::SELECT, ISD::SELECT_CC
- switch(Op.getOpcode()) {
+ switch (Op.getOpcode()) {
default: break;
case ISD::AND:
case ISD::OR:
@@ -1054,6 +1054,14 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
}
break;
case ISD::SRA:
+ // If this is an arithmetic shift right and only the low-bit is set, we can
+ // always convert this into a logical shr, even if the shift amount is
+ // variable. The low bit of the shift cannot be an input sign bit unless
+ // the shift amount is >= the size of the datatype, which is undefined.
+ if (DemandedMask == 1)
+ return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, Op.getValueType(),
+ Op.getOperand(0), Op.getOperand(1)));
+
if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
MVT VT = Op.getValueType();
unsigned ShAmt = SA->getZExtValue();
@@ -1332,6 +1340,21 @@ unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
return 1;
}
+static bool ValueHasAtMostOneBitSet(SDValue Val, const SelectionDAG &DAG) {
+ // Logical shift right or left won't ever introduce new set bits.
+ // We check for this case because we don't care which bits are
+ // set, but ComputeMaskedBits won't know anything unless it can
+ // determine which specific bits may be set.
+ if (Val.getOpcode() == ISD::SHL || Val.getOpcode() == ISD::SRL)
+ return ValueHasAtMostOneBitSet(Val.getOperand(0), DAG);
+
+ MVT OpVT = Val.getValueType();
+ unsigned BitWidth = OpVT.getSizeInBits();
+ APInt Mask = APInt::getAllOnesValue(BitWidth);
+ APInt KnownZero, KnownOne;
+ DAG.ComputeMaskedBits(Val, Mask, KnownZero, KnownOne);
+ return KnownZero.countPopulation() == BitWidth - 1;
+}
/// SimplifySetCC - Try to simplify a setcc built with the specified operands
/// and cc. If it is unable to simplify it, return a null SDValue.
@@ -1791,6 +1814,24 @@ TargetLowering::SimplifySetCC(MVT VT, SDValue N0, SDValue N1,
}
}
}
+
+ // Simpify x&y == y to x&y == 0 if y has exactly one bit set.
+ if (N0.getOpcode() == ISD::AND)
+ if (N0.getOperand(0) == N1 || N0.getOperand(1) == N1) {
+ if (ValueHasAtMostOneBitSet(N1, DAG)) {
+ Cond = ISD::getSetCCInverse(Cond, /*isInteger=*/true);
+ SDValue Zero = DAG.getConstant(0, N1.getValueType());
+ return DAG.getSetCC(VT, N0, Zero, Cond);
+ }
+ }
+ if (N1.getOpcode() == ISD::AND)
+ if (N1.getOperand(0) == N0 || N1.getOperand(1) == N0) {
+ if (ValueHasAtMostOneBitSet(N0, DAG)) {
+ Cond = ISD::getSetCCInverse(Cond, /*isInteger=*/true);
+ SDValue Zero = DAG.getConstant(0, N0.getValueType());
+ return DAG.getSetCC(VT, N1, Zero, Cond);
+ }
+ }
}
// Fold away ALL boolean setcc's.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 6ec97e2db6..bf7c704b9b 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -5114,22 +5114,39 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
SDValue Op1 = Op.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
- // Lower (X & (1 << N)) == 0 to BT.
- // Lower ((X >>u N) & 1) != 0 to BT.
- // Lower ((X >>s N) & 1) != 0 to BT.
+ // Lower (X & (1 << N)) == 0 to BT(X, N).
+ // Lower ((X >>u N) & 1) != 0 to BT(X, N).
+ // Lower ((X >>s N) & 1) != 0 to BT(X, N).
if (Op0.getOpcode() == ISD::AND &&
Op0.hasOneUse() &&
Op1.getOpcode() == ISD::Constant &&
- Op0.getOperand(1).getOpcode() == ISD::Constant &&
+ cast<ConstantSDNode>(Op1)->getZExtValue() == 0 &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
- ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1));
- ConstantSDNode *CmpRHS = cast<ConstantSDNode>(Op1);
- SDValue AndLHS = Op0.getOperand(0);
- if (CmpRHS->getZExtValue() == 0 && AndRHS->getZExtValue() == 1 &&
- AndLHS.getOpcode() == ISD::SRL) {
- SDValue LHS = AndLHS.getOperand(0);
- SDValue RHS = AndLHS.getOperand(1);
+ SDValue LHS, RHS;
+ if (Op0.getOperand(1).getOpcode() == ISD::SHL) {
+ if (ConstantSDNode *Op010C =
+ dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0)))
+ if (Op010C->getZExtValue() == 1) {
+ LHS = Op0.getOperand(0);
+ RHS = Op0.getOperand(1).getOperand(1);
+ }
+ } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) {
+ if (ConstantSDNode *Op000C =
+ dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0)))
+ if (Op000C->getZExtValue() == 1) {
+ LHS = Op0.getOperand(1);
+ RHS = Op0.getOperand(0).getOperand(1);
+ }
+ } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) {
+ ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1));
+ SDValue AndLHS = Op0.getOperand(0);
+ if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
+ LHS = AndLHS.getOperand(0);
+ RHS = AndLHS.getOperand(1);
+ }
+ }
+ if (LHS.getNode()) {
// If LHS is i8, promote it to i16 with any_extend. There is no i8 BT
// instruction. Since the shift amount is in-range-or-undefined, we know
// that doing a bittest on the i16 value is ok. We extend to i32 because
@@ -5141,10 +5158,10 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
// BT ignores high bits (like shifts) we can use anyextend.
if (LHS.getValueType() != RHS.getValueType())
RHS = DAG.getNode(ISD::ANY_EXTEND, LHS.getValueType(), RHS);
-
+
SDValue BT = DAG.getNode(X86ISD::BT, MVT::i32, LHS, RHS);
unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
- return DAG.getNode(X86ISD::SETCC, MVT::i8,
+ return DAG.getNode(X86ISD::SETCC, MVT::i8,
DAG.getConstant(Cond, MVT::i8), BT);
}
}
@@ -5295,7 +5312,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) {
!isScalarFPTypeInSSEReg(VT)) // FPStack?
IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
- if (isX86LogicalCmp(Opc) && !IllegalFPCMov) {
+ if ((isX86LogicalCmp(Opc) && !IllegalFPCMov) || Opc == X86ISD::BT) { // FIXME
Cond = Cmp;
addTest = false;
}
@@ -7547,6 +7564,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
/// PerformBuildVectorCombine - build_vector 0,(load i64 / f64) -> movq / movsd.
static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget,
const TargetLowering &TLI) {
unsigned NumOps = N->getNumOperands();
@@ -7587,7 +7605,9 @@ static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, Tys, Ops, 2);
- DAG.ReplaceAllUsesOfValueWith(SDValue(Base, 1), ResNode.getValue(1));
+ TargetLowering::TargetLoweringOpt TLO(DAG);
+ TLO.CombineTo(SDValue(Base, 1), ResNode.getValue(1));
+ DCI.CommitTargetLoweringOpt(TLO);
return ResNode;
}
@@ -7875,6 +7895,23 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+static SDValue PerformBTCombine(SDNode *N,
+ SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // BT ignores high bits in the bit index operand.
+ SDValue Op1 = N->getOperand(1);
+ if (Op1.hasOneUse()) {
+ unsigned BitWidth = Op1.getValueSizeInBits();
+ APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
+ APInt KnownZero, KnownOne;
+ TargetLowering::TargetLoweringOpt TLO(DAG);
+ TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
+ TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
+ DCI.CommitTargetLoweringOpt(TLO);
+ }
+ return SDValue();
+}
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
@@ -7883,7 +7920,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
default: break;
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
case ISD::BUILD_VECTOR:
- return PerformBuildVectorCombine(N, DAG, Subtarget, *this);
+ return PerformBuildVectorCombine(N, DAG, DCI, Subtarget, *this);
case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget);
case ISD::SHL:
case ISD::SRA:
@@ -7892,6 +7929,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::FXOR:
case X86ISD::FOR: return PerformFORCombine(N, DAG);
case X86ISD::FAND: return PerformFANDCombine(N, DAG);
+ case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
}
return SDValue();
diff --git a/test/CodeGen/X86/bt.ll b/test/CodeGen/X86/bt.ll
index 86254d3295..f91130dd69 100644
--- a/test/CodeGen/X86/bt.ll
+++ b/test/CodeGen/X86/bt.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc | grep btl
+; RUN: llvm-as < %s | llc -march=x86 | grep btl | count 28
; RUN: llvm-as < %s | llc -mcpu=pentium4 | grep btl | not grep esp
; RUN: llvm-as < %s | llc -mcpu=penryn | grep btl | not grep esp
; PR3253
@@ -7,8 +7,17 @@
; pentium4, however it is currently disabled due to the register+memory
; form having different semantics than the register+register form.
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-target triple = "i386-apple-darwin8"
+; Test these patterns:
+; (X & (1 << N)) != 0 --> BT(X, N).
+; ((X >>u N) & 1) != 0 --> BT(X, N).
+; as well as several variations:
+; - The second form can use an arithmetic shift.
+; - Either form can use == instead of !=.
+; - Either form can compare with an operand of the &
+; instead of with 0.
+; - The comparison can be commuted (only cases where neither
+; operand is constant are included).
+; - The and can be commuted.
define void @test2(i32 %x, i32 %n) nounwind {
entry:
@@ -25,4 +34,409 @@ UnifiedReturnBlock: ; preds = %entry
ret void
}
+define void @test2b(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = lshr i32 %x, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp eq i32 %tmp3, 0 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @atest2(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 %tmp29, 1 ; <i32> [#uses=1]
+ %tmp4 = icmp eq i32 %tmp3, 0 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @atest2b(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp eq i32 %tmp3, 0 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @test3(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 %tmp29, %x ; <i32> [#uses=1]
+ %tmp4 = icmp eq i32 %tmp3, 0 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @test3b(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 %x, %tmp29
+ %tmp4 = icmp eq i32 %tmp3, 0 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @testne2(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = lshr i32 %x, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 %tmp29, 1 ; <i32> [#uses=1]
+ %tmp4 = icmp ne i32 %tmp3, 0 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @testne2b(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = lshr i32 %x, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp ne i32 %tmp3, 0 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @atestne2(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 %tmp29, 1 ; <i32> [#uses=1]
+ %tmp4 = icmp ne i32 %tmp3, 0 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @atestne2b(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp ne i32 %tmp3, 0 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @testne3(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 %tmp29, %x ; <i32> [#uses=1]
+ %tmp4 = icmp ne i32 %tmp3, 0 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @testne3b(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 %x, %tmp29
+ %tmp4 = icmp ne i32 %tmp3, 0 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @query2(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = lshr i32 %x, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 %tmp29, 1 ; <i32> [#uses=1]
+ %tmp4 = icmp eq i32 %tmp3, 1 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @query2b(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = lshr i32 %x, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp eq i32 %tmp3, 1 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @aquery2(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 %tmp29, 1 ; <i32> [#uses=1]
+ %tmp4 = icmp eq i32 %tmp3, 1 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @aquery2b(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp eq i32 %tmp3, 1 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @query3(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 %tmp29, %x ; <i32> [#uses=1]
+ %tmp4 = icmp eq i32 %tmp3, %tmp29 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @query3b(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 %x, %tmp29
+ %tmp4 = icmp eq i32 %tmp3, %tmp29 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @query3x(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 %tmp29, %x ; <i32> [#uses=1]
+ %tmp4 = icmp eq i32 %tmp29, %tmp3 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @query3bx(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 %x, %tmp29
+ %tmp4 = icmp eq i32 %tmp29, %tmp3 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @queryne2(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = lshr i32 %x, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 %tmp29, 1 ; <i32> [#uses=1]
+ %tmp4 = icmp ne i32 %tmp3, 1 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @queryne2b(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = lshr i32 %x, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp ne i32 %tmp3, 1 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @aqueryne2(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 %tmp29, 1 ; <i32> [#uses=1]
+ %tmp4 = icmp ne i32 %tmp3, 1 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @aqueryne2b(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp ne i32 %tmp3, 1 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @queryne3(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 %tmp29, %x ; <i32> [#uses=1]
+ %tmp4 = icmp ne i32 %tmp3, %tmp29 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @queryne3b(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 %x, %tmp29
+ %tmp4 = icmp ne i32 %tmp3, %tmp29 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @queryne3x(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 %tmp29, %x ; <i32> [#uses=1]
+ %tmp4 = icmp ne i32 %tmp29, %tmp3 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
+define void @queryne3bx(i32 %x, i32 %n) nounwind {
+entry:
+ %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
+ %tmp3 = and i32 %x, %tmp29
+ %tmp4 = icmp ne i32 %tmp29, %tmp3 ; <i1> [#uses=1]
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb: ; preds = %entry
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock: ; preds = %entry
+ ret void
+}
+
declare void @foo()
diff --git a/test/CodeGen/X86/commute-cmov.ll b/test/CodeGen/X86/commute-cmov.ll
index 24398dc125..ac0e4ef3e5 100644
--- a/test/CodeGen/X86/commute-cmov.ll
+++ b/test/CodeGen/X86/commute-cmov.ll
@@ -1,5 +1,20 @@
-; RUN: llvm-as < %s | llc -march=x86 | grep {cmove 16(%esp)}
+; RUN: llvm-as < %s | llc -march=x86 > %t
+; RUN: grep btl %t | count 2
+; RUN: grep cmov %t | count 2
+; RUN: not grep test %t
+; RUN: not grep set %t
+; RUN: not grep j %t
+; RUN: not grep cmovne %t
+; RUN: not grep cmove %t
+define i32 @foo(i32 %x, i32 %n, i32 %w, i32 %v) nounwind readnone {
+entry:
+ %0 = lshr i32 %x, %n ; <i32> [#uses=1]
+ %1 = and i32 %0, 1 ; <i32> [#uses=1]
+ %toBool = icmp eq i32 %1, 0 ; <i1> [#uses=1]
+ %.0 = select i1 %toBool, i32 %v, i32 12 ; <i32> [#uses=1]
+ ret i32 %.0
+}
define i32 @bar(i32 %x, i32 %n, i32 %w, i32 %v) nounwind readnone {
entry:
%0 = lshr i32 %x, %n ; <i32> [#uses=1]