diff options
-rw-r--r-- | lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 89 | ||||
-rw-r--r-- | test/CodeGen/PowerPC/stdux-constuse.ll | 47 |
2 files changed, 136 insertions, 0 deletions
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index d694bc782c..472919c3e3 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6917,6 +6917,16 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { ISD::MemIndexedMode AM = ISD::UNINDEXED; if (!TLI.getPreIndexedAddressParts(N, BasePtr, Offset, AM, DAG)) return false; + + // Backends without true r+i pre-indexed forms may need to pass a + // constant base with a variable offset so that constant coercion + // will work with the patterns in canonical form. + bool Swapped = false; + if (isa<ConstantSDNode>(BasePtr)) { + std::swap(BasePtr, Offset); + Swapped = true; + } + // Don't create a indexed load / store with zero offset. if (isa<ConstantSDNode>(Offset) && cast<ConstantSDNode>(Offset)->isNullValue()) @@ -6942,6 +6952,48 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { return false; } + // If the offset is a constant, there may be other adds of constants that + // can be folded with this one. We should do this to avoid having to keep + // a copy of the original base pointer. + SmallVector<SDNode *, 16> OtherUses; + if (isa<ConstantSDNode>(Offset)) + for (SDNode::use_iterator I = BasePtr.getNode()->use_begin(), + E = BasePtr.getNode()->use_end(); I != E; ++I) { + SDNode *Use = *I; + if (Use == Ptr.getNode()) + continue; + + if (Use->isPredecessorOf(N)) + continue; + + if (Use->getOpcode() != ISD::ADD && Use->getOpcode() != ISD::SUB) { + OtherUses.clear(); + break; + } + + SDValue Op0 = Use->getOperand(0), Op1 = Use->getOperand(1); + if (Op1.getNode() == BasePtr.getNode()) + std::swap(Op0, Op1); + assert(Op0.getNode() == BasePtr.getNode() && + "Use of ADD/SUB but not an operand"); + + if (!isa<ConstantSDNode>(Op1)) { + OtherUses.clear(); + break; + } + + // FIXME: In some cases, we can be smarter about this. + if (Op1.getValueType() != Offset.getValueType()) { + OtherUses.clear(); + break; + } + + OtherUses.push_back(Use); + } + + if (Swapped) + std::swap(BasePtr, Offset); + // Now check for #3 and #4. bool RealUse = false; @@ -6991,6 +7043,43 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { // Finally, since the node is now dead, remove it from the graph. DAG.DeleteNode(N); + if (Swapped) + std::swap(BasePtr, Offset); + + // Replace other uses of BasePtr that can be updated to use Ptr + for (unsigned i = 0, e = OtherUses.size(); i != e; ++i) { + unsigned OffsetIdx = 1; + if (OtherUses[i]->getOperand(OffsetIdx).getNode() == BasePtr.getNode()) + OffsetIdx = 0; + assert(OtherUses[i]->getOperand(!OffsetIdx).getNode() == + BasePtr.getNode() && "Expected BasePtr operand"); + + APInt OV = + cast<ConstantSDNode>(Offset)->getAPIntValue(); + if (AM == ISD::PRE_DEC) + OV = -OV; + + ConstantSDNode *CN = + cast<ConstantSDNode>(OtherUses[i]->getOperand(OffsetIdx)); + APInt CNV = CN->getAPIntValue(); + if (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1) + CNV += OV; + else + CNV -= OV; + + SDValue NewOp1 = Result.getValue(isLoad ? 1 : 0); + SDValue NewOp2 = DAG.getConstant(CNV, CN->getValueType(0)); + if (OffsetIdx == 0) + std::swap(NewOp1, NewOp2); + + SDValue NewUse = DAG.getNode(OtherUses[i]->getOpcode(), + OtherUses[i]->getDebugLoc(), + OtherUses[i]->getValueType(0), NewOp1, NewOp2); + DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUses[i], 0), NewUse); + removeFromWorkList(OtherUses[i]); + DAG.DeleteNode(OtherUses[i]); + } + // Replace the uses of Ptr with uses of the updated base value. DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0)); removeFromWorkList(Ptr.getNode()); diff --git a/test/CodeGen/PowerPC/stdux-constuse.ll b/test/CodeGen/PowerPC/stdux-constuse.ll new file mode 100644 index 0000000000..e62d438014 --- /dev/null +++ b/test/CodeGen/PowerPC/stdux-constuse.ll @@ -0,0 +1,47 @@ +; RUN: llc -mcpu=a2 -disable-lsr < %s | FileCheck %s +target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +define i32 @test1(i64 %add, i64* %ptr) nounwind { +entry: + %p1 = getelementptr i64* %ptr, i64 144115188075855 + br label %for.cond2.preheader + +for.cond2.preheader: + %nl.018 = phi i32 [ 0, %entry ], [ %inc9, %for.end ] + br label %for.body4 + +for.body4: + %lsr.iv = phi i32 [ %lsr.iv.next, %for.body4 ], [ 16000, %for.cond2.preheader ] + %i0 = phi i64* [ %p1, %for.cond2.preheader ], [ %i6, %for.body4 ] + %i6 = getelementptr i64* %i0, i64 400000 + %i7 = getelementptr i64* %i6, i64 300000 + %i8 = getelementptr i64* %i6, i64 200000 + %i9 = getelementptr i64* %i6, i64 100000 + store i64 %add, i64* %i6, align 32 + store i64 %add, i64* %i7, align 32 + store i64 %add, i64* %i8, align 32 + store i64 %add, i64* %i9, align 32 + %lsr.iv.next = add i32 %lsr.iv, -16 + %exitcond.15 = icmp eq i32 %lsr.iv.next, 0 + br i1 %exitcond.15, label %for.end, label %for.body4 + +; Make sure that we generate the most compact form of this loop with no +; unnecessary moves +; CHECK: @test1 +; CHECK: mtctr +; CHECK: stdux +; CHECK-NEXT: stdx +; CHECK-NEXT: stdx +; CHECK-NEXT: stdx +; CHECK-NEXT: bdnz + +for.end: + %inc9 = add nsw i32 %nl.018, 1 + %exitcond = icmp eq i32 %inc9, 400000 + br i1 %exitcond, label %for.end10, label %for.cond2.preheader + +for.end10: + ret i32 0 +} + |