diff options
-rw-r--r-- | lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 103 | ||||
-rw-r--r-- | lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h | 4 | ||||
-rw-r--r-- | lib/Transforms/Scalar/SimplifyLibCalls.cpp | 13 | ||||
-rw-r--r-- | test/CodeGen/X86/memcmp.ll | 76 | ||||
-rw-r--r-- | test/Transforms/SimplifyLibCalls/memcmp.ll | 3 |
5 files changed, 183 insertions, 16 deletions
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index e3c219cd0b..e194003da9 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ConstantFolding.h" #include "llvm/Constants.h" #include "llvm/CallingConv.h" #include "llvm/DerivedTypes.h" @@ -5075,6 +5076,105 @@ void SelectionDAGBuilder::LowerCallTo(CallSite CS, SDValue Callee, } } +/// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the +/// value is equal or not-equal to zero. +static bool IsOnlyUsedInZeroEqualityComparison(Value *V) { + for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); + UI != E; ++UI) { + if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI)) + if (IC->isEquality()) + if (Constant *C = dyn_cast<Constant>(IC->getOperand(1))) + if (C->isNullValue()) + continue; + // Unknown instruction. + return false; + } + return true; +} + +static SDValue getMemCmpLoad(Value *PtrVal, unsigned Size, + SelectionDAGBuilder &Builder) { + MVT LoadVT; + const Type *LoadTy; + if (Size == 2) { + LoadVT = MVT::i16; + LoadTy = Type::getInt16Ty(PtrVal->getContext()); + } else { + LoadVT = MVT::i32; + LoadTy = Type::getInt32Ty(PtrVal->getContext()); + } + + // Check to see if this load can be trivially constant folded, e.g. if the + // input is from a string literal. + if (Constant *LoadInput = dyn_cast<Constant>(PtrVal)) { + // Cast pointer to the type we really want to load. + LoadInput = ConstantExpr::getBitCast(LoadInput, + PointerType::getUnqual(LoadTy)); + + if (Constant *LoadCst = ConstantFoldLoadFromConstPtr(LoadInput, Builder.TD)) + return Builder.getValue(LoadCst); + } + + // Otherwise, we have to emit the load. If the pointer is to unfoldable but + // still constant memory, the input chain can be the entry node. + SDValue Root; + bool ConstantMemory = false; + + // Do not serialize (non-volatile) loads of constant memory with anything. + if (Builder.AA->pointsToConstantMemory(PtrVal)) { + Root = Builder.DAG.getEntryNode(); + ConstantMemory = true; + } else { + // Do not serialize non-volatile loads against each other. + Root = Builder.DAG.getRoot(); + } + + SDValue Ptr = Builder.getValue(PtrVal); + SDValue LoadVal = Builder.DAG.getLoad(LoadVT, Builder.getCurDebugLoc(), Root, + Ptr, PtrVal /*SrcValue*/, 0/*SVOffset*/, + false /*volatile*/, 1 /* align=1 */); + + if (!ConstantMemory) + Builder.PendingLoads.push_back(LoadVal.getValue(1)); + return LoadVal; +} + + +/// visitMemCmpCall - See if we can lower a call to memcmp in an optimized form. +/// If so, return true and lower it, otherwise return false and it will be +/// lowered like a normal call. +bool SelectionDAGBuilder::visitMemCmpCall(CallInst &I) { + // Verify that the prototype makes sense. int memcmp(void*,void*,size_t) + if (I.getNumOperands() != 4) + return false; + + Value *LHS = I.getOperand(1), *RHS = I.getOperand(2); + if (!isa<PointerType>(LHS->getType()) || !isa<PointerType>(RHS->getType()) || + !isa<IntegerType>(I.getOperand(3)->getType()) || + !isa<IntegerType>(I.getType())) + return false; + + ConstantInt *Size = dyn_cast<ConstantInt>(I.getOperand(3)); + + // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS) != 0 + // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS) != 0 + if (Size && (Size->getValue() == 2 || Size->getValue() == 4) && + IsOnlyUsedInZeroEqualityComparison(&I)) { + SDValue LHSVal = getMemCmpLoad(LHS, Size->getZExtValue(), *this); + SDValue RHSVal = getMemCmpLoad(RHS, Size->getZExtValue(), *this); + + SDValue Res = DAG.getSetCC(getCurDebugLoc(), MVT::i1, LHSVal, RHSVal, + ISD::SETNE); + EVT CallVT = TLI.getValueType(I.getType(), true); + setValue(&I, DAG.getZExtOrTrunc(Res, getCurDebugLoc(), CallVT)); + return true; + } + + + return false; +} + + void SelectionDAGBuilder::visitCall(CallInst &I) { const char *RenameFn = 0; if (Function *F = I.getCalledFunction()) { @@ -5148,6 +5248,9 @@ void SelectionDAGBuilder::visitCall(CallInst &I) { Tmp.getValueType(), Tmp)); return; } + } else if (Name == "memcmp") { + if (visitMemCmpCall(I)) + return; } } } else if (isa<InlineAsm>(I.getOperand(0))) { diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index c9c5f4f87c..88a2017b47 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -91,11 +91,13 @@ class SelectionDAGBuilder { DenseMap<const Value*, SDValue> NodeMap; +public: /// PendingLoads - Loads are not emitted to the program immediately. We bunch /// them up and then emit token factor nodes when possible. This allows us to /// get simple disambiguation between loads without worrying about alias /// analysis. SmallVector<SDValue, 8> PendingLoads; +private: /// PendingExports - CopyToReg nodes that copy values to virtual registers /// for export to other blocks need to be emitted before any terminator @@ -461,6 +463,8 @@ private: void visitStore(StoreInst &I); void visitPHI(PHINode &I) { } // PHI nodes are handled specially. void visitCall(CallInst &I); + bool visitMemCmpCall(CallInst &I); + void visitInlineAsm(CallSite CS); const char *visitIntrinsicCall(CallInst &I, unsigned Intrinsic); void visitTargetIntrinsic(CallInst &I, unsigned Intrinsic); diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp index 30c3f3f398..3c28ad27e5 100644 --- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp +++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp @@ -1011,19 +1011,6 @@ struct MemCmpOpt : public LibCallOptimization { return B.CreateSExt(B.CreateSub(LHSV, RHSV, "chardiff"), CI->getType()); } - // memcmp(S1,S2,2) != 0 -> (*(short*)LHS ^ *(short*)RHS) != 0 - // memcmp(S1,S2,4) != 0 -> (*(int*)LHS ^ *(int*)RHS) != 0 - if ((Len == 2 || Len == 4) && IsOnlyUsedInZeroEqualityComparison(CI)) { - const Type *PTy = PointerType::getUnqual(Len == 2 ? - Type::getInt16Ty(*Context) : Type::getInt32Ty(*Context)); - LHS = B.CreateBitCast(LHS, PTy, "tmp"); - RHS = B.CreateBitCast(RHS, PTy, "tmp"); - LoadInst *LHSV = B.CreateLoad(LHS, "lhsv"); - LoadInst *RHSV = B.CreateLoad(RHS, "rhsv"); - LHSV->setAlignment(1); RHSV->setAlignment(1); // Unaligned loads. - return B.CreateZExt(B.CreateXor(LHSV, RHSV, "shortdiff"), CI->getType()); - } - // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant) std::string LHSStr, RHSStr; if (GetConstantStringInfo(LHS, LHSStr) && diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll new file mode 100644 index 0000000000..f6086c4fbe --- /dev/null +++ b/test/CodeGen/X86/memcmp.ll @@ -0,0 +1,76 @@ +; RUN: llc %s -o - -march=x86-64 | FileCheck %s + +; This tests codegen time inlining/optimization of memcmp +; rdar://6480398 + +@.str = private constant [6 x i8] c"fooxx\00", align 1 ; <[6 x i8]*> [#uses=1] + +declare i32 @memcmp(...) + +define void @memcmp2(i8* %X, i8* %Y, i32* nocapture %P) nounwind { +entry: + %0 = tail call i32 (...)* @memcmp(i8* %X, i8* %Y, i32 2) nounwind ; <i32> [#uses=1] + %1 = icmp eq i32 %0, 0 ; <i1> [#uses=1] + br i1 %1, label %return, label %bb + +bb: ; preds = %entry + store i32 4, i32* %P, align 4 + ret void + +return: ; preds = %entry + ret void +; CHECK: memcmp2: +; CHECK: movw (%rsi), %ax +; CHECK: cmpw %ax, (%rdi) +} + +define void @memcmp2a(i8* %X, i32* nocapture %P) nounwind { +entry: + %0 = tail call i32 (...)* @memcmp(i8* %X, i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 1), i32 2) nounwind ; <i32> [#uses=1] + %1 = icmp eq i32 %0, 0 ; <i1> [#uses=1] + br i1 %1, label %return, label %bb + +bb: ; preds = %entry + store i32 4, i32* %P, align 4 + ret void + +return: ; preds = %entry + ret void +; CHECK: memcmp2a: +; CHECK: cmpw $28527, (%rdi) +} + + +define void @memcmp4(i8* %X, i8* %Y, i32* nocapture %P) nounwind { +entry: + %0 = tail call i32 (...)* @memcmp(i8* %X, i8* %Y, i32 4) nounwind ; <i32> [#uses=1] + %1 = icmp eq i32 %0, 0 ; <i1> [#uses=1] + br i1 %1, label %return, label %bb + +bb: ; preds = %entry + store i32 4, i32* %P, align 4 + ret void + +return: ; preds = %entry + ret void +; CHECK: memcmp4: +; CHECK: movl (%rsi), %eax +; CHECK: cmpl %eax, (%rdi) +} + +define void @memcmp4a(i8* %X, i32* nocapture %P) nounwind { +entry: + %0 = tail call i32 (...)* @memcmp(i8* %X, i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 1), i32 4) nounwind ; <i32> [#uses=1] + %1 = icmp eq i32 %0, 0 ; <i1> [#uses=1] + br i1 %1, label %return, label %bb + +bb: ; preds = %entry + store i32 4, i32* %P, align 4 + ret void + +return: ; preds = %entry + ret void +; CHECK: memcmp4a: +; CHECK: cmpl $2021158767, (%rdi) +} + diff --git a/test/Transforms/SimplifyLibCalls/memcmp.ll b/test/Transforms/SimplifyLibCalls/memcmp.ll index ed7bcac467..640d232a7f 100644 --- a/test/Transforms/SimplifyLibCalls/memcmp.ll +++ b/test/Transforms/SimplifyLibCalls/memcmp.ll @@ -14,9 +14,6 @@ define void @test(i8* %P, i8* %Q, i32 %N, i32* %IP, i1* %BP) { volatile store i32 %B, i32* %IP %C = call i32 @memcmp( i8* %P, i8* %Q, i32 1 ) ; <i32> [#uses=1] volatile store i32 %C, i32* %IP - %D = call i32 @memcmp( i8* %P, i8* %Q, i32 2 ) ; <i32> [#uses=1] - %E = icmp eq i32 %D, 0 ; <i1> [#uses=1] - volatile store i1 %E, i1* %BP %F = call i32 @memcmp(i8* getelementptr ([4 x i8]* @hel, i32 0, i32 0), i8* getelementptr ([8 x i8]* @hello_u, i32 0, i32 0), i32 3) |