aboutsummaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/Analysis/ConstantFolding.cpp28
-rw-r--r--lib/CodeGen/LiveIntervalAnalysis.cpp18
-rw-r--r--lib/CodeGen/MachineScheduler.cpp1
-rw-r--r--lib/MC/MCExpr.cpp4
-rw-r--r--lib/Support/APFloat.cpp5
-rw-r--r--lib/Target/ARM/ARMAsmPrinter.cpp48
-rw-r--r--lib/Target/ARM/ARMISelLowering.cpp10
-rw-r--r--lib/Target/ARM/ARMInstrThumb2.td14
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp3
-rw-r--r--lib/Target/NVPTX/NVPTX.td34
-rw-r--r--lib/Target/NVPTX/NVPTXAsmPrinter.cpp6
-rw-r--r--lib/Target/NVPTX/NVPTXSubtarget.cpp20
-rw-r--r--lib/Target/NVPTX/NVPTXSubtarget.h11
-rw-r--r--lib/Target/TargetLibraryInfo.cpp18
-rw-r--r--lib/Target/TargetTransformImpl.cpp12
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp79
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp141
-rw-r--r--lib/Target/X86/X86ISelLowering.h19
-rw-r--r--lib/Target/X86/X86InstrSSE.td154
-rw-r--r--lib/Target/X86/X86Subtarget.cpp5
-rw-r--r--lib/Transforms/InstCombine/InstructionCombining.cpp20
-rw-r--r--lib/Transforms/Instrumentation/ThreadSanitizer.cpp99
-rw-r--r--lib/Transforms/Scalar/SimplifyLibCalls.cpp247
-rw-r--r--lib/Transforms/Utils/SimplifyLibCalls.cpp295
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp229
25 files changed, 895 insertions, 625 deletions
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 5cac8ca3ba..91a5b84e8a 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -292,7 +292,10 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
unsigned IntBytes = unsigned(CI->getBitWidth()/8);
for (unsigned i = 0; i != BytesLeft && ByteOffset != IntBytes; ++i) {
- CurPtr[i] = (unsigned char)(Val >> (ByteOffset * 8));
+ int n = ByteOffset;
+ if (!TD.isLittleEndian())
+ n = IntBytes - n - 1;
+ CurPtr[i] = (unsigned char)(Val >> (n * 8));
++ByteOffset;
}
return true;
@@ -442,10 +445,19 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
BytesLoaded, TD))
return 0;
- APInt ResultVal = APInt(IntType->getBitWidth(), RawBytes[BytesLoaded-1]);
- for (unsigned i = 1; i != BytesLoaded; ++i) {
- ResultVal <<= 8;
- ResultVal |= RawBytes[BytesLoaded-1-i];
+ APInt ResultVal = APInt(IntType->getBitWidth(), 0);
+ if (TD.isLittleEndian()) {
+ ResultVal = RawBytes[BytesLoaded - 1];
+ for (unsigned i = 1; i != BytesLoaded; ++i) {
+ ResultVal <<= 8;
+ ResultVal |= RawBytes[BytesLoaded-1-i];
+ }
+ } else {
+ ResultVal = RawBytes[0];
+ for (unsigned i = 1; i != BytesLoaded; ++i) {
+ ResultVal <<= 8;
+ ResultVal |= RawBytes[i];
+ }
}
return ConstantInt::get(IntType->getContext(), ResultVal);
@@ -521,10 +533,8 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
}
}
- // Try hard to fold loads from bitcasted strange and non-type-safe things. We
- // currently don't do any of this for big endian systems. It can be
- // generalized in the future if someone is interested.
- if (TD && TD->isLittleEndian())
+ // Try hard to fold loads from bitcasted strange and non-type-safe things.
+ if (TD)
return FoldReinterpretLoadFromConstPtr(CE, *TD);
return 0;
}
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 65bc4af99e..4e75d892e5 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -146,6 +146,11 @@ void LiveIntervals::print(raw_ostream &OS, const Module* ) const {
OS << PrintReg(Reg) << " = " << getInterval(Reg) << '\n';
}
+ OS << "RegMasks:";
+ for (unsigned i = 0, e = RegMaskSlots.size(); i != e; ++i)
+ OS << ' ' << RegMaskSlots[i];
+ OS << '\n';
+
printInstrs(OS);
}
@@ -1257,10 +1262,15 @@ private:
SmallVectorImpl<SlotIndex>::iterator RI =
std::lower_bound(LIS.RegMaskSlots.begin(), LIS.RegMaskSlots.end(),
OldIdx);
- assert(*RI == OldIdx && "No RegMask at OldIdx.");
- *RI = NewIdx;
- assert(*prior(RI) < *RI && *RI < *next(RI) &&
- "RegSlots out of order. Did you move one call across another?");
+ assert(RI != LIS.RegMaskSlots.end() && *RI == OldIdx.getRegSlot() &&
+ "No RegMask at OldIdx.");
+ *RI = NewIdx.getRegSlot();
+ assert((RI == LIS.RegMaskSlots.begin() ||
+ SlotIndex::isEarlierInstr(*llvm::prior(RI), *RI)) &&
+ "Cannot move regmask instruction above another call");
+ assert((llvm::next(RI) == LIS.RegMaskSlots.end() ||
+ SlotIndex::isEarlierInstr(*RI, *llvm::next(RI))) &&
+ "Cannot move regmask instruction below another call");
}
// Return the last use of reg between NewIdx and OldIdx.
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index de16932c06..a4817d09c0 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -1583,6 +1583,7 @@ const char *ConvergingScheduler::getReasonStr(
case NextDefUse: return "DEF-USE ";
case NodeOrder: return "ORDER ";
};
+ llvm_unreachable("Unknown reason!");
}
void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand,
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index ffa79761f2..e0336342d6 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -60,7 +60,8 @@ void MCExpr::print(raw_ostream &OS) const {
SRE.getKind() == MCSymbolRefExpr::VK_ARM_GOTOFF ||
SRE.getKind() == MCSymbolRefExpr::VK_ARM_TPOFF ||
SRE.getKind() == MCSymbolRefExpr::VK_ARM_GOTTPOFF ||
- SRE.getKind() == MCSymbolRefExpr::VK_ARM_TARGET1)
+ SRE.getKind() == MCSymbolRefExpr::VK_ARM_TARGET1 ||
+ SRE.getKind() == MCSymbolRefExpr::VK_ARM_TARGET2)
OS << MCSymbolRefExpr::getVariantKindName(SRE.getKind());
else if (SRE.getKind() != MCSymbolRefExpr::VK_None &&
SRE.getKind() != MCSymbolRefExpr::VK_PPC_DARWIN_HA16 &&
@@ -199,6 +200,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
case VK_ARM_GOTTPOFF: return "(gottpoff)";
case VK_ARM_TLSGD: return "(tlsgd)";
case VK_ARM_TARGET1: return "(target1)";
+ case VK_ARM_TARGET2: return "(target2)";
case VK_PPC_TOC: return "tocbase";
case VK_PPC_TOC_ENTRY: return "toc";
case VK_PPC_DARWIN_HA16: return "ha16";
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index 43c68f4d1d..7e8b4a3d0d 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -3553,11 +3553,6 @@ void APFloat::toString(SmallVectorImpl<char> &Str,
}
bool APFloat::getExactInverse(APFloat *inv) const {
- // We can only guarantee the existence of an exact inverse for IEEE floats.
- if (semantics != &IEEEhalf && semantics != &IEEEsingle &&
- semantics != &IEEEdouble && semantics != &IEEEquad)
- return false;
-
// Special floats and denormals have no exact inverse.
if (category != fcNormal)
return false;
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 8d1a301a67..f67decc550 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -853,13 +853,28 @@ void ARMAsmPrinter::emitAttributes() {
AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
ARMBuildAttrs::Allowed);
} else if (CPUString == "generic") {
- // FIXME: Why these defaults?
- AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v4T);
+ // For a generic CPU, we assume a standard v7a architecture in Subtarget.
+ AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7);
+ AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch_profile,
+ ARMBuildAttrs::ApplicationProfile);
AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use,
ARMBuildAttrs::Allowed);
AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
- ARMBuildAttrs::Allowed);
- }
+ ARMBuildAttrs::AllowThumb32);
+ } else if (Subtarget->hasV7Ops()) {
+ AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7);
+ AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+ ARMBuildAttrs::AllowThumb32);
+ } else if (Subtarget->hasV6T2Ops())
+ AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v6T2);
+ else if (Subtarget->hasV6Ops())
+ AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v6);
+ else if (Subtarget->hasV5TEOps())
+ AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5TE);
+ else if (Subtarget->hasV5TOps())
+ AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5T);
+ else if (Subtarget->hasV4TOps())
+ AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v4T);
if (Subtarget->hasNEON() && emitFPU) {
/* NEON is not exactly a VFP architecture, but GAS emit one of
@@ -1515,31 +1530,6 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
}
return;
}
- case ARM::t2BMOVPCB_CALL: {
- {
- MCInst TmpInst;
- TmpInst.setOpcode(ARM::tMOVr);
- TmpInst.addOperand(MCOperand::CreateReg(ARM::LR));
- TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
- // Add predicate operands.
- TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
- TmpInst.addOperand(MCOperand::CreateReg(0));
- OutStreamer.EmitInstruction(TmpInst);
- }
- {
- MCInst TmpInst;
- TmpInst.setOpcode(ARM::t2B);
- const GlobalValue *GV = MI->getOperand(0).getGlobal();
- MCSymbol *GVSym = Mang->getSymbol(GV);
- const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext);
- TmpInst.addOperand(MCOperand::CreateExpr(GVSymExpr));
- // Add predicate operands.
- TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
- TmpInst.addOperand(MCOperand::CreateReg(0));
- OutStreamer.EmitInstruction(TmpInst);
- }
- return;
- }
case ARM::MOVi16_ga_pcrel:
case ARM::t2MOVi16_ga_pcrel: {
MCInst TmpInst;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index dec498a4f7..0893826427 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -1639,18 +1639,12 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (Subtarget->isThumb()) {
if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
CallOpc = ARMISD::CALL_NOLINK;
- else if (doesNotRet && isDirect && !isARMFunc &&
- Subtarget->hasRAS() && !Subtarget->isThumb1Only() &&
- // Emit regular call when code size is the priority
- !HasMinSizeAttr)
- // "mov lr, pc; b _foo" to avoid confusing the RSP
- CallOpc = ARMISD::CALL_NOLINK;
else
CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL;
} else {
- if (!isDirect && !Subtarget->hasV5TOps()) {
+ if (!isDirect && !Subtarget->hasV5TOps())
CallOpc = ARMISD::CALL_NOLINK;
- } else if (doesNotRet && isDirect && Subtarget->hasRAS() &&
+ else if (doesNotRet && isDirect && Subtarget->hasRAS() &&
// Emit regular call when code size is the priority
!HasMinSizeAttr)
// "mov lr, pc; b _foo" to avoid confusing the RSP
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 248bab6b12..c2800acccd 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -3331,20 +3331,6 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
Requires<[IsThumb2, IsIOS]>;
}
-let isCall = 1, Defs = [LR], Uses = [SP] in {
- // mov lr, pc; b if callee is marked noreturn to avoid confusing the
- // return stack predictor.
- def t2BMOVPCB_CALL : tPseudoInst<(outs),
- (ins t_bltarget:$func),
- 6, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>,
- Requires<[IsThumb]>;
-}
-
-// Direct calls
-def : T2Pat<(ARMcall_nolink texternalsym:$func),
- (t2BMOVPCB_CALL texternalsym:$func)>,
- Requires<[IsThumb]>;
-
// IT block
let Defs = [ITSTATE] in
def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 7a7ce27d48..253d1fa2ab 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -247,6 +247,9 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
case MCSymbolRefExpr::VK_ARM_TARGET1:
Type = ELF::R_ARM_TARGET1;
break;
+ case MCSymbolRefExpr::VK_ARM_TARGET2:
+ Type = ELF::R_ARM_TARGET2;
+ break;
}
break;
case ARM::fixup_arm_ldst_pcrel_12:
diff --git a/lib/Target/NVPTX/NVPTX.td b/lib/Target/NVPTX/NVPTX.td
index ae7710e54f..7aee3595c6 100644
--- a/lib/Target/NVPTX/NVPTX.td
+++ b/lib/Target/NVPTX/NVPTX.td
@@ -24,7 +24,30 @@ include "NVPTXInstrInfo.td"
// - Need at least one feature to avoid generating zero sized array by
// TableGen in NVPTXGenSubtarget.inc.
//===----------------------------------------------------------------------===//
-def FeatureDummy : SubtargetFeature<"dummy", "dummy", "true", "">;
+
+// SM Versions
+def SM10 : SubtargetFeature<"sm_10", "SmVersion", "10",
+ "Target SM 1.0">;
+def SM11 : SubtargetFeature<"sm_11", "SmVersion", "11",
+ "Target SM 1.1">;
+def SM12 : SubtargetFeature<"sm_12", "SmVersion", "12",
+ "Target SM 1.2">;
+def SM13 : SubtargetFeature<"sm_13", "SmVersion", "13",
+ "Target SM 1.3">;
+def SM20 : SubtargetFeature<"sm_20", "SmVersion", "20",
+ "Target SM 2.0">;
+def SM21 : SubtargetFeature<"sm_21", "SmVersion", "21",
+ "Target SM 2.1">;
+def SM30 : SubtargetFeature<"sm_30", "SmVersion", "30",
+ "Target SM 3.0">;
+def SM35 : SubtargetFeature<"sm_35", "SmVersion", "35",
+ "Target SM 3.5">;
+
+// PTX Versions
+def PTX30 : SubtargetFeature<"ptx30", "PTXVersion", "30",
+ "Use PTX version 3.0">;
+def PTX31 : SubtargetFeature<"ptx31", "PTXVersion", "31",
+ "Use PTX version 3.1">;
//===----------------------------------------------------------------------===//
// NVPTX supported processors.
@@ -33,7 +56,14 @@ def FeatureDummy : SubtargetFeature<"dummy", "dummy", "true", "">;
class Proc<string Name, list<SubtargetFeature> Features>
: Processor<Name, NoItineraries, Features>;
-def : Proc<"sm_10", [FeatureDummy]>;
+def : Proc<"sm_10", [SM10]>;
+def : Proc<"sm_11", [SM11]>;
+def : Proc<"sm_12", [SM12]>;
+def : Proc<"sm_13", [SM13]>;
+def : Proc<"sm_20", [SM20]>;
+def : Proc<"sm_21", [SM21]>;
+def : Proc<"sm_30", [SM30]>;
+def : Proc<"sm_35", [SM35]>;
def NVPTXInstrInfo : InstrInfo {
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index d3dfb35e26..3dd9bf5613 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -910,7 +910,8 @@ void NVPTXAsmPrinter::emitHeader (Module &M, raw_ostream &O) {
O << "//\n";
O << "\n";
- O << ".version 3.0\n";
+ unsigned PTXVersion = nvptxSubtarget.getPTXVersion();
+ O << ".version " << (PTXVersion / 10) << "." << (PTXVersion % 10) << "\n";
O << ".target ";
O << nvptxSubtarget.getTargetName();
@@ -1525,6 +1526,9 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F,
// <a> = PAL.getparamalignment
// size = typeallocsize of element type
unsigned align = PAL.getParamAlignment(paramIndex+1);
+ if (align == 0)
+ align = TD->getABITypeAlignment(ETy);
+
unsigned sz = TD->getTypeAllocSize(ETy);
O << "\t.param .align " << align
<< " .b8 ";
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 6aadd43e94..7b62cce2c6 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -34,16 +34,18 @@ DriverInterface(cl::desc("Choose driver interface:"),
NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU,
const std::string &FS, bool is64Bit)
-:NVPTXGenSubtargetInfo(TT, "", FS), // Don't pass CPU to subtarget,
- // because we don't register all
- // nvptx targets.
- Is64Bit(is64Bit) {
+: NVPTXGenSubtargetInfo(TT, CPU, FS),
+ Is64Bit(is64Bit),
+ PTXVersion(0),
+ SmVersion(10) {
drvInterface = DriverInterface;
// Provide the default CPU if none
std::string defCPU = "sm_10";
+ ParseSubtargetFeatures((CPU.empty() ? defCPU : CPU), FS);
+
// Get the TargetName from the FS if available
if (FS.empty() && CPU.empty())
TargetName = defCPU;
@@ -52,6 +54,12 @@ NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU,
else
llvm_unreachable("we are not using FeatureStr");
- // Set up the SmVersion
- SmVersion = atoi(TargetName.c_str()+3);
+ // We default to PTX 3.1, but we cannot just default to it in the initializer
+ // since the attribute parser checks if the given option is >= the default.
+ // So if we set ptx31 as the default, the ptx30 attribute would never match.
+ // Instead, we use 0 as the default and manually set 31 if the default is
+ // used.
+ if (PTXVersion == 0) {
+ PTXVersion = 31;
+ }
}
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h
index 8f2a629d22..c3a683a2c6 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -25,13 +25,18 @@
namespace llvm {
class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
-
- unsigned int SmVersion;
+
std::string TargetName;
NVPTX::DrvInterface drvInterface;
bool dummy; // For the 'dummy' feature, see NVPTX.td
bool Is64Bit;
+ // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
+ unsigned PTXVersion;
+
+ // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31
+ unsigned int SmVersion;
+
public:
/// This constructor initializes the data members to match that
/// of the specified module.
@@ -69,6 +74,8 @@ public:
NVPTX::DrvInterface getDrvInterface() const { return drvInterface; }
std::string getTargetName() const { return TargetName; }
+ unsigned getPTXVersion() const { return PTXVersion; }
+
void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
std::string getDataLayout() const {
diff --git a/lib/Target/TargetLibraryInfo.cpp b/lib/Target/TargetLibraryInfo.cpp
index 62f973e658..6d4eab1204 100644
--- a/lib/Target/TargetLibraryInfo.cpp
+++ b/lib/Target/TargetLibraryInfo.cpp
@@ -152,9 +152,12 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
"sqrt",
"sqrtf",
"sqrtl",
+ "stpcpy",
"strcat",
"strchr",
+ "strcmp",
"strcpy",
+ "strcspn",
"strdup",
"strlen",
"strncat",
@@ -162,6 +165,17 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
"strncpy",
"strndup",
"strnlen",
+ "strpbrk",
+ "strrchr",
+ "strspn",
+ "strstr",
+ "strtod",
+ "strtof",
+ "strtol",
+ "strtold",
+ "strtoll",
+ "strtoul",
+ "strtoull",
"tan",
"tanf",
"tanh",
@@ -309,6 +323,10 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
TLI.setUnavailable(LibFunc::tanf);
TLI.setUnavailable(LibFunc::tanhf);
}
+
+ // Win32 does *not* provide stpcpy. It is provided on POSIX systems:
+ // http://pubs.opengroup.org/onlinepubs/9699919799/functions/stpcpy.html
+ TLI.setUnavailable(LibFunc::stpcpy);
}
}
diff --git a/lib/Target/TargetTransformImpl.cpp b/lib/Target/TargetTransformImpl.cpp
index c07332de32..b36e6f858f 100644
--- a/lib/Target/TargetTransformImpl.cpp
+++ b/lib/Target/TargetTransformImpl.cpp
@@ -214,8 +214,16 @@ unsigned VectorTargetTransformImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
// Handle scalar conversions.
if (!Src->isVectorTy() && !Dst->isVectorTy()) {
- // Scalar bitcasts and truncs are usually free.
- if (Opcode == Instruction::BitCast || Opcode == Instruction::Trunc)
+ // Scalar bitcasts are usually free.
+ if (Opcode == Instruction::BitCast)
+ return 0;
+
+ if (Opcode == Instruction::Trunc &&
+ TLI->isTruncateFree(SrcLT.second, DstLT.second))
+ return 0;
+
+ if (Opcode == Instruction::ZExt &&
+ TLI->isZExtFree(SrcLT.second, DstLT.second))
return 0;
// Just check the op cost. If the operation is legal then assume it costs 1.
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index e86c1000f1..42134256e3 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2892,85 +2892,6 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
return Result;
}
-
- // FIXME: Custom handling because TableGen doesn't support multiple implicit
- // defs in an instruction pattern
- case X86ISD::PCMPESTRI: {
- SDValue N0 = Node->getOperand(0);
- SDValue N1 = Node->getOperand(1);
- SDValue N2 = Node->getOperand(2);
- SDValue N3 = Node->getOperand(3);
- SDValue N4 = Node->getOperand(4);
-
- // Make sure last argument is a constant
- ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N4);
- if (!Cst)
- break;
-
- uint64_t Imm = Cst->getZExtValue();
-
- SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
- X86::EAX, N1, SDValue()).getValue(1);
- InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
- N3, InFlag).getValue(1);
-
- SDValue Ops[] = { N0, N2, getI8Imm(Imm), InFlag };
- unsigned Opc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr :
- X86::PCMPESTRIrr;
- InFlag = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, Ops,
- array_lengthof(Ops)), 0);
-
- if (!SDValue(Node, 0).use_empty()) {
- SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
- X86::ECX, NVT, InFlag);
- InFlag = Result.getValue(2);
- ReplaceUses(SDValue(Node, 0), Result);
- }
- if (!SDValue(Node, 1).use_empty()) {
- SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
- X86::EFLAGS, NVT, InFlag);
- InFlag = Result.getValue(2);
- ReplaceUses(SDValue(Node, 1), Result);
- }
-
- return NULL;
- }
-
- // FIXME: Custom handling because TableGen doesn't support multiple implicit
- // defs in an instruction pattern
- case X86ISD::PCMPISTRI: {
- SDValue N0 = Node->getOperand(0);
- SDValue N1 = Node->getOperand(1);
- SDValue N2 = Node->getOperand(2);
-
- // Make sure last argument is a constant
- ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N2);
- if (!Cst)
- break;
-
- uint64_t Imm = Cst->getZExtValue();
-
- SDValue Ops[] = { N0, N1, getI8Imm(Imm) };
- unsigned Opc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr :
- X86::PCMPISTRIrr;
- SDValue InFlag = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, Ops,
- array_lengthof(Ops)), 0);
-
- if (!SDValue(Node, 0).use_empty()) {
- SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
- X86::ECX, NVT, InFlag);
- InFlag = Result.getValue(2);
- ReplaceUses(SDValue(Node, 0), Result);
- }
- if (!SDValue(Node, 1).use_empty()) {
- SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
- X86::EFLAGS, NVT, InFlag);
- InFlag = Result.getValue(2);
- ReplaceUses(SDValue(Node, 1), Result);
- }
-
- return NULL;
- }
}
SDNode *ResNode = SelectCode(Node);
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index fd0a8a27d6..5610bb5ba3 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -12240,6 +12240,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
+ case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
+ case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
}
}
@@ -12388,13 +12390,10 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
// X86 Scheduler Hooks
//===----------------------------------------------------------------------===//
-// private utility function
-
/// Utility function to emit xbegin specifying the start of an RTM region.
-MachineBasicBlock *
-X86TargetLowering::EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB) const {
+static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
+ const TargetInstrInfo *TII) {
DebugLoc DL = MI->getDebugLoc();
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
const BasicBlock *BB = MBB->getBasicBlock();
MachineFunction::iterator I = MBB;
@@ -13033,45 +13032,82 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
// or XMM0_V32I8 in AVX all of this code can be replaced with that
// in the .td file.
-MachineBasicBlock *
-X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
- unsigned numArgs, bool memArg) const {
- assert(Subtarget->hasSSE42() &&
- "Target must have SSE4.2 or AVX features enabled");
+static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
+ const TargetInstrInfo *TII) {
+ unsigned Opc;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("illegal opcode!");
+ case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
+ case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
+ case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
+ case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
+ case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
+ case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
+ case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
+ case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
+ }
DebugLoc dl = MI->getDebugLoc();
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
+
+ unsigned NumArgs = MI->getNumOperands();
+ for (unsigned i = 1; i < NumArgs; ++i) {
+ MachineOperand &Op = MI->getOperand(i);
+ if (!(Op.isReg() && Op.isImplicit()))
+ MIB.addOperand(Op);
+ }
+ if (MI->hasOneMemOperand())
+ MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+ BuildMI(*BB, MI, dl,
+ TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+ .addReg(X86::XMM0);
+
+ MI->eraseFromParent();
+ return BB;
+}
+
+// FIXME: Custom handling because TableGen doesn't support multiple implicit
+// defs in an instruction pattern
+static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
+ const TargetInstrInfo *TII) {
unsigned Opc;
- if (!Subtarget->hasAVX()) {
- if (memArg)
- Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm;
- else
- Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr;
- } else {
- if (memArg)
- Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm;
- else
- Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("illegal opcode!");
+ case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
+ case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
+ case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
+ case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
+ case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
+ case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
+ case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
+ case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
}
+ DebugLoc dl = MI->getDebugLoc();
MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
- for (unsigned i = 0; i < numArgs; ++i) {
- MachineOperand &Op = MI->getOperand(i+1);
+
+ unsigned NumArgs = MI->getNumOperands(); // remove the results
+ for (unsigned i = 1; i < NumArgs; ++i) {
+ MachineOperand &Op = MI->getOperand(i);
if (!(Op.isReg() && Op.isImplicit()))
MIB.addOperand(Op);
}
+ if (MI->hasOneMemOperand())
+ MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
BuildMI(*BB, MI, dl,
TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
- .addReg(X86::XMM0);
+ .addReg(X86::ECX);
MI->eraseFromParent();
return BB;
}
-MachineBasicBlock *
-X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const {
+static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
+ const TargetInstrInfo *TII,
+ const X86Subtarget* Subtarget) {
DebugLoc dl = MI->getDebugLoc();
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
// Address into RAX/EAX, other two args into ECX, EDX.
unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
@@ -14125,36 +14161,33 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::PCMPESTRM128REG:
case X86::VPCMPESTRM128REG:
case X86::PCMPESTRM128MEM:
- case X86::VPCMPESTRM128MEM: {
- unsigned NumArgs;
- bool MemArg;
- switch (MI->getOpcode()) {
- default: llvm_unreachable("illegal opcode!");
- case X86::PCMPISTRM128REG:
- case X86::VPCMPISTRM128REG:
- NumArgs = 3; MemArg = false; break;
- case X86::PCMPISTRM128MEM:
- case X86::VPCMPISTRM128MEM:
- NumArgs = 3; MemArg = true; break;
- case X86::PCMPESTRM128REG:
- case X86::VPCMPESTRM128REG:
- NumArgs = 5; MemArg = false; break;
- case X86::PCMPESTRM128MEM:
- case X86::VPCMPESTRM128MEM:
- NumArgs = 5; MemArg = true; break;
- }
- return EmitPCMP(MI, BB, NumArgs, MemArg);
- }
-
- // Thread synchronization.
+ case X86::VPCMPESTRM128MEM:
+ assert(Subtarget->hasSSE42() &&
+ "Target must have SSE4.2 or AVX features enabled");
+ return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo());
+
+ // String/text processing lowering.
+ case X86::PCMPISTRIREG:
+ case X86::VPCMPISTRIREG:
+ case X86::PCMPISTRIMEM:
+ case X86::VPCMPISTRIMEM:
+ case X86::PCMPESTRIREG:
+ case X86::VPCMPESTRIREG:
+ case X86::PCMPESTRIMEM:
+ case X86::VPCMPESTRIMEM:
+ assert(Subtarget->hasSSE42() &&
+ "Target must have SSE4.2 or AVX features enabled");
+ return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo());
+
+ // Thread synchronization.
case X86::MONITOR:
- return EmitMonitor(MI, BB);
+ return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget);
// xbegin
case X86::XBEGIN:
- return EmitXBegin(MI, BB);
+ return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo());
- // Atomic Lowering.
+ // Atomic Lowering.
case X86::ATOMAND8:
case X86::ATOMAND16:
case X86::ATOMAND32:
@@ -17993,8 +18026,8 @@ unsigned X86VectorTargetTransformInfo::getCastInstrCost(unsigned Opcode,
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 1 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 1 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 1 },
- { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 1 },
- { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 6 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 9 },
{ ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 },
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 09f175db53..b6e8960f76 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -893,21 +893,6 @@ namespace llvm {
const SmallVectorImpl<ISD::OutputArg> &Outs,
LLVMContext &Context) const;
- /// Utility function to emit string processing sse4.2 instructions
- /// that return in xmm0.
- /// This takes the instruction to expand, the associated machine basic
- /// block, the number of args, and whether or not the second arg is
- /// in memory or not.
- MachineBasicBlock *EmitPCMP(MachineInstr *BInstr, MachineBasicBlock *BB,
- unsigned argNum, bool inMem) const;
-
- /// Utility functions to emit monitor and mwait instructions. These
- /// need to make sure that the arguments to the intrinsic are in the
- /// correct registers.
- MachineBasicBlock *EmitMonitor(MachineInstr *MI,
- MachineBasicBlock *BB) const;
- MachineBasicBlock *EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const;
-
/// Utility function to emit atomic-load-arith operations (and, or, xor,
/// nand, max, min, umax, umin). It takes the corresponding instruction to
/// expand, the associated machine basic block, and the associated X86
@@ -920,10 +905,6 @@ namespace llvm {
MachineBasicBlock *EmitAtomicLoadArith6432(MachineInstr *MI,
MachineBasicBlock *MBB) const;
- /// Utility function to emit xbegin specifying the start of an RTM region.
- MachineBasicBlock *EmitXBegin(MachineInstr *MI,
- MachineBasicBlock *MBB) const;
-
// Utility function to emit the low-level va_arg code for X86-64.
MachineBasicBlock *EmitVAARG64WithCustomInserter(
MachineInstr *MI,
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 28dfbe7a1f..6f48d7ed7f 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -7002,8 +7002,8 @@ multiclass pseudo_pcmpistrm<string asm> {
imm:$src3))]>;
def MEM : PseudoI<(outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, i8imm:$src3),
- [(set VR128:$dst, (int_x86_sse42_pcmpistrm128
- VR128:$src1, (load addr:$src2), imm:$src3))]>;
+ [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
+ (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>;
}
let Defs = [EFLAGS], usesCustomInserter = 1 in {
@@ -7011,24 +7011,22 @@ let Defs = [EFLAGS], usesCustomInserter = 1 in {
defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[UseSSE42]>;
}
-let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1, Predicates = [HasAVX] in {
- def VPCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs),
- (ins VR128:$src1, VR128:$src2, i8imm:$src3),
- "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX;
+multiclass pcmpistrm_SS42AI<string asm> {
+ def rr : SS42AI<0x62, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+ []>, OpSize;
let mayLoad = 1 in
- def VPCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs),
- (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
- "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX;
+ def rm :SS42AI<0x62, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+ []>, OpSize;
}
let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1 in {
- def PCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs),
- (ins VR128:$src1, VR128:$src2, i8imm:$src3),
- "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
- let mayLoad = 1 in
- def PCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs),
- (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
- "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
+ let Predicates = [HasAVX] in
+ defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
+ defm PCMPISTRM128 : pcmpistrm_SS42AI<"pcmpistrm"> ;
}
// Packed Compare Explicit Length Strings, Return Mask
@@ -7039,8 +7037,8 @@ multiclass pseudo_pcmpestrm<string asm> {
VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
def MEM : PseudoI<(outs VR128:$dst),
(ins VR128:$src1, i128mem:$src3, i8imm:$src5),
- [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
- VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5))]>;
+ [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
+ (bc_v16i8 (memopv2i64 addr:$src3)), EDX, imm:$src5))]>;
}
let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
@@ -7048,64 +7046,94 @@ let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[UseSSE42]>;
}
-let Predicates = [HasAVX],
- Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
- def VPCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs),
- (ins VR128:$src1, VR128:$src3, i8imm:$src5),
- "vpcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize, VEX;
+multiclass SS42AI_pcmpestrm<string asm> {
+ def rr : SS42AI<0x60, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+ !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+ []>, OpSize;
let mayLoad = 1 in
- def VPCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs),
- (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
- "vpcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize, VEX;
+ def rm : SS42AI<0x60, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+ !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+ []>, OpSize;
}
let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
- def PCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs),
- (ins VR128:$src1, VR128:$src3, i8imm:$src5),
- "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
- let mayLoad = 1 in
- def PCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs),
- (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
- "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
+ let Predicates = [HasAVX] in
+ defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
+ defm PCMPESTRM128 : SS42AI_pcmpestrm<"pcmpestrm">;
}
// Packed Compare Implicit Length Strings, Return Index
-let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in {
- multiclass SS42AI_pcmpistri<string asm> {
- def rr : SS42AI<0x63, MRMSrcReg, (outs),
- (ins VR128:$src1, VR128:$src2, i8imm:$src3),
- !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
- []>, OpSize;
- let mayLoad = 1 in
- def rm : SS42AI<0x63, MRMSrcMem, (outs),
- (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
- !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
- []>, OpSize;
- }
+multiclass pseudo_pcmpistri<string asm> {
+ def REG : PseudoI<(outs GR32:$dst),
+ (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+ [(set GR32:$dst, EFLAGS,
+ (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
+ def MEM : PseudoI<(outs GR32:$dst),
+ (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+ [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
+ (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>;
}
-let Predicates = [HasAVX] in
-defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
-defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">;
+let Defs = [EFLAGS], usesCustomInserter = 1 in {
+ defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI">, Requires<[HasAVX]>;
+ defm PCMPISTRI : pseudo_pcmpistri<"#PCMPISTRI">, Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpistri<string asm> {
+ def rr : SS42AI<0x63, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+ []>, OpSize;
+ let mayLoad = 1 in
+ def rm : SS42AI<0x63, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+ !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+ []>, OpSize;
+}
+
+let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in {
+ let Predicates = [HasAVX] in
+ defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
+ defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">;
+}
// Packed Compare Explicit Length Strings, Return Index
-let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
- multiclass SS42AI_pcmpestri<string asm> {
- def rr : SS42AI<0x61, MRMSrcReg, (outs),
- (ins VR128:$src1, VR128:$src3, i8imm:$src5),
- !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
- []>, OpSize;
- let mayLoad = 1 in
- def rm : SS42AI<0x61, MRMSrcMem, (outs),
- (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
- !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
- []>, OpSize;
- }
+multiclass pseudo_pcmpestri<string asm> {
+ def REG : PseudoI<(outs GR32:$dst),
+ (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+ [(set GR32:$dst, EFLAGS,
+ (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
+ def MEM : PseudoI<(outs GR32:$dst),
+ (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+ [(set GR32:$dst, EFLAGS,
+ (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (memopv2i64 addr:$src3)), EDX,
+ imm:$src5))]>;
}
-let Predicates = [HasAVX] in
-defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
-defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">;
+let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+ defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI">, Requires<[HasAVX]>;
+ defm PCMPESTRI : pseudo_pcmpestri<"#PCMPESTRI">, Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpestri<string asm> {
+ def rr : SS42AI<0x61, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+ !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+ []>, OpSize;
+ let mayLoad = 1 in
+ def rm : SS42AI<0x61, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+ !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+ []>, OpSize;
+}
+
+let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
+ let Predicates = [HasAVX] in
+ defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
+ defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">;
+}
//===----------------------------------------------------------------------===//
// SSE4.2 - CRC Instructions
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 31be6b7a7b..0132f81410 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -423,12 +423,11 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
assert((!In64BitMode || HasX86_64) &&
"64-bit code requested on a subtarget that doesn't support it!");
- // Stack alignment is 16 bytes on Darwin, FreeBSD, Linux, Solaris (both
+ // Stack alignment is 16 bytes on Darwin, Linux and Solaris (both
// 32 and 64 bit), NaCl and for all 64-bit targets.
if (StackAlignOverride)
stackAlignment = StackAlignOverride;
- else if (isTargetDarwin() || isTargetFreeBSD() || isTargetLinux() ||
- isTargetSolaris() ||
+ else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() ||
isTargetNaCl() || // @LOCALMOD
In64BitMode)
stackAlignment = 16;
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index ccf75bca2b..9a46f25e66 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2367,6 +2367,24 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
return MadeIRChange;
}
+namespace {
+class InstCombinerLibCallSimplifier : public LibCallSimplifier {
+ InstCombiner *IC;
+public:
+ InstCombinerLibCallSimplifier(const DataLayout *TD,
+ const TargetLibraryInfo *TLI,
+ InstCombiner *IC)
+ : LibCallSimplifier(TD, TLI) {
+ this->IC = IC;
+ }
+
+ /// replaceAllUsesWith - override so that instruction replacement
+ /// can be defined in terms of the instruction combiner framework.
+ virtual void replaceAllUsesWith(Instruction *I, Value *With) const {
+ IC->ReplaceInstUsesWith(*I, With);
+ }
+};
+}
bool InstCombiner::runOnFunction(Function &F) {
TD = getAnalysisIfAvailable<DataLayout>();
@@ -2379,7 +2397,7 @@ bool InstCombiner::runOnFunction(Function &F) {
InstCombineIRInserter(Worklist));
Builder = &TheBuilder;
- LibCallSimplifier TheSimplifier(TD, TLI);
+ InstCombinerLibCallSimplifier TheSimplifier(TD, TLI, this);
Simplifier = &TheSimplifier;
bool EverMadeChange = false;
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index c6244a55c9..9e10fc4416 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -97,6 +97,10 @@ struct ThreadSanitizer : public FunctionPass {
Function *TsanWrite[kNumberOfAccessSizes];
Function *TsanAtomicLoad[kNumberOfAccessSizes];
Function *TsanAtomicStore[kNumberOfAccessSizes];
+ Function *TsanAtomicRMW[AtomicRMWInst::LAST_BINOP + 1][kNumberOfAccessSizes];
+ Function *TsanAtomicCAS[kNumberOfAccessSizes];
+ Function *TsanAtomicThreadFence;
+ Function *TsanAtomicSignalFence;
Function *TsanVptrUpdate;
};
} // namespace
@@ -167,10 +171,42 @@ bool ThreadSanitizer::doInitialization(Module &M) {
TsanAtomicStore[i] = checkInterfaceFunction(M.getOrInsertFunction(
AtomicStoreName, IRB.getVoidTy(), PtrTy, Ty, OrdTy,
NULL));
+
+ for (int op = AtomicRMWInst::FIRST_BINOP;
+ op <= AtomicRMWInst::LAST_BINOP; ++op) {
+ TsanAtomicRMW[op][i] = NULL;
+ const char *NamePart = NULL;
+ if (op == AtomicRMWInst::Xchg)
+ NamePart = "_exchange";
+ else if (op == AtomicRMWInst::Add)
+ NamePart = "_fetch_add";
+ else if (op == AtomicRMWInst::Sub)
+ NamePart = "_fetch_sub";
+ else if (op == AtomicRMWInst::And)
+ NamePart = "_fetch_and";
+ else if (op == AtomicRMWInst::Or)
+ NamePart = "_fetch_or";
+ else if (op == AtomicRMWInst::Xor)
+ NamePart = "_fetch_xor";
+ else
+ continue;
+ SmallString<32> RMWName("__tsan_atomic" + itostr(BitSize) + NamePart);
+ TsanAtomicRMW[op][i] = checkInterfaceFunction(M.getOrInsertFunction(
+ RMWName, Ty, PtrTy, Ty, OrdTy, NULL));
+ }
+
+ SmallString<32> AtomicCASName("__tsan_atomic" + itostr(BitSize) +
+ "_compare_exchange_val");
+ TsanAtomicCAS[i] = checkInterfaceFunction(M.getOrInsertFunction(
+ AtomicCASName, Ty, PtrTy, Ty, Ty, OrdTy, NULL));
}
TsanVptrUpdate = checkInterfaceFunction(M.getOrInsertFunction(
"__tsan_vptr_update", IRB.getVoidTy(), IRB.getInt8PtrTy(),
IRB.getInt8PtrTy(), NULL));
+ TsanAtomicThreadFence = checkInterfaceFunction(M.getOrInsertFunction(
+ "__tsan_atomic_thread_fence", IRB.getVoidTy(), OrdTy, NULL));
+ TsanAtomicSignalFence = checkInterfaceFunction(M.getOrInsertFunction(
+ "__tsan_atomic_signal_fence", IRB.getVoidTy(), OrdTy, NULL));
return true;
}
@@ -253,8 +289,8 @@ static bool isAtomic(Instruction *I) {
return true;
if (isa<AtomicCmpXchgInst>(I))
return true;
- if (FenceInst *FI = dyn_cast<FenceInst>(I))
- return FI->getSynchScope() == CrossThread;
+ if (isa<FenceInst>(I))
+ return true;
return false;
}
@@ -354,15 +390,14 @@ static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) {
switch (ord) {
case NotAtomic: assert(false);
case Unordered: // Fall-through.
- case Monotonic: v = 1 << 0; break;
- // case Consume: v = 1 << 1; break; // Not specified yet.
- case Acquire: v = 1 << 2; break;
- case Release: v = 1 << 3; break;
- case AcquireRelease: v = 1 << 4; break;
- case SequentiallyConsistent: v = 1 << 5; break;
+ case Monotonic: v = 0; break;
+ // case Consume: v = 1; break; // Not specified yet.
+ case Acquire: v = 2; break;
+ case Release: v = 3; break;
+ case AcquireRelease: v = 4; break;
+ case SequentiallyConsistent: v = 5; break;
}
- // +100500 is temporal to migrate to new enum values.
- return IRB->getInt32(v + 100500);
+ return IRB->getInt32(v);
}
bool ThreadSanitizer::instrumentAtomic(Instruction *I) {
@@ -397,12 +432,44 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I) {
CallInst *C = CallInst::Create(TsanAtomicStore[Idx],
ArrayRef<Value*>(Args));
ReplaceInstWithInst(I, C);
- } else if (isa<AtomicRMWInst>(I)) {
- // FIXME: Not yet supported.
- } else if (isa<AtomicCmpXchgInst>(I)) {
- // FIXME: Not yet supported.
- } else if (isa<FenceInst>(I)) {
- // FIXME: Not yet supported.
+ } else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I)) {
+ Value *Addr = RMWI->getPointerOperand();
+ int Idx = getMemoryAccessFuncIndex(Addr);
+ if (Idx < 0)
+ return false;
+ Function *F = TsanAtomicRMW[RMWI->getOperation()][Idx];
+ if (F == NULL)
+ return false;
+ const size_t ByteSize = 1 << Idx;
+ const size_t BitSize = ByteSize * 8;
+ Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+ Type *PtrTy = Ty->getPointerTo();
+ Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+ IRB.CreateIntCast(RMWI->getValOperand(), Ty, false),
+ createOrdering(&IRB, RMWI->getOrdering())};
+ CallInst *C = CallInst::Create(F, ArrayRef<Value*>(Args));
+ ReplaceInstWithInst(I, C);
+ } else if (AtomicCmpXchgInst *CASI = dyn_cast<AtomicCmpXchgInst>(I)) {
+ Value *Addr = CASI->getPointerOperand();
+ int Idx = getMemoryAccessFuncIndex(Addr);
+ if (Idx < 0)
+ return false;
+ const size_t ByteSize = 1 << Idx;
+ const size_t BitSize = ByteSize * 8;
+ Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+ Type *PtrTy = Ty->getPointerTo();
+ Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+ IRB.CreateIntCast(CASI->getCompareOperand(), Ty, false),
+ IRB.CreateIntCast(CASI->getNewValOperand(), Ty, false),
+ createOrdering(&IRB, CASI->getOrdering())};
+ CallInst *C = CallInst::Create(TsanAtomicCAS[Idx], ArrayRef<Value*>(Args));
+ ReplaceInstWithInst(I, C);
+ } else if (FenceInst *FI = dyn_cast<FenceInst>(I)) {
+ Value *Args[] = {createOrdering(&IRB, FI->getOrdering())};
+ Function *F = FI->getSynchScope() == SingleThread ?
+ TsanAtomicSignalFence : TsanAtomicThreadFence;
+ CallInst *C = CallInst::Create(F, ArrayRef<Value*>(Args));
+ ReplaceInstWithInst(I, C);
}
return true;
}
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index 7d652dea48..17d07cdb2d 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -99,243 +99,7 @@ static bool CallHasFloatingPointArgument(const CallInst *CI) {
return false;
}
-/// IsOnlyUsedInEqualityComparison - Return true if it is only used in equality
-/// comparisons with With.
-static bool IsOnlyUsedInEqualityComparison(Value *V, Value *With) {
- for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
- UI != E; ++UI) {
- if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI))
- if (IC->isEquality() && IC->getOperand(1) == With)
- continue;
- // Unknown instruction.
- return false;
- }
- return true;
-}
-
-//===----------------------------------------------------------------------===//
-// String and Memory LibCall Optimizations
-//===----------------------------------------------------------------------===//
-
namespace {
-//===---------------------------------------===//
-// 'strcspn' Optimizations
-
-struct StrCSpnOpt : public LibCallOptimization {
- virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
- FunctionType *FT = Callee->getFunctionType();
- if (FT->getNumParams() != 2 ||
- FT->getParamType(0) != B.getInt8PtrTy() ||
- FT->getParamType(1) != FT->getParamType(0) ||
- !FT->getReturnType()->isIntegerTy())
- return 0;
-
- StringRef S1, S2;
- bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
- bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
-
- // strcspn("", s) -> 0
- if (HasS1 && S1.empty())
- return Constant::getNullValue(CI->getType());
-
- // Constant folding.
- if (HasS1 && HasS2) {
- size_t Pos = S1.find_first_of(S2);
- if (Pos == StringRef::npos) Pos = S1.size();
- return ConstantInt::get(CI->getType(), Pos);
- }
-
- // strcspn(s, "") -> strlen(s)
- if (TD && HasS2 && S2.empty())
- return EmitStrLen(CI->getArgOperand(0), B, TD, TLI);
-
- return 0;
- }
-};
-
-//===---------------------------------------===//
-// 'strstr' Optimizations
-
-struct StrStrOpt : public LibCallOptimization {
- virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
- FunctionType *FT = Callee->getFunctionType();
- if (FT->getNumParams() != 2 ||
- !FT->getParamType(0)->isPointerTy() ||
- !FT->getParamType(1)->isPointerTy() ||
- !FT->getReturnType()->isPointerTy())
- return 0;
-
- // fold strstr(x, x) -> x.
- if (CI->getArgOperand(0) == CI->getArgOperand(1))
- return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
-
- // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0
- if (TD && IsOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) {
- Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, TD, TLI);
- if (!StrLen)
- return 0;
- Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1),
- StrLen, B, TD, TLI);
- if (!StrNCmp)
- return 0;
- for (Value::use_iterator UI = CI->use_begin(), UE = CI->use_end();
- UI != UE; ) {
- ICmpInst *Old = cast<ICmpInst>(*UI++);
- Value *Cmp = B.CreateICmp(Old->getPredicate(), StrNCmp,
- ConstantInt::getNullValue(StrNCmp->getType()),
- "cmp");
- Old->replaceAllUsesWith(Cmp);
- Old->eraseFromParent();
- }
- return CI;
- }
-
- // See if either input string is a constant string.
- StringRef SearchStr, ToFindStr;
- bool HasStr1 = getConstantStringInfo(CI->getArgOperand(0), SearchStr);
- bool HasStr2 = getConstantStringInfo(CI->getArgOperand(1), ToFindStr);
-
- // fold strstr(x, "") -> x.
- if (HasStr2 && ToFindStr.empty())
- return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
-
- // If both strings are known, constant fold it.
- if (HasStr1 && HasStr2) {
- std::string::size_type Offset = SearchStr.find(ToFindStr);
-
- if (Offset == StringRef::npos) // strstr("foo", "bar") -> null
- return Constant::getNullValue(CI->getType());
-
- // strstr("abcd", "bc") -> gep((char*)"abcd", 1)
- Value *Result = CastToCStr(CI->getArgOperand(0), B);
- Result = B.CreateConstInBoundsGEP1_64(Result, Offset, "strstr");
- return B.CreateBitCast(Result, CI->getType());
- }
-
- // fold strstr(x, "y") -> strchr(x, 'y').
- if (HasStr2 && ToFindStr.size() == 1) {
- Value *StrChr= EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TD, TLI);
- return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : 0;
- }
- return 0;
- }
-};
-
-
-//===---------------------------------------===//
-// 'memcmp' Optimizations
-
-struct MemCmpOpt : public LibCallOptimization {
- virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
- FunctionType *FT = Callee->getFunctionType();
- if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() ||
- !FT->getParamType(1)->isPointerTy() ||
- !FT->getReturnType()->isIntegerTy(32))
- return 0;
-
- Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1);
-
- if (LHS == RHS) // memcmp(s,s,x) -> 0
- return Constant::getNullValue(CI->getType());
-
- // Make sure we have a constant length.
- ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
- if (!LenC) return 0;
- uint64_t Len = LenC->getZExtValue();
-
- if (Len == 0) // memcmp(s1,s2,0) -> 0
- return Constant::getNullValue(CI->getType());
-
- // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS
- if (Len == 1) {
- Value *LHSV = B.CreateZExt(B.CreateLoad(CastToCStr(LHS, B), "lhsc"),
- CI->getType(), "lhsv");
- Value *RHSV = B.CreateZExt(B.CreateLoad(CastToCStr(RHS, B), "rhsc"),
- CI->getType(), "rhsv");
- return B.CreateSub(LHSV, RHSV, "chardiff");
- }
-
- // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant)
- StringRef LHSStr, RHSStr;
- if (getConstantStringInfo(LHS, LHSStr) &&
- getConstantStringInfo(RHS, RHSStr)) {
- // Make sure we're not reading out-of-bounds memory.
- if (Len > LHSStr.size() || Len > RHSStr.size())
- return 0;
- uint64_t Ret = memcmp(LHSStr.data(), RHSStr.data(), Len);
- return ConstantInt::get(CI->getType(), Ret);
- }
-
- return 0;
- }
-};
-
-//===---------------------------------------===//
-// 'memcpy' Optimizations
-
-struct MemCpyOpt : public LibCallOptimization {
- virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
- // These optimizations require DataLayout.
- if (!TD) return 0;
-
- FunctionType *FT = Callee->getFunctionType();
- if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
- !FT->getParamType(0)->isPointerTy() ||
- !FT->getParamType(1)->isPointerTy() ||
- FT->getParamType(2) != TD->getIntPtrType(*Context))
- return 0;
-
- // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
- B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
- CI->getArgOperand(2), 1);
- return CI->getArgOperand(0);
- }
-};
-
-//===---------------------------------------===//
-// 'memmove' Optimizations
-
-struct MemMoveOpt : public LibCallOptimization {
- virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
- // These optimizations require DataLayout.
- if (!TD) return 0;
-
- FunctionType *FT = Callee->getFunctionType();
- if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
- !FT->getParamType(0)->isPointerTy() ||
- !FT->getParamType(1)->isPointerTy() ||
- FT->getParamType(2) != TD->getIntPtrType(*Context))
- return 0;
-
- // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
- B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
- CI->getArgOperand(2), 1);
- return CI->getArgOperand(0);
- }
-};
-
-//===---------------------------------------===//
-// 'memset' Optimizations
-
-struct MemSetOpt : public LibCallOptimization {
- virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
- // These optimizations require DataLayout.
- if (!TD) return 0;
-
- FunctionType *FT = Callee->getFunctionType();
- if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
- !FT->getParamType(0)->isPointerTy() ||
- !FT->getParamType(1)->isIntegerTy() ||
- FT->getParamType(2) != TD->getIntPtrType(*Context))
- return 0;
-
- // memset(p, v, n) -> llvm.memset(p, v, n, 1)
- Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
- B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
- return CI->getArgOperand(0);
- }
-};
-
//===----------------------------------------------------------------------===//
// Math Library Optimizations
//===----------------------------------------------------------------------===//
@@ -1004,9 +768,6 @@ namespace {
TargetLibraryInfo *TLI;
StringMap<LibCallOptimization*> Optimizations;
- // String and Memory LibCall Optimizations
- StrCSpnOpt StrCSpn; StrStrOpt StrStr;
- MemCmpOpt MemCmp; MemCpyOpt MemCpy; MemMoveOpt MemMove; MemSetOpt MemSet;
// Math Library Optimizations
CosOpt Cos; PowOpt Pow; Exp2Opt Exp2;
UnaryDoubleFPOpt UnaryDoubleFP, UnsafeUnaryDoubleFP;
@@ -1072,14 +833,6 @@ void SimplifyLibCalls::AddOpt(LibFunc::Func F1, LibFunc::Func F2,
/// Optimizations - Populate the Optimizations map with all the optimizations
/// we know.
void SimplifyLibCalls::InitOptimizations() {
- // String and Memory LibCall Optimizations
- Optimizations["strcspn"] = &StrCSpn;
- Optimizations["strstr"] = &StrStr;
- Optimizations["memcmp"] = &MemCmp;
- AddOpt(LibFunc::memcpy, &MemCpy);
- Optimizations["memmove"] = &MemMove;
- AddOpt(LibFunc::memset, &MemSet);
-
// Math Library Optimizations
Optimizations["cosf"] = &Cos;
Optimizations["cos"] = &Cos;
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 64c7011660..c3ea63852f 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -34,6 +34,7 @@ protected:
Function *Caller;
const DataLayout *TD;
const TargetLibraryInfo *TLI;
+ const LibCallSimplifier *LCS;
LLVMContext* Context;
public:
LibCallOptimization() { }
@@ -48,10 +49,12 @@ public:
=0;
Value *optimizeCall(CallInst *CI, const DataLayout *TD,
- const TargetLibraryInfo *TLI, IRBuilder<> &B) {
+ const TargetLibraryInfo *TLI,
+ const LibCallSimplifier *LCS, IRBuilder<> &B) {
Caller = CI->getParent()->getParent();
this->TD = TD;
this->TLI = TLI;
+ this->LCS = LCS;
if (CI->getCalledFunction())
Context = &CI->getCalledFunction()->getContext();
@@ -83,6 +86,20 @@ static bool isOnlyUsedInZeroEqualityComparison(Value *V) {
return true;
}
+/// isOnlyUsedInEqualityComparison - Return true if it is only used in equality
+/// comparisons with With.
+static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) {
+ for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
+ UI != E; ++UI) {
+ if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI))
+ if (IC->isEquality() && IC->getOperand(1) == With)
+ continue;
+ // Unknown instruction.
+ return false;
+ }
+ return true;
+}
+
//===----------------------------------------------------------------------===//
// Fortified Library Call Optimizations
//===----------------------------------------------------------------------===//
@@ -801,6 +818,204 @@ struct StrSpnOpt : public LibCallOptimization {
}
};
+struct StrCSpnOpt : public LibCallOptimization {
+ virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+ FunctionType *FT = Callee->getFunctionType();
+ if (FT->getNumParams() != 2 ||
+ FT->getParamType(0) != B.getInt8PtrTy() ||
+ FT->getParamType(1) != FT->getParamType(0) ||
+ !FT->getReturnType()->isIntegerTy())
+ return 0;
+
+ StringRef S1, S2;
+ bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+ bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
+
+ // strcspn("", s) -> 0
+ if (HasS1 && S1.empty())
+ return Constant::getNullValue(CI->getType());
+
+ // Constant folding.
+ if (HasS1 && HasS2) {
+ size_t Pos = S1.find_first_of(S2);
+ if (Pos == StringRef::npos) Pos = S1.size();
+ return ConstantInt::get(CI->getType(), Pos);
+ }
+
+ // strcspn(s, "") -> strlen(s)
+ if (TD && HasS2 && S2.empty())
+ return EmitStrLen(CI->getArgOperand(0), B, TD, TLI);
+
+ return 0;
+ }
+};
+
+struct StrStrOpt : public LibCallOptimization {
+ virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+ FunctionType *FT = Callee->getFunctionType();
+ if (FT->getNumParams() != 2 ||
+ !FT->getParamType(0)->isPointerTy() ||
+ !FT->getParamType(1)->isPointerTy() ||
+ !FT->getReturnType()->isPointerTy())
+ return 0;
+
+ // fold strstr(x, x) -> x.
+ if (CI->getArgOperand(0) == CI->getArgOperand(1))
+ return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
+
+ // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0
+ if (TD && isOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) {
+ Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, TD, TLI);
+ if (!StrLen)
+ return 0;
+ Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1),
+ StrLen, B, TD, TLI);
+ if (!StrNCmp)
+ return 0;
+ for (Value::use_iterator UI = CI->use_begin(), UE = CI->use_end();
+ UI != UE; ) {
+ ICmpInst *Old = cast<ICmpInst>(*UI++);
+ Value *Cmp = B.CreateICmp(Old->getPredicate(), StrNCmp,
+ ConstantInt::getNullValue(StrNCmp->getType()),
+ "cmp");
+ LCS->replaceAllUsesWith(Old, Cmp);
+ }
+ return CI;
+ }
+
+ // See if either input string is a constant string.
+ StringRef SearchStr, ToFindStr;
+ bool HasStr1 = getConstantStringInfo(CI->getArgOperand(0), SearchStr);
+ bool HasStr2 = getConstantStringInfo(CI->getArgOperand(1), ToFindStr);
+
+ // fold strstr(x, "") -> x.
+ if (HasStr2 && ToFindStr.empty())
+ return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
+
+ // If both strings are known, constant fold it.
+ if (HasStr1 && HasStr2) {
+ std::string::size_type Offset = SearchStr.find(ToFindStr);
+
+ if (Offset == StringRef::npos) // strstr("foo", "bar") -> null
+ return Constant::getNullValue(CI->getType());
+
+ // strstr("abcd", "bc") -> gep((char*)"abcd", 1)
+ Value *Result = CastToCStr(CI->getArgOperand(0), B);
+ Result = B.CreateConstInBoundsGEP1_64(Result, Offset, "strstr");
+ return B.CreateBitCast(Result, CI->getType());
+ }
+
+ // fold strstr(x, "y") -> strchr(x, 'y').
+ if (HasStr2 && ToFindStr.size() == 1) {
+ Value *StrChr= EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TD, TLI);
+ return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : 0;
+ }
+ return 0;
+ }
+};
+
+struct MemCmpOpt : public LibCallOptimization {
+ virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+ FunctionType *FT = Callee->getFunctionType();
+ if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() ||
+ !FT->getParamType(1)->isPointerTy() ||
+ !FT->getReturnType()->isIntegerTy(32))
+ return 0;
+
+ Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1);
+
+ if (LHS == RHS) // memcmp(s,s,x) -> 0
+ return Constant::getNullValue(CI->getType());
+
+ // Make sure we have a constant length.
+ ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+ if (!LenC) return 0;
+ uint64_t Len = LenC->getZExtValue();
+
+ if (Len == 0) // memcmp(s1,s2,0) -> 0
+ return Constant::getNullValue(CI->getType());
+
+ // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS
+ if (Len == 1) {
+ Value *LHSV = B.CreateZExt(B.CreateLoad(CastToCStr(LHS, B), "lhsc"),
+ CI->getType(), "lhsv");
+ Value *RHSV = B.CreateZExt(B.CreateLoad(CastToCStr(RHS, B), "rhsc"),
+ CI->getType(), "rhsv");
+ return B.CreateSub(LHSV, RHSV, "chardiff");
+ }
+
+ // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant)
+ StringRef LHSStr, RHSStr;
+ if (getConstantStringInfo(LHS, LHSStr) &&
+ getConstantStringInfo(RHS, RHSStr)) {
+ // Make sure we're not reading out-of-bounds memory.
+ if (Len > LHSStr.size() || Len > RHSStr.size())
+ return 0;
+ uint64_t Ret = memcmp(LHSStr.data(), RHSStr.data(), Len);
+ return ConstantInt::get(CI->getType(), Ret);
+ }
+
+ return 0;
+ }
+};
+
+struct MemCpyOpt : public LibCallOptimization {
+ virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+ // These optimizations require DataLayout.
+ if (!TD) return 0;
+
+ FunctionType *FT = Callee->getFunctionType();
+ if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+ !FT->getParamType(0)->isPointerTy() ||
+ !FT->getParamType(1)->isPointerTy() ||
+ FT->getParamType(2) != TD->getIntPtrType(*Context))
+ return 0;
+
+ // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
+ B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+ CI->getArgOperand(2), 1);
+ return CI->getArgOperand(0);
+ }
+};
+
+struct MemMoveOpt : public LibCallOptimization {
+ virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+ // These optimizations require DataLayout.
+ if (!TD) return 0;
+
+ FunctionType *FT = Callee->getFunctionType();
+ if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+ !FT->getParamType(0)->isPointerTy() ||
+ !FT->getParamType(1)->isPointerTy() ||
+ FT->getParamType(2) != TD->getIntPtrType(*Context))
+ return 0;
+
+ // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
+ B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
+ CI->getArgOperand(2), 1);
+ return CI->getArgOperand(0);
+ }
+};
+
+struct MemSetOpt : public LibCallOptimization {
+ virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+ // These optimizations require DataLayout.
+ if (!TD) return 0;
+
+ FunctionType *FT = Callee->getFunctionType();
+ if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+ !FT->getParamType(0)->isPointerTy() ||
+ !FT->getParamType(1)->isIntegerTy() ||
+ FT->getParamType(2) != TD->getIntPtrType(*Context))
+ return 0;
+
+ // memset(p, v, n) -> llvm.memset(p, v, n, 1)
+ Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
+ B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
+ return CI->getArgOperand(0);
+ }
+};
+
} // End anonymous namespace.
namespace llvm {
@@ -808,6 +1023,7 @@ namespace llvm {
class LibCallSimplifierImpl {
const DataLayout *TD;
const TargetLibraryInfo *TLI;
+ const LibCallSimplifier *LCS;
StringMap<LibCallOptimization*> Optimizations;
// Fortified library call optimizations.
@@ -818,7 +1034,7 @@ class LibCallSimplifierImpl {
StpCpyChkOpt StpCpyChk;
StrNCpyChkOpt StrNCpyChk;
- // String and memory library call optimizations.
+ // String library call optimizations.
StrCatOpt StrCat;
StrNCatOpt StrNCat;
StrChrOpt StrChr;
@@ -832,12 +1048,23 @@ class LibCallSimplifierImpl {
StrPBrkOpt StrPBrk;
StrToOpt StrTo;
StrSpnOpt StrSpn;
+ StrCSpnOpt StrCSpn;
+ StrStrOpt StrStr;
+
+ // Memory library call optimizations.
+ MemCmpOpt MemCmp;
+ MemCpyOpt MemCpy;
+ MemMoveOpt MemMove;
+ MemSetOpt MemSet;
void initOptimizations();
+ void addOpt(LibFunc::Func F, LibCallOptimization* Opt);
public:
- LibCallSimplifierImpl(const DataLayout *TD, const TargetLibraryInfo *TLI) {
+ LibCallSimplifierImpl(const DataLayout *TD, const TargetLibraryInfo *TLI,
+ const LibCallSimplifier *LCS) {
this->TD = TD;
this->TLI = TLI;
+ this->LCS = LCS;
}
Value *optimizeCall(CallInst *CI);
@@ -853,26 +1080,34 @@ void LibCallSimplifierImpl::initOptimizations() {
Optimizations["__strncpy_chk"] = &StrNCpyChk;
Optimizations["__stpncpy_chk"] = &StrNCpyChk;
- // String and memory library call optimizations.
- Optimizations["strcat"] = &StrCat;
- Optimizations["strncat"] = &StrNCat;
- Optimizations["strchr"] = &StrChr;
- Optimizations["strrchr"] = &StrRChr;
- Optimizations["strcmp"] = &StrCmp;
- Optimizations["strncmp"] = &StrNCmp;
- Optimizations["strcpy"] = &StrCpy;
- Optimizations["stpcpy"] = &StpCpy;
- Optimizations["strncpy"] = &StrNCpy;
- Optimizations["strlen"] = &StrLen;
- Optimizations["strpbrk"] = &StrPBrk;
- Optimizations["strtol"] = &StrTo;
- Optimizations["strtod"] = &StrTo;
- Optimizations["strtof"] = &StrTo;
- Optimizations["strtoul"] = &StrTo;
- Optimizations["strtoll"] = &StrTo;
- Optimizations["strtold"] = &StrTo;
- Optimizations["strtoull"] = &StrTo;
- Optimizations["strspn"] = &StrSpn;
+ // String library call optimizations.
+ addOpt(LibFunc::strcat, &StrCat);
+ addOpt(LibFunc::strncat, &StrNCat);
+ addOpt(LibFunc::strchr, &StrChr);
+ addOpt(LibFunc::strrchr, &StrRChr);
+ addOpt(LibFunc::strcmp, &StrCmp);
+ addOpt(LibFunc::strncmp, &StrNCmp);
+ addOpt(LibFunc::strcpy, &StrCpy);
+ addOpt(LibFunc::stpcpy, &StpCpy);
+ addOpt(LibFunc::strncpy, &StrNCpy);
+ addOpt(LibFunc::strlen, &StrLen);
+ addOpt(LibFunc::strpbrk, &StrPBrk);
+ addOpt(LibFunc::strtol, &StrTo);
+ addOpt(LibFunc::strtod, &StrTo);
+ addOpt(LibFunc::strtof, &StrTo);
+ addOpt(LibFunc::strtoul, &StrTo);
+ addOpt(LibFunc::strtoll, &StrTo);
+ addOpt(LibFunc::strtold, &StrTo);
+ addOpt(LibFunc::strtoull, &StrTo);
+ addOpt(LibFunc::strspn, &StrSpn);
+ addOpt(LibFunc::strcspn, &StrCSpn);
+ addOpt(LibFunc::strstr, &StrStr);
+
+ // Memory library call optimizations.
+ addOpt(LibFunc::memcmp, &MemCmp);
+ addOpt(LibFunc::memcpy, &MemCpy);
+ addOpt(LibFunc::memmove, &MemMove);
+ addOpt(LibFunc::memset, &MemSet);
}
Value *LibCallSimplifierImpl::optimizeCall(CallInst *CI) {
@@ -883,14 +1118,19 @@ Value *LibCallSimplifierImpl::optimizeCall(CallInst *CI) {
LibCallOptimization *LCO = Optimizations.lookup(Callee->getName());
if (LCO) {
IRBuilder<> Builder(CI);
- return LCO->optimizeCall(CI, TD, TLI, Builder);
+ return LCO->optimizeCall(CI, TD, TLI, LCS, Builder);
}
return 0;
}
+void LibCallSimplifierImpl::addOpt(LibFunc::Func F, LibCallOptimization* Opt) {
+ if (TLI->has(F))
+ Optimizations[TLI->getName(F)] = Opt;
+}
+
LibCallSimplifier::LibCallSimplifier(const DataLayout *TD,
const TargetLibraryInfo *TLI) {
- Impl = new LibCallSimplifierImpl(TD, TLI);
+ Impl = new LibCallSimplifierImpl(TD, TLI, this);
}
LibCallSimplifier::~LibCallSimplifier() {
@@ -901,4 +1141,9 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
return Impl->optimizeCall(CI);
}
+void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) const {
+ I->replaceAllUsesWith(With);
+ I->eraseFromParent();
+}
+
}
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 892808760f..a7ef248e6e 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -78,6 +78,10 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
/// We don't vectorize loops with a known constant trip count below this number.
const unsigned TinyTripCountThreshold = 16;
+/// When performing a runtime memory check, do not check more than this
+/// number of pointers. Notice that the check is quadratic!
+const unsigned RuntimeMemoryCheckThreshold = 2;
+
namespace {
// Forward declarations.
@@ -114,7 +118,7 @@ public:
/// Widen each instruction in the old loop to a new one in the new loop.
/// Use the Legality module to find the induction and reduction variables.
vectorizeLoop(Legal);
- // register the new loop.
+ // Register the new loop and update the analysis passes.
updateAnalysis();
}
@@ -123,7 +127,8 @@ private:
void createEmptyLoop(LoopVectorizationLegality *Legal);
/// Copy and widen the instructions from the old loop.
void vectorizeLoop(LoopVectorizationLegality *Legal);
- /// Insert the new loop to the loop hierarchy and pass manager.
+ /// Insert the new loop to the loop hierarchy and pass manager
+ /// and update the analysis passes.
void updateAnalysis();
/// This instruction is un-vectorizable. Implement it as a sequence
@@ -242,6 +247,15 @@ public:
ReductionKind Kind;
};
+ // This POD struct holds information about the memory runtime legality
+ // check that a group of pointers do not overlap.
+ struct RuntimePointerCheck {
+ /// This flag indicates if we need to add the runtime check.
+ bool Need;
+ /// Holds the pointers that we need to check.
+ SmallVector<Value*, 2> Pointers;
+ };
+
/// ReductionList contains the reduction descriptors for all
/// of the reductions that were found in the loop.
typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
@@ -263,9 +277,14 @@ public:
/// This check allows us to vectorize A[idx] into a wide load/store.
bool isConsecutiveGep(Value *Ptr);
+ /// Returns true if the value V is uniform within the loop.
+ bool isUniform(Value *V);
+
/// Returns true if this instruction will remain scalar after vectorization.
bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);}
+ /// Returns the information that we collected about runtime memory check.
+ RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; }
private:
/// Check if a single basic block loop is vectorizable.
/// At this point we know that this is a loop with a constant trip count
@@ -286,6 +305,8 @@ private:
bool isReductionInstr(Instruction *I, ReductionKind Kind);
/// Returns True, if 'Phi' is an induction variable.
bool isInductionVariable(PHINode *Phi);
+ /// Return true if can compute the address bounds of Ptr within the loop.
+ bool hasComputableBounds(Value *Ptr);
/// The loop that we evaluate.
Loop *TheLoop;
@@ -306,6 +327,9 @@ private:
/// This set holds the variables which are known to be uniform after
/// vectorization.
SmallPtrSet<Instruction*, 4> Uniforms;
+ /// We need to check that all of the pointers in this list are disjoint
+ /// at runtime.
+ RuntimePointerCheck PtrRtCheck;
};
/// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -506,6 +530,10 @@ bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
return false;
}
+bool LoopVectorizationLegality::isUniform(Value *V) {
+ return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
+}
+
Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
assert(!V->getType()->isVectorTy() && "Can't widen a vector");
// If we saved a vectorized copy of V, use it.
@@ -631,13 +659,29 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
...
*/
+ OldInduction = Legal->getInduction();
+ assert(OldInduction && "We must have a single phi node.");
+ Type *IdxTy = OldInduction->getType();
+
+ // Find the loop boundaries.
+ const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
+ assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
+
+ // Get the total trip count from the count by adding 1.
+ ExitCount = SE->getAddExpr(ExitCount,
+ SE->getConstant(ExitCount->getType(), 1));
+ // We may need to extend the index in case there is a type mismatch.
+ // We know that the count starts at zero and does not overflow.
+ // We are using Zext because it should be less expensive.
+ if (ExitCount->getType() != IdxTy)
+ ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
+
// This is the original scalar-loop preheader.
BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
BasicBlock *ExitBlock = OrigLoop->getExitBlock();
assert(ExitBlock && "Must have an exit block");
// The loop index does not have to start at Zero. It starts with this value.
- OldInduction = Legal->getInduction();
Value *StartIdx = OldInduction->getIncomingValueForBlock(BypassBlock);
assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
@@ -655,8 +699,6 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
"scalar.preheader");
// Find the induction variable.
BasicBlock *OldBasicBlock = OrigLoop->getHeader();
- assert(OldInduction && "We must have a single phi node.");
- Type *IdxTy = OldInduction->getType();
// Use this IR builder to create the loop instructions (Phi, Br, Cmp)
// inside the loop.
@@ -666,25 +708,11 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
Induction = Builder.CreatePHI(IdxTy, 2, "index");
Constant *Step = ConstantInt::get(IdxTy, VF);
- // Find the loop boundaries.
- const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
- assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
-
- // Get the total trip count from the count by adding 1.
- ExitCount = SE->getAddExpr(ExitCount,
- SE->getConstant(ExitCount->getType(), 1));
-
// Expand the trip count and place the new instructions in the preheader.
// Notice that the pre-header does not change, only the loop body.
SCEVExpander Exp(*SE, "induction");
Instruction *Loc = BypassBlock->getTerminator();
- // We may need to extend the index in case there is a type mismatch.
- // We know that the count starts at zero and does not overflow.
- // We are using Zext because it should be less expensive.
- if (ExitCount->getType() != Induction->getType())
- ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
-
// Count holds the overall loop count (N).
Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc);
@@ -704,15 +732,85 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
IdxEndRoundDown,
StartIdx,
"cmp.zero", Loc);
+
+ LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
+ Legal->getRuntimePointerCheck();
+ Value *MemoryRuntimeCheck = 0;
+ if (PtrRtCheck->Need) {
+ unsigned NumPointers = PtrRtCheck->Pointers.size();
+ SmallVector<Value* , 2> Starts;
+ SmallVector<Value* , 2> Ends;
+
+ // Use this type for pointer arithmetic.
+ Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType();
+
+ for (unsigned i=0; i < NumPointers; ++i) {
+ Value *Ptr = PtrRtCheck->Pointers[i];
+ const SCEV *Sc = SE->getSCEV(Ptr);
+
+ if (SE->isLoopInvariant(Sc, OrigLoop)) {
+ DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" <<
+ *Ptr <<"\n");
+ Starts.push_back(Ptr);
+ Ends.push_back(Ptr);
+ } else {
+ DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
+ Value *Start = Exp.expandCodeFor(AR->getStart(), PtrArithTy, Loc);
+ const SCEV *Ex = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
+ const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
+ assert(!isa<SCEVCouldNotCompute>(ScEnd) && "Invalid scev range.");
+ Value *End = Exp.expandCodeFor(ScEnd, PtrArithTy, Loc);
+ Starts.push_back(Start);
+ Ends.push_back(End);
+ }
+ }
+
+ for (unsigned i=0; i < NumPointers; ++i) {
+ for (unsigned j=i+1; j < NumPointers; ++j) {
+ Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
+ Starts[0], Ends[1], "bound0", Loc);
+ Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
+ Starts[1], Ends[0], "bound1", Loc);
+ Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
+ "found.conflict", Loc);
+ if (MemoryRuntimeCheck) {
+ MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or,
+ MemoryRuntimeCheck,
+ IsConflict,
+ "conflict.rdx", Loc);
+ } else {
+ MemoryRuntimeCheck = IsConflict;
+ }
+ }
+ }
+ }// end of need-runtime-check code.
+
+ // If we are using memory runtime checks, include them in.
+ if (MemoryRuntimeCheck) {
+ Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck,
+ "CntOrMem", Loc);
+ }
+
BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
// Remove the old terminator.
Loc->eraseFromParent();
+ // We are going to resume the execution of the scalar loop.
+ // This PHI decides on what number to start. If we come from the
+ // vector loop then we need to start with the end index minus the
+ // index modulo VF. If we come from a bypass edge then we need to start
+ // from the real start.
+ PHINode* ResumeIndex = PHINode::Create(IdxTy, 2, "resume.idx",
+ MiddleBlock->getTerminator());
+ ResumeIndex->addIncoming(StartIdx, BypassBlock);
+ ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
+
// Add a check in the middle block to see if we have completed
// all of the iterations in the first vector loop.
// If (N - N%VF) == N, then we *don't* need to run the remainder.
Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
- IdxEndRoundDown, "cmp.n",
+ ResumeIndex, "cmp.n",
MiddleBlock->getTerminator());
BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
@@ -732,7 +830,7 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
// Fix the scalar body iteration count.
unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
- OldInduction->setIncomingValue(BlockIdx, IdxEndRoundDown);
+ OldInduction->setIncomingValue(BlockIdx, ResumeIndex);
// Get ready to start creating new instructions into the vectorized body.
Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
@@ -905,7 +1003,12 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
Value *Ptr = SI->getPointerOperand();
unsigned Alignment = SI->getAlignment();
+
+ assert(!Legal->isUniform(Ptr) &&
+ "We do not allow storing to uniform addresses");
+
GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+
// This store does not use GEPs.
if (!Legal->isConsecutiveGep(Gep)) {
scalarizeInstruction(Inst);
@@ -935,8 +1038,9 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
unsigned Alignment = LI->getAlignment();
GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
- // We don't have a gep. Scalarize the load.
- if (!Legal->isConsecutiveGep(Gep)) {
+ // If we don't have a gep, or that the pointer is loop invariant,
+ // scalarize the load.
+ if (!Gep || Legal->isUniform(Gep) || !Legal->isConsecutiveGep(Gep)) {
scalarizeInstruction(Inst);
break;
}
@@ -1146,12 +1250,6 @@ bool LoopVectorizationLegality::canVectorize() {
BasicBlock *BB = TheLoop->getHeader();
DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n");
- // Go over each instruction and look at memory deps.
- if (!canVectorizeBlock(*BB)) {
- DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
- return false;
- }
-
// ScalarEvolution needs to be able to find the exit count.
const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
if (ExitCount == SE->getCouldNotCompute()) {
@@ -1167,7 +1265,15 @@ bool LoopVectorizationLegality::canVectorize() {
return false;
}
- DEBUG(dbgs() << "LV: We can vectorize this loop!\n");
+ // Go over each instruction and look at memory deps.
+ if (!canVectorizeBlock(*BB)) {
+ DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
+ return false;
+ }
+
+ DEBUG(dbgs() << "LV: We can vectorize this loop" <<
+ (PtrRtCheck.Need ? " (with a runtime bound check)" : "")
+ <<"!\n");
// Okay! We can vectorize. At this point we don't have any other mem analysis
// which may limit our maximum vectorization factor, so just return true with
@@ -1304,6 +1410,8 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
// Holds the Load and Store *instructions*.
ValueVector Loads;
ValueVector Stores;
+ PtrRtCheck.Pointers.clear();
+ PtrRtCheck.Need = false;
// Scan the BB and collect legal loads and stores.
for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
@@ -1361,6 +1469,12 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
StoreInst *ST = dyn_cast<StoreInst>(*I);
assert(ST && "Bad StoreInst");
Value* Ptr = ST->getPointerOperand();
+
+ if (isUniform(Ptr)) {
+ DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
+ return false;
+ }
+
// If we did *not* see this pointer before, insert it to
// the read-write list. At this phase it is only a 'write' list.
if (Seen.insert(Ptr))
@@ -1390,6 +1504,39 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
return true;
}
+ // Find pointers with computable bounds. We are going to use this information
+ // to place a runtime bound check.
+ bool RT = true;
+ for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I)
+ if (hasComputableBounds(*I)) {
+ PtrRtCheck.Pointers.push_back(*I);
+ DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+ } else {
+ RT = false;
+ break;
+ }
+ for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I)
+ if (hasComputableBounds(*I)) {
+ PtrRtCheck.Pointers.push_back(*I);
+ DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+ } else {
+ RT = false;
+ break;
+ }
+
+ // Check that we did not collect too many pointers or found a
+ // unsizeable pointer.
+ if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
+ PtrRtCheck.Pointers.clear();
+ RT = false;
+ }
+
+ PtrRtCheck.Need = RT;
+
+ if (RT) {
+ DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
+ }
+
// Now that the pointers are in two lists (Reads and ReadWrites), we
// can check that there are no conflicts between each of the writes and
// between the writes to the reads.
@@ -1404,12 +1551,12 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
it != e; ++it) {
if (!isIdentifiedObject(*it)) {
DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
- return false;
+ return RT;
}
if (!WriteObjects.insert(*it)) {
DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
<< **it <<"\n");
- return false;
+ return RT;
}
}
TempObjects.clear();
@@ -1422,18 +1569,21 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
it != e; ++it) {
if (!isIdentifiedObject(*it)) {
DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
- return false;
+ return RT;
}
if (WriteObjects.count(*it)) {
DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
<< **it <<"\n");
- return false;
+ return RT;
}
}
TempObjects.clear();
}
- // All is okay.
+ // It is safe to vectorize and we don't need any runtime checks.
+ DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n");
+ PtrRtCheck.Pointers.clear();
+ PtrRtCheck.Need = false;
return true;
}
@@ -1556,6 +1706,15 @@ bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
return true;
}
+bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
+ const SCEV *PhiScev = SE->getSCEV(Ptr);
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
+ if (!AR)
+ return false;
+
+ return AR->isAffine();
+}
+
unsigned
LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) {
if (!VTTI) {