aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/ARM/ARMISelLowering.cpp
diff options
context:
space:
mode:
authorDerek Schuff <dschuff@chromium.org>2013-01-09 16:55:43 -0800
committerDerek Schuff <dschuff@chromium.org>2013-01-11 13:47:37 -0800
commitb770d0e0636a4b5ad61b1ca661caee67576c05fc (patch)
treec486ce032d41f97313c50629bd5b879f53e6ccbf /lib/Target/ARM/ARMISelLowering.cpp
parentb835840cf112a6178506d834b58aa625f59a8994 (diff)
parent1ad9253c9d34ccbce3e7e4ea5d87c266cbf93410 (diff)
Merge commit '1ad9253c9d34ccbce3e7e4ea5d87c266cbf93410'
deplib features commented out due to removal upstream; will add back as a localmod Conflicts: include/llvm/ADT/Triple.h include/llvm/MC/MCAssembler.h include/llvm/Target/TargetFrameLowering.h lib/CodeGen/AsmPrinter/DwarfDebug.cpp lib/CodeGen/AsmPrinter/DwarfDebug.h lib/CodeGen/BranchFolding.cpp lib/LLVMBuild.txt lib/Linker/LinkArchives.cpp lib/MC/MCAssembler.cpp lib/MC/MCELFStreamer.cpp lib/Makefile lib/Target/ARM/ARMExpandPseudoInsts.cpp lib/Target/ARM/ARMFrameLowering.cpp lib/Target/ARM/ARMISelLowering.cpp lib/Target/ARM/ARMSubtarget.h lib/Target/ARM/ARMTargetObjectFile.cpp lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp lib/Target/Mips/MipsInstrFPU.td lib/Target/Mips/MipsInstrInfo.td lib/Target/X86/X86CodeEmitter.cpp lib/Target/X86/X86Subtarget.h lib/VMCore/Module.cpp test/MC/MachO/ARM/nop-armv4-padding.s tools/Makefile tools/llc/llc.cpp tools/lto/LTOModule.cpp tools/lto/lto.cpp
Diffstat (limited to 'lib/Target/ARM/ARMISelLowering.cpp')
-rw-r--r--lib/Target/ARM/ARMISelLowering.cpp403
1 files changed, 357 insertions, 46 deletions
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index e1f5aa3fed..45d52b62ac 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -23,14 +23,9 @@
#include "ARMTargetMachine.h"
#include "ARMTargetObjectFile.h"
#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/CallingConv.h"
-#include "llvm/Constants.h"
-#include "llvm/Function.h"
-#include "llvm/GlobalValue.h"
-#include "llvm/Instruction.h"
-#include "llvm/Instructions.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/Type.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -40,14 +35,19 @@
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Instruction.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
#include "llvm/MC/MCSectionMachO.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/Statistic.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Type.h"
// @LOCALMOD-START
namespace llvm {
@@ -556,6 +556,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
+ // NEON does not have single instruction CTPOP for vectors with element
+ // types wider than 8-bits. However, custom lowering can leverage the
+ // v8i8/v16i8 vcnt instruction.
+ setOperationAction(ISD::CTPOP, MVT::v2i32, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v4i16, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
+
setTargetDAGCombine(ISD::INTRINSIC_VOID);
setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
@@ -724,7 +732,11 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
- setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
// Automatically insert fences (dmb ist) around ATOMIC_SWAP etc.
setInsertFencesForAtomic(true);
@@ -862,9 +874,12 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
setSchedulingPreference(Sched::Hybrid);
//// temporary - rewrite interface to use type
- maxStoresPerMemcpy = maxStoresPerMemcpyOptSize = 1;
- maxStoresPerMemset = 16;
+ maxStoresPerMemset = 8;
maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+ maxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
+ maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
+ maxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
+ maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
// On ARM arguments smaller than 4 bytes are extended, so all arguments
// are at least 4 bytes aligned.
@@ -1950,6 +1965,17 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
return true;
}
+bool
+ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+ MachineFunction &MF, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
+ return CCInfo.CheckReturn(Outs, CCAssignFnForNode(CallConv, /*Return=*/true,
+ isVarArg));
+}
+
SDValue
ARMTargetLowering::LowerReturn(SDValue Chain,
CallingConv::ID CallConv, bool isVarArg,
@@ -3777,6 +3803,114 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
}
+/// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count
+/// for each 16-bit element from operand, repeated. The basic idea is to
+/// leverage vcnt to get the 8-bit counts, gather and add the results.
+///
+/// Trace for v4i16:
+/// input = [v0 v1 v2 v3 ] (vi 16-bit element)
+/// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element)
+/// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi)
+/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6]
+/// [b0 b1 b2 b3 b4 b5 b6 b7]
+/// +[b1 b0 b3 b2 b5 b4 b7 b6]
+/// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0,
+/// vuzp: = [k0 k1 k2 k3 k0 k1 k2 k3] each ki is 8-bits)
+static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ DebugLoc DL = N->getDebugLoc();
+
+ EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
+ SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0));
+ SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0);
+ SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1);
+ SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2);
+ return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3);
+}
+
+/// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the
+/// bit-count for each 16-bit element from the operand. We need slightly
+/// different sequencing for v4i16 and v8i16 to stay within NEON's available
+/// 64/128-bit registers.
+///
+/// Trace for v4i16:
+/// input = [v0 v1 v2 v3 ] (vi 16-bit element)
+/// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi)
+/// v8i16:Extended = [k0 k1 k2 k3 k0 k1 k2 k3 ]
+/// v4i16:Extracted = [k0 k1 k2 k3 ]
+static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ DebugLoc DL = N->getDebugLoc();
+
+ SDValue BitCounts = getCTPOP16BitCounts(N, DAG);
+ if (VT.is64BitVector()) {
+ SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended,
+ DAG.getIntPtrConstant(0));
+ } else {
+ SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8,
+ BitCounts, DAG.getIntPtrConstant(0));
+ return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted);
+ }
+}
+
+/// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the
+/// bit-count for each 32-bit element from the operand. The idea here is
+/// to split the vector into 16-bit elements, leverage the 16-bit count
+/// routine, and then combine the results.
+///
+/// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged):
+/// input = [v0 v1 ] (vi: 32-bit elements)
+/// Bitcast = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1])
+/// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi)
+/// vrev: N0 = [k1 k0 k3 k2 ]
+/// [k0 k1 k2 k3 ]
+/// N1 =+[k1 k0 k3 k2 ]
+/// [k0 k2 k1 k3 ]
+/// N2 =+[k1 k3 k0 k2 ]
+/// [k0 k2 k1 k3 ]
+/// Extended =+[k1 k3 k0 k2 ]
+/// [k0 k2 ]
+/// Extracted=+[k1 k3 ]
+///
+static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ DebugLoc DL = N->getDebugLoc();
+
+ EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
+
+ SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0));
+ SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG);
+ SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16);
+ SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0);
+ SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1);
+
+ if (VT.is64BitVector()) {
+ SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended,
+ DAG.getIntPtrConstant(0));
+ } else {
+ SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2,
+ DAG.getIntPtrConstant(0));
+ return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted);
+ }
+}
+
+static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ EVT VT = N->getValueType(0);
+
+ assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
+ assert((VT == MVT::v2i32 || VT == MVT::v4i32 ||
+ VT == MVT::v4i16 || VT == MVT::v8i16) &&
+ "Unexpected type for custom ctpop lowering");
+
+ if (VT.getVectorElementType() == MVT::i32)
+ return lowerCTPOP32BitElements(N, DAG);
+ else
+ return lowerCTPOP16BitElements(N, DAG);
+}
+
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *ST) {
EVT VT = N->getValueType(0);
@@ -5162,16 +5296,76 @@ static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
return false;
}
-/// SkipExtension - For a node that is a SIGN_EXTEND, ZERO_EXTEND, extending
-/// load, or BUILD_VECTOR with extended elements, return the unextended value.
-static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) {
+/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
+/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
+/// We insert the required extension here to get the vector to fill a D register.
+static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
+ const EVT &OrigTy,
+ const EVT &ExtTy,
+ unsigned ExtOpcode) {
+ // The vector originally had a size of OrigTy. It was then extended to ExtTy.
+ // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
+ // 64-bits we need to insert a new extension so that it will be 64-bits.
+ assert(ExtTy.is128BitVector() && "Unexpected extension size");
+ if (OrigTy.getSizeInBits() >= 64)
+ return N;
+
+ // Must extend size to at least 64 bits to be used as an operand for VMULL.
+ MVT::SimpleValueType OrigSimpleTy = OrigTy.getSimpleVT().SimpleTy;
+ EVT NewVT;
+ switch (OrigSimpleTy) {
+ default: llvm_unreachable("Unexpected Orig Vector Type");
+ case MVT::v2i8:
+ case MVT::v2i16:
+ NewVT = MVT::v2i32;
+ break;
+ case MVT::v4i8:
+ NewVT = MVT::v4i16;
+ break;
+ }
+ return DAG.getNode(ExtOpcode, N->getDebugLoc(), NewVT, N);
+}
+
+/// SkipLoadExtensionForVMULL - return a load of the original vector size that
+/// does not do any sign/zero extension. If the original vector is less
+/// than 64 bits, an appropriate extension will be added after the load to
+/// reach a total size of 64 bits. We have to add the extension separately
+/// because ARM does not have a sign/zero extending load for vectors.
+static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
+ SDValue NonExtendingLoad =
+ DAG.getLoad(LD->getMemoryVT(), LD->getDebugLoc(), LD->getChain(),
+ LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(),
+ LD->isNonTemporal(), LD->isInvariant(),
+ LD->getAlignment());
+ unsigned ExtOp = 0;
+ switch (LD->getExtensionType()) {
+ default: llvm_unreachable("Unexpected LoadExtType");
+ case ISD::EXTLOAD:
+ case ISD::SEXTLOAD: ExtOp = ISD::SIGN_EXTEND; break;
+ case ISD::ZEXTLOAD: ExtOp = ISD::ZERO_EXTEND; break;
+ }
+ MVT::SimpleValueType MemType = LD->getMemoryVT().getSimpleVT().SimpleTy;
+ MVT::SimpleValueType ExtType = LD->getValueType(0).getSimpleVT().SimpleTy;
+ return AddRequiredExtensionForVMULL(NonExtendingLoad, DAG,
+ MemType, ExtType, ExtOp);
+}
+
+/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
+/// extending load, or BUILD_VECTOR with extended elements, return the
+/// unextended value. The unextended vector should be 64 bits so that it can
+/// be used as an operand to a VMULL instruction. If the original vector size
+/// before extension is less than 64 bits we add a an extension to resize
+/// the vector to 64 bits.
+static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
- return N->getOperand(0);
+ return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
+ N->getOperand(0)->getValueType(0),
+ N->getValueType(0),
+ N->getOpcode());
+
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
- return DAG.getLoad(LD->getMemoryVT(), N->getDebugLoc(), LD->getChain(),
- LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(),
- LD->isNonTemporal(), LD->isInvariant(),
- LD->getAlignment());
+ return SkipLoadExtensionForVMULL(LD, DAG);
+
// Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
// have been legalized as a BITCAST from v4i32.
if (N->getOpcode() == ISD::BITCAST) {
@@ -5226,7 +5420,8 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
// Multiplications are only custom-lowered for 128-bit vectors so that
// VMULL can be detected. Otherwise v2i64 multiplications are not legal.
EVT VT = Op.getValueType();
- assert(VT.is128BitVector() && "unexpected type for custom-lowering ISD::MUL");
+ assert(VT.is128BitVector() && VT.isInteger() &&
+ "unexpected type for custom-lowering ISD::MUL");
SDNode *N0 = Op.getOperand(0).getNode();
SDNode *N1 = Op.getOperand(1).getNode();
unsigned NewOpc = 0;
@@ -5269,9 +5464,9 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
// Legalize to a VMULL instruction.
DebugLoc DL = Op.getDebugLoc();
SDValue Op0;
- SDValue Op1 = SkipExtension(N1, DAG);
+ SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
if (!isMLA) {
- Op0 = SkipExtension(N0, DAG);
+ Op0 = SkipExtensionForVMULL(N0, DAG);
assert(Op0.getValueType().is64BitVector() &&
Op1.getValueType().is64BitVector() &&
"unexpected types for extended operands to VMULL");
@@ -5286,8 +5481,8 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
// vaddl q0, d4, d5
// vmovl q1, d6
// vmul q0, q0, q1
- SDValue N00 = SkipExtension(N0->getOperand(0).getNode(), DAG);
- SDValue N01 = SkipExtension(N0->getOperand(1).getNode(), DAG);
+ SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
+ SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
EVT Op1VT = Op1.getValueType();
return DAG.getNode(N0->getOpcode(), DL, VT,
DAG.getNode(NewOpc, DL, VT,
@@ -5617,6 +5812,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRL_PARTS:
case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
case ISD::CTTZ: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
+ case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
case ISD::SETCC: return LowerVSETCC(Op, DAG);
case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
@@ -5682,6 +5878,18 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::ATOMIC_CMP_SWAP:
ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMCMPXCHG64_DAG);
return;
+ case ISD::ATOMIC_LOAD_MIN:
+ ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMIN64_DAG);
+ return;
+ case ISD::ATOMIC_LOAD_UMIN:
+ ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMIN64_DAG);
+ return;
+ case ISD::ATOMIC_LOAD_MAX:
+ ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMAX64_DAG);
+ return;
+ case ISD::ATOMIC_LOAD_UMAX:
+ ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMAX64_DAG);
+ return;
}
if (Res.getNode())
Results.push_back(Res);
@@ -6021,7 +6229,8 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
MachineBasicBlock *
ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
unsigned Op1, unsigned Op2,
- bool NeedsCarry, bool IsCmpxchg) const {
+ bool NeedsCarry, bool IsCmpxchg,
+ bool IsMinMax, ARMCC::CondCodes CC) const {
// This also handles ATOMIC_SWAP, indicated by Op1==0.
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
@@ -6050,16 +6259,15 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *contBB = 0, *cont2BB = 0;
- if (IsCmpxchg) {
+ if (IsCmpxchg || IsMinMax)
contBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ if (IsCmpxchg)
cont2BB = MF->CreateMachineBasicBlock(LLVM_BB);
- }
MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
MF->insert(It, loopMBB);
- if (IsCmpxchg) {
- MF->insert(It, contBB);
- MF->insert(It, cont2BB);
- }
+ if (IsCmpxchg || IsMinMax) MF->insert(It, contBB);
+ if (IsCmpxchg) MF->insert(It, cont2BB);
MF->insert(It, exitMBB);
// Transfer the remainder of BB and its successor edges to exitMBB.
@@ -6097,6 +6305,22 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
// Load
unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
unsigned GPRPair1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+ unsigned GPRPair2;
+ if (IsMinMax) {
+ //We need an extra double register for doing min/max.
+ unsigned undef = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+ unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+ GPRPair2 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+ BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), undef);
+ BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1)
+ .addReg(undef)
+ .addReg(vallo)
+ .addImm(ARM::gsub_0);
+ BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), GPRPair2)
+ .addReg(r1)
+ .addReg(valhi)
+ .addImm(ARM::gsub_1);
+ }
AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc))
.addReg(GPRPair0, RegState::Define).addReg(ptr));
@@ -6142,7 +6366,8 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
.addReg(NeedsCarry ? ARM::CPSR : 0, getDefRegState(NeedsCarry));
unsigned tmpRegHi = MRI.createVirtualRegister(TRC);
AddDefaultPred(BuildMI(BB, dl, TII->get(Op2), tmpRegHi)
- .addReg(desthi).addReg(valhi)).addReg(0);
+ .addReg(desthi).addReg(valhi))
+ .addReg(IsMinMax ? ARM::CPSR : 0, getDefRegState(IsMinMax));
unsigned UndefPair = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), UndefPair);
@@ -6169,10 +6394,20 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
.addReg(valhi)
.addImm(ARM::gsub_1);
}
+ unsigned GPRPairStore = GPRPair1;
+ if (IsMinMax) {
+ // Compare and branch to exit block.
+ BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+ .addMBB(exitMBB).addImm(CC).addReg(ARM::CPSR);
+ BB->addSuccessor(exitMBB);
+ BB->addSuccessor(contBB);
+ BB = contBB;
+ GPRPairStore = GPRPair2;
+ }
// Store
AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess)
- .addReg(GPRPair1).addReg(ptr));
+ .addReg(GPRPairStore).addReg(ptr));
// Cmp+jump
AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
.addReg(storesuccess).addImm(0));
@@ -7174,6 +7409,26 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
/*NeedsCarry*/ false, /*IsCmpxchg*/true);
+ case ARM::ATOMMIN6432:
+ return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
+ isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
+ /*NeedsCarry*/ true, /*IsCmpxchg*/false,
+ /*IsMinMax*/ true, ARMCC::LE);
+ case ARM::ATOMMAX6432:
+ return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
+ isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
+ /*NeedsCarry*/ true, /*IsCmpxchg*/false,
+ /*IsMinMax*/ true, ARMCC::GE);
+ case ARM::ATOMUMIN6432:
+ return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
+ isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
+ /*NeedsCarry*/ true, /*IsCmpxchg*/false,
+ /*IsMinMax*/ true, ARMCC::LS);
+ case ARM::ATOMUMAX6432:
+ return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
+ isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
+ /*NeedsCarry*/ true, /*IsCmpxchg*/false,
+ /*IsMinMax*/ true, ARMCC::HS);
case ARM::tMOVCCr_pseudo: {
// To "insert" a SELECT_CC instruction, we actually have to insert the
@@ -9445,7 +9700,7 @@ bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
}
-bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
+bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
// The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
@@ -9454,15 +9709,27 @@ bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
return false;
case MVT::i8:
case MVT::i16:
- case MVT::i32:
+ case MVT::i32: {
// Unaligned access can use (for example) LRDB, LRDH, LDR
- return AllowsUnaligned;
+ if (AllowsUnaligned) {
+ if (Fast)
+ *Fast = Subtarget->hasV7Ops();
+ return true;
+ }
+ return false;
+ }
case MVT::f64:
- case MVT::v2f64:
+ case MVT::v2f64: {
// For any little-endian targets with neon, we can support unaligned ld/st
// of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
// A big-endian target may also explictly support unaligned accesses
- return Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian());
+ if (Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian())) {
+ if (Fast)
+ *Fast = true;
+ return true;
+ }
+ return false;
+ }
}
}
@@ -9481,12 +9748,17 @@ EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
// See if we can use NEON instructions for this...
if (IsZeroVal &&
- !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat) &&
- Subtarget->hasNEON()) {
- if (memOpAlign(SrcAlign, DstAlign, 16) && Size >= 16) {
- return MVT::v4i32;
- } else if (memOpAlign(SrcAlign, DstAlign, 8) && Size >= 8) {
- return MVT::v2i32;
+ Subtarget->hasNEON() &&
+ !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) {
+ bool Fast;
+ if (Size >= 16 && (memOpAlign(SrcAlign, DstAlign, 16) ||
+ (allowsUnalignedMemoryAccesses(MVT::v2f64, &Fast) &&
+ Fast))) {
+ return MVT::v2f64;
+ } else if (Size >= 8 && (memOpAlign(SrcAlign, DstAlign, 8) ||
+ (allowsUnalignedMemoryAccesses(MVT::f64, &Fast) &&
+ Fast))) {
+ return MVT::f64;
}
}
@@ -9501,6 +9773,27 @@ EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
return MVT::Other;
}
+bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+ if (Val.getOpcode() != ISD::LOAD)
+ return false;
+
+ EVT VT1 = Val.getValueType();
+ if (!VT1.isSimple() || !VT1.isInteger() ||
+ !VT2.isSimple() || !VT2.isInteger())
+ return false;
+
+ switch (VT1.getSimpleVT().SimpleTy) {
+ default: break;
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
+ return true;
+ }
+
+ return false;
+}
+
static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
if (V < 0)
return false;
@@ -10277,6 +10570,24 @@ bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
return false;
}
+bool ARMTargetLowering::isIntImmLegal(const APInt &Imm, EVT VT) const {
+ if (VT.getSizeInBits() > 32)
+ return false;
+
+ int32_t ImmVal = Imm.getSExtValue();
+ if (!Subtarget->isThumb()) {
+ return (ImmVal >= 0 && ImmVal < 65536) ||
+ (ARM_AM::getSOImmVal(ImmVal) != -1) ||
+ (ARM_AM::getSOImmVal(~ImmVal) != -1);
+ } else if (Subtarget->isThumb2()) {
+ return (ImmVal >= 0 && ImmVal < 65536) ||
+ (ARM_AM::getT2SOImmVal(ImmVal) != -1) ||
+ (ARM_AM::getT2SOImmVal(~ImmVal) != -1);
+ } else /*Thumb1*/ {
+ return (ImmVal >= 0 && ImmVal < 256);
+ }
+}
+
/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
/// specified in the intrinsic calls.