diff options
author | Derek Schuff <dschuff@chromium.org> | 2012-09-25 17:30:25 -0700 |
---|---|---|
committer | Derek Schuff <dschuff@chromium.org> | 2012-09-25 18:01:23 -0700 |
commit | a27c28b1427dc2082ab2b31efdbb25f9fde31b61 (patch) | |
tree | 6f3ff025f542ca3f66a1a01cbf239aeef7784511 /lib/Target/ARM | |
parent | 0e15ffd8cb1ec642eddb96380660914ff2b007e1 (diff) | |
parent | bc4021f31eaa97ee52655828da3e3de14a39e4a6 (diff) |
Merge commit 'bc4021f31eaa97ee52655828da3e3de14a39e4a6'
Conflicts:
lib/MC/MCAssembler.cpp
lib/Target/ARM/ARMISelDAGToDAG.cpp
lib/Target/Mips/MipsInstrFPU.td
lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
lib/Target/X86/X86ISelLowering.h
Diffstat (limited to 'lib/Target/ARM')
23 files changed, 900 insertions, 141 deletions
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index 06ed89395f..38509a3400 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -139,6 +139,11 @@ def ProcA9 : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9", [FeatureVMLxForwarding, FeatureT2XtPk, FeatureFP16, FeatureAvoidPartialCPSR]>; +// FIXME: It has not been determined if A15 has these features. +def ProcA15 : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15", + "Cortex-A15 ARM processors", + [FeatureT2XtPk, FeatureFP16, + FeatureAvoidPartialCPSR]>; class ProcNoItin<string Name, list<SubtargetFeature> Features> : Processor<Name, NoItineraries, Features>; @@ -218,6 +223,10 @@ def : ProcessorModel<"cortex-a9-mp", CortexA9Model, [ProcA9, HasV7Ops, FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureMP, FeatureHasRAS]>; +// FIXME: A15 has currently the same ProcessorModel as A9. +def : ProcessorModel<"cortex-a15", CortexA9Model, + [ProcA15, HasV7Ops, FeatureNEON, FeatureDB, + FeatureDSPThumb2, FeatureHasRAS]>; // V7M Processors. def : ProcNoItin<"cortex-m3", [HasV7Ops, diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index e1a2d3649a..c08294918e 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -2344,6 +2344,37 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI, return true; } +// Return the number of 32-bit words loaded by LDM or stored by STM. If this +// can't be easily determined return 0 (missing MachineMemOperand). +// +// FIXME: The current MachineInstr design does not support relying on machine +// mem operands to determine the width of a memory access. Instead, we expect +// the target to provide this information based on the instruction opcode and +// operands. However, using MachineMemOperand is a the best solution now for +// two reasons: +// +// 1) getNumMicroOps tries to infer LDM memory width from the total number of MI +// operands. This is much more dangerous than using the MachineMemOperand +// sizes because CodeGen passes can insert/remove optional machine operands. In +// fact, it's totally incorrect for preRA passes and appears to be wrong for +// postRA passes as well. +// +// 2) getNumLDMAddresses is only used by the scheduling machine model and any +// machine model that calls this should handle the unknown (zero size) case. +// +// Long term, we should require a target hook that verifies MachineMemOperand +// sizes during MC lowering. That target hook should be local to MC lowering +// because we can't ensure that it is aware of other MI forms. Doing this will +// ensure that MachineMemOperands are correctly propagated through all passes. +unsigned ARMBaseInstrInfo::getNumLDMAddresses(const MachineInstr *MI) const { + unsigned Size = 0; + for (MachineInstr::mmo_iterator I = MI->memoperands_begin(), + E = MI->memoperands_end(); I != E; ++I) { + Size += (*I)->getSize(); + } + return Size / 4; +} + unsigned ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData, const MachineInstr *MI) const { @@ -2432,7 +2463,7 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData, if (NumRegs % 2) ++A8UOps; return A8UOps; - } else if (Subtarget.isCortexA9()) { + } else if (Subtarget.isLikeA9()) { int A9UOps = (NumRegs / 2); // If there are odd number of registers or if it's not 64-bit aligned, // then it takes an extra AGU (Address Generation Unit) cycle. @@ -2465,7 +2496,7 @@ ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData, DefCycle = RegNo / 2 + 1; if (RegNo % 2) ++DefCycle; - } else if (Subtarget.isCortexA9()) { + } else if (Subtarget.isLikeA9()) { DefCycle = RegNo; bool isSLoad = false; @@ -2509,7 +2540,7 @@ ARMBaseInstrInfo::getLDMDefCycle(const InstrItineraryData *ItinData, DefCycle = 1; // Result latency is issue cycle + 2: E2. DefCycle += 2; - } else if (Subtarget.isCortexA9()) { + } else if (Subtarget.isLikeA9()) { DefCycle = (RegNo / 2); // If there are odd number of registers or if it's not 64-bit aligned, // then it takes an extra AGU (Address Generation Unit) cycle. @@ -2540,7 +2571,7 @@ ARMBaseInstrInfo::getVSTMUseCycle(const InstrItineraryData *ItinData, UseCycle = RegNo / 2 + 1; if (RegNo % 2) ++UseCycle; - } else if (Subtarget.isCortexA9()) { + } else if (Subtarget.isLikeA9()) { UseCycle = RegNo; bool isSStore = false; @@ -2581,7 +2612,7 @@ ARMBaseInstrInfo::getSTMUseCycle(const InstrItineraryData *ItinData, UseCycle = 2; // Read in E3. UseCycle += 2; - } else if (Subtarget.isCortexA9()) { + } else if (Subtarget.isLikeA9()) { UseCycle = (RegNo / 2); // If there are odd number of registers or if it's not 64-bit aligned, // then it takes an extra AGU (Address Generation Unit) cycle. @@ -2766,7 +2797,7 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget, const MachineInstr *DefMI, const MCInstrDesc *DefMCID, unsigned DefAlign) { int Adjust = 0; - if (Subtarget.isCortexA8() || Subtarget.isCortexA9()) { + if (Subtarget.isCortexA8() || Subtarget.isLikeA9()) { // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2] // variants are one cycle cheaper. switch (DefMCID->getOpcode()) { @@ -2793,7 +2824,7 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget, } } - if (DefAlign < 8 && Subtarget.isCortexA9()) { + if (DefAlign < 8 && Subtarget.isLikeA9()) { switch (DefMCID->getOpcode()) { default: break; case ARM::VLD1q8: @@ -2951,7 +2982,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, if (Reg == ARM::CPSR) { if (DefMI->getOpcode() == ARM::FMSTAT) { // fpscr -> cpsr stalls over 20 cycles on A8 (and earlier?) - return Subtarget.isCortexA9() ? 1 : 20; + return Subtarget.isLikeA9() ? 1 : 20; } // CPSR set and branch can be paired in the same cycle. @@ -3017,7 +3048,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, if (!UseNode->isMachineOpcode()) { int Latency = ItinData->getOperandCycle(DefMCID.getSchedClass(), DefIdx); - if (Subtarget.isCortexA9()) + if (Subtarget.isLikeA9()) return Latency <= 2 ? 1 : Latency - 1; else return Latency <= 3 ? 1 : Latency - 2; @@ -3034,7 +3065,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, UseMCID, UseIdx, UseAlign); if (Latency > 1 && - (Subtarget.isCortexA8() || Subtarget.isCortexA9())) { + (Subtarget.isCortexA8() || Subtarget.isLikeA9())) { // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2] // variants are one cycle cheaper. switch (DefMCID.getOpcode()) { @@ -3063,7 +3094,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, } } - if (DefAlign < 8 && Subtarget.isCortexA9()) + if (DefAlign < 8 && Subtarget.isLikeA9()) switch (DefMCID.getOpcode()) { default: break; case ARM::VLD1q8: @@ -3356,9 +3387,9 @@ ARMBaseInstrInfo::getExecutionDomain(const MachineInstr *MI) const { if (MI->getOpcode() == ARM::VMOVD && !isPredicated(MI)) return std::make_pair(ExeVFP, (1<<ExeVFP) | (1<<ExeNEON)); - // Cortex-A9 is particularly picky about mixing the two and wants these + // A9-like cores are particularly picky about mixing the two and want these // converted. - if (Subtarget.isCortexA9() && !isPredicated(MI) && + if (Subtarget.isLikeA9() && !isPredicated(MI) && (MI->getOpcode() == ARM::VMOVRS || MI->getOpcode() == ARM::VMOVSR || MI->getOpcode() == ARM::VMOVS)) @@ -3396,6 +3427,48 @@ static unsigned getCorrespondingDRegAndLane(const TargetRegisterInfo *TRI, return DReg; } +/// getImplicitSPRUseForDPRUse - Given a use of a DPR register and lane, +/// set ImplicitSReg to a register number that must be marked as implicit-use or +/// zero if no register needs to be defined as implicit-use. +/// +/// If the function cannot determine if an SPR should be marked implicit use or +/// not, it returns false. +/// +/// This function handles cases where an instruction is being modified from taking +/// an SPR to a DPR[Lane]. A use of the DPR is being added, which may conflict +/// with an earlier def of an SPR corresponding to DPR[Lane^1] (i.e. the other +/// lane of the DPR). +/// +/// If the other SPR is defined, an implicit-use of it should be added. Else, +/// (including the case where the DPR itself is defined), it should not. +/// +static bool getImplicitSPRUseForDPRUse(const TargetRegisterInfo *TRI, + MachineInstr *MI, + unsigned DReg, unsigned Lane, + unsigned &ImplicitSReg) { + // If the DPR is defined or used already, the other SPR lane will be chained + // correctly, so there is nothing to be done. + if (MI->definesRegister(DReg, TRI) || MI->readsRegister(DReg, TRI)) { + ImplicitSReg = 0; + return true; + } + + // Otherwise we need to go searching to see if the SPR is set explicitly. + ImplicitSReg = TRI->getSubReg(DReg, + (Lane & 1) ? ARM::ssub_0 : ARM::ssub_1); + MachineBasicBlock::LivenessQueryResult LQR = + MI->getParent()->computeRegisterLiveness(TRI, ImplicitSReg, MI); + + if (LQR == MachineBasicBlock::LQR_Live) + return true; + else if (LQR == MachineBasicBlock::LQR_Unknown) + return false; + + // If the register is known not to be live, there is no need to add an + // implicit-use. + ImplicitSReg = 0; + return true; +} void ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { @@ -3453,7 +3526,7 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { // was dead before here. MIB.addReg(SrcReg, RegState::Implicit); break; - case ARM::VMOVSR: + case ARM::VMOVSR: { if (Domain != ExeNEON) break; assert(!isPredicated(MI) && "Cannot predicate a VSETLN"); @@ -3464,12 +3537,9 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { DReg = getCorrespondingDRegAndLane(TRI, DstReg, Lane); - // If we insert both a novel <def> and an <undef> on the DReg, we break - // any existing dependency chain on the unused lane. Either already being - // present means this instruction is in that chain anyway so we can make - // the transformation. - if (!MI->definesRegister(DReg, TRI) && !MI->readsRegister(DReg, TRI)) - break; + unsigned ImplicitSReg; + if (!getImplicitSPRUseForDPRUse(TRI, MI, DReg, Lane, ImplicitSReg)) + break; for (unsigned i = MI->getDesc().getNumOperands(); i; --i) MI->RemoveOperand(i-1); @@ -3486,7 +3556,10 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { // The narrower destination must be marked as set to keep previous chains // in place. MIB.addReg(DstReg, RegState::Define | RegState::Implicit); + if (ImplicitSReg != 0) + MIB.addReg(ImplicitSReg, RegState::Implicit); break; + } case ARM::VMOVS: { if (Domain != ExeNEON) break; @@ -3499,12 +3572,9 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { DDst = getCorrespondingDRegAndLane(TRI, DstReg, DstLane); DSrc = getCorrespondingDRegAndLane(TRI, SrcReg, SrcLane); - // If we insert both a novel <def> and an <undef> on the DReg, we break - // any existing dependency chain on the unused lane. Either already being - // present means this instruction is in that chain anyway so we can make - // the transformation. - if (!MI->definesRegister(DDst, TRI) && !MI->readsRegister(DDst, TRI)) - break; + unsigned ImplicitSReg; + if (!getImplicitSPRUseForDPRUse(TRI, MI, DSrc, SrcLane, ImplicitSReg)) + break; for (unsigned i = MI->getDesc().getNumOperands(); i; --i) MI->RemoveOperand(i-1); @@ -3522,6 +3592,8 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { // more, so add them in manually. MIB.addReg(DstReg, RegState::Implicit | RegState::Define); MIB.addReg(SrcReg, RegState::Implicit); + if (ImplicitSReg != 0) + MIB.addReg(ImplicitSReg, RegState::Implicit); break; } @@ -3580,6 +3652,8 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { // As before, the original destination is no longer represented, add it // implicitly. MIB.addReg(DstReg, RegState::Define | RegState::Implicit); + if (ImplicitSReg != 0) + MIB.addReg(ImplicitSReg, RegState::Implicit); break; } } diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index 92e5ee8dcb..304ccc087c 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -235,6 +235,9 @@ public: getExecutionDomain(const MachineInstr *MI) const; void setExecutionDomain(MachineInstr *MI, unsigned Domain) const; + /// Get the number of addresses by LDM or VLDM or zero for unknown. + unsigned getNumLDMAddresses(const MachineInstr *MI) const; + private: unsigned getInstBundleLength(const MachineInstr *MI) const; diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp index b2ba9703b6..277dd57ef2 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -480,7 +480,7 @@ ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg, bool ARMBaseRegisterInfo::avoidWriteAfterWrite(const TargetRegisterClass *RC) const { // CortexA9 has a Write-after-write hazard for NEON registers. - if (!STI.isCortexA9()) + if (!STI.isLikeA9()) return false; switch (RC->getID()) { diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index c292821e79..56cfcface4 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -115,9 +115,9 @@ namespace { bool IsLoad; bool isUpdating; bool hasWritebackOperand; - NEONRegSpacing RegSpacing; - unsigned char NumRegs; // D registers loaded or stored - unsigned char RegElts; // elements per D register; used for lane ops + uint8_t RegSpacing; // One of type NEONRegSpacing + uint8_t NumRegs; // D registers loaded or stored + uint8_t RegElts; // elements per D register; used for lane ops // FIXME: Temporary flag to denote whether the real instruction takes // a single register (like the encoding) or all of the registers in // the list (like the asm syntax and the isel DAG). When all definitions @@ -389,7 +389,7 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) { const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode()); assert(TableEntry && TableEntry->IsLoad && "NEONLdStTable lookup failed"); - NEONRegSpacing RegSpc = TableEntry->RegSpacing; + NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing; unsigned NumRegs = TableEntry->NumRegs; MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), @@ -454,7 +454,7 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) { const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode()); assert(TableEntry && !TableEntry->IsLoad && "NEONLdStTable lookup failed"); - NEONRegSpacing RegSpc = TableEntry->RegSpacing; + NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing; unsigned NumRegs = TableEntry->NumRegs; MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), @@ -538,7 +538,7 @@ void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) { const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode()); assert(TableEntry && "NEONLdStTable lookup failed"); - NEONRegSpacing RegSpc = TableEntry->RegSpacing; + NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing; unsigned NumRegs = TableEntry->NumRegs; unsigned RegElts = TableEntry->RegElts; diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index 873404effd..d6ef3f333b 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -1026,6 +1026,9 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr, RC = &ARM::GPRRegClass; break; case MVT::i16: + if (Alignment && Alignment < 2 && !Subtarget->allowsUnalignedMem()) + return false; + if (isThumb2) { if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops()) Opc = isZExt ? ARM::t2LDRHi8 : ARM::t2LDRSHi8; @@ -1038,6 +1041,9 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr, RC = &ARM::GPRRegClass; break; case MVT::i32: + if (Alignment && Alignment < 4 && !Subtarget->allowsUnalignedMem()) + return false; + if (isThumb2) { if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops()) Opc = ARM::t2LDRi8; @@ -1144,6 +1150,9 @@ bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr, } break; case MVT::i16: + if (Alignment && Alignment < 2 && !Subtarget->allowsUnalignedMem()) + return false; + if (isThumb2) { if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops()) StrOpc = ARM::t2STRHi8; @@ -1155,6 +1164,9 @@ bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr, } break; case MVT::i32: + if (Alignment && Alignment < 4 && !Subtarget->allowsUnalignedMem()) + return false; + if (isThumb2) { if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops()) StrOpc = ARM::t2STRi8; diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp index a5fd15b6bb..1240169e84 100644 --- a/lib/Target/ARM/ARMHazardRecognizer.cpp +++ b/lib/Target/ARM/ARMHazardRecognizer.cpp @@ -47,7 +47,7 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { // Skip over one non-VFP / NEON instruction. if (!LastMI->isBarrier() && // On A9, AGU and NEON/FPU are muxed. - !(STI.isCortexA9() && (LastMI->mayLoad() || LastMI->mayStore())) && + !(STI.isLikeA9() && (LastMI->mayLoad() || LastMI->mayStore())) && (LastMCID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) { MachineBasicBlock::iterator I = LastMI; if (I != LastMI->getParent()->begin()) { diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 3042b07920..a44e2a220a 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -317,7 +317,7 @@ static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) { } /// \brief Check whether a particular node is a constant value representable as -/// (N * Scale) where (N in [\arg RangeMin, \arg RangeMax). +/// (N * Scale) where (N in [\p RangeMin, \p RangeMax). /// /// \param ScaledConstant [out] - On success, the pre-scaled constant value. static bool isScaledConstantInRange(SDValue Node, int Scale, @@ -347,8 +347,7 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const { if (!CheckVMLxHazard) return true; - - if (!Subtarget->isCortexA8() && !Subtarget->isCortexA9()) + if (!Subtarget->isCortexA8() && !Subtarget->isLikeA9()) return true; if (!N->hasOneUse()) @@ -386,7 +385,7 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const { bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift, ARM_AM::ShiftOpc ShOpcVal, unsigned ShAmt) { - if (!Subtarget->isCortexA9()) + if (!Subtarget->isLikeA9()) return true; if (Shift.hasOneUse()) return true; @@ -519,7 +518,7 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, return false; // @LOCALMOD-END if (N.getOpcode() == ISD::MUL && - (!Subtarget->isCortexA9() || N.hasOneUse())) { + (!Subtarget->isLikeA9() || N.hasOneUse())) { if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { // X * [3,5,9] -> X + X * [2,4,8] etc. int RHSC = (int)RHS->getZExtValue(); @@ -583,7 +582,7 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, // Try matching (R shl C) + (R). if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift && - !(Subtarget->isCortexA9() || N.getOperand(0).hasOneUse())) { + !(Subtarget->isLikeA9() || N.getOperand(0).hasOneUse())) { ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode()); if (ShOpcVal != ARM_AM::no_shift) { // Check to see if the RHS of the shift is a constant, if not, we can't @@ -631,7 +630,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDNode *Op, // @LOCALMOD-END if (N.getOpcode() == ISD::MUL && - (!Subtarget->isCortexA9() || N.hasOneUse())) { + (!Subtarget->isLikeA9() || N.hasOneUse())) { if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { // X * [3,5,9] -> X + X * [2,4,8] etc. int RHSC = (int)RHS->getZExtValue(); @@ -698,7 +697,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDNode *Op, } } - if (Subtarget->isCortexA9() && !N.hasOneUse()) { + if (Subtarget->isLikeA9() && !N.hasOneUse()) { // Compute R +/- (R << N) and reuse it. Base = N; Offset = CurDAG->getRegister(0, MVT::i32); @@ -754,7 +753,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDNode *Op, // Try matching (R shl C) + (R). if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift && - !(Subtarget->isCortexA9() || N.getOperand(0).hasOneUse())) { + !(Subtarget->isLikeA9() || N.getOperand(0).hasOneUse())) { ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode()); if (ShOpcVal != ARM_AM::no_shift) { // Check to see if the RHS of the shift is a constant, if not, we can't diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index bc15dcf4fc..2e7588b29f 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -860,7 +860,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) benefitFromCodePlacementOpt = true; // Prefer likely predicted branches to selects on out-of-order cores. - predictableSelectIsExpensive = Subtarget->isCortexA9(); + predictableSelectIsExpensive = Subtarget->isLikeA9(); setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); } @@ -9308,8 +9308,8 @@ bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, } bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const { - if (!Subtarget->allowsUnalignedMem()) - return false; + // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus + bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); switch (VT.getSimpleVT().SimpleTy) { default: @@ -9317,10 +9317,14 @@ bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const { case MVT::i8: case MVT::i16: case MVT::i32: - return true; + // Unaligned access can use (for example) LRDB, LRDH, LDR + return AllowsUnaligned; case MVT::f64: - return Subtarget->hasNEON(); - // FIXME: VLD1 etc with standard alignment is legal. + case MVT::v2f64: + // For any little-endian targets with neon, we can support unaligned ld/st + // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. + // A big-endian target may also explictly support unaligned accesses + return Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian()); } } diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 697b5b0111..2060bb9374 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -3239,9 +3239,11 @@ def : ARMPat<(ARMaddc GPR:$src, so_imm_neg:$imm), (SUBSri GPR:$src, so_imm_neg:$imm)>; def : ARMPat<(add GPR:$src, imm0_65535_neg:$imm), - (SUBrr GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>; + (SUBrr GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>, + Requires<[IsARM, HasV6T2]>; def : ARMPat<(ARMaddc GPR:$src, imm0_65535_neg:$imm), - (SUBSrr GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>; + (SUBSrr GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>, + Requires<[IsARM, HasV6T2]>; // The with-carry-in form matches bitwise not instead of the negation. // Effectively, the inverse interpretation of the carry flag already accounts diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 8158a11f83..1bcb48776e 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -398,6 +398,20 @@ def VecListFourQWordIndexed : Operand<i32> { let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx); } +def dword_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() >= 8; +}]>; +def dword_alignedstore : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAlignment() >= 8; +}]>; +def word_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() == 4; +}]>; +def word_alignedstore : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAlignment() == 4; +}]>; def hword_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return cast<LoadSDNode>(N)->getAlignment() == 2; }]>; @@ -2273,6 +2287,25 @@ def : Pat<(f64 (non_word_alignedload addrmode6:$addr)), def : Pat<(non_word_alignedstore (f64 DPR:$value), addrmode6:$addr), (VST1d64 addrmode6:$addr, DPR:$value)>, Requires<[IsBE]>; +// Use vld1/vst1 for Q and QQ. Also use them for unaligned v2f64 +// load / store if it's legal. +def : Pat<(v2f64 (dword_alignedload addrmode6:$addr)), + (VLD1q64 addrmode6:$addr)>; +def : Pat<(dword_alignedstore (v2f64 QPR:$value), addrmode6:$addr), + (VST1q64 addrmode6:$addr, QPR:$value)>; +def : Pat<(v2f64 (word_alignedload addrmode6:$addr)), + (VLD1q32 addrmode6:$addr)>; +def : Pat<(word_alignedstore (v2f64 QPR:$value), addrmode6:$addr), + (VST1q32 addrmode6:$addr, QPR:$value)>; +def : Pat<(v2f64 (hword_alignedload addrmode6:$addr)), + (VLD1q16 addrmode6:$addr)>, Requires<[IsLE]>; +def : Pat<(hword_alignedstore (v2f64 QPR:$value), addrmode6:$addr), + (VST1q16 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>; +def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)), + (VLD1q8 addrmode6:$addr)>, Requires<[IsLE]>; +def : Pat<(byte_alignedstore (v2f64 QPR:$value), addrmode6:$addr), + (VST1q8 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>; + //===----------------------------------------------------------------------===// // NEON pattern fragments //===----------------------------------------------------------------------===// @@ -4455,10 +4488,23 @@ def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd), "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd", [(set DPR:$Vd, (v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>; +def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 DPR:$src1), + (v8i8 DPR:$Vn), (v8i8 DPR:$Vm))), + (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>, + Requires<[HasNEON]>; +def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 DPR:$src1), + (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))), + (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>, + Requires<[HasNEON]>; +def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 DPR:$src1), + (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))), + (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>, + Requires<[HasNEON]>; def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd), (and DPR:$Vm, (vnotd DPR:$Vd)))), - (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>; + (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>, + Requires<[HasNEON]>; def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), @@ -4467,9 +4513,23 @@ def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd), [(set QPR:$Vd, (v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>; +def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 QPR:$src1), + (v16i8 QPR:$Vn), (v16i8 QPR:$Vm))), + (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>, + Requires<[HasNEON]>; +def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 QPR:$src1), + (v8i16 QPR:$Vn), (v8i16 QPR:$Vm))), + (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>, + Requires<[HasNEON]>; +def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 QPR:$src1), + (v4i32 QPR:$Vn), (v4i32 QPR:$Vm))), + (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>, + Requires<[HasNEON]>; + def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd), (and QPR:$Vm, (vnotq QPR:$Vd)))), - (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>; + (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>, + Requires<[HasNEON]>; // VBIF : Vector Bitwise Insert if False // like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst", diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index 7bc590f947..404634fee9 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/ |