aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/ARM
diff options
context:
space:
mode:
authorDerek Schuff <dschuff@chromium.org>2012-09-25 17:30:25 -0700
committerDerek Schuff <dschuff@chromium.org>2012-09-25 18:01:23 -0700
commita27c28b1427dc2082ab2b31efdbb25f9fde31b61 (patch)
tree6f3ff025f542ca3f66a1a01cbf239aeef7784511 /lib/Target/ARM
parent0e15ffd8cb1ec642eddb96380660914ff2b007e1 (diff)
parentbc4021f31eaa97ee52655828da3e3de14a39e4a6 (diff)
Merge commit 'bc4021f31eaa97ee52655828da3e3de14a39e4a6'
Conflicts: lib/MC/MCAssembler.cpp lib/Target/ARM/ARMISelDAGToDAG.cpp lib/Target/Mips/MipsInstrFPU.td lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp lib/Target/X86/X86ISelLowering.h
Diffstat (limited to 'lib/Target/ARM')
-rw-r--r--lib/Target/ARM/ARM.td9
-rw-r--r--lib/Target/ARM/ARMBaseInstrInfo.cpp126
-rw-r--r--lib/Target/ARM/ARMBaseInstrInfo.h3
-rw-r--r--lib/Target/ARM/ARMBaseRegisterInfo.cpp2
-rw-r--r--lib/Target/ARM/ARMExpandPseudoInsts.cpp12
-rw-r--r--lib/Target/ARM/ARMFastISel.cpp12
-rw-r--r--lib/Target/ARM/ARMHazardRecognizer.cpp2
-rw-r--r--lib/Target/ARM/ARMISelDAGToDAG.cpp17
-rw-r--r--lib/Target/ARM/ARMISelLowering.cpp16
-rw-r--r--lib/Target/ARM/ARMInstrInfo.td6
-rw-r--r--lib/Target/ARM/ARMInstrNEON.td64
-rw-r--r--lib/Target/ARM/ARMScheduleA9.td597
-rw-r--r--lib/Target/ARM/ARMSubtarget.cpp3
-rw-r--r--lib/Target/ARM/ARMSubtarget.h4
-rw-r--r--lib/Target/ARM/ARMTargetMachine.cpp2
-rw-r--r--lib/Target/ARM/AsmParser/ARMAsmParser.cpp59
-rw-r--r--lib/Target/ARM/Disassembler/ARMDisassembler.cpp5
-rw-r--r--lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp78
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp2
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp8
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMCExpr.h4
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h2
-rw-r--r--lib/Target/ARM/MLxExpansionPass.cpp8
23 files changed, 900 insertions, 141 deletions
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 06ed89395f..38509a3400 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -139,6 +139,11 @@ def ProcA9 : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9",
[FeatureVMLxForwarding,
FeatureT2XtPk, FeatureFP16,
FeatureAvoidPartialCPSR]>;
+// FIXME: It has not been determined if A15 has these features.
+def ProcA15 : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15",
+ "Cortex-A15 ARM processors",
+ [FeatureT2XtPk, FeatureFP16,
+ FeatureAvoidPartialCPSR]>;
class ProcNoItin<string Name, list<SubtargetFeature> Features>
: Processor<Name, NoItineraries, Features>;
@@ -218,6 +223,10 @@ def : ProcessorModel<"cortex-a9-mp", CortexA9Model,
[ProcA9, HasV7Ops, FeatureNEON, FeatureDB,
FeatureDSPThumb2, FeatureMP,
FeatureHasRAS]>;
+// FIXME: A15 has currently the same ProcessorModel as A9.
+def : ProcessorModel<"cortex-a15", CortexA9Model,
+ [ProcA15, HasV7Ops, FeatureNEON, FeatureDB,
+ FeatureDSPThumb2, FeatureHasRAS]>;
// V7M Processors.
def : ProcNoItin<"cortex-m3", [HasV7Ops,
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index e1a2d3649a..c08294918e 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -2344,6 +2344,37 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI,
return true;
}
+// Return the number of 32-bit words loaded by LDM or stored by STM. If this
+// can't be easily determined return 0 (missing MachineMemOperand).
+//
+// FIXME: The current MachineInstr design does not support relying on machine
+// mem operands to determine the width of a memory access. Instead, we expect
+// the target to provide this information based on the instruction opcode and
+// operands. However, using MachineMemOperand is a the best solution now for
+// two reasons:
+//
+// 1) getNumMicroOps tries to infer LDM memory width from the total number of MI
+// operands. This is much more dangerous than using the MachineMemOperand
+// sizes because CodeGen passes can insert/remove optional machine operands. In
+// fact, it's totally incorrect for preRA passes and appears to be wrong for
+// postRA passes as well.
+//
+// 2) getNumLDMAddresses is only used by the scheduling machine model and any
+// machine model that calls this should handle the unknown (zero size) case.
+//
+// Long term, we should require a target hook that verifies MachineMemOperand
+// sizes during MC lowering. That target hook should be local to MC lowering
+// because we can't ensure that it is aware of other MI forms. Doing this will
+// ensure that MachineMemOperands are correctly propagated through all passes.
+unsigned ARMBaseInstrInfo::getNumLDMAddresses(const MachineInstr *MI) const {
+ unsigned Size = 0;
+ for (MachineInstr::mmo_iterator I = MI->memoperands_begin(),
+ E = MI->memoperands_end(); I != E; ++I) {
+ Size += (*I)->getSize();
+ }
+ return Size / 4;
+}
+
unsigned
ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
const MachineInstr *MI) const {
@@ -2432,7 +2463,7 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
if (NumRegs % 2)
++A8UOps;
return A8UOps;
- } else if (Subtarget.isCortexA9()) {
+ } else if (Subtarget.isLikeA9()) {
int A9UOps = (NumRegs / 2);
// If there are odd number of registers or if it's not 64-bit aligned,
// then it takes an extra AGU (Address Generation Unit) cycle.
@@ -2465,7 +2496,7 @@ ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData,
DefCycle = RegNo / 2 + 1;
if (RegNo % 2)
++DefCycle;
- } else if (Subtarget.isCortexA9()) {
+ } else if (Subtarget.isLikeA9()) {
DefCycle = RegNo;
bool isSLoad = false;
@@ -2509,7 +2540,7 @@ ARMBaseInstrInfo::getLDMDefCycle(const InstrItineraryData *ItinData,
DefCycle = 1;
// Result latency is issue cycle + 2: E2.
DefCycle += 2;
- } else if (Subtarget.isCortexA9()) {
+ } else if (Subtarget.isLikeA9()) {
DefCycle = (RegNo / 2);
// If there are odd number of registers or if it's not 64-bit aligned,
// then it takes an extra AGU (Address Generation Unit) cycle.
@@ -2540,7 +2571,7 @@ ARMBaseInstrInfo::getVSTMUseCycle(const InstrItineraryData *ItinData,
UseCycle = RegNo / 2 + 1;
if (RegNo % 2)
++UseCycle;
- } else if (Subtarget.isCortexA9()) {
+ } else if (Subtarget.isLikeA9()) {
UseCycle = RegNo;
bool isSStore = false;
@@ -2581,7 +2612,7 @@ ARMBaseInstrInfo::getSTMUseCycle(const InstrItineraryData *ItinData,
UseCycle = 2;
// Read in E3.
UseCycle += 2;
- } else if (Subtarget.isCortexA9()) {
+ } else if (Subtarget.isLikeA9()) {
UseCycle = (RegNo / 2);
// If there are odd number of registers or if it's not 64-bit aligned,
// then it takes an extra AGU (Address Generation Unit) cycle.
@@ -2766,7 +2797,7 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget,
const MachineInstr *DefMI,
const MCInstrDesc *DefMCID, unsigned DefAlign) {
int Adjust = 0;
- if (Subtarget.isCortexA8() || Subtarget.isCortexA9()) {
+ if (Subtarget.isCortexA8() || Subtarget.isLikeA9()) {
// FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
// variants are one cycle cheaper.
switch (DefMCID->getOpcode()) {
@@ -2793,7 +2824,7 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget,
}
}
- if (DefAlign < 8 && Subtarget.isCortexA9()) {
+ if (DefAlign < 8 && Subtarget.isLikeA9()) {
switch (DefMCID->getOpcode()) {
default: break;
case ARM::VLD1q8:
@@ -2951,7 +2982,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
if (Reg == ARM::CPSR) {
if (DefMI->getOpcode() == ARM::FMSTAT) {
// fpscr -> cpsr stalls over 20 cycles on A8 (and earlier?)
- return Subtarget.isCortexA9() ? 1 : 20;
+ return Subtarget.isLikeA9() ? 1 : 20;
}
// CPSR set and branch can be paired in the same cycle.
@@ -3017,7 +3048,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
if (!UseNode->isMachineOpcode()) {
int Latency = ItinData->getOperandCycle(DefMCID.getSchedClass(), DefIdx);
- if (Subtarget.isCortexA9())
+ if (Subtarget.isLikeA9())
return Latency <= 2 ? 1 : Latency - 1;
else
return Latency <= 3 ? 1 : Latency - 2;
@@ -3034,7 +3065,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
UseMCID, UseIdx, UseAlign);
if (Latency > 1 &&
- (Subtarget.isCortexA8() || Subtarget.isCortexA9())) {
+ (Subtarget.isCortexA8() || Subtarget.isLikeA9())) {
// FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
// variants are one cycle cheaper.
switch (DefMCID.getOpcode()) {
@@ -3063,7 +3094,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
}
}
- if (DefAlign < 8 && Subtarget.isCortexA9())
+ if (DefAlign < 8 && Subtarget.isLikeA9())
switch (DefMCID.getOpcode()) {
default: break;
case ARM::VLD1q8:
@@ -3356,9 +3387,9 @@ ARMBaseInstrInfo::getExecutionDomain(const MachineInstr *MI) const {
if (MI->getOpcode() == ARM::VMOVD && !isPredicated(MI))
return std::make_pair(ExeVFP, (1<<ExeVFP) | (1<<ExeNEON));
- // Cortex-A9 is particularly picky about mixing the two and wants these
+ // A9-like cores are particularly picky about mixing the two and want these
// converted.
- if (Subtarget.isCortexA9() && !isPredicated(MI) &&
+ if (Subtarget.isLikeA9() && !isPredicated(MI) &&
(MI->getOpcode() == ARM::VMOVRS ||
MI->getOpcode() == ARM::VMOVSR ||
MI->getOpcode() == ARM::VMOVS))
@@ -3396,6 +3427,48 @@ static unsigned getCorrespondingDRegAndLane(const TargetRegisterInfo *TRI,
return DReg;
}
+/// getImplicitSPRUseForDPRUse - Given a use of a DPR register and lane,
+/// set ImplicitSReg to a register number that must be marked as implicit-use or
+/// zero if no register needs to be defined as implicit-use.
+///
+/// If the function cannot determine if an SPR should be marked implicit use or
+/// not, it returns false.
+///
+/// This function handles cases where an instruction is being modified from taking
+/// an SPR to a DPR[Lane]. A use of the DPR is being added, which may conflict
+/// with an earlier def of an SPR corresponding to DPR[Lane^1] (i.e. the other
+/// lane of the DPR).
+///
+/// If the other SPR is defined, an implicit-use of it should be added. Else,
+/// (including the case where the DPR itself is defined), it should not.
+///
+static bool getImplicitSPRUseForDPRUse(const TargetRegisterInfo *TRI,
+ MachineInstr *MI,
+ unsigned DReg, unsigned Lane,
+ unsigned &ImplicitSReg) {
+ // If the DPR is defined or used already, the other SPR lane will be chained
+ // correctly, so there is nothing to be done.
+ if (MI->definesRegister(DReg, TRI) || MI->readsRegister(DReg, TRI)) {
+ ImplicitSReg = 0;
+ return true;
+ }
+
+ // Otherwise we need to go searching to see if the SPR is set explicitly.
+ ImplicitSReg = TRI->getSubReg(DReg,
+ (Lane & 1) ? ARM::ssub_0 : ARM::ssub_1);
+ MachineBasicBlock::LivenessQueryResult LQR =
+ MI->getParent()->computeRegisterLiveness(TRI, ImplicitSReg, MI);
+
+ if (LQR == MachineBasicBlock::LQR_Live)
+ return true;
+ else if (LQR == MachineBasicBlock::LQR_Unknown)
+ return false;
+
+ // If the register is known not to be live, there is no need to add an
+ // implicit-use.
+ ImplicitSReg = 0;
+ return true;
+}
void
ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
@@ -3453,7 +3526,7 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
// was dead before here.
MIB.addReg(SrcReg, RegState::Implicit);
break;
- case ARM::VMOVSR:
+ case ARM::VMOVSR: {
if (Domain != ExeNEON)
break;
assert(!isPredicated(MI) && "Cannot predicate a VSETLN");
@@ -3464,12 +3537,9 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
DReg = getCorrespondingDRegAndLane(TRI, DstReg, Lane);
- // If we insert both a novel <def> and an <undef> on the DReg, we break
- // any existing dependency chain on the unused lane. Either already being
- // present means this instruction is in that chain anyway so we can make
- // the transformation.
- if (!MI->definesRegister(DReg, TRI) && !MI->readsRegister(DReg, TRI))
- break;
+ unsigned ImplicitSReg;
+ if (!getImplicitSPRUseForDPRUse(TRI, MI, DReg, Lane, ImplicitSReg))
+ break;
for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
MI->RemoveOperand(i-1);
@@ -3486,7 +3556,10 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
// The narrower destination must be marked as set to keep previous chains
// in place.
MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
+ if (ImplicitSReg != 0)
+ MIB.addReg(ImplicitSReg, RegState::Implicit);
break;
+ }
case ARM::VMOVS: {
if (Domain != ExeNEON)
break;
@@ -3499,12 +3572,9 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
DDst = getCorrespondingDRegAndLane(TRI, DstReg, DstLane);
DSrc = getCorrespondingDRegAndLane(TRI, SrcReg, SrcLane);
- // If we insert both a novel <def> and an <undef> on the DReg, we break
- // any existing dependency chain on the unused lane. Either already being
- // present means this instruction is in that chain anyway so we can make
- // the transformation.
- if (!MI->definesRegister(DDst, TRI) && !MI->readsRegister(DDst, TRI))
- break;
+ unsigned ImplicitSReg;
+ if (!getImplicitSPRUseForDPRUse(TRI, MI, DSrc, SrcLane, ImplicitSReg))
+ break;
for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
MI->RemoveOperand(i-1);
@@ -3522,6 +3592,8 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
// more, so add them in manually.
MIB.addReg(DstReg, RegState::Implicit | RegState::Define);
MIB.addReg(SrcReg, RegState::Implicit);
+ if (ImplicitSReg != 0)
+ MIB.addReg(ImplicitSReg, RegState::Implicit);
break;
}
@@ -3580,6 +3652,8 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
// As before, the original destination is no longer represented, add it
// implicitly.
MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
+ if (ImplicitSReg != 0)
+ MIB.addReg(ImplicitSReg, RegState::Implicit);
break;
}
}
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 92e5ee8dcb..304ccc087c 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -235,6 +235,9 @@ public:
getExecutionDomain(const MachineInstr *MI) const;
void setExecutionDomain(MachineInstr *MI, unsigned Domain) const;
+ /// Get the number of addresses by LDM or VLDM or zero for unknown.
+ unsigned getNumLDMAddresses(const MachineInstr *MI) const;
+
private:
unsigned getInstBundleLength(const MachineInstr *MI) const;
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index b2ba9703b6..277dd57ef2 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -480,7 +480,7 @@ ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
bool
ARMBaseRegisterInfo::avoidWriteAfterWrite(const TargetRegisterClass *RC) const {
// CortexA9 has a Write-after-write hazard for NEON registers.
- if (!STI.isCortexA9())
+ if (!STI.isLikeA9())
return false;
switch (RC->getID()) {
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index c292821e79..56cfcface4 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -115,9 +115,9 @@ namespace {
bool IsLoad;
bool isUpdating;
bool hasWritebackOperand;
- NEONRegSpacing RegSpacing;
- unsigned char NumRegs; // D registers loaded or stored
- unsigned char RegElts; // elements per D register; used for lane ops
+ uint8_t RegSpacing; // One of type NEONRegSpacing
+ uint8_t NumRegs; // D registers loaded or stored
+ uint8_t RegElts; // elements per D register; used for lane ops
// FIXME: Temporary flag to denote whether the real instruction takes
// a single register (like the encoding) or all of the registers in
// the list (like the asm syntax and the isel DAG). When all definitions
@@ -389,7 +389,7 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
assert(TableEntry && TableEntry->IsLoad && "NEONLdStTable lookup failed");
- NEONRegSpacing RegSpc = TableEntry->RegSpacing;
+ NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
unsigned NumRegs = TableEntry->NumRegs;
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
@@ -454,7 +454,7 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
assert(TableEntry && !TableEntry->IsLoad && "NEONLdStTable lookup failed");
- NEONRegSpacing RegSpc = TableEntry->RegSpacing;
+ NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
unsigned NumRegs = TableEntry->NumRegs;
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
@@ -538,7 +538,7 @@ void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) {
const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
assert(TableEntry && "NEONLdStTable lookup failed");
- NEONRegSpacing RegSpc = TableEntry->RegSpacing;
+ NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
unsigned NumRegs = TableEntry->NumRegs;
unsigned RegElts = TableEntry->RegElts;
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 873404effd..d6ef3f333b 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -1026,6 +1026,9 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
RC = &ARM::GPRRegClass;
break;
case MVT::i16:
+ if (Alignment && Alignment < 2 && !Subtarget->allowsUnalignedMem())
+ return false;
+
if (isThumb2) {
if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
Opc = isZExt ? ARM::t2LDRHi8 : ARM::t2LDRSHi8;
@@ -1038,6 +1041,9 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
RC = &ARM::GPRRegClass;
break;
case MVT::i32:
+ if (Alignment && Alignment < 4 && !Subtarget->allowsUnalignedMem())
+ return false;
+
if (isThumb2) {
if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
Opc = ARM::t2LDRi8;
@@ -1144,6 +1150,9 @@ bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr,
}
break;
case MVT::i16:
+ if (Alignment && Alignment < 2 && !Subtarget->allowsUnalignedMem())
+ return false;
+
if (isThumb2) {
if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
StrOpc = ARM::t2STRHi8;
@@ -1155,6 +1164,9 @@ bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr,
}
break;
case MVT::i32:
+ if (Alignment && Alignment < 4 && !Subtarget->allowsUnalignedMem())
+ return false;
+
if (isThumb2) {
if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
StrOpc = ARM::t2STRi8;
diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp
index a5fd15b6bb..1240169e84 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -47,7 +47,7 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
// Skip over one non-VFP / NEON instruction.
if (!LastMI->isBarrier() &&
// On A9, AGU and NEON/FPU are muxed.
- !(STI.isCortexA9() && (LastMI->mayLoad() || LastMI->mayStore())) &&
+ !(STI.isLikeA9() && (LastMI->mayLoad() || LastMI->mayStore())) &&
(LastMCID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) {
MachineBasicBlock::iterator I = LastMI;
if (I != LastMI->getParent()->begin()) {
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 3042b07920..a44e2a220a 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -317,7 +317,7 @@ static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
}
/// \brief Check whether a particular node is a constant value representable as
-/// (N * Scale) where (N in [\arg RangeMin, \arg RangeMax).
+/// (N * Scale) where (N in [\p RangeMin, \p RangeMax).
///
/// \param ScaledConstant [out] - On success, the pre-scaled constant value.
static bool isScaledConstantInRange(SDValue Node, int Scale,
@@ -347,8 +347,7 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
if (!CheckVMLxHazard)
return true;
-
- if (!Subtarget->isCortexA8() && !Subtarget->isCortexA9())
+ if (!Subtarget->isCortexA8() && !Subtarget->isLikeA9())
return true;
if (!N->hasOneUse())
@@ -386,7 +385,7 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift,
ARM_AM::ShiftOpc ShOpcVal,
unsigned ShAmt) {
- if (!Subtarget->isCortexA9())
+ if (!Subtarget->isLikeA9())
return true;
if (Shift.hasOneUse())
return true;
@@ -519,7 +518,7 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
return false;
// @LOCALMOD-END
if (N.getOpcode() == ISD::MUL &&
- (!Subtarget->isCortexA9() || N.hasOneUse())) {
+ (!Subtarget->isLikeA9() || N.hasOneUse())) {
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
// X * [3,5,9] -> X + X * [2,4,8] etc.
int RHSC = (int)RHS->getZExtValue();
@@ -583,7 +582,7 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
// Try matching (R shl C) + (R).
if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift &&
- !(Subtarget->isCortexA9() || N.getOperand(0).hasOneUse())) {
+ !(Subtarget->isLikeA9() || N.getOperand(0).hasOneUse())) {
ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode());
if (ShOpcVal != ARM_AM::no_shift) {
// Check to see if the RHS of the shift is a constant, if not, we can't
@@ -631,7 +630,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDNode *Op,
// @LOCALMOD-END
if (N.getOpcode() == ISD::MUL &&
- (!Subtarget->isCortexA9() || N.hasOneUse())) {
+ (!Subtarget->isLikeA9() || N.hasOneUse())) {
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
// X * [3,5,9] -> X + X * [2,4,8] etc.
int RHSC = (int)RHS->getZExtValue();
@@ -698,7 +697,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDNode *Op,
}
}
- if (Subtarget->isCortexA9() && !N.hasOneUse()) {
+ if (Subtarget->isLikeA9() && !N.hasOneUse()) {
// Compute R +/- (R << N) and reuse it.
Base = N;
Offset = CurDAG->getRegister(0, MVT::i32);
@@ -754,7 +753,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDNode *Op,
// Try matching (R shl C) + (R).
if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift &&
- !(Subtarget->isCortexA9() || N.getOperand(0).hasOneUse())) {
+ !(Subtarget->isLikeA9() || N.getOperand(0).hasOneUse())) {
ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode());
if (ShOpcVal != ARM_AM::no_shift) {
// Check to see if the RHS of the shift is a constant, if not, we can't
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index bc15dcf4fc..2e7588b29f 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -860,7 +860,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
benefitFromCodePlacementOpt = true;
// Prefer likely predicted branches to selects on out-of-order cores.
- predictableSelectIsExpensive = Subtarget->isCortexA9();
+ predictableSelectIsExpensive = Subtarget->isLikeA9();
setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
}
@@ -9308,8 +9308,8 @@ bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
}
bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
- if (!Subtarget->allowsUnalignedMem())
- return false;
+ // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
+ bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
switch (VT.getSimpleVT().SimpleTy) {
default:
@@ -9317,10 +9317,14 @@ bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
case MVT::i8:
case MVT::i16:
case MVT::i32:
- return true;
+ // Unaligned access can use (for example) LRDB, LRDH, LDR
+ return AllowsUnaligned;
case MVT::f64:
- return Subtarget->hasNEON();
- // FIXME: VLD1 etc with standard alignment is legal.
+ case MVT::v2f64:
+ // For any little-endian targets with neon, we can support unaligned ld/st
+ // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
+ // A big-endian target may also explictly support unaligned accesses
+ return Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian());
}
}
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 697b5b0111..2060bb9374 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -3239,9 +3239,11 @@ def : ARMPat<(ARMaddc GPR:$src, so_imm_neg:$imm),
(SUBSri GPR:$src, so_imm_neg:$imm)>;
def : ARMPat<(add GPR:$src, imm0_65535_neg:$imm),
- (SUBrr GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>;
+ (SUBrr GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>,
+ Requires<[IsARM, HasV6T2]>;
def : ARMPat<(ARMaddc GPR:$src, imm0_65535_neg:$imm),
- (SUBSrr GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>;
+ (SUBSrr GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>,
+ Requires<[IsARM, HasV6T2]>;
// The with-carry-in form matches bitwise not instead of the negation.
// Effectively, the inverse interpretation of the carry flag already accounts
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 8158a11f83..1bcb48776e 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -398,6 +398,20 @@ def VecListFourQWordIndexed : Operand<i32> {
let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
}
+def dword_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+def dword_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 8;
+}]>;
+def word_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() == 4;
+}]>;
+def word_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() == 4;
+}]>;
def hword_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
return cast<LoadSDNode>(N)->getAlignment() == 2;
}]>;
@@ -2273,6 +2287,25 @@ def : Pat<(f64 (non_word_alignedload addrmode6:$addr)),
def : Pat<(non_word_alignedstore (f64 DPR:$value), addrmode6:$addr),
(VST1d64 addrmode6:$addr, DPR:$value)>, Requires<[IsBE]>;
+// Use vld1/vst1 for Q and QQ. Also use them for unaligned v2f64
+// load / store if it's legal.
+def : Pat<(v2f64 (dword_alignedload addrmode6:$addr)),
+ (VLD1q64 addrmode6:$addr)>;
+def : Pat<(dword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+ (VST1q64 addrmode6:$addr, QPR:$value)>;
+def : Pat<(v2f64 (word_alignedload addrmode6:$addr)),
+ (VLD1q32 addrmode6:$addr)>;
+def : Pat<(word_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+ (VST1q32 addrmode6:$addr, QPR:$value)>;
+def : Pat<(v2f64 (hword_alignedload addrmode6:$addr)),
+ (VLD1q16 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(hword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+ (VST1q16 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
+def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)),
+ (VLD1q8 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(byte_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+ (VST1q8 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
+
//===----------------------------------------------------------------------===//
// NEON pattern fragments
//===----------------------------------------------------------------------===//
@@ -4455,10 +4488,23 @@ def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
"vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
[(set DPR:$Vd,
(v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>;
+def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 DPR:$src1),
+ (v8i8 DPR:$Vn), (v8i8 DPR:$Vm))),
+ (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+ Requires<[HasNEON]>;
+def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 DPR:$src1),
+ (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))),
+ (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+ Requires<[HasNEON]>;
+def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 DPR:$src1),
+ (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))),
+ (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+ Requires<[HasNEON]>;
def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd),
(and DPR:$Vm, (vnotd DPR:$Vd)))),
- (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
+ (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>,
+ Requires<[HasNEON]>;
def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
(ins QPR:$src1, QPR:$Vn, QPR:$Vm),
@@ -4467,9 +4513,23 @@ def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
[(set QPR:$Vd,
(v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>;
+def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 QPR:$src1),
+ (v16i8 QPR:$Vn), (v16i8 QPR:$Vm))),
+ (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+ Requires<[HasNEON]>;
+def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 QPR:$src1),
+ (v8i16 QPR:$Vn), (v8i16 QPR:$Vm))),
+ (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+ Requires<[HasNEON]>;
+def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 QPR:$src1),
+ (v4i32 QPR:$Vn), (v4i32 QPR:$Vm))),
+ (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+ Requires<[HasNEON]>;
+
def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd),
(and QPR:$Vm, (vnotq QPR:$Vd)))),
- (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>;
+ (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>,
+ Requires<[HasNEON]>;
// VBIF : Vector Bitwise Insert if False
// like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst",
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 7bc590f947..404634fee9 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/