aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/llvm/Object/MachOFormat.h5
-rw-r--r--lib/Target/ARM/ARM.td22
-rw-r--r--lib/Target/ARM/ARMBaseInstrInfo.cpp467
-rw-r--r--lib/Target/ARM/ARMBaseInstrInfo.h7
-rw-r--r--lib/Target/ARM/ARMISelDAGToDAG.cpp21
-rw-r--r--lib/Target/ARM/ARMISelLowering.cpp6
-rw-r--r--lib/Target/ARM/ARMInstrFormats.td21
-rw-r--r--lib/Target/ARM/ARMInstrInfo.td68
-rw-r--r--lib/Target/ARM/ARMInstrNEON.td32
-rw-r--r--lib/Target/ARM/ARMInstrThumb2.td26
-rw-r--r--lib/Target/ARM/ARMInstrVFP.td4
-rw-r--r--lib/Target/ARM/ARMRegisterInfo.td9
-rw-r--r--lib/Target/ARM/ARMSchedule.td2
-rw-r--r--lib/Target/ARM/ARMScheduleSwift.td1085
-rw-r--r--lib/Target/ARM/ARMSubtarget.cpp6
-rw-r--r--lib/Target/ARM/ARMSubtarget.h12
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp9
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp8
-rw-r--r--lib/Target/ARM/MLxExpansionPass.cpp68
-rw-r--r--test/CodeGen/ARM/2010-12-07-PEIBug.ll2
-rw-r--r--test/CodeGen/ARM/2012-05-04-vmov.ll11
-rw-r--r--test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll14
-rw-r--r--test/CodeGen/ARM/avoid-cpsr-rmw.ll1
-rw-r--r--test/CodeGen/ARM/call-noret.ll39
-rw-r--r--test/CodeGen/ARM/div.ll17
-rw-r--r--test/CodeGen/ARM/fabss.ll4
-rw-r--r--test/CodeGen/ARM/fadds.ll8
-rw-r--r--test/CodeGen/ARM/fdivs.ll8
-rw-r--r--test/CodeGen/ARM/fmuls.ll8
-rw-r--r--test/CodeGen/ARM/fp_convert.ll4
-rw-r--r--test/CodeGen/ARM/fsubs.ll6
-rw-r--r--test/CodeGen/ARM/ifcvt1.ll12
-rw-r--r--test/CodeGen/ARM/ifcvt12.ll15
-rw-r--r--test/CodeGen/ARM/ifcvt5.ll12
-rw-r--r--test/CodeGen/ARM/ldr_post.ll1
-rw-r--r--test/CodeGen/ARM/ldr_pre.ll1
-rw-r--r--test/CodeGen/ARM/mls.ll12
-rw-r--r--test/CodeGen/ARM/neon-fma.ll22
-rw-r--r--test/CodeGen/ARM/neon_ld2.ll27
-rw-r--r--test/CodeGen/ARM/opt-shuff-tstore.ll4
-rw-r--r--test/CodeGen/ARM/subreg-remat.ll4
-rw-r--r--test/CodeGen/Thumb2/cortex-fp.ll6
-rw-r--r--test/CodeGen/Thumb2/div.ll10
-rw-r--r--test/CodeGen/Thumb2/thumb2-mla.ll7
-rw-r--r--test/CodeGen/Thumb2/thumb2-smla.ll4
45 files changed, 2038 insertions, 99 deletions
diff --git a/include/llvm/Object/MachOFormat.h b/include/llvm/Object/MachOFormat.h
index e4bfcc67fe..c0f700d3c8 100644
--- a/include/llvm/Object/MachOFormat.h
+++ b/include/llvm/Object/MachOFormat.h
@@ -61,7 +61,10 @@ namespace mach {
CSARM_V6 = 6,
CSARM_V5TEJ = 7,
CSARM_XSCALE = 8,
- CSARM_V7 = 9
+ CSARM_V7 = 9,
+ CSARM_V7F = 10,
+ CSARM_V7S = 11,
+ CSARM_V7K = 12
};
/// \brief PowerPC Machine Subtypes.
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 1fb190ca11..23974ad905 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -32,9 +32,6 @@ def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFPv2", "true",
def FeatureVFP3 : SubtargetFeature<"vfp3", "HasVFPv3", "true",
"Enable VFP3 instructions",
[FeatureVFP2]>;
-def FeatureVFP4 : SubtargetFeature<"vfp4", "HasVFPv4", "true",
- "Enable VFP4 instructions",
- [FeatureVFP3]>;
def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
"Enable NEON instructions",
[FeatureVFP3]>;
@@ -44,10 +41,16 @@ def FeatureNoARM : SubtargetFeature<"noarm", "NoARM", "true",
"Does not support ARM mode execution">;
def FeatureFP16 : SubtargetFeature<"fp16", "HasFP16", "true",
"Enable half-precision floating point">;
+def FeatureVFP4 : SubtargetFeature<"vfp4", "HasVFPv4", "true",
+ "Enable VFP4 instructions",
+ [FeatureVFP3, FeatureFP16]>;
def FeatureD16 : SubtargetFeature<"d16", "HasD16", "true",
"Restrict VFP3 to 16 double registers">;
def FeatureHWDiv : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true",
"Enable divide instructions">;
+def FeatureHWDivARM : SubtargetFeature<"hwdiv-arm",
+ "HasHardwareDivideInARM", "true",
+ "Enable divide instructions in ARM mode">;
def FeatureT2XtPk : SubtargetFeature<"t2xtpk", "HasT2ExtractPack", "true",
"Enable Thumb2 extract and pack instructions">;
def FeatureDB : SubtargetFeature<"db", "HasDataBarrier", "true",
@@ -139,6 +142,13 @@ def ProcA9 : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9",
[FeatureVMLxForwarding,
FeatureT2XtPk, FeatureFP16,
FeatureAvoidPartialCPSR]>;
+def ProcSwift : SubtargetFeature<"swift", "ARMProcFamily", "Swift",
+ "Swift ARM processors",
+ [FeatureNEONForFP, FeatureT2XtPk,
+ FeatureVFP4, FeatureMP, FeatureHWDiv,
+ FeatureHWDivARM, FeatureAvoidPartialCPSR,
+ FeatureHasSlowFPVMLx]>;
+
// FIXME: It has not been determined if A15 has these features.
def ProcA15 : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15",
"Cortex-A15 ARM processors",
@@ -236,6 +246,12 @@ def : ProcNoItin<"cortex-m4", [HasV7Ops,
FeatureT2XtPk, FeatureVFP4,
FeatureVFPOnlySP, FeatureMClass]>;
+// Swift uArch Processors.
+def : ProcessorModel<"swift", SwiftModel,
+ [ProcSwift, HasV7Ops, FeatureNEON,
+ FeatureDB, FeatureDSPThumb2,
+ FeatureHasRAS]>;
+
//===----------------------------------------------------------------------===//
// Register File Description
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 8984c555c1..8c744d17f0 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -49,6 +49,11 @@ static cl::opt<bool>
WidenVMOVS("widen-vmovs", cl::Hidden, cl::init(true),
cl::desc("Widen ARM vmovs to vmovd when possible"));
+static cl::opt<unsigned>
+SwiftPartialUpdateClearance("swift-partial-update-clearance",
+ cl::Hidden, cl::init(12),
+ cl::desc("Clearance before partial register updates"));
+
/// ARM_MLxEntry - Record information about MLA / MLS instructions.
struct ARM_MLxEntry {
uint16_t MLxOpc; // MLA / MLS opcode
@@ -1389,7 +1394,6 @@ bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case ARM::VLDRD:
case ARM::VLDRS:
case ARM::t2LDRi8:
- case ARM::t2LDRDi8:
case ARM::t2LDRSHi8:
case ARM::t2LDRi12:
case ARM::t2LDRSHi12:
@@ -1528,6 +1532,14 @@ isProfitableToIfCvt(MachineBasicBlock &TMBB,
return (TCycles + FCycles + TExtra + FExtra) <= UnpredCost;
}
+bool
+ARMBaseInstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+ MachineBasicBlock &FMBB) const {
+ // Reduce false anti-dependencies to let Swift's out-of-order execution
+ // engine do its thing.
+ return Subtarget.isSwift();
+}
+
/// getInstrPredicate - If instruction is predicated, returns its predicate
/// condition, otherwise returns AL. It also returns the condition code
/// register by reference.
@@ -2342,6 +2354,229 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI,
return true;
}
+static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
+ const MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ default: {
+ const MCInstrDesc &Desc = MI->getDesc();
+ int UOps = ItinData->getNumMicroOps(Desc.getSchedClass());
+ assert(UOps >= 0 && "bad # UOps");
+ return UOps;
+ }
+
+ case ARM::LDRrs:
+ case ARM::LDRBrs:
+ case ARM::STRrs:
+ case ARM::STRBrs: {
+ unsigned ShOpVal = MI->getOperand(3).getImm();
+ bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+ unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+ if (!isSub &&
+ (ShImm == 0 ||
+ ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+ ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+ return 1;
+ return 2;
+ }
+
+ case ARM::LDRH:
+ case ARM::STRH: {
+ if (!MI->getOperand(2).getReg())
+ return 1;
+
+ unsigned ShOpVal = MI->getOperand(3).getImm();
+ bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+ unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+ if (!isSub &&
+ (ShImm == 0 ||
+ ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+ ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+ return 1;
+ return 2;
+ }
+
+ case ARM::LDRSB:
+ case ARM::LDRSH:
+ return (ARM_AM::getAM3Op(MI->getOperand(3).getImm()) == ARM_AM::sub) ? 3:2;
+
+ case ARM::LDRSB_POST:
+ case ARM::LDRSH_POST: {
+ unsigned Rt = MI->getOperand(0).getReg();
+ unsigned Rm = MI->getOperand(3).getReg();
+ return (Rt == Rm) ? 4 : 3;
+ }
+
+ case ARM::LDR_PRE_REG:
+ case ARM::LDRB_PRE_REG: {
+ unsigned Rt = MI->getOperand(0).getReg();
+ unsigned Rm = MI->getOperand(3).getReg();
+ if (Rt == Rm)
+ return 3;
+ unsigned ShOpVal = MI->getOperand(4).getImm();
+ bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+ unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+ if (!isSub &&
+ (ShImm == 0 ||
+ ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+ ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+ return 2;
+ return 3;
+ }
+
+ case ARM::STR_PRE_REG:
+ case ARM::STRB_PRE_REG: {
+ unsigned ShOpVal = MI->getOperand(4).getImm();
+ bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+ unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+ if (!isSub &&
+ (ShImm == 0 ||
+ ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+ ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+ return 2;
+ return 3;
+ }
+
+ case ARM::LDRH_PRE:
+ case ARM::STRH_PRE: {
+ unsigned Rt = MI->getOperand(0).getReg();
+ unsigned Rm = MI->getOperand(3).getReg();
+ if (!Rm)
+ return 2;
+ if (Rt == Rm)
+ return 3;
+ return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub)
+ ? 3 : 2;
+ }
+
+ case ARM::LDR_POST_REG:
+ case ARM::LDRB_POST_REG:
+ case ARM::LDRH_POST: {
+ unsigned Rt = MI->getOperand(0).getReg();
+ unsigned Rm = MI->getOperand(3).getReg();
+ return (Rt == Rm) ? 3 : 2;
+ }
+
+ case ARM::LDR_PRE_IMM:
+ case ARM::LDRB_PRE_IMM:
+ case ARM::LDR_POST_IMM:
+ case ARM::LDRB_POST_IMM:
+ case ARM::STRB_POST_IMM:
+ case ARM::STRB_POST_REG:
+ case ARM::STRB_PRE_IMM:
+ case ARM::STRH_POST:
+ case ARM::STR_POST_IMM:
+ case ARM::STR_POST_REG:
+ case ARM::STR_PRE_IMM:
+ return 2;
+
+ case ARM::LDRSB_PRE:
+ case ARM::LDRSH_PRE: {
+ unsigned Rm = MI->getOperand(3).getReg();
+ if (Rm == 0)
+ return 3;
+ unsigned Rt = MI->getOperand(0).getReg();
+ if (Rt == Rm)
+ return 4;
+ unsigned ShOpVal = MI->getOperand(4).getImm();
+ bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+ unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+ if (!isSub &&
+ (ShImm == 0 ||
+ ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+ ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+ return 3;
+ return 4;
+ }
+
+ case ARM::LDRD: {
+ unsigned Rt = MI->getOperand(0).getReg();
+ unsigned Rn = MI->getOperand(2).getReg();
+ unsigned Rm = MI->getOperand(3).getReg();
+ if (Rm)
+ return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub) ?4:3;
+ return (Rt == Rn) ? 3 : 2;
+ }
+
+ case ARM::STRD: {
+ unsigned Rm = MI->getOperand(3).getReg();
+ if (Rm)
+ return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub) ?4:3;
+ return 2;
+ }
+
+ case ARM::LDRD_POST:
+ case ARM::t2LDRD_POST:
+ return 3;
+
+ case ARM::STRD_POST:
+ case ARM::t2STRD_POST:
+ return 4;
+
+ case ARM::LDRD_PRE: {
+ unsigned Rt = MI->getOperand(0).getReg();
+ unsigned Rn = MI->getOperand(3).getReg();
+ unsigned Rm = MI->getOperand(4).getReg();
+ if (Rm)
+ return (ARM_AM::getAM3Op(MI->getOperand(5).getImm()) == ARM_AM::sub) ?5:4;
+ return (Rt == Rn) ? 4 : 3;
+ }
+
+ case ARM::t2LDRD_PRE: {
+ unsigned Rt = MI->getOperand(0).getReg();
+ unsigned Rn = MI->getOperand(3).getReg();
+ return (Rt == Rn) ? 4 : 3;
+ }
+
+ case ARM::STRD_PRE: {
+ unsigned Rm = MI->getOperand(4).getReg();
+ if (Rm)
+ return (ARM_AM::getAM3Op(MI->getOperand(5).getImm()) == ARM_AM::sub) ?5:4;
+ return 3;
+ }
+
+ case ARM::t2STRD_PRE:
+ return 3;
+
+ case ARM::t2LDR_POST:
+ case ARM::t2LDRB_POST:
+ case ARM::t2LDRB_PRE:
+ case ARM::t2LDRSBi12:
+ case ARM::t2LDRSBi8:
+ case ARM::t2LDRSBpci:
+ case ARM::t2LDRSBs:
+ case ARM::t2LDRH_POST:
+ case ARM::t2LDRH_PRE:
+ case ARM::t2LDRSBT:
+ case ARM::t2LDRSB_POST:
+ case ARM::t2LDRSB_PRE:
+ case ARM::t2LDRSH_POST:
+ case ARM::t2LDRSH_PRE:
+ case ARM::t2LDRSHi12:
+ case ARM::t2LDRSHi8:
+ case ARM::t2LDRSHpci:
+ case ARM::t2LDRSHs:
+ return 2;
+
+ case ARM::t2LDRDi8: {
+ unsigned Rt = MI->getOperand(0).getReg();
+ unsigned Rn = MI->getOperand(2).getReg();
+ return (Rt == Rn) ? 3 : 2;
+ }
+
+ case ARM::t2STRB_POST:
+ case ARM::t2STRB_PRE:
+ case ARM::t2STRBs:
+ case ARM::t2STRDi8:
+ case ARM::t2STRH_POST:
+ case ARM::t2STRH_PRE:
+ case ARM::t2STRHs:
+ case ARM::t2STR_POST:
+ case ARM::t2STR_PRE:
+ case ARM::t2STRs:
+ return 2;
+ }
+}
+
// Return the number of 32-bit words loaded by LDM or stored by STM. If this
// can't be easily determined return 0 (missing MachineMemOperand).
//
@@ -2382,8 +2617,12 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
const MCInstrDesc &Desc = MI->getDesc();
unsigned Class = Desc.getSchedClass();
int ItinUOps = ItinData->getNumMicroOps(Class);
- if (ItinUOps >= 0)
+ if (ItinUOps >= 0) {
+ if (Subtarget.isSwift() && (Desc.mayLoad() || Desc.mayStore()))
+ return getNumMicroOpsSwiftLdSt(ItinData, MI);
+
return ItinUOps;
+ }
unsigned Opc = MI->getOpcode();
switch (Opc) {
@@ -2452,7 +2691,43 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
case ARM::t2STMIA_UPD:
case ARM::t2STMDB_UPD: {
unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands() + 1;
- if (Subtarget.isCortexA8()) {
+ if (Subtarget.isSwift()) {
+ // rdar://8402126
+ int UOps = 1 + NumRegs; // One for address computation, one for each ld / st.
+ switch (Opc) {
+ default: break;
+ case ARM::VLDMDIA_UPD:
+ case ARM::VLDMDDB_UPD:
+ case ARM::VLDMSIA_UPD:
+ case ARM::VLDMSDB_UPD:
+ case ARM::VSTMDIA_UPD:
+ case ARM::VSTMDDB_UPD:
+ case ARM::VSTMSIA_UPD:
+ case ARM::VSTMSDB_UPD:
+ case ARM::LDMIA_UPD:
+ case ARM::LDMDA_UPD:
+ case ARM::LDMDB_UPD:
+ case ARM::LDMIB_UPD:
+ case ARM::STMIA_UPD:
+ case ARM::STMDA_UPD:
+ case ARM::STMDB_UPD:
+ case ARM::STMIB_UPD:
+ case ARM::tLDMIA_UPD:
+ case ARM::tSTMIA_UPD:
+ case ARM::t2LDMIA_UPD:
+ case ARM::t2LDMDB_UPD:
+ case ARM::t2STMIA_UPD:
+ case ARM::t2STMDB_UPD:
+ ++UOps; // One for base register writeback.
+ break;
+ case ARM::LDMIA_RET:
+ case ARM::tPOP_RET:
+ case ARM::t2LDMIA_RET:
+ UOps += 2; // One for base reg wb, one for write to pc.
+ break;
+ }
+ return UOps;
+ } else if (Subtarget.isCortexA8()) {
if (NumRegs < 4)
return 2;
// 4 registers would be issued: 2, 2.
@@ -2461,7 +2736,7 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
if (NumRegs % 2)
++A8UOps;
return A8UOps;
- } else if (Subtarget.isLikeA9()) {
+ } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
int A9UOps = (NumRegs / 2);
// If there are odd number of registers or if it's not 64-bit aligned,
// then it takes an extra AGU (Address Generation Unit) cycle.
@@ -2494,7 +2769,7 @@ ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData,
DefCycle = RegNo / 2 + 1;
if (RegNo % 2)
++DefCycle;
- } else if (Subtarget.isLikeA9()) {
+ } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
DefCycle = RegNo;
bool isSLoad = false;
@@ -2538,7 +2813,7 @@ ARMBaseInstrInfo::getLDMDefCycle(const InstrItineraryData *ItinData,
DefCycle = 1;
// Result latency is issue cycle + 2: E2.
DefCycle += 2;
- } else if (Subtarget.isLikeA9()) {
+ } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
DefCycle = (RegNo / 2);
// If there are odd number of registers or if it's not 64-bit aligned,
// then it takes an extra AGU (Address Generation Unit) cycle.
@@ -2569,7 +2844,7 @@ ARMBaseInstrInfo::getVSTMUseCycle(const InstrItineraryData *ItinData,
UseCycle = RegNo / 2 + 1;
if (RegNo % 2)
++UseCycle;
- } else if (Subtarget.isLikeA9()) {
+ } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
UseCycle = RegNo;
bool isSStore = false;
@@ -2610,7 +2885,7 @@ ARMBaseInstrInfo::getSTMUseCycle(const InstrItineraryData *ItinData,
UseCycle = 2;
// Read in E3.
UseCycle += 2;
- } else if (Subtarget.isLikeA9()) {
+ } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
UseCycle = (RegNo / 2);
// If there are odd number of registers or if it's not 64-bit aligned,
// then it takes an extra AGU (Address Generation Unit) cycle.
@@ -2820,6 +3095,37 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget,
break;
}
}
+ } else if (Subtarget.isSwift()) {
+ // FIXME: Properly handle all of the latency adjustments for address
+ // writeback.
+ switch (DefMCID->getOpcode()) {
+ default: break;
+ case ARM::LDRrs:
+ case ARM::LDRBrs: {
+ unsigned ShOpVal = DefMI->getOperand(3).getImm();
+ bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+ unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+ if (!isSub &&
+ (ShImm == 0 ||
+ ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+ ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+ Adjust -= 2;
+ else if (!isSub &&
+ ShImm == 1 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsr)
+ --Adjust;
+ break;
+ }
+ case ARM::t2LDRs:
+ case ARM::t2LDRBs:
+ case ARM::t2LDRHs:
+ case ARM::t2LDRSHs: {
+ // Thumb2 mode: lsl only.
+ unsigned ShAmt = DefMI->getOperand(3).getImm();
+ if (ShAmt == 0 || ShAmt == 1 || ShAmt == 2 || ShAmt == 3)
+ Adjust -= 2;
+ break;
+ }
+ }
}
if (DefAlign < 8 && Subtarget.isLikeA9()) {
@@ -3046,7 +3352,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
if (!UseNode->isMachineOpcode()) {
int Latency = ItinData->getOperandCycle(DefMCID.getSchedClass(), DefIdx);
- if (Subtarget.isLikeA9())
+ if (Subtarget.isLikeA9() || Subtarget.isSwift())
return Latency <= 2 ? 1 : Latency - 1;
else
return Latency <= 3 ? 1 : Latency - 2;
@@ -3090,6 +3396,33 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
break;
}
}
+ } else if (DefIdx == 0 && Latency > 2 && Subtarget.isSwift()) {
+ // FIXME: Properly handle all of the latency adjustments for address
+ // writeback.
+ switch (DefMCID.getOpcode()) {
+ default: break;
+ case ARM::LDRrs:
+ case ARM::LDRBrs: {
+ unsigned ShOpVal =
+ cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
+ unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+ if (ShImm == 0 ||
+ ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+ ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
+ Latency -= 2;
+ else if (ShImm == 1 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsr)
+ --Latency;
+ break;
+ }
+ case ARM::t2LDRs:
+ case ARM::t2LDRBs:
+ case ARM::t2LDRHs:
+ case ARM::t2LDRSHs: {
+ // Thumb2 mode: lsl 0-3 only.
+ Latency -= 2;
+ break;
+ }
+ }
}
if (DefAlign < 8 && Subtarget.isLikeA9())
@@ -3658,6 +3991,122 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
}
+//===----------------------------------------------------------------------===//
+// Partial register updates
+//===----------------------------------------------------------------------===//
+//
+// Swift renames NEON registers with 64-bit granularity. That means any
+// instruction writing an S-reg implicitly reads the containing D-reg. The
+// problem is mostly avoided by translating f32 operations to v2f32 operations
+// on D-registers, but f32 loads are still a problem.
+//
+// These instructions can load an f32 into a NEON register:
+//
+// VLDRS - Only writes S, partial D update.
+// VLD1LNd32 - Writes all D-regs, explicit partial D update, 2 uops.
+// VLD1DUPd32 - Writes all D-regs, no partial reg update, 2 uops.
+//
+// FCONSTD can be used as a dependency-breaking instruction.
+
+
+unsigned ARMBaseInstrInfo::
+getPartialRegUpdateClearance(const MachineInstr *MI,
+ unsigned OpNum,
+ const TargetRegisterInfo *TRI) const {
+ // Only Swift has partial register update problems.
+ if (!SwiftPartialUpdateClearance || !Subtarget.isSwift())
+ return 0;
+
+ assert(TRI && "Need TRI instance");
+
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ if (MO.readsReg())
+ return 0;
+ unsigned Reg = MO.getReg();
+ int UseOp = -1;
+
+ switch(MI->getOpcode()) {
+ // Normal instructions writing only an S-register.
+ case ARM::VLDRS:
+ case ARM::FCONSTS:
+ case ARM::VMOVSR:
+ // rdar://problem/8791586
+ case ARM::VMOVv8i8:
+ case ARM::VMOVv4i16:
+ case ARM::VMOVv2i32:
+ case ARM::VMOVv2f32:
+ case ARM::VMOVv1i64:
+ UseOp = MI->findRegisterUseOperandIdx(Reg, false, TRI);
+ break;
+
+ // Explicitly reads the dependency.
+ case ARM::VLD1LNd32:
+ UseOp = 1;
+ break;
+ default:
+ return 0;
+ }
+
+ // If this instruction actually reads a value from Reg, there is no unwanted
+ // dependency.
+ if (UseOp != -1 && MI->getOperand(UseOp).readsReg())
+ return 0;
+
+ // We must be able to clobber the whole D-reg.
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ // Virtual register must be a foo:ssub_0<def,undef> operand.
+ if (!MO.getSubReg() || MI->readsVirtualRegister(Reg))
+ return 0;
+ } else if (ARM::SPRRegClass.contains(Reg)) {
+ // Physical register: MI must define the full D-reg.
+ unsigned DReg = TRI->getMatchingSuperReg(Reg, ARM::ssub_0,
+ &ARM::DPRRegClass);
+ if (!DReg || !MI->definesRegister(DReg, TRI))
+ return 0;
+ }
+
+ // MI has an unwanted D-register dependency.
+ // Avoid defs in the previous N instructrions.
+ return SwiftPartialUpdateClearance;
+}
+
+// Break a partial register dependency after getPartialRegUpdateClearance
+// returned non-zero.
+void ARMBaseInstrInfo::
+breakPartialRegDependency(MachineBasicBlock::iterator MI,
+ unsigned OpNum,
+ const TargetRegisterInfo *TRI) const {
+ assert(MI && OpNum < MI->getDesc().getNumDefs() && "OpNum is not a def");
+ assert(TRI && "Need TRI instance");
+
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ unsigned Reg = MO.getReg();
+ assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+ "Can't break virtual register dependencies.");
+ unsigned DReg = Reg;
+
+ // If MI defines an S-reg, find the corresponding D super-register.
+ if (ARM::SPRRegClass.contains(Reg)) {
+ DReg = ARM::D0 + (Reg - ARM::S0) / 2;
+ assert(TRI->isSuperRegister(Reg, DReg) && "Register enums broken");
+ }
+
+ assert(ARM::DPRRegClass.contains(DReg) && "Can only break D-reg deps");
+ assert(MI->definesRegister(DReg, TRI) && "MI doesn't clobber full D-reg");
+
+ // FIXME: In some cases, VLDRS can be changed to a VLD1DUPd32 which defines
+ // the full D-register by loading the same value to both lanes. The
+ // instruction is micro-coded with 2 uops, so don't do this until we can
+ // properly schedule micro-coded instuctions. The dispatcher stalls cause
+ // too big regressions.
+
+ // Insert the dependency-breaking FCONSTD before MI.
+ // 96 is the encoding of 0.5, but the actual value doesn't matter here.
+ AddDefaultPred(BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ get(ARM::FCONSTD), DReg).addImm(96));
+ MI->addRegisterKilled(DReg, TRI, true);
+}
+
bool ARMBaseInstrInfo::hasNOP() const {
return (Subtarget.getFeatureBits() & ARM::HasV6T2Ops) != 0;
}
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 18746b421d..8f4f47b34f 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -186,6 +186,9 @@ public:
return NumCycles == 1;
}
+ virtual bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+ MachineBasicBlock &FMBB) const;
+
/// analyzeCompare - For a comparison instruction, return the source registers
/// in SrcReg and SrcReg2 if having two register operands, and the value it
/// compares against in CmpValue. Return true if the comparison instruction
@@ -235,6 +238,10 @@ public:
getExecutionDomain(const MachineInstr *MI) const;
void setExecutionDomain(MachineInstr *MI, unsigned Domain) const;
+ unsigned getPartialRegUpdateClearance(const MachineInstr*, unsigned,
+ const TargetRegisterInfo*) const;
+ void breakPartialRegDependency(MachineBasicBlock::iterator, unsigned,
+ const TargetRegisterInfo *TRI) const;
/// Get the number of addresses by LDM or VLDM or zero for unknown.
unsigned getNumLDMAddresses(const MachineInstr *MI) const;
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 1eea0cc61d..efd6d2b839 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -335,7 +335,9 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
if (!CheckVMLxHazard)
return true;
- if (!Subtarget->isCortexA8() && !Subtarget->isLikeA9())
+
+ if (!Subtarget->isCortexA8() && !Subtarget->isLikeA9() &&
+ !Subtarget->isSwift())
return true;
if (!N->hasOneUse())
@@ -373,12 +375,13 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift,
ARM_AM::ShiftOpc ShOpcVal,
unsigned ShAmt) {
- if (!Subtarget->isLikeA9())
+ if (!Subtarget->isLikeA9() && !Subtarget->isSwift())
return true;
if (Shift.hasOneUse())
return true;
// R << 2 is free.
- return ShOpcVal == ARM_AM::lsl && ShAmt == 2;
+ return ShOpcVal == ARM_AM::lsl &&
+ (ShAmt == 2 || (Subtarget->isSwift() && ShAmt == 1));
}
bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
@@ -485,7 +488,7 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
SDValue &Opc) {
if (N.getOpcode() == ISD::MUL &&
- (!Subtarget->isLikeA9() || N.hasOneUse())) {
+ ((!Subtarget->isLikeA9() && !Subtarget->isSwift()) || N.hasOneUse())) {
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
// X * [3,5,9] -> X + X * [2,4,8] etc.
int RHSC = (int)RHS->getZExtValue();
@@ -549,7 +552,8 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
// Try matching (R shl C) + (R).
if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift &&
- !(Subtarget->isLikeA9() || N.getOperand(0).hasOneUse())) {
+ !(Subtarget->isLikeA9() || Subtarget->isSwift() ||
+ N.getOperand(0).hasOneUse())) {
ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode());
if (ShOpcVal != ARM_AM::no_shift) {
// Check to see if the RHS of the shift is a constant, if not, we can't
@@ -583,7 +587,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
SDValue &Offset,
SDValue &Opc) {
if (N.getOpcode() == ISD::MUL &&
- (!Subtarget->isLikeA9() || N.hasOneUse())) {
+ (!(Subtarget->isLikeA9() || Subtarget->isSwift()) || N.hasOneUse())) {
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
// X * [3,5,9] -> X + X * [2,4,8] etc.
int RHSC = (int)RHS->getZExtValue();
@@ -649,7 +653,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
}
}
- if (Subtarget->isLikeA9() && !N.hasOneUse()) {
+ if ((Subtarget->isLikeA9() || Subtarget->isSwift()) && !N.hasOneUse()) {
// Compute R +/- (R << N) and reuse it.
Base = N;
Offset = CurDAG->getRegister(0, MVT::i32);
@@ -687,7 +691,8 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
// Try matching (R shl C) + (R).
if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift &&
- !(Subtarget->isLikeA9() || N.getOperand(0).hasOneUse())) {
+ !(Subtarget->isLikeA9() || Subtarget->isSwift() ||
+ N.getOperand(0).hasOneUse())) {
ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode());
if (ShOpcVal != ARM_AM::no_shift) {
// Check to see if the RHS of the shift is a constant, if not, we can't
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index f8455a4b0e..8ff48216d9 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -635,9 +635,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
if (!Subtarget->hasV6Ops())
setOperationAction(ISD::BSWAP, MVT::i32, Expand);
- // These are expanded into libcalls.
- if (!Subtarget->hasDivide() || !Subtarget->isThumb2()) {
- // v7M has a hardware divider
+ if (!(Subtarget->hasDivide() && Subtarget->isThumb2()) &&
+ !(Subtarget->hasDivideInARMMode() && !Subtarget->isThumb())) {
+ // These are expanded into libcalls if the cpu doesn't have HW divider.
setOperationAction(ISD::SDIV, MVT::i32, Expand);
setOperationAction(ISD::UDIV, MVT::i32, Expand);
}
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index c8966fb97a..67a6820932 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -846,6 +846,23 @@ class AMiscA1I<bits<8> opcod, bits<4> opc7_4, dag oops, dag iops,
let Inst{3-0} = Rm;
}
+// Division instructions.
+class ADivA1I<bits<3> opcod, dag oops, dag iops,
+ InstrItinClass itin, string opc, string asm, list<dag> pattern>
+ : I<oops, iops, AddrModeNone, 4, IndexModeNone, ArithMiscFrm, itin,
+ opc, asm, "", pattern> {
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<4> Rm;
+ let Inst{27-23} = 0b01110;
+ let Inst{22-20} = opcod;
+ let Inst{19-16} = Rd;
+ let Inst{15-12} = 0b1111;
+ let Inst{11-8} = Rm;
+ let Inst{7-4} = 0b0001;
+ let Inst{3-0} = Rn;
+}
+
// PKH instructions
def PKHLSLAsmOperand : ImmAsmOperand {
let Name = "PKHLSLImm";
@@ -893,6 +910,10 @@ class ARMV5TPat<dag pattern, dag result> : Pat<pattern, result> {
class ARMV5TEPat<dag pattern, dag result> : Pat<pattern, result> {
list<Predicate> Predicates = [IsARM, HasV5TE];
}
+// ARMV5MOPat - Same as ARMV5TEPat with UseMulOps.
+class ARMV5MOPat<dag pattern, dag result> : Pat<pattern, result> {
+ list<Predicate> Predicates = [IsARM, HasV5TE, UseMulOps];
+}
class ARMV6Pat<dag pattern, dag result> : Pat<pattern, result> {
list<Predicate> Predicates = [IsARM, HasV6];
}
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 20d7c1b1d2..a78ada0a80 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -207,6 +207,8 @@ def HasFP16 : Predicate<"Subtarget->hasFP16()">,
AssemblerPredicate<"FeatureFP16","half-float">;
def HasDivide : Predicate<"Subtarget->hasDivide()">,
AssemblerPredicate<"FeatureHWDiv", "divide">;
+def HasDivideInARM : Predicate<"Subtarget->hasDivideInARMMode()">,
+ AssemblerPredicate<"FeatureHWDivARM">;
def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">,
AssemblerPredicate<"FeatureT2XtPk",
"pack/extract">;
@@ -242,6 +244,7 @@ def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">;
def UseMovt : Predicate<"Subtarget->useMovt()">;
def DontUseMovt : Predicate<"!Subtarget->useMovt()">;
def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">;
+def UseMulOps : Predicate<"Subtarget->useMulOps()">;
// Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available.
// But only select them if more precision in FP computation is allowed.
@@ -252,6 +255,20 @@ def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion =="
def DontUseFusedMAC : Predicate<"!Subtarget->hasVFP4() || "
"Subtarget->isTargetDarwin()">;
+// VGETLNi32 is microcoded on Swift - prefer VMOV.
+def HasFastVGETLNi32 : Predicate<"!Subtarget->isSwift()">;
+def HasSlowVGETLNi32 : Predicate<"Subtarget->isSwift()">;
+
+// VDUP.32 is microcoded on Swift - prefer VMOV.
+def HasFastVDUP32 : Predicate<"!Subtarget->isSwift()">;
+def HasSlowVDUP32 : Predicate<"Subtarget->isSwift()">;
+
+// Cortex-A9 prefers VMOVSR to VMOVDRR even when using NEON for scalar FP, as
+// this allows more effective execution domain optimization. See
+// setExecutionDomain().
+def UseVMOVSR : Predicate<"Subtarget->isCortexA9() || !Subtarget->useNEONForSinglePrecisionFP()">;
+def DontUseVMOVSR : Predicate<"!Subtarget->isCortexA9() && Subtarget->useNEONForSinglePrecisionFP()">;
+
def IsLE : Predicate<"TLI.isLittleEndian()">;
def IsBE : Predicate<"TLI.isBigEndian()">;
@@ -3446,13 +3463,13 @@ def MULv5: ARMPseudoExpand<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm,
4, IIC_iMUL32,
[(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))],
(MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>,
- Requires<[IsARM, NoV6]>;
+ Requires<[IsARM, NoV6, UseMulOps]>;
}
def MLA : AsMul1I32<0b0000001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
IIC_iMAC32, "mla", "\t$Rd, $Rn, $Rm, $Ra",
[(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))]>,
- Requires<[IsARM, HasV6]> {
+ Requires<[IsARM, HasV6, UseMulOps]> {
bits<4> Ra;
let Inst{15-12} = Ra;
}
@@ -3468,7 +3485,7 @@ def MLAv5: ARMPseudoExpand<(outs GPR:$Rd),
def MLS : AMul1I<0b0000011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
IIC_iMAC32, "mls", "\t$Rd, $Rn, $Rm, $Ra",
[(set GPR:$Rd, (sub GPR:$Ra, (mul GPR:$Rn, GPR:$Rm)))]>,
- Requires<[IsARM, HasV6T2]> {
+ Requires<[IsARM, HasV6T2, UseMulOps]> {
bits<4> Rd;
bits<4> Rm;
bits<4> Rn;
@@ -3574,7 +3591,7 @@ def SMMLA : AMul2Ia <0b0111010, 0b0001, (outs GPR:$Rd),
(ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
IIC_iMAC32, "smmla", "\t$Rd, $Rn, $Rm, $Ra",
[(set GPR:$Rd, (add (mulhs GPR:$Rn, GPR:$Rm), GPR:$Ra))]>,
- Requires<[IsARM, HasV6]>;
+ Requires<[IsARM, HasV6, UseMulOps]>;
def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd),
(ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
@@ -3584,7 +3601,7 @@ def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd),
def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd),
(ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
IIC_iMAC32, "smmls", "\t$Rd, $Rn, $Rm, $Ra", []>,
- Requires<[IsARM, HasV6]>;
+ Requires<[IsARM, HasV6, UseMulOps]>;
def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd),
(ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
@@ -3638,7 +3655,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
[(set GPRnopc:$Rd, (add GPR:$Ra,
(opnode (sext_inreg GPRnopc:$Rn, i16),
(sext_inreg GPRnopc:$Rm, i16))))]>,
- Requires<[IsARM, HasV5TE]>;
+ Requires<[IsARM, HasV5TE, UseMulOps]>;
def BT : AMulxyIa<0b0001000, 0b10, (outs GPRnopc:$Rd),
(ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
@@ -3646,7 +3663,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
[(set GPRnopc:$Rd,
(add GPR:$Ra, (opnode (sext_inreg GPRnopc:$Rn, i16),
(sra GPRnopc:$Rm, (i32 16)))))]>,
- Requires<[IsARM, HasV5TE]>;
+ Requires<[IsARM, HasV5TE, UseMulOps]>;
def TB : AMulxyIa<0b0001000, 0b01, (outs GPRnopc:$Rd),
(ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
@@ -3654,7 +3671,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
[(set GPRnopc:$Rd,
(add GPR:$Ra, (opnode (sra GPRnopc:$Rn, (i32 16)),
(sext_inreg GPRnopc:$Rm, i16))))]>,
- Requires<[IsARM, HasV5TE]>;
+ Requires<[IsARM, HasV5TE, UseMulOps]>;
def TT : AMulxyIa<0b0001000, 0b11, (outs GPRnopc:$Rd),
(ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
@@ -3662,7 +3679,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
[(set GPRnopc:$Rd,
(add GPR:$Ra, (opnode (sra GPRnopc:$Rn, (i32 16)),
(sra GPRnopc:$Rm, (i32 16)))))]>,
- Requires<[IsARM, HasV5TE]>;
+ Requires<[IsARM, HasV5TE, UseMulOps]>;
def WB : AMulxyIa<0b0001001, 0b00, (outs GPRnopc:$Rd),
(ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
@@ -3670,7 +3687,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
[(set GPRnopc:$Rd,
(add GPR:$Ra, (sra (opnode GPRnopc:$Rn,
(sext_inreg GPRnopc:$Rm, i16)), (i32 16))))]>,
- Requires<[IsARM, HasV5TE]>;
+ Requires<[IsARM, HasV5TE, UseMulOps]>;
def WT : AMulxyIa<0b0001001, 0b10, (outs GPRnopc:$Rd),
(ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
@@ -3678,7 +3695,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
[(set GPRnopc:$Rd,
(add GPR:$Ra, (sra (opnode GPRnopc:$Rn,
(sra GPRnopc:$Rm, (i32 16))), (i32 16))))]>,
- Requires<[IsARM, HasV5TE]>;
+ Requires<[IsARM, HasV5TE, UseMulOps]>;
}
}
@@ -3781,6 +3798,19 @@ defm SMUA : AI_sdml<0, "smua">;
defm SMUS : AI_sdml<1, "smus">;
//===----------------------------------------------------------------------===//
+// Division Instructions (ARMv7-A with virtualization extension)
+//
+def SDIV : ADivA1I<0b001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iDIV,
+ "sdiv", "\t$Rd, $Rn, $Rm",
+ [(set GPR:$Rd, (sdiv GPR:$Rn, GPR:$Rm))]>,
+ Requires<[IsARM, HasDivideInARM]>;
+
+def UDIV : ADivA1I<0b011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iDIV,
+ "udiv", "\t$Rd, $Rn, $Rm",
+ [(set GPR:$Rd, (udiv GPR:$Rn, GPR:$Rm))]>,
+ Requires<[IsARM, HasDivideInARM]>;
+
+//===----------------------------------------------------------------------===//
// Misc. Arithmetic Instructions.
//
@@ -4831,32 +4861,32 @@ def : ARMV5TEPat<(sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))),
def : ARMV5TEPat<(sra (mul GPR:$a, sext_16_node:$b), (i32 16)),
(SMULWB GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(add GPR:$acc,
+def : ARMV5MOPat<(add GPR:$acc,
(mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
(sra (shl GPR:$b, (i32 16)), (i32 16)))),
(SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
-def : ARMV5TEPat<(add GPR:$acc,
+def : ARMV5MOPat<(add GPR:$acc,
(mul sext_16_node:$a, sext_16_node:$b)),
(SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
-def : ARMV5TEPat<(add GPR:$acc,
+def : ARMV5MOPat<(add GPR:$acc,
(mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
(sra GPR:$b, (i32 16)))),
(SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
-def : ARMV5TEPat<(add GPR:$acc,
+def : ARMV5MOPat<(add GPR:$acc,
(mul sext_16_node:$a, (sra GPR:$b, (i32 16)))),
(SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
-def : ARMV5TEPat<(add GPR:$acc,
+def : ARMV5MOPat<(add GPR:$acc,
(mul (sra GPR:$a, (i32 16)),
(sra (shl GPR:$b, (i32 16)), (i32 16)))),
(SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
-def : ARMV5TEPat<(add GPR:$acc,
+def : ARMV5MOPat<(add GPR:$acc,
(mul (sra GPR:$a, (i32 16)), sext_16_node:$b)),
(SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
-def : ARMV5TEPat<(add GPR:$acc,
+def : ARMV5MOPat<(add GPR:$acc,
(sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))),
(i32 16))),
(SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
-def : ARMV5TEPat<(add GPR:$acc,
+def : ARMV5MOPat<(add GPR:$acc,
(sra (mul GPR:$a, sext_16_node:$b), (i32 16))),
(SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 1bcb48776e..de655f1a0e 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -5043,7 +5043,8 @@ def VGETLNi32 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, 0b00,
(outs GPR:$R), (ins DPR:$V, VectorIndex32:$lane),
IIC_VMOVSI, "vmov", "32", "$R, $V$lane",
[(set GPR:$R, (extractelt (v2i32 DPR:$V),
- imm:$lane))]> {
+ imm:$lane))]>,
+ Requires<[HasNEON, HasFastVGETLNi32]> {
let Inst{21} = lane{0};
}
// def VGETLNf32: see FMRDH and FMRDL in ARMInstrVFP.td
@@ -5066,7 +5067,16 @@ def : Pat<(NEONvgetlaneu (v8i16 QPR:$src), imm:$lane),
def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane),
(VGETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src,
(DSubReg_i32_reg imm:$lane))),
- (SubReg_i32_lane imm:$lane))>;
+ (SubReg_i32_lane imm:$lane))>,
+ Requires<[HasNEON, HasFastVGETLNi32]>;
+def : Pat<(extractelt (v2i32 DPR:$src), imm:$lane),
+ (COPY_TO_REGCLASS
+ (i32 (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane))), GPR)>,
+ Requires<[HasNEON, HasSlowVGETLNi32]>;
+def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane),
+ (COPY_TO_REGCLASS
+ (i32 (EXTRACT_SUBREG QPR:$src, (SSubReg_f32_reg imm:$lane))), GPR)>,
+ Requires<[HasNEON, HasSlowVGETLNi32]>;
def : Pat<(extractelt (v2f32 DPR:$src1), imm:$src2),
(EXTRACT_SUBREG (v2f32 (COPY_TO_REGCLASS (v2f32 DPR:$src1),DPR_VFP2)),
(SSubReg_f32_reg imm:$src2))>;
@@ -5175,14 +5185,23 @@ class VDUPQ<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty>
def VDUP8d : VDUPD<0b11101100, 0b00, "8", v8i8>;
def VDUP16d : VDUPD<0b11101000, 0b01, "16", v4i16>;
-def VDUP32d : VDUPD<0b11101000, 0b00, "32", v2i32>;
+def VDUP32d : VDUPD<0b11101000, 0b00, "32", v2i32>,
+ Requires<[HasNEON, HasFastVDUP32]>;
def VDUP8q : VDUPQ<0b11101110, 0b00, "8", v16i8>;
def VDUP16q : VDUPQ<0b11101010, 0b01, "16", v8i16>;
def VDUP32q : VDUPQ<0b11101010, 0b00, "32", v4i32>;
-def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32d GPR:$R)>;
+// NEONvdup patterns for uarchs with fast VDUP.32.
+def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32d GPR:$R)>,
+ Requires<[HasNEON,HasFastVDUP32]>;
def : Pat<(v4f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32q GPR:$R)>;
+// NEONvdup patterns for uarchs with slow VDUP.32 - use VMOVDRR instead.
+def : Pat<(v2i32 (NEONvdup (i32 GPR:$R))), (VMOVDRR GPR:$R, GPR:$R)>,
+ Requires<[HasNEON,HasSlowVDUP32]>;
+def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VMOVDRR GPR:$R, GPR:$R)>,
+ Requires<[HasNEON,HasSlowVDUP32]>;
+
// VDUP : Vector Duplicate Lane (from scalar to all elements)
class VDUPLND<bits<4> op19_16, string OpcodeStr, string Dt,
@@ -5619,6 +5638,11 @@ def : N2VSPat<arm_ftoui, VCVTf2ud>;
def : N2VSPat<arm_sitof, VCVTs2fd>;
def : N2VSPat<arm_uitof, VCVTu2fd>;
+// Prefer VMOVDRR for i32 -> f32 bitcasts, it can write all DPR registers.
+def : Pat<(f32 (bitconvert GPR:$a)),
+ (EXTRACT_SUBREG (VMOVDRR GPR:$a, GPR:$a), ssub_0)>,
+ Requires<[HasNEON, DontUseVMOVSR]>;
+
//===----------------------------------------------------------------------===//
// Non-Instruction Patterns
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index f1a6cced26..2c62fdb386 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -2396,7 +2396,8 @@ def t2MUL: T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
def t2MLA: T2FourReg<
(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
"mla", "\t$Rd, $Rn, $Rm, $Ra",
- [(set rGPR:$Rd, (add (mul rGPR:$Rn, rGPR:$Rm), rGPR:$Ra))]> {
+ [(set rGPR:$Rd, (add (mul rGPR:$Rn, rGPR:$Rm), rGPR:$Ra))]>,
+ Requires<[IsThumb2, UseMulOps]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b000;
@@ -2406,7 +2407,8 @@ def t2MLA: T2FourReg<
def t2MLS: T2FourReg<
(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
"mls", "\t$Rd, $Rn, $Rm, $Ra",
- [(set rGPR:$Rd, (sub rGPR:$Ra, (mul rGPR:$Rn, rGPR:$Rm)))]> {
+ [(set rGPR:$Rd, (sub rGPR:$Ra, (mul rGPR:$Rn, rGPR:$Rm)))]>,
+ Requires<[IsThumb2, UseMulOps]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b000;
@@ -2475,7 +2477,7 @@ def t2SMMLA : T2FourReg<
(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
"smmla", "\t$Rd, $Rn, $Rm, $Ra",
[(set rGPR:$Rd, (add (mulhs rGPR:$Rm, rGPR:$Rn), rGPR:$Ra))]>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b101;
@@ -2496,7 +2498,7 @@ def t2SMMLS: T2FourReg<
(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
"smmls", "\t$Rd, $Rn, $Rm, $Ra",
[(set rGPR:$Rd, (sub rGPR:$Ra, (mulhs rGPR:$Rn, rGPR:$Rm)))]>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b110;
@@ -2601,7 +2603,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
[(set rGPR:$Rd, (add rGPR:$Ra,
(opnode (sext_inreg rGPR:$Rn, i16),
(sext_inreg rGPR:$Rm, i16))))]>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b001;
@@ -2614,7 +2616,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
!strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra",
[(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sext_inreg rGPR:$Rn, i16),
(sra rGPR:$Rm, (i32 16)))))]>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b001;
@@ -2627,7 +2629,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
!strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra",
[(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)),
(sext_inreg rGPR:$Rm, i16))))]>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b001;
@@ -2640,7 +2642,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
!strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra",
[(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)),
(sra rGPR:$Rm, (i32 16)))))]>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b001;
@@ -2653,7 +2655,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
!strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra",
[(set rGPR:$Rd, (add rGPR:$Ra, (sra (opnode rGPR:$Rn,
(sext_inreg rGPR:$Rm, i16)), (i32 16))))]>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b011;
@@ -2666,7 +2668,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
!strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra",
[(set rGPR:$Rd, (add rGPR:$Ra, (sra (opnode rGPR:$Rn,
(sra rGPR:$Rm, (i32 16))), (i32 16))))]>,
- Requires<[IsThumb2, HasThumb2DSP]> {
+ Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
let Inst{31-27} = 0b11111;
let Inst{26-23} = 0b0110;
let Inst{22-20} = 0b011;
@@ -2760,7 +2762,7 @@ def t2SMLSLDX : T2FourReg_mac<1, 0b101, 0b1101, (outs rGPR:$Ra,rGPR:$Rd),
// Division Instructions.
// Signed and unsigned division on v7-M
//
-def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUi,
+def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
"sdiv", "\t$Rd, $Rn, $Rm",
[(set rGPR:$Rd, (sdiv rGPR:$Rn, rGPR:$Rm))]>,
Requires<[HasDivide, IsThumb2]> {
@@ -2771,7 +2773,7 @@ def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUi,
let Inst{7-4} = 0b1111;
}
-def t2UDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUi,
+def t2UDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
"udiv", "\t$Rd, $Rn, $Rm",
[(set rGPR:$Rd, (udiv rGPR:$Rn, rGPR:$Rm))]>,
Requires<[HasDivide, IsThumb2]> {
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index 7d6692f307..b5a896c699 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -523,10 +523,12 @@ def VMOVRS : AVConv2I<0b11100001, 0b1010,
let D = VFPNeonDomain;
}
+// Bitcast i32 -> f32. NEON prefers to use VMOVDRR.
def VMOVSR : AVConv4I<0b11100000, 0b1010,
(outs SPR:$Sn), (ins GPR:$Rt),
IIC_fpMOVIS, "vmov", "\t$Sn, $Rt",
- [(set SPR:$Sn, (bitconvert GPR:$Rt))]> {
+ [(set SPR:$Sn, (bitconvert GPR:$Rt))]>,
+ Requires<[HasVFP2, UseVMOVSR]> {
// Instruction operands.
bits<5> Sn;
bits<4> Rt;
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 6f974fd17d..ed8ac1aff7 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -247,11 +247,16 @@ def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> {
}
// Scalar single precision floating point register class..
-def SPR : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 31)>;
+// FIXME: Allocation order changed to s0, s2, s4, ... as a quick hack to
+// avoid partial-write dependencies on D registers (S registers are
+// renamed as portions of D registers).
+def SPR : RegisterClass<"ARM", [f32], 32, (add (decimate
+ (sequence "S%u", 0, 31), 2),
+ (sequence "S%u", 0, 31))>;
// Subset of SPR which can be used as a source of NEON scalars for 16-bit
// operations
-def SPR_8 : RegisterClass<"ARM", [f32], 32, (trunc SPR, 16)>;
+def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)>;
// Scalar double precision floating point / generic 64-bit vector register
// class.
diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td
index 81d2fa37c2..02196d06bf 100644
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@@ -55,6 +55,7 @@ def IIC_iMUL32 : InstrItinClass;
def IIC_iMAC32 : InstrItinClass;
def IIC_iMUL64 : InstrItinClass;
def IIC_iMAC64 : InstrItinClass;
+def IIC_iDIV : InstrItinClass;
def IIC_iLoad_i : InstrItinClass;
def IIC_iLoad_r : InstrItinClass;
def IIC_iLoad_si : InstrItinClass;
@@ -261,3 +262,4 @@ def IIC_VTBX4 : InstrItinClass;
include "ARMScheduleV6.td"
include "ARMScheduleA8.td"
include "ARMScheduleA9.td"
+include "ARMScheduleSwift.td"
diff --git a/lib/Target/ARM/ARMScheduleSwift.td b/lib/Target/ARM/ARMScheduleSwift.td
new file mode 100644
index 0000000000..e9bc3e0f39
--- /dev/null
+++ b/lib/Target/ARM/ARMScheduleSwift.td
@@ -0,0 +1,1085 @@
+//=- ARMScheduleSwift.td - Swift Scheduling Definitions -*- tablegen -*----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Swift processor..
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// This section contains legacy support for itineraries. This is
+// required until SD and PostRA schedulers are replaced by MachineScheduler.
+
+def SW_DIS0 : FuncUnit;
+def SW_DIS1 : FuncUnit;
+def SW_DIS2 : FuncUnit;
+
+def SW_ALU0 : FuncUnit;
+def SW_ALU1 : FuncUnit;
+def SW_LS : FuncUnit;
+def SW_IDIV : FuncUnit;
+def SW_FDIV : FuncUnit;
+
+// FIXME: Need bypasses.
+// FIXME: Model the multiple stages of IIC_iMOVix2, IIC_iMOVix2addpc, and
+// IIC_iMOVix2ld better.
+// FIXME: Model the special immediate shifts that are not microcoded.
+// FIXME: Do we need to model the fact that uses of r15 in a micro-op force it
+// to issue on pipe 1?
+// FIXME: Model the pipelined behavior of CMP / TST instructions.
+// FIXME: Better model the microcode stages of multiply instructions, especially
+// conditional variants.
+// FIXME: Add preload instruction when it is documented.
+// FIXME: Model non-pipelined nature of FP div / sqrt unit.
+
+def SwiftItineraries : ProcessorItineraries<
+ [SW_DIS0, SW_DIS1, SW_DIS2, SW_ALU0, SW_ALU1, SW_LS, SW_IDIV, SW_FDIV], [], [
+ //
+ // Move instructions, unconditional
+ InstrItinData<IIC_iMOVi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1]>,
+ InstrItinData<IIC_iMOVr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1]>,
+ InstrItinData<IIC_iMOVsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1]>,
+ InstrItinData<IIC_iMOVsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1]>,
+ InstrItinData<IIC_iMOVix2 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [2]>,
+ InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [3]>,
+ InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>,
+ InstrStage<1, [SW_LS]>],
+ [5]>,
+ //
+ // MVN instructions
+ InstrItinData<IIC_iMVNi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1]>,
+ InstrItinData<IIC_iMVNr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1]>,
+ InstrItinData<IIC_iMVNsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1]>,
+ InstrItinData<IIC_iMVNsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1]>,
+ //
+ // No operand cycles
+ InstrItinData<IIC_iALUx , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>]>,
+ //
+ // Binary Instructions that produce a result
+ InstrItinData<IIC_iALUi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1, 1]>,
+ InstrItinData<IIC_iALUr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_iALUsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [2, 1, 1]>,
+ InstrItinData<IIC_iALUsir,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [2, 1, 1]>,
+ InstrItinData<IIC_iALUsr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [2, 1, 1, 1]>,
+ //
+ // Bitwise Instructions that produce a result
+ InstrItinData<IIC_iBITi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1, 1]>,
+ InstrItinData<IIC_iBITr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_iBITsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [2, 1, 1]>,
+ InstrItinData<IIC_iBITsr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [2, 1, 1, 1]>,
+ //
+ // Unary Instructions that produce a result
+
+ // CLZ, RBIT, etc.
+ InstrItinData<IIC_iUNAr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1, 1]>,
+
+ // BFC, BFI, UBFX, SBFX
+ InstrItinData<IIC_iUNAsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [2, 1]>,
+
+ //
+ // Zero and sign extension instructions
+ InstrItinData<IIC_iEXTr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1, 1]>,
+ InstrItinData<IIC_iEXTAr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_iEXTAsr,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1, 1, 1, 1]>,
+ //
+ // Compare instructions
+ InstrItinData<IIC_iCMPi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1]>,
+ InstrItinData<IIC_iCMPr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1, 1]>,
+ InstrItinData<IIC_iCMPsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<2, [SW_ALU0, SW_ALU1]>],
+ [1, 1]>,
+ InstrItinData<IIC_iCMPsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<2, [SW_ALU0, SW_ALU1]>],
+ [1, 1, 1]>,
+ //
+ // Test instructions
+ InstrItinData<IIC_iTSTi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1]>,
+ InstrItinData<IIC_iTSTr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1, 1]>,
+ InstrItinData<IIC_iTSTsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<2, [SW_ALU0, SW_ALU1]>],
+ [1, 1]>,
+ InstrItinData<IIC_iTSTsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<2, [SW_ALU0, SW_ALU1]>],
+ [1, 1, 1]>,
+ //
+ // Move instructions, conditional
+ // FIXME: Correctly model the extra input dep on the destination.
+ InstrItinData<IIC_iCMOVi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1]>,
+ InstrItinData<IIC_iCMOVr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1, 1]>,
+ InstrItinData<IIC_iCMOVsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1, 1]>,
+ InstrItinData<IIC_iCMOVsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [2, 1, 1]>,
+ InstrItinData<IIC_iCMOVix2, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [2]>,
+
+ // Integer multiply pipeline
+ //
+ InstrItinData<IIC_iMUL16 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [3, 1, 1]>,
+ InstrItinData<IIC_iMAC16 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [3, 1, 1, 1]>,
+ InstrItinData<IIC_iMUL32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [4, 1, 1]>,
+ InstrItinData<IIC_iMAC32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [4, 1, 1, 1]>,
+ InstrItinData<IIC_iMUL64 , [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0], 1>,
+ InstrStage<1, [SW_ALU0], 3>,
+ InstrStage<1, [SW_ALU0]>],
+ [5, 5, 1, 1]>,
+ InstrItinData<IIC_iMAC64 , [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0], 1>,
+ InstrStage<1, [SW_ALU0], 1>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 3>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [5, 6, 1, 1]>,
+ //
+ // Integer divide
+ InstrItinData<IIC_iDIV , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0], 0>,
+ InstrStage<14, [SW_IDIV]>],
+ [14, 1, 1]>,
+
+ // Integer load pipeline
+ // FIXME: The timings are some rough approximations
+ //
+ // Immediate offset
+ InstrItinData<IIC_iLoad_i , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_LS]>],
+ [3, 1]>,
+ InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_LS]>],
+ [3, 1]>,
+ InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_LS], 1>,
+ InstrStage<1, [SW_LS]>],
+ [3, 4, 1]>,
+ //
+ // Register offset
+ InstrItinData<IIC_iLoad_r , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_LS]>],
+ [3, 1, 1]>,
+ InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_LS]>],
+ [3, 1, 1]>,
+ InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_DIS2], 0>,
+ InstrStage<1, [SW_LS], 1>,
+ InstrStage<1, [SW_LS], 3>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [3, 4, 1, 1]>,
+ //
+ // Scaled register offset
+ InstrItinData<IIC_iLoad_si , [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+ InstrStage<1, [SW_LS]>],
+ [5, 1, 1]>,
+ InstrItinData<IIC_iLoad_bh_si,[InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+ InstrStage<1, [SW_LS]>],
+ [5, 1, 1]>,
+ //
+ // Immediate offset with update
+ InstrItinData<IIC_iLoad_iu , [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+ InstrStage<1, [SW_LS]>],
+ [3, 1, 1]>,
+ InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+ InstrStage<1, [SW_LS]>],
+ [3, 1, 1]>,
+ //
+ // Register offset with update
+ InstrItinData<IIC_iLoad_ru , [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_ALU0], 1>,
+ InstrStage<1, [SW_LS]>],
+ [3, 1, 1, 1]>,
+ InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_ALU0], 1>,
+ InstrStage<1, [SW_LS]>],
+ [3, 1, 1, 1]>,
+ InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 0>,
+ InstrStage<1, [SW_LS], 3>,
+ InstrStage<1, [SW_LS], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [3, 4, 1, 1]>,
+ //
+ // Scaled register offset with update
+ InstrItinData<IIC_iLoad_siu , [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+ InstrStage<1, [SW_LS], 3>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [5, 3, 1, 1]>,
+ InstrItinData<IIC_iLoad_bh_siu,[InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+ InstrStage<1, [SW_LS], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [5, 3, 1, 1]>,
+ //
+ // Load multiple, def is the 5th operand.
+ // FIXME: This assumes 3 to 4 registers.
+ InstrItinData<IIC_iLoad_m , [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+ InstrStage<1, [SW_LS]>],
+ [1, 1, 1, 1, 3], [], -1>, // dynamic uops
+
+ //
+ // Load multiple + update, defs are the 1st and 5th operands.
+ InstrItinData<IIC_iLoad_mu , [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 0>,
+ InstrStage<1, [SW_LS], 3>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [2, 1, 1, 1, 3], [], -1>, // dynamic uops
+ //
+ // Load multiple plus branch
+ InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+ InstrStage<1, [SW_LS]>],
+ [1, 1, 1, 1, 3], [], -1>, // dynamic uops
+ //
+ // Pop, def is the 3rd operand.
+ InstrItinData<IIC_iPop , [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+ InstrStage<1, [SW_LS]>],
+ [1, 1, 3], [], -1>, // dynamic uops
+ //
+ // Pop + branch, def is the 3rd operand.
+ InstrItinData<IIC_iPop_Br, [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+ InstrStage<1, [SW_LS]>],
+ [1, 1, 3], [], -1>, // dynamic uops
+
+ //
+ // iLoadi + iALUr for t2LDRpci_pic.
+ InstrItinData<IIC_iLoadiALU, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_LS], 3>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [4, 1]>,
+
+ // Integer store pipeline
+ ///
+ // Immediate offset
+ InstrItinData<IIC_iStore_i , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_LS]>],
+ [1, 1]>,
+ InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_LS]>],
+ [1, 1]>,
+ InstrItinData<IIC_iStore_d_i, [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_DIS2], 0>,
+ InstrStage<1, [SW_LS], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+ InstrStage<1, [SW_LS]>],
+ [1, 1]>,
+ //
+ // Register offset
+ InstrItinData<IIC_iStore_r , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_LS]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_LS]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_iStore_d_r, [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_DIS2], 0>,
+ InstrStage<1, [SW_LS], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+ InstrStage<1, [SW_LS]>],
+ [1, 1, 1]>,
+ //
+ // Scaled register offset
+ InstrItinData<IIC_iStore_si , [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+ InstrStage<1, [SW_LS]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_iStore_bh_si,[InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+ InstrStage<1, [SW_LS]>],
+ [1, 1, 1]>,
+ //
+ // Immediate offset with update
+ InstrItinData<IIC_iStore_iu , [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+ InstrStage<1, [SW_LS]>],
+ [1, 1, 1]>,
+ InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+ InstrStage<1, [SW_LS]>],
+ [1, 1, 1]>,
+ //
+ // Register offset with update
+ InstrItinData<IIC_iStore_ru , [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+ InstrStage<1, [SW_LS]>],
+ [1, 1, 1, 1]>,
+ InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+ InstrStage<1, [SW_LS]>],
+ [1, 1, 1, 1]>,
+ InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+ InstrStage<1, [SW_LS]>],
+ [1, 1, 1, 1]>,
+ //
+ // Scaled register offset with update
+ InstrItinData<IIC_iStore_siu, [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+ InstrStage<1, [SW_LS], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 1>],
+ [3, 1, 1, 1]>,
+ InstrItinData<IIC_iStore_bh_siu, [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+ InstrStage<1, [SW_LS], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 1>],
+ [3, 1, 1, 1]>,
+ //
+ // Store multiple
+ InstrItinData<IIC_iStore_m , [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+ InstrStage<1, [SW_LS], 1>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+ InstrStage<1, [SW_LS], 1>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+ InstrStage<1, [SW_LS]>],
+ [], [], -1>, // dynamic uops
+ //
+ // Store multiple + update
+ InstrItinData<IIC_iStore_mu, [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+ InstrStage<1, [SW_LS], 1>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+ InstrStage<1, [SW_LS], 1>,
+ InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+ InstrStage<1, [SW_LS]>],
+ [2], [], -1>, // dynamic uops
+
+ //
+ // Preload
+ InstrItinData<IIC_Preload, [InstrStage<1, [SW_DIS0], 0>], [1, 1]>,
+
+ // Branch
+ //
+ // no delay slots, so the latency of a branch is unimportant
+ InstrItinData<IIC_Br , [InstrStage<1, [SW_DIS0], 0>]>,
+
+ // FP Special Register to Integer Register File Move
+ InstrItinData<IIC_fpSTAT , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [1]>,
+ //
+ // Single-precision FP Unary
+ //
+ // Most floating-point moves get issued on ALU0.
+ InstrItinData<IIC_fpUNA32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [2, 1]>,
+ //
+ // Double-precision FP Unary
+ InstrItinData<IIC_fpUNA64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [2, 1]>,
+
+ //
+ // Single-precision FP Compare
+ InstrItinData<IIC_fpCMP32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [1, 1]>,
+ //
+ // Double-precision FP Compare
+ InstrItinData<IIC_fpCMP64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [1, 1]>,
+ //
+ // Single to Double FP Convert
+ InstrItinData<IIC_fpCVTSD , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [4, 1]>,
+ //
+ // Double to Single FP Convert
+ InstrItinData<IIC_fpCVTDS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [4, 1]>,
+
+ //
+ // Single to Half FP Convert
+ InstrItinData<IIC_fpCVTSH , [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_ALU1], 4>,
+ InstrStage<1, [SW_ALU1]>],
+ [6, 1]>,
+ //
+ // Half to Single FP Convert
+ InstrItinData<IIC_fpCVTHS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [4, 1]>,
+
+ //
+ // Single-Precision FP to Integer Convert
+ InstrItinData<IIC_fpCVTSI , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [4, 1]>,
+ //
+ // Double-Precision FP to Integer Convert
+ InstrItinData<IIC_fpCVTDI , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [4, 1]>,
+ //
+ // Integer to Single-Precision FP Convert
+ InstrItinData<IIC_fpCVTIS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [4, 1]>,
+ //
+ // Integer to Double-Precision FP Convert
+ InstrItinData<IIC_fpCVTID , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [4, 1]>,
+ //
+ // Single-precision FP ALU
+ InstrItinData<IIC_fpALU32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [2, 1, 1]>,
+ //
+ // Double-precision FP ALU
+ InstrItinData<IIC_fpALU64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [2, 1, 1]>,
+ //
+ // Single-precision FP Multiply
+ InstrItinData<IIC_fpMUL32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [4, 1, 1]>,
+ //
+ // Double-precision FP Multiply
+ InstrItinData<IIC_fpMUL64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [6, 1, 1]>,
+ //
+ // Single-precision FP MAC
+ InstrItinData<IIC_fpMAC32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [8, 1, 1]>,
+ //
+ // Double-precision FP MAC
+ InstrItinData<IIC_fpMAC64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [12, 1, 1]>,
+ //
+ // Single-precision Fused FP MAC
+ InstrItinData<IIC_fpFMAC32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [8, 1, 1]>,
+ //
+ // Double-precision Fused FP MAC
+ InstrItinData<IIC_fpFMAC64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [12, 1, 1]>,
+ //
+ // Single-precision FP DIV
+ InstrItinData<IIC_fpDIV32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1], 0>,
+ InstrStage<15, [SW_FDIV]>],
+ [17, 1, 1]>,
+ //
+ // Double-precision FP DIV
+ InstrItinData<IIC_fpDIV64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1], 0>,
+ InstrStage<30, [SW_FDIV]>],
+ [32, 1, 1]>,
+ //
+ // Single-precision FP SQRT
+ InstrItinData<IIC_fpSQRT32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1], 0>,
+ InstrStage<15, [SW_FDIV]>],
+ [17, 1]>,
+ //
+ // Double-precision FP SQRT
+ InstrItinData<IIC_fpSQRT64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1], 0>,
+ InstrStage<30, [SW_FDIV]>],
+ [32, 1, 1]>,
+
+ //
+ // Integer to Single-precision Move
+ InstrItinData<IIC_fpMOVIS, [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_LS], 4>,
+ InstrStage<1, [SW_ALU0]>],
+ [6, 1]>,
+ //
+ // Integer to Double-precision Move
+ InstrItinData<IIC_fpMOVID, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_LS]>],
+ [4, 1]>,
+ //
+ // Single-precision to Integer Move
+ InstrItinData<IIC_fpMOVSI, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_LS]>],
+ [3, 1]>,
+ //
+ // Double-precision to Integer Move
+ InstrItinData<IIC_fpMOVDI, [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_LS], 3>,
+ InstrStage<1, [SW_LS]>],
+ [3, 4, 1]>,
+ //
+ // Single-precision FP Load
+ InstrItinData<IIC_fpLoad32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_LS]>],
+ [4, 1]>,
+ //
+ // Double-precision FP Load
+ InstrItinData<IIC_fpLoad64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_LS]>],
+ [4, 1]>,
+ //
+ // FP Load Multiple
+ // FIXME: Assumes a single Q register.
+ InstrItinData<IIC_fpLoad_m, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_LS]>],
+ [1, 1, 1, 4], [], -1>, // dynamic uops
+ //
+ // FP Load Multiple + update
+ // FIXME: Assumes a single Q register.
+ InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_LS], 4>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [2, 1, 1, 1, 4], [], -1>, // dynamic uops
+ //
+ // Single-precision FP Store
+ InstrItinData<IIC_fpStore32,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_LS]>],
+ [1, 1]>,
+ //
+ // Double-precision FP Store
+ InstrItinData<IIC_fpStore64,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_LS]>],
+ [1, 1]>,
+ //
+ // FP Store Multiple
+ // FIXME: Assumes a single Q register.
+ InstrItinData<IIC_fpStore_m,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_LS]>],
+ [1, 1, 1], [], -1>, // dynamic uops
+ //
+ // FP Store Multiple + update
+ // FIXME: Assumes a single Q register.
+ InstrItinData<IIC_fpStore_mu,[InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_LS], 4>,
+ InstrStage<1, [SW_ALU0, SW_ALU1]>],
+ [2, 1, 1, 1], [], -1>, // dynamic uops
+ // NEON
+ //
+ // Double-register Integer Unary
+ InstrItinData<IIC_VUNAiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [4, 1]>,
+ //
+ // Quad-register Integer Unary
+ InstrItinData<IIC_VUNAiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [4, 1]>,
+ //
+ // Double-register Integer Q-Unary
+ InstrItinData<IIC_VQUNAiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [4, 1]>,
+ //
+ // Quad-register Integer CountQ-Unary
+ InstrItinData<IIC_VQUNAiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [4, 1]>,
+ //
+ // Double-register Integer Binary
+ InstrItinData<IIC_VBINiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [2, 1, 1]>,
+ //
+ // Quad-register Integer Binary
+ InstrItinData<IIC_VBINiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [2, 1, 1]>,
+ //
+ // Double-register Integer Subtract
+ InstrItinData<IIC_VSUBiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [2, 1, 1]>,
+ //
+ // Quad-register Integer Subtract
+ InstrItinData<IIC_VSUBiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [2, 1, 1]>,
+ //
+ // Double-register Integer Shift
+ InstrItinData<IIC_VSHLiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [2, 1, 1]>,
+ //
+ // Quad-register Integer Shift
+ InstrItinData<IIC_VSHLiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [2, 1, 1]>,
+ //
+ // Double-register Integer Shift (4 cycle)
+ InstrItinData<IIC_VSHLi4D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [4, 1, 1]>,
+ //
+ // Quad-register Integer Shift (4 cycle)
+ InstrItinData<IIC_VSHLi4Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [4, 1, 1]>,
+ //
+ // Double-register Integer Binary (4 cycle)
+ InstrItinData<IIC_VBINi4D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [4, 1, 1]>,
+ //
+ // Quad-register Integer Binary (4 cycle)
+ InstrItinData<IIC_VBINi4Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [4, 1, 1]>,
+ //
+ // Double-register Integer Subtract (4 cycle)
+ InstrItinData<IIC_VSUBi4D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [4, 1, 1]>,
+ //
+ // Quad-register Integer Subtract (4 cycle)
+ InstrItinData<IIC_VSUBi4Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [4, 1, 1]>,
+
+ //
+ // Double-register Integer Count
+ InstrItinData<IIC_VCNTiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [2, 1, 1]>,
+ //
+ // Quad-register Integer Count
+ InstrItinData<IIC_VCNTiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [2, 1, 1]>,
+ //
+ // Double-register Absolute Difference and Accumulate
+ InstrItinData<IIC_VABAD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [4, 1, 1, 1]>,
+ //
+ // Quad-register Absolute Difference and Accumulate
+ InstrItinData<IIC_VABAQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [4, 1, 1, 1]>,
+ //
+ // Double-register Integer Pair Add Long
+ InstrItinData<IIC_VPALiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [4, 1, 1]>,
+ //
+ // Quad-register Integer Pair Add Long
+ InstrItinData<IIC_VPALiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [4, 1, 1]>,
+
+ //
+ // Double-register Integer Multiply (.8, .16)
+ InstrItinData<IIC_VMULi16D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [4, 1, 1]>,
+ //
+ // Quad-register Integer Multiply (.8, .16)
+ InstrItinData<IIC_VMULi16Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [4, 1, 1]>,
+
+ //
+ // Double-register Integer Multiply (.32)
+ InstrItinData<IIC_VMULi32D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [4, 1, 1]>,
+ //
+ // Quad-register Integer Multiply (.32)
+ InstrItinData<IIC_VMULi32Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [4, 1, 1]>,
+ //
+ // Double-register Integer Multiply-Accumulate (.8, .16)
+ InstrItinData<IIC_VMACi16D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [4, 1, 1, 1]>,
+ //
+ // Double-register Integer Multiply-Accumulate (.32)
+ InstrItinData<IIC_VMACi32D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [4, 1, 1, 1]>,
+ //
+ // Quad-register Integer Multiply-Accumulate (.8, .16)
+ InstrItinData<IIC_VMACi16Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [4, 1, 1, 1]>,
+ //
+ // Quad-register Integer Multiply-Accumulate (.32)
+ InstrItinData<IIC_VMACi32Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [4, 1, 1, 1]>,
+
+ //
+ // Move
+ InstrItinData<IIC_VMOV, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [2, 1]>,
+ //
+ // Move Immediate
+ InstrItinData<IIC_VMOVImm, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [2]>,
+ //
+ // Double-register Permute Move
+ InstrItinData<IIC_VMOVD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [2, 1]>,
+ //
+ // Quad-register Permute Move
+ InstrItinData<IIC_VMOVQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [2, 1]>,
+ //
+ // Integer to Single-precision Move
+ InstrItinData<IIC_VMOVIS , [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_LS], 4>,
+ InstrStage<1, [SW_ALU0]>],
+ [6, 1]>,
+ //
+ // Integer to Double-precision Move
+ InstrItinData<IIC_VMOVID , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_LS]>],
+ [4, 1, 1]>,
+ //
+ // Single-precision to Integer Move
+ InstrItinData<IIC_VMOVSI , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_LS]>],
+ [3, 1]>,
+ //
+ // Double-precision to Integer Move
+ InstrItinData<IIC_VMOVDI , [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_LS], 3>,
+ InstrStage<1, [SW_LS]>],
+ [3, 4, 1]>,
+ //
+ // Integer to Lane Move
+ // FIXME: I think this is correct, but it is not clear from the tuning guide.
+ InstrItinData<IIC_VMOVISL , [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_LS], 4>,
+ InstrStage<1, [SW_ALU0]>],
+ [6, 1]>,
+
+ //
+ // Vector narrow move
+ InstrItinData<IIC_VMOVN, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [2, 1]>,
+ //
+ // Double-register FP Unary
+ // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here,
+ // and they issue on a different pipeline.
+ InstrItinData<IIC_VUNAD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [2, 1]>,
+ //
+ // Quad-register FP Unary
+ // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here,
+ // and they issue on a different pipeline.
+ InstrItinData<IIC_VUNAQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [2, 1]>,
+ //
+ // Double-register FP Binary
+ // FIXME: We're using this itin for many instructions.
+ InstrItinData<IIC_VBIND, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [4, 1, 1]>,
+
+ //
+ // VPADD, etc.
+ InstrItinData<IIC_VPBIND, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [4, 1, 1]>,
+ //
+ // Double-register FP VMUL
+ InstrItinData<IIC_VFMULD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [4, 1, 1]>,
+ //
+ // Quad-register FP Binary
+ InstrItinData<IIC_VBINQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU0]>],
+ [4, 1, 1]>,
+ //
+ // Quad-register FP VMUL
+ InstrItinData<IIC_VFMULQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [4, 1, 1]>,
+ //
+ // Double-register FP Multiple-Accumulate
+ InstrItinData<IIC_VMACD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [8, 1, 1]>,
+ //
+ // Quad-register FP Multiple-Accumulate
+ InstrItinData<IIC_VMACQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [8, 1, 1]>,
+ //
+ // Double-register Fused FP Multiple-Accumulate
+ InstrItinData<IIC_VFMACD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [8, 1, 1]>,
+ //
+ // Quad-register FusedF P Multiple-Accumulate
+ InstrItinData<IIC_VFMACQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [8, 1, 1]>,
+ //
+ // Double-register Reciprical Step
+ InstrItinData<IIC_VRECSD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [8, 1, 1]>,
+ //
+ // Quad-register Reciprical Step
+ InstrItinData<IIC_VRECSQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [8, 1, 1]>,
+ //
+ // Double-register Permute
+ // FIXME: The latencies are unclear from the documentation.
+ InstrItinData<IIC_VPERMD, [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1], 2>,
+ InstrStage<1, [SW_ALU1], 2>,
+ InstrStage<1, [SW_ALU1]>],
+ [3, 4, 3, 4]>,
+ //
+ // Quad-register Permute
+ // FIXME: The latencies are unclear from the documentation.
+ InstrItinData<IIC_VPERMQ, [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1], 2>,
+ InstrStage<1, [SW_ALU1], 2>,
+ InstrStage<1, [SW_ALU1]>],
+ [3, 4, 3, 4]>,
+ //
+ // Quad-register Permute (3 cycle issue on A9)
+ InstrItinData<IIC_VPERMQ3, [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1], 2>,
+ InstrStage<1, [SW_ALU1], 2>,
+ InstrStage<1, [SW_ALU1]>],
+ [3, 4, 3, 4]>,
+
+ //
+ // Double-register VEXT
+ InstrItinData<IIC_VEXTD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [2, 1, 1]>,
+ //
+ // Quad-register VEXT
+ InstrItinData<IIC_VEXTQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [2, 1, 1]>,
+ //
+ // VTB
+ InstrItinData<IIC_VTB1, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [2, 1, 1]>,
+ InstrItinData<IIC_VTB2, [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_ALU1], 2>,
+ InstrStage<1, [SW_ALU1]>],
+ [4, 1, 3, 3]>,
+ InstrItinData<IIC_VTB3, [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1], 2>,
+ InstrStage<1, [SW_ALU1], 2>,
+ InstrStage<1, [SW_ALU1]>],
+ [6, 1, 3, 5, 5]>,
+ InstrItinData<IIC_VTB4, [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1], 2>,
+ InstrStage<1, [SW_ALU1], 2>,
+ InstrStage<1, [SW_ALU1], 2>,
+ InstrStage<1, [SW_ALU1]>],
+ [8, 1, 3, 5, 7, 7]>,
+ //
+ // VTBX
+ InstrItinData<IIC_VTBX1, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1]>],
+ [2, 1, 1]>,
+ InstrItinData<IIC_VTBX2, [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_ALU1], 2>,
+ InstrStage<1, [SW_ALU1]>],
+ [4, 1, 3, 3]>,
+ InstrItinData<IIC_VTBX3, [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1], 2>,
+ InstrStage<1, [SW_ALU1], 2>,
+ InstrStage<1, [SW_ALU1]>],
+ [6, 1, 3, 5, 5]>,
+ InstrItinData<IIC_VTBX4, [InstrStage<1, [SW_DIS0], 0>,
+ InstrStage<1, [SW_DIS1], 0>,
+ InstrStage<1, [SW_DIS2], 0>,
+ InstrStage<1, [SW_ALU1], 2>,
+ InstrStage<1, [SW_ALU1], 2>,
+ InstrStage<1, [SW_ALU1], 2>,
+ InstrStage<1, [SW_ALU1]>],
+ [8, 1, 3, 5, 7, 7]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// This following definitions describe the simple machine model which
+// will replace itineraries.
+
+// Swift machine model for scheduling and other instruction cost heuristics.
+def SwiftModel : SchedMachineModel {
+ let IssueWidth = 3; // 3 micro-ops are dispatched per cycle.
+ let MinLatency = 0; // Data dependencies are allowed within dispatch groups.
+ let LoadLatency = 3;
+
+ let Itineraries = SwiftItineraries;
+}
+
+// TODO: Add Swift processor and scheduler resources.
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 13420c2ed7..bcc9db4ae3 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -32,6 +32,10 @@ static cl::opt<bool>
DarwinUseMOVT("arm-darwin-use-movt", cl::init(true), cl::Hidden);
static cl::opt<bool>
+UseFusedMulOps("arm-use-mulops",
+ cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
StrictAlign("arm-strict-align", cl::Hidden,
cl::desc("Disallow all unaligned memory accesses"));
@@ -50,6 +54,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
, HasVFPv4(false)
, HasNEON(false)
, UseNEONForSinglePrecisionFP(false)
+ , UseMulOps(UseFusedMulOps)
, SlowFPVMLx(false)
, HasVMLxForwarding(false)
, SlowFPBrcc(false)
@@ -64,6 +69,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
, HasFP16(false)
, HasD16(false)
, HasHardwareDivide(false)
+ , HasHardwareDivideInARM(false)
, HasT2ExtractPack(false)
, HasDataBarrier(false)
, Pref32BitThumb(false)
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 495afa92b5..8e6b650602 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -30,7 +30,7 @@ class StringRef;
class ARMSubtarget : public ARMGenSubtargetInfo {
protected:
enum ARMProcFamilyEnum {
- Others, CortexA8, CortexA9, CortexA15
+ Others, CortexA8, CortexA9, CortexA15, Swift
};
/// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others.
@@ -57,6 +57,10 @@ protected:
/// determine if NEON should actually be used.
bool UseNEONForSinglePrecisionFP;
+ /// UseMulOps - True if non-microcoded fused integer multiply-add and
+ /// multiply-subtract instructions should be used.
+ bool UseMulOps;
+
/// SlowFPVMLx - If the VFP2 / NEON instructions are available, indicates
/// whether the FP VML[AS] instructions are slow (if so, don't use them).
bool SlowFPVMLx;
@@ -107,6 +111,9 @@ protected:
/// HasHardwareDivide - True if subtarget supports [su]div
bool HasHardwareDivide;
+ /// HasHardwareDivideInARM - True if subtarget supports [su]div in ARM mode
+ bool HasHardwareDivideInARM;
+
/// HasT2ExtractPack - True if subtarget supports thumb2 extract/pack
/// instructions.
bool HasT2ExtractPack;
@@ -200,6 +207,7 @@ protected:
bool isCortexA8() const { return ARMProcFamily == CortexA8; }
bool isCortexA9() const { return ARMProcFamily == CortexA9; }
bool isCortexA15() const { return ARMProcFamily == CortexA15; }
+ bool isSwift() const { return ARMProcFamily == Swift; }
bool isCortexM3() const { return CPUString == "cortex-m3"; }
bool isLikeA9() const { return isCortexA9() || isCortexA15(); }
@@ -213,8 +221,10 @@ protected:
return hasNEON() && UseNEONForSinglePrecisionFP; }
bool hasDivide() const { return HasHardwareDivide; }
+ bool hasDivideInARMMode() const { return HasHardwareDivideInARM; }
bool hasT2ExtractPack() const { return HasT2ExtractPack; }
bool hasDataBarrier() const { return HasDataBarrier; }
+ bool useMulOps() const { return UseMulOps; }
bool useFPVMLx() const { return !SlowFPVMLx; }
bool hasVMLxForwarding() const { return HasVMLxForwarding; }
bool isFPBrccSlow() const { return SlowFPBrcc; }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 68c47ac6d9..b032978da9 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -687,6 +687,15 @@ MCAsmBackend *llvm::createARMAsmBackend(const Target &T, StringRef TT, StringRef
else if (TheTriple.getArchName() == "armv6" ||
TheTriple.getArchName() == "thumbv6")
return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V6);
+ else if (TheTriple.getArchName() == "armv7f" ||
+ TheTriple.getArchName() == "thumbv7f")
+ return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V7F);
+ else if (TheTriple.getArchName() == "armv7k" ||
+ TheTriple.getArchName() == "thumbv7k")
+ return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V7K);
+ else if (TheTriple.getArchName() == "armv7s" ||
+ TheTriple.getArchName() == "thumbv7s")
+ return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V7S);
return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V7);
}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 5df84c8b10..00ffc94ac7 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -71,6 +71,14 @@ std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
else
// Use CPU to figure out the exact features.
ARMArchFeature = "+v7";
+ } else if (Len >= Idx+2 && TT[Idx+1] == 's') {
+ if (NoCPU)
+ // v7s: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureT2XtPk
+ // Swift
+ ARMArchFeature = "+v7,+swift,+neon,+db,+t2dsp,+t2xtpk";
+ else
+ // Use CPU to figure out the exact features.
+ ARMArchFeature = "+v7";
} else {
// v7 CPUs have lots of different feature sets. If no CPU is specified,
// then assume v7a (e.g. cortex-a8) feature set. Otherwise, return
diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp
index 4ebba0e4d3..70643bcda3 100644
--- a/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/lib/Target/ARM/MLxExpansionPass.cpp
@@ -52,6 +52,7 @@ namespace {
MachineRegisterInfo *MRI;
bool isLikeA9;
+ bool isSwift;
unsigned MIIdx;
MachineInstr* LastMIs[4];
SmallPtrSet<MachineInstr*, 4> IgnoreStall;
@@ -60,6 +61,7 @@ namespace {
void pushStack(MachineInstr *MI);
MachineInstr *getAccDefMI(MachineInstr *MI) const;
unsigned getDefReg(MachineInstr *MI) const;
+ bool hasLoopHazard(MachineInstr *MI) const;
bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const;
bool FindMLxHazard(MachineInstr *MI);
void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
@@ -135,6 +137,50 @@ unsigned MLxExpansion::getDefReg(MachineInstr *MI) const {
return Reg;
}
+/// hasLoopHazard - Check whether an MLx instruction is chained to itself across
+/// a single-MBB loop.
+bool MLxExpansion::hasLoopHazard(MachineInstr *MI) const {
+ unsigned Reg = MI->getOperand(1).getReg();
+ if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ return false;
+
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineInstr *DefMI = MRI->getVRegDef(Reg);
+ while (true) {
+outer_continue:
+ if (DefMI->getParent() != MBB)
+ break;
+
+ if (DefMI->isPHI()) {
+ for (unsigned i = 1, e = DefMI->getNumOperands(); i < e; i += 2) {
+ if (DefMI->getOperand(i + 1).getMBB() == MBB) {
+ unsigned SrcReg = DefMI->getOperand(i).getReg();
+ if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+ DefMI = MRI->getVRegDef(SrcReg);
+ goto outer_continue;
+ }
+ }
+ }
+ } else if (DefMI->isCopyLike()) {
+ Reg = DefMI->getOperand(1).getReg();
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ DefMI = MRI->getVRegDef(Reg);
+ continue;
+ }
+ } else if (DefMI->isInsertSubreg()) {
+ Reg = DefMI->getOperand(2).getReg();
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ DefMI = MRI->getVRegDef(Reg);
+ continue;
+ }
+ }
+
+ break;
+ }
+
+ return DefMI == MI;
+}
+
bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const {
// FIXME: Detect integer instructions properly.
const MCInstrDesc &MCID = MI->getDesc();
@@ -149,6 +195,19 @@ bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const {
return false;
}
+static bool isFpMulInstruction(unsigned Opcode) {
+ switch (Opcode) {
+ case ARM::VMULS:
+ case ARM::VMULfd:
+ case ARM::VMULfq:
+ case ARM::VMULD:
+ case ARM::VMULslfd:
+ case ARM::VMULslfq:
+ return true;
+ default:
+ return false;
+ }
+}
bool MLxExpansion::FindMLxHazard(MachineInstr *MI) {
if (NumExpand >= ExpandLimit)
@@ -171,6 +230,12 @@ bool MLxExpansion::FindMLxHazard(MachineInstr *MI) {
return true;
}
+ // On Swift, we mostly care about hazards from multiplication instructions
+ // writing the accumulator and the pipelining of loop iterations by out-of-
+ // order execution.
+ if (isSwift)
+ return isFpMulInstruction(DefMI->getOpcode()) || hasLoopHazard(MI);
+
if (IgnoreStall.count(MI))
return false;
@@ -316,7 +381,8 @@ bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) {
TRI = Fn.getTarget().getRegisterInfo();
MRI = &Fn.getRegInfo();
const ARMSubtarget *STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
- isLikeA9 = STI->isLikeA9();
+ isLikeA9 = STI->isLikeA9() || STI->isSwift();
+ isSwift = STI->isSwift();
bool Modified = false;
for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
diff --git a/test/CodeGen/ARM/2010-12-07-PEIBug.ll b/test/CodeGen/ARM/2010-12-07-PEIBug.ll
index 770ad4466a..4879f4e10b 100644
--- a/test/CodeGen/ARM/2010-12-07-PEIBug.ll
+++ b/test/CodeGen/ARM/2010-12-07-PEIBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a9 | FileCheck %s
; rdar://8728956
define hidden void @foo() nounwind ssp {
diff --git a/test/CodeGen/ARM/2012-05-04-vmov.ll b/test/CodeGen/ARM/2012-05-04-vmov.ll
new file mode 100644
index 0000000000..d52ef2cc5a
--- /dev/null
+++ b/test/CodeGen/ARM/2012-05-04-vmov.ll
@@ -0,0 +1,11 @@
+; RUN: llc -O1 -march=arm -mcpu=cortex-a9 < %s | FileCheck -check-prefix=A9-CHECK %s
+; RUN: llc -O1 -march=arm -mcpu=swift < %s | FileCheck -check-prefix=SWIFT-CHECK %s
+; Check that swift doesn't use vmov.32. <rdar://problem/10453003>.
+
+define <2 x i32> @testuvec(<2 x i32> %A, <2 x i32> %B) nounwind {
+entry:
+ %div = udiv <2 x i32> %A, %B
+ ret <2 x i32> %div
+; A9-CHECK: vmov.32
+; SWIFT-CHECK-NOT: vmov.32
+}
diff --git a/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll b/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll
new file mode 100644
index 0000000000..dd678436c0
--- /dev/null
+++ b/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=arm -mcpu=swift < %s | FileCheck %s
+; <rdar://problem/10451892>
+
+define void @f(i32 %x, i32* %p) nounwind ssp {
+entry:
+; CHECK-NOT: vdup.32
+ %vecinit.i = insertelement <2 x i32> undef, i32 %x, i32 0
+ %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %x, i32 1
+ %0 = bitcast i32* %p to i8*
+ tail call void @llvm.arm.neon.vst1.v2i32(i8* %0, <2 x i32> %vecinit1.i, i32 4)
+ ret void
+}
+
+declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) nounwind
diff --git a/test/CodeGen/ARM/avoid-cpsr-rmw.ll b/test/CodeGen/ARM/avoid-cpsr-rmw.ll
index 1b385ab79c..96e83dd88e 100644
--- a/test/CodeGen/ARM/avoid-cpsr-rmw.ll
+++ b/test/CodeGen/ARM/avoid-cpsr-rmw.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift | FileCheck %s
; Avoid some 's' 16-bit instruction which partially update CPSR (and add false
; dependency) when it isn't dependent on last CPSR defining instruction.
; rdar://8928208
diff --git a/test/CodeGen/ARM/call-noret.ll b/test/CodeGen/ARM/call-noret.ll
new file mode 100644
index 0000000000..d294f2cf1a
--- /dev/null
+++ b/test/CodeGen/ARM/call-noret.ll
@@ -0,0 +1,39 @@
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a8 | FileCheck %s -check-prefix=ARM
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=swift | FileCheck %s -check-prefix=SWIFT
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 | FileCheck %s -check-prefix=T2
+; rdar://8979299
+
+define void @t1() noreturn nounwind ssp {
+entry:
+; ARM: t1:
+; ARM: mov lr, pc
+; ARM: b _bar
+
+; SWIFT: t1:
+; SWIFT: mov lr, pc
+; SWIFT: b _bar
+
+; T2: t1:
+; T2: blx _bar
+ tail call void @bar() noreturn nounwind
+ unreachable
+}
+
+define void @t2() noreturn nounwind ssp {
+entry:
+; ARM: t2:
+; ARM: mov lr, pc
+; ARM: b _t1
+
+; SWIFT: t2:
+; SWIFT: mov lr, pc
+; SWIFT: b _t1
+
+; T2: t2:
+; T2: mov lr, pc
+; T2: b.w _t1
+ tail call void @t1() noreturn nounwind
+ unreachable
+}
+
+declare void @bar() noreturn
diff --git a/test/CodeGen/ARM/div.ll b/test/CodeGen/ARM/div.ll
index 3d29e05a0c..82cfca182b 100644
--- a/test/CodeGen/ARM/div.ll
+++ b/test/CodeGen/ARM/div.ll
@@ -1,9 +1,13 @@
-; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s -check-prefix=CHECK-ARM
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 | FileCheck %s -check-prefix=CHECK-ARM
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=swift | FileCheck %s -check-prefix=CHECK-SWIFT
define i32 @f1(i32 %a, i32 %b) {
entry:
; CHECK-ARM: f1
; CHECK-ARM: __divsi3
+
+; CHECK-SWIFT: f1
+; CHECK-SWIFT: sdiv
%tmp1 = sdiv i32 %a, %b ; <i32> [#uses=1]
ret i32 %tmp1
}
@@ -12,6 +16,9 @@ define i32 @f2(i32 %a, i32 %b) {
entry:
; CHECK-ARM: f2
; CHECK-ARM: __udivsi3
+
+; CHECK-SWIFT: f2
+; CHECK-SWIFT: udiv
%tmp1 = udiv i32 %a, %b ; <i32> [#uses=1]
ret i32 %tmp1
}
@@ -20,6 +27,10 @@ define i32 @f3(i32 %a, i32 %b) {
entry:
; CHECK-ARM: f3
; CHECK-ARM: __modsi3
+
+; CHECK-SWIFT: f3
+; CHECK-SWIFT: sdiv
+; CHECK-SWIFT: mls
%tmp1 = srem i32 %a, %b ; <i32> [#uses=1]
ret i32 %tmp1
}
@@ -28,6 +39,10 @@ define i32 @f4(i32 %a, i32 %b) {
entry:
; CHECK-ARM: f4
; CHECK-ARM: __umodsi3
+
+; CHECK-SWIFT: f4
+; CHECK-SWIFT: udiv
+; CHECK-SWIFT: mls
%tmp1 = urem i32 %a, %b ; <i32> [#uses=1]
ret i32 %tmp1
}
diff --git a/test/CodeGen/ARM/fabss.ll b/test/CodeGen/ARM/fabss.ll
index bcb4ee7452..46c2f1c65f 100644
--- a/test/CodeGen/ARM/fabss.ll
+++ b/test/CodeGen/ARM/fabss.ll
@@ -14,12 +14,12 @@ entry:
declare float @fabsf(float)
; VFP2: test:
-; VFP2: vabs.f32 s1, s1
+; VFP2: vabs.f32 s2, s2
; NFP1: test:
; NFP1: vabs.f32 d1, d1
; NFP0: test:
-; NFP0: vabs.f32 s1, s1
+; NFP0: vabs.f32 s2, s2
; CORTEXA8: test:
; CORTEXA8: vadd.f32 [[D1:d[0-9]+]]
diff --git a/test/CodeGen/ARM/fadds.ll b/test/CodeGen/ARM/fadds.ll
index e35103c045..48ef5ed88f 100644
--- a/test/CodeGen/ARM/fadds.ll
+++ b/test/CodeGen/ARM/fadds.ll
@@ -10,14 +10,14 @@ entry:
}
; VFP2: test:
-; VFP2: vadd.f32 s0, s1, s0
+; VFP2: vadd.f32 s
; NFP1: test:
-; NFP1: vadd.f32 d0, d1, d0
+; NFP1: vadd.f32 d
; NFP0: test:
-; NFP0: vadd.f32 s0, s1, s0
+; NFP0: vadd.f32 s
; CORTEXA8: test:
-; CORTEXA8: vadd.f32 d0, d1, d0
+; CORTEXA8: vadd.f32 d
; CORTEXA9: test:
; CORTEXA9: vadd.f32 s{{.}}, s{{.}}, s{{.}}
diff --git a/test/CodeGen/ARM/fdivs.ll b/test/CodeGen/ARM/fdivs.ll
index 31c1ca9405..8fab002135 100644
--- a/test/CodeGen/ARM/fdivs.ll
+++ b/test/CodeGen/ARM/fdivs.ll
@@ -10,14 +10,14 @@ entry:
}
; VFP2: test:
-; VFP2: vdiv.f32 s0, s1, s0
+; VFP2: vdiv.f32 s0, s2, s0
; NFP1: test:
-; NFP1: vdiv.f32 s0, s1, s0
+; NFP1: vdiv.f32 s0, s2, s0
; NFP0: test:
-; NFP0: vdiv.f32 s0, s1, s0
+; NFP0: vdiv.f32 s0, s2, s0
; CORTEXA8: test:
-; CORTEXA8: vdiv.f32 s0, s1, s0
+; CORTEXA8: vdiv.f32 s0, s2, s0
; CORTEXA9: test:
; CORTEXA9: vdiv.f32 s{{.}}, s{{.}}, s{{.}}
diff --git a/test/CodeGen/ARM/fmuls.ll b/test/CodeGen/ARM/fmuls.ll
index 3c3182bc63..1566a9272d 100644
--- a/test/CodeGen/ARM/fmuls.ll
+++ b/test/CodeGen/ARM/fmuls.ll
@@ -10,15 +10,15 @@ entry:
}
; VFP2: test:
-; VFP2: vmul.f32 s0, s1, s0
+; VFP2: vmul.f32 s
; NFP1: test:
-; NFP1: vmul.f32 d0, d1, d0
+; NFP1: vmul.f32 d
; NFP0: test:
-; NFP0: vmul.f32 s0, s1, s0
+; NFP0: vmul.f32 s
; CORTEXA8: test:
-; CORTEXA8: vmul.f32 d0, d1, d0
+; CORTEXA8: vmul.f32 d
; CORTEXA9: test:
; CORTEXA9: vmul.f32 s{{.}}, s{{.}}, s{{.}}
diff --git a/test/CodeGen/ARM/fp_convert.ll b/test/CodeGen/ARM/fp_convert.ll
index 7002cecf36..44298b9c5d 100644
--- a/test/CodeGen/ARM/fp_convert.ll
+++ b/test/CodeGen/ARM/fp_convert.ll
@@ -31,7 +31,7 @@ define float @test3(i32 %a, i32 %b) {
; VFP2: test3:
; VFP2: vcvt.f32.u32 s{{.}}, s{{.}}
; NEON: test3:
-; NEON: vcvt.f32.u32 d0, d0
+; NEON: vcvt.f32.u32 d
entry:
%0 = add i32 %a, %b
%1 = uitofp i32 %0 to float
@@ -42,7 +42,7 @@ define float @test4(i32 %a, i32 %b) {
; VFP2: test4:
; VFP2: vcvt.f32.s32 s{{.}}, s{{.}}
; NEON: test4:
-; NEON: vcvt.f32.s32 d0, d0
+; NEON: vcvt.f32.s32 d
entry:
%0 = add i32 %a, %b
%1 = sitofp i32 %0 to float
diff --git a/test/CodeGen/ARM/fsubs.ll b/test/CodeGen/ARM/fsubs.ll
index bea8d5f4f3..f039e74c8e 100644
--- a/test/CodeGen/ARM/fsubs.ll
+++ b/test/CodeGen/ARM/fsubs.ll
@@ -8,6 +8,6 @@ entry:
ret float %0
}
-; VFP2: vsub.f32 s0, s1, s0
-; NFP1: vsub.f32 d0, d1, d0
-; NFP0: vsub.f32 s0, s1, s0
+; VFP2: vsub.f32 s
+; NFP1: vsub.f32 d
+; NFP0: vsub.f32 s
diff --git a/test/CodeGen/ARM/ifcvt1.ll b/test/CodeGen/ARM/ifcvt1.ll
index cd870bb5d4..fd831442c1 100644
--- a/test/CodeGen/ARM/ifcvt1.ll
+++ b/test/CodeGen/ARM/ifcvt1.ll
@@ -1,17 +1,21 @@
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
+; RUN: llc < %s -march=arm -mcpu=swift | FileCheck %s -check-prefix=SWIFT
define i32 @t1(i32 %a, i32 %b) {
-; CHECK: t1:
+; A8: t1:
+; SWIFT: t1:
%tmp2 = icmp eq i32 %a, 0
br i1 %tmp2, label %cond_false, label %cond_true
cond_true:
-; CHECK: subeq r0, r1, #1
+; A8: subeq r0, r1, #1
+; SWIFT: sub r0, r1, #1
%tmp5 = add i32 %b, 1
ret i32 %tmp5
cond_false:
-; CHECK: addne r0, r1, #1
+; A8: addne r0, r1, #1
+; SWIFT: addne r0, r1, #1
%tmp7 = add i32 %b, -1
ret i32 %tmp7
}
diff --git a/test/CodeGen/ARM/ifcvt12.ll b/test/CodeGen/ARM/ifcvt12.ll
new file mode 100644
index 0000000000..77bdca57e5
--- /dev/null
+++ b/test/CodeGen/ARM/ifcvt12.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=arm-apple-darwin -mcpu=cortex-a8 | FileCheck %s
+define i32 @f1(i32 %a, i32 %b, i32 %c) {
+; CHECK: f1:
+; CHECK: mlsne r0, r0, r1, r2
+ %tmp1 = icmp eq i32 %a, 0
+ br i1 %tmp1, label %cond_false, label %cond_true
+
+cond_true:
+ %tmp2 = mul i32 %a, %b
+ %tmp3 = sub i32 %c, %tmp2
+ ret i32 %tmp3
+
+cond_false:
+ ret i32 %a
+}
diff --git a/test/CodeGen/ARM/ifcvt5.ll b/test/CodeGen/ARM/ifcvt5.ll
index 95f5c97f2a..5081791bc2 100644
--- a/test/CodeGen/ARM/ifcvt5.ll
+++ b/test/CodeGen/ARM/ifcvt5.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -mtriple=armv7-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=swift | FileCheck %s -check-prefix=SWIFT
+; rdar://8402126
@x = external global i32* ; <i32**> [#uses=1]
@@ -10,8 +12,12 @@ entry:
}
define i32 @t1(i32 %a, i32 %b) {
-; CHECK: t1:
-; CHECK: poplt {r7, pc}
+; A8: t1:
+; A8: poplt {r7, pc}
+
+; SWIFT: t1:
+; SWIFT: pop {r7, pc}
+; SWIFT: pop {r7, pc}
entry:
%tmp1 = icmp sgt i32 %a, 10 ; <i1> [#uses=1]
br i1 %tmp1, label %cond_true, label %UnifiedReturnBlock
diff --git a/test/CodeGen/ARM/ldr_post.ll b/test/CodeGen/ARM/ldr_post.ll
index 8ddf025dbf..a6ca434483 100644
--- a/test/CodeGen/ARM/ldr_post.ll
+++ b/test/CodeGen/ARM/ldr_post.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=swift | FileCheck %s
; CHECK: test1:
; CHECK: ldr {{.*, \[.*]}}, -r2
diff --git a/test/CodeGen/ARM/ldr_pre.ll b/test/CodeGen/ARM/ldr_pre.ll
index e904e5fd2c..6c40ad7326 100644
--- a/test/CodeGen/ARM/ldr_pre.ll
+++ b/test/CodeGen/ARM/ldr_pre.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=swift | FileCheck %s
; CHECK: test1:
; CHECK: ldr {{.*!}}
diff --git a/test/CodeGen/ARM/mls.ll b/test/CodeGen/ARM/mls.ll
index a6cdba4454..066bf98de6 100644
--- a/test/CodeGen/ARM/mls.ll
+++ b/test/CodeGen/ARM/mls.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -march=arm -mattr=+v6t2 | FileCheck %s
+; RUN: llc < %s -march=arm -mattr=+v6t2 -arm-use-mulops=false | FileCheck %s -check-prefix=NO_MULOPS
define i32 @f1(i32 %a, i32 %b, i32 %c) {
%tmp1 = mul i32 %a, %b
@@ -13,4 +14,15 @@ define i32 @f2(i32 %a, i32 %b, i32 %c) {
ret i32 %tmp2
}
+; CHECK: f1:
; CHECK: mls r0, r0, r1, r2
+; NO_MULOPS: f1:
+; NO_MULOPS: mul r0, r0, r1
+; NO_MULOPS-NEXT: sub r0, r2, r0
+
+; CHECK: f2:
+; CHECK: mul r0, r0, r1
+; CHECK-NEXT: sub r0, r0, r2
+; NO_MULOPS: f2:
+; NO_MULOPS: mul r0, r0, r1
+; NO_MULOPS-NEXT: sub r0, r0, r2
diff --git a/test/CodeGen/ARM/neon-fma.ll b/test/CodeGen/ARM/neon-fma.ll
new file mode 100644
index 0000000000..d2cca5009d
--- /dev/null
+++ b/test/CodeGen/ARM/neon-fma.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -mcpu=swift | FileCheck %s
+
+; CHECK: test_v2f32
+; CHECK: vfma.f32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+
+define <2 x float> @test_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
+entry:
+ %call = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone
+ ret <2 x float> %call
+}
+
+; CHECK: test_v4f32
+; CHECK: vfma.f32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+
+define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp {
+entry:
+ %call = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone
+ ret <4 x float> %call
+}
+
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/neon_ld2.ll b/test/CodeGen/ARM/neon_ld2.ll
index 630db93035..497619ed74 100644
--- a/test/CodeGen/ARM/neon_ld2.ll
+++ b/test/CodeGen/ARM/neon_ld2.ll
@@ -1,10 +1,16 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=swift | FileCheck %s --check-prefix=SWIFT
; CHECK: t1
; CHECK: vld1.64
; CHECK: vld1.64
; CHECK: vadd.i64 q
; CHECK: vst1.64
+; SWIFT: t1
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
+; SWIFT: vadd.i64 q
+; SWIFT: vst1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
define void @t1(<4 x i32>* %r, <2 x i64>* %a, <2 x i64>* %b) nounwind {
entry:
%0 = load <2 x i64>* %a, align 16 ; <<2 x i64>> [#uses=1]
@@ -21,6 +27,12 @@ entry:
; CHECK: vsub.i64 q
; CHECK: vmov r0, r1, d
; CHECK: vmov r2, r3, d
+; SWIFT: t2
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
+; SWIFT: vsub.i64 q
+; SWIFT: vmov r0, r1, d
+; SWIFT: vmov r2, r3, d
define <4 x i32> @t2(<2 x i64>* %a, <2 x i64>* %b) nounwind readonly {
entry:
%0 = load <2 x i64>* %a, align 16 ; <<2 x i64>> [#uses=1]
@@ -30,3 +42,18 @@ entry:
ret <4 x i32> %3
}
+; Limited alignment.
+; SWIFT: t3
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+}}
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+}}
+; SWIFT: vadd.i64 q
+; SWIFT: vst1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+}}
+define void @t3(<4 x i32>* %r, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+entry:
+ %0 = load <2 x i64>* %a, align 8
+ %1 = load <2 x i64>* %b, align 8
+ %2 = add <2 x i64> %0, %1
+ %3 = bitcast <2 x i64> %2 to <4 x i32>
+ store <4 x i32> %3, <4 x i32>* %r, align 8
+ ret void
+}
diff --git a/test/CodeGen/ARM/opt-shuff-tstore.ll b/test/CodeGen/ARM/opt-shuff-tstore.ll
index df98e231cc..74c9a21355 100644
--- a/test/CodeGen/ARM/opt-shuff-tstore.ll
+++ b/test/CodeGen/ARM/opt-shuff-tstore.ll
@@ -2,7 +2,7 @@
; CHECK: func_4_8
; CHECK: vst1.32
-; CHECK-NEXT: bx lr
+; CHECK: bx lr
define void @func_4_8(<4 x i8> %param, <4 x i8>* %p) {
%r = add <4 x i8> %param, <i8 1, i8 2, i8 3, i8 4>
store <4 x i8> %r, <4 x i8>* %p
@@ -11,7 +11,7 @@ define void @func_4_8(<4 x i8> %param, <4 x i8>* %p) {
; CHECK: func_2_16
; CHECK: vst1.32
-; CHECK-NEXT: bx lr
+; CHECK: bx lr
define void @func_2_16(<2 x i16> %param, <2 x i16>* %p) {
%r = add <2 x i16> %param, <i16 1, i16 2>
store <2 x i16> %r, <2 x i16>* %p
diff --git a/test/CodeGen/ARM/subreg-remat.ll b/test/CodeGen/ARM/subreg-remat.ll
index 03ae12c6de..455bfce0f2 100644
--- a/test/CodeGen/ARM/subreg-remat.ll
+++ b/test/CodeGen/ARM/subreg-remat.ll
@@ -4,14 +4,14 @@ target triple = "thumbv7-apple-ios"
;
; The vector %v2 is built like this:
;
-; %vreg6:ssub_1<def> = VMOVSR %vreg0<kill>, pred:14, pred:%noreg, %vreg6<imp-def>; DPR_VFP2:%vreg6 GPR:%vreg0
+; %vreg6:ssub_1<def> = ...
; %vreg6:ssub_0<def> = VLDRS <cp#0>, 0, pred:14, pred:%noreg; mem:LD4[ConstantPool] DPR_VFP2:%vreg6
;
; When %vreg6 spills, the VLDRS constant pool load cannot be rematerialized
; since it implicitly reads the ssub_1 sub-register.
;
; CHECK: f1
-; CHECK: vmov s1, r0
+; CHECK: vmov d0, r0, r0
; CHECK: vldr s0, LCPI
; The vector must be spilled:
; CHECK: vstr d0,
diff --git a/test/CodeGen/Thumb2/cortex-fp.ll b/test/CodeGen/Thumb2/cortex-fp.ll
index d06f8a7bee..b7df2fbf54 100644
--- a/test/CodeGen/Thumb2/cortex-fp.ll
+++ b/test/CodeGen/Thumb2/cortex-fp.ll
@@ -7,8 +7,8 @@ define float @foo(float %a, float %b) {
entry:
; CHECK: foo
; CORTEXM3: blx ___mulsf3
-; CORTEXM4: vmul.f32 s0, s1, s0
-; CORTEXA8: vmul.f32 d0, d1, d0
+; CORTEXM4: vmul.f32 s0, s2, s0
+; CORTEXA8: vmul.f32 d
%0 = fmul float %a, %b
ret float %0
}
@@ -19,6 +19,6 @@ entry:
%0 = fmul double %a, %b
; CORTEXM3: blx ___muldf3
; CORTEXM4: blx ___muldf3
-; CORTEXA8: vmul.f64 d16, d17, d16
+; CORTEXA8: vmul.f64 d
ret double %0
}
diff --git a/test/CodeGen/Thumb2/div.ll b/test/CodeGen/Thumb2/div.ll
index 2c00c70c0d..f89746a303 100644
--- a/test/CodeGen/Thumb2/div.ll
+++ b/test/CodeGen/Thumb2/div.ll
@@ -2,6 +2,8 @@
; RUN: | FileCheck %s -check-prefix=CHECK-THUMB
; RUN: llc < %s -march=thumb -mcpu=cortex-m3 -mattr=+thumb2 \
; RUN: | FileCheck %s -check-prefix=CHECK-THUMBV7M
+; RUN: llc < %s -march=thumb -mcpu=swift \
+; RUN: | FileCheck %s -check-prefix=CHECK-SWIFT-T2
define i32 @f1(i32 %a, i32 %b) {
entry:
@@ -9,6 +11,8 @@ entry:
; CHECK-THUMB: __divsi3
; CHECK-THUMBV7M: f1
; CHECK-THUMBV7M: sdiv
+; CHECK-SWIFT-T2: f1
+; CHECK-SWIFT-T2: sdiv
%tmp1 = sdiv i32 %a, %b ; <i32> [#uses=1]
ret i32 %tmp1
}
@@ -19,6 +23,8 @@ entry:
; CHECK-THUMB: __udivsi3
; CHECK-THUMBV7M: f2
; CHECK-THUMBV7M: udiv
+; CHECK-SWIFT-T2: f2
+; CHECK-SWIFT-T2: udiv
%tmp1 = udiv i32 %a, %b ; <i32> [#uses=1]
ret i32 %tmp1
}
@@ -29,6 +35,8 @@ entry:
; CHECK-THUMB: __modsi3
; CHECK-THUMBV7M: f3
; CHECK-THUMBV7M: sdiv
+; CHECK-SWIFT-T2: f3
+; CHECK-SWIFT-T2: sdiv
%tmp1 = srem i32 %a, %b ; <i32> [#uses=1]
ret i32 %tmp1
}
@@ -39,6 +47,8 @@ entry:
; CHECK-THUMB: __umodsi3
; CHECK-THUMBV7M: f4
; CHECK-THUMBV7M: udiv
+; CHECK-SWIFT-T2: f4
+; CHECK-SWIFT-T2: udiv
%tmp1 = urem i32 %a, %b ; <i32> [#uses=1]
ret i32 %tmp1
}
diff --git a/test/CodeGen/Thumb2/thumb2-mla.ll b/test/CodeGen/Thumb2/thumb2-mla.ll
index c4cc749ea5..594d9742b0 100644
--- a/test/CodeGen/Thumb2/thumb2-mla.ll
+++ b/test/CodeGen/Thumb2/thumb2-mla.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 -arm-use-mulops=false | FileCheck %s -check-prefix=NO_MULOPS
define i32 @f1(i32 %a, i32 %b, i32 %c) {
%tmp1 = mul i32 %a, %b
@@ -7,6 +8,9 @@ define i32 @f1(i32 %a, i32 %b, i32 %c) {
}
; CHECK: f1:
; CHECK: mla r0, r0, r1, r2
+; NO_MULOPS: f1:
+; NO_MULOPS: muls r0, r1, r0
+; NO_MULOPS-NEXT: add r0, r2
define i32 @f2(i32 %a, i32 %b, i32 %c) {
%tmp1 = mul i32 %a, %b
@@ -15,3 +19,6 @@ define i32 @f2(i32 %a, i32 %b, i32 %c) {
}
; CHECK: f2:
; CHECK: mla r0, r0, r1, r2
+; NO_MULOPS: f2:
+; NO_MULOPS: muls r0, r1, r0
+; NO_MULOPS-NEXT: add r0, r2
diff --git a/test/CodeGen/Thumb2/thumb2-smla.ll b/test/CodeGen/Thumb2/thumb2-smla.ll
index c128eccd66..aaaedfa42e 100644
--- a/test/CodeGen/Thumb2/thumb2-smla.ll
+++ b/test/CodeGen/Thumb2/thumb2-smla.ll
@@ -1,8 +1,12 @@
; RUN: llc < %s -march=thumb -mattr=+thumb2,+t2xtpk,+t2dsp | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2,+t2xtpk,+t2dsp -arm-use-mulops=false | FileCheck %s -check-prefix=NO_MULOPS
define i32 @f3(i32 %a, i16 %x, i32 %y) {
; CHECK: f3
; CHECK: smlabt r0, r1, r2, r0
+; NO_MULOPS: f3
+; NO_MULOPS: smultb r1, r2, r1
+; NO_MULOPS-NEXT: add r0, r1
%tmp = sext i16 %x to i32 ; <i32> [#uses=1]
%tmp2 = ashr i32 %y, 16 ; <i32> [#uses=1]
%tmp3 = mul i32 %tmp2, %tmp ; <i32> [#uses=1]