diff options
-rw-r--r-- | lib/Target/ARM/ARMBaseInstrInfo.cpp | 12 | ||||
-rw-r--r-- | lib/Target/ARM/ARMTargetMachine.cpp | 3 | ||||
-rw-r--r-- | test/CodeGen/ARM/a15-SD-dep.ll | 5 | ||||
-rw-r--r-- | test/CodeGen/ARM/a15-partial-update.ll | 38 |
4 files changed, 47 insertions, 11 deletions
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index ed8b9cd9a1..126f160f6d 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -3734,9 +3734,9 @@ ARMBaseInstrInfo::getExecutionDomain(const MachineInstr *MI) const { if (MI->getOpcode() == ARM::VMOVD && !isPredicated(MI)) return std::make_pair(ExeVFP, (1<<ExeVFP) | (1<<ExeNEON)); - // A9-like cores are particularly picky about mixing the two and want these + // CortexA9 is particularly picky about mixing the two and wants these // converted. - if (Subtarget.isLikeA9() && !isPredicated(MI) && + if (Subtarget.isCortexA9() && !isPredicated(MI) && (MI->getOpcode() == ARM::VMOVRS || MI->getOpcode() == ARM::VMOVSR || MI->getOpcode() == ARM::VMOVS)) @@ -4023,14 +4023,12 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { // VLD1DUPd32 - Writes all D-regs, no partial reg update, 2 uops. // // FCONSTD can be used as a dependency-breaking instruction. - - unsigned ARMBaseInstrInfo:: getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum, const TargetRegisterInfo *TRI) const { - // Only Swift has partial register update problems. - if (!SwiftPartialUpdateClearance || !Subtarget.isSwift()) + if (!SwiftPartialUpdateClearance || + !(Subtarget.isSwift() || Subtarget.isCortexA15())) return 0; assert(TRI && "Need TRI instance"); @@ -4056,7 +4054,7 @@ getPartialRegUpdateClearance(const MachineInstr *MI, // Explicitly reads the dependency. case ARM::VLD1LNd32: - UseOp = 1; + UseOp = 3; break; default: return 0; diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index b0f9e56db7..42c7d2c437 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -185,8 +185,7 @@ bool ARMPassConfig::addPreSched2() { addPass(createARMLoadStoreOptimizationPass()); printAndVerify("After ARM load / store optimizer"); } - if ((DisableA15SDOptimization || !getARMSubtarget().isCortexA15()) && - getARMSubtarget().hasNEON()) + if (getARMSubtarget().hasNEON()) addPass(createExecutionDependencyFixPass(&ARM::DPRRegClass)); } diff --git a/test/CodeGen/ARM/a15-SD-dep.ll b/test/CodeGen/ARM/a15-SD-dep.ll index 17e3eba272..a52468e5be 100644 --- a/test/CodeGen/ARM/a15-SD-dep.ll +++ b/test/CodeGen/ARM/a15-SD-dep.ll @@ -5,7 +5,7 @@ ; CHECK-DISABLED: t1: define <2 x float> @t1(float %f) { ; CHECK-ENABLED: vdup.32 d{{[0-9]*}}, d0[0] - ; CHECK-DISABLED: vmov.32 d0[1], r{{.}} + ; CHECK-DISABLED-NOT: vdup.32 d{{[0-9]*}}, d0[0] %i1 = insertelement <2 x float> undef, float %f, i32 1 %i2 = fadd <2 x float> %i1, %i1 ret <2 x float> %i2 @@ -15,7 +15,7 @@ define <2 x float> @t1(float %f) { ; CHECK-DISABLED: t2: define <4 x float> @t2(float %g, float %f) { ; CHECK-ENABLED: vdup.32 q{{[0-9]*}}, d0[0] - ; CHECK-DISABLED: vmov.32 d0[1], r{{.}} + ; CHECK-DISABLED-NOT: vdup.32 d{{[0-9]*}}, d0[0] %i1 = insertelement <4 x float> undef, float %f, i32 1 %i2 = fadd <4 x float> %i1, %i1 ret <4 x float> %i2 @@ -25,6 +25,7 @@ define <4 x float> @t2(float %g, float %f) { ; CHECK-DISABLED: t3: define arm_aapcs_vfpcc <2 x float> @t3(float %f) { ; CHECK-ENABLED: vdup.32 d{{[0-9]*}}, d0[0] + ; CHECK-DISABLED-NOT: vdup.32 d{{[0-9]*}}, d0[0] %i1 = insertelement <2 x float> undef, float %f, i32 1 %i2 = fadd <2 x float> %i1, %i1 ret <2 x float> %i2 diff --git a/test/CodeGen/ARM/a15-partial-update.ll b/test/CodeGen/ARM/a15-partial-update.ll new file mode 100644 index 0000000000..6306790d15 --- /dev/null +++ b/test/CodeGen/ARM/a15-partial-update.ll @@ -0,0 +1,38 @@ +; RUN: llc -O1 -mcpu=cortex-a15 -mtriple=armv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck %s + +; CHECK: t1: +define <2 x float> @t1(float* %A, <2 x float> %B) { +; The generated code for this test uses a vld1.32 instruction +; to write the lane 1 of a D register containing the value of +; <2 x float> %B. Since the D register is defined, it would +; be incorrect to fully write it (with a vmov.f64) before the +; vld1.32 instruction. The test checks that a vmov.f64 was not +; generated. + +; CHECK-NOT: vmov.{{.*}} d{{[0-9]+}}, + %tmp2 = load float* %A, align 4 + %tmp3 = insertelement <2 x float> %B, float %tmp2, i32 1 + ret <2 x float> %tmp3 +} + +; CHECK: t2: +define void @t2(<4 x i8> *%in, <4 x i8> *%out, i32 %n) { +entry: + br label %loop +loop: +; The code generated by this test uses a vld1.32 instruction. +; We check that a dependency breaking vmov* instruction was +; generated. + +; CHECK: vmov.{{.*}} d{{[0-9]+}}, + %oldcount = phi i32 [0, %entry], [%newcount, %loop] + %newcount = add i32 %oldcount, 1 + %p1 = getelementptr <4 x i8> *%in, i32 %newcount + %p2 = getelementptr <4 x i8> *%out, i32 %newcount + %tmp1 = load <4 x i8> *%p1, align 4 + store <4 x i8> %tmp1, <4 x i8> *%p2 + %cmp = icmp eq i32 %newcount, %n + br i1 %cmp, label %loop, label %ret +ret: + ret void +} |