aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJames Molloy <james.molloy@arm.com>2012-09-06 09:16:01 +0000
committerJames Molloy <james.molloy@arm.com>2012-09-06 09:16:01 +0000
commit6c822eea47dbef96940819b1ea085fabc49a1e71 (patch)
treee9d2318a4f16d3ae149ba9e4f2794ba5f28c5f07
parent7859f438e198fe441abef3d2c95c1cb9517f575b (diff)
Optimize codegen for VSETLNi{8,16,32} operating on Q registers. Degenerate to a VSETLN on D registers, instead of an (INSERT_SUBREG (VSETLN (EXTRACT_SUBREG ))) sequence to help the register coalescer.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@163298 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/ARM/ARMExpandPseudoInsts.cpp51
-rw-r--r--lib/Target/ARM/ARMInstrNEON.td32
-rw-r--r--test/CodeGen/ARM/integer_insertelement.ll35
-rw-r--r--test/CodeGen/ARM/vget_lane.ll2
4 files changed, 102 insertions, 18 deletions
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 15bb32eb14..8ed6b751f3 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -1208,6 +1208,57 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
ExpandLaneOp(MBBI);
return true;
+ case ARM::VSETLNi8Q:
+ case ARM::VSETLNi16Q: {
+ // Expand VSETLNs acting on a Q register to equivalent VSETLNs acting
+ // on the respective D register.
+
+ unsigned QReg = MI.getOperand(1).getReg();
+ unsigned QLane = MI.getOperand(3).getImm();
+
+ unsigned NewOpcode, DLane, DSubReg;
+ switch (Opcode) {
+ default: llvm_unreachable("Invalid opcode!");
+ case ARM::VSETLNi8Q:
+ // 4 possible 8-bit lanes per DPR:
+ NewOpcode = ARM::VSETLNi8;
+ DLane = QLane % 8;
+ DSubReg = (QLane / 8) ? ARM::dsub_1 : ARM::dsub_0;
+ break;
+ case ARM::VSETLNi16Q:
+ // 4 possible 16-bit lanes per DPR.
+ NewOpcode = ARM::VSETLNi16;
+ DLane = QLane % 4;
+ DSubReg = (QLane / 4) ? ARM::dsub_1 : ARM::dsub_0;
+ break;
+ }
+
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpcode));
+
+ unsigned DReg = TRI->getSubReg(QReg, DSubReg);
+
+ MIB.addReg(DReg, RegState::Define); // Output DPR
+ MIB.addReg(DReg); // Input DPR
+ MIB.addOperand(MI.getOperand(2)); // Input GPR
+ MIB.addImm(DLane); // Lane
+
+ // Add the predicate operands.
+ MIB.addOperand(MI.getOperand(4));
+ MIB.addOperand(MI.getOperand(5));
+
+ if (MI.getOperand(1).isKill()) // Add an implicit kill for the Q register.
+ MIB->addRegisterKilled(QReg, TRI, true);
+ // And an implicit def of the output register (which should always be the
+ // same as the input register).
+ MIB->addRegisterDefined(QReg, TRI);
+
+ TransferImpOps(MI, MIB, MIB);
+
+ MI.eraseFromParent();
+ return true;
+ }
+
case ARM::VTBL3Pseudo: ExpandVTBL(MBBI, ARM::VTBL3, false); return true;
case ARM::VTBL4Pseudo: ExpandVTBL(MBBI, ARM::VTBL4, false); return true;
case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true); return true;
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 048d340df0..c5f3a83ab2 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -5045,25 +5045,23 @@ def VSETLNi32 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, 0b00, (outs DPR:$V),
GPR:$R, imm:$lane))]> {
let Inst{21} = lane{0};
}
+
+def VSETLNi8Q : PseudoNeonI<(outs QPR:$V),
+ (ins QPR:$src1, GPR:$R, VectorIndex8:$lane),
+ IIC_VMOVISL, "",
+ [(set QPR:$V, (vector_insert (v16i8 QPR:$src1),
+ GPR:$R, imm:$lane))]>;
+def VSETLNi16Q : PseudoNeonI<(outs QPR:$V),
+ (ins QPR:$src1, GPR:$R, VectorIndex16:$lane),
+ IIC_VMOVISL, "",
+ [(set QPR:$V, (vector_insert (v8i16 QPR:$src1),
+ GPR:$R, imm:$lane))]>;
}
-def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane),
- (v16i8 (INSERT_SUBREG QPR:$src1,
- (v8i8 (VSETLNi8 (v8i8 (EXTRACT_SUBREG QPR:$src1,
- (DSubReg_i8_reg imm:$lane))),
- GPR:$src2, (SubReg_i8_lane imm:$lane))),
- (DSubReg_i8_reg imm:$lane)))>;
-def : Pat<(vector_insert (v8i16 QPR:$src1), GPR:$src2, imm:$lane),
- (v8i16 (INSERT_SUBREG QPR:$src1,
- (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1,
- (DSubReg_i16_reg imm:$lane))),
- GPR:$src2, (SubReg_i16_lane imm:$lane))),
- (DSubReg_i16_reg imm:$lane)))>;
+
def : Pat<(insertelt (v4i32 QPR:$src1), GPR:$src2, imm:$lane),
- (v4i32 (INSERT_SUBREG QPR:$src1,
- (v2i32 (VSETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src1,
- (DSubReg_i32_reg imm:$lane))),
- GPR:$src2, (SubReg_i32_lane imm:$lane))),
- (DSubReg_i32_reg imm:$lane)))>;
+ (v4i32 (INSERT_SUBREG QPR:$src1,
+ GPR:$src2,
+ (SSubReg_f32_reg imm:$lane)))>;
def : Pat<(v2f32 (insertelt DPR:$src1, SPR:$src2, imm:$src3)),
(INSERT_SUBREG (v2f32 (COPY_TO_REGCLASS DPR:$src1, DPR_VFP2)),
diff --git a/test/CodeGen/ARM/integer_insertelement.ll b/test/CodeGen/ARM/integer_insertelement.ll
new file mode 100644
index 0000000000..4f2d7e3f73
--- /dev/null
+++ b/test/CodeGen/ARM/integer_insertelement.ll
@@ -0,0 +1,35 @@
+; RUN: llc %s -o - -march=arm -mattr=+neon | FileCheck %s
+
+; This test checks that when inserting one (integer) element into a vector,
+; the vector is not spuriously copied. "vorr dX, dY, dY" is the way of moving
+; one DPR to another that we check for.
+
+; CHECK: @f
+; CHECK-NOT: vorr d
+; CHECK: vmov s
+; CHECK-NOT: vorr d
+; CHECK: mov pc, lr
+define <4 x i32> @f(<4 x i32> %in) {
+ %1 = insertelement <4 x i32> %in, i32 255, i32 3
+ ret <4 x i32> %1
+}
+
+; CHECK: @g
+; CHECK-NOT: vorr d
+; CHECK: vmov.16 d
+; CHECK-NOT: vorr d
+; CHECK: mov pc, lr
+define <8 x i16> @g(<8 x i16> %in) {
+ %1 = insertelement <8 x i16> %in, i16 255, i32 7
+ ret <8 x i16> %1
+}
+
+; CHECK: @h
+; CHECK-NOT: vorr d
+; CHECK: vmov.8 d
+; CHECK-NOT: vorr d
+; CHECK: mov pc, lr
+define <16 x i8> @h(<16 x i8> %in) {
+ %1 = insertelement <16 x i8> %in, i8 255, i32 15
+ ret <16 x i8> %1
+}
diff --git a/test/CodeGen/ARM/vget_lane.ll b/test/CodeGen/ARM/vget_lane.ll
index 1fc885d613..2ed65c9aee 100644
--- a/test/CodeGen/ARM/vget_lane.ll
+++ b/test/CodeGen/ARM/vget_lane.ll
@@ -200,7 +200,7 @@ define <8 x i16> @vsetQ_lane16(<8 x i16>* %A, i16 %B) nounwind {
define <4 x i32> @vsetQ_lane32(<4 x i32>* %A, i32 %B) nounwind {
;CHECK: vsetQ_lane32:
-;CHECK: vmov.32
+;CHECK: vmov s
%tmp1 = load <4 x i32>* %A
%tmp2 = insertelement <4 x i32> %tmp1, i32 %B, i32 1
ret <4 x i32> %tmp2