aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNadav Rotem <nrotem@apple.com>2013-01-18 23:10:30 +0000
committerNadav Rotem <nrotem@apple.com>2013-01-18 23:10:30 +0000
commit48177ac90fb940833b9deea1a6716092348cfe82 (patch)
tree5252f0617e256f0dd1f8b26082f05a088d7232b9
parent7336f7febb5170b374a4cbffee273ad82ff8a1a3 (diff)
On Sandybridge loading unaligned 256bits using two XMM loads (vmovups and vinsertf128) is faster than using a single vmovups instruction.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@172868 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp32
-rw-r--r--test/CodeGen/X86/sandybridge-loads.ll21
-rw-r--r--test/CodeGen/X86/v8i1-masks.ll2
3 files changed, 53 insertions, 2 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 2b6ff3602a..73a1d2e007 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -16340,8 +16340,39 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
EVT MemVT = Ld->getMemoryVT();
DebugLoc dl = Ld->getDebugLoc();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned RegSz = RegVT.getSizeInBits();
ISD::LoadExtType Ext = Ld->getExtensionType();
+ unsigned Alignment = Ld->getAlignment();
+
+ // On Sandybridge unaligned 256bit loads are inefficient.
+ if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
+ !DCI.isBeforeLegalizeOps() && Alignment < 32 &&
+ Ext == ISD::NON_EXTLOAD) {
+ unsigned NumElems = RegVT.getVectorNumElements();
+ SDValue Ptr = Ld->getBasePtr();
+ SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
+
+ EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+ NumElems/2);
+ SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+ Ld->getPointerInfo(), Ld->isVolatile(),
+ Ld->isNonTemporal(), Ld->isInvariant(),
+ Alignment);
+ Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+ SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+ Ld->getPointerInfo(), Ld->isVolatile(),
+ Ld->isNonTemporal(), Ld->isInvariant(),
+ Alignment);
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ Load1.getValue(1),
+ Load2.getValue(1));
+
+ SDValue NewVec = DAG.getUNDEF(RegVT);
+ NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
+ NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
+ return DCI.CombineTo(N, NewVec, TF, true);
+ }
// If this is a vector EXT Load then attempt to optimize it using a
// shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
@@ -16356,7 +16387,6 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
assert(MemVT.isVector() && "Must load a vector from memory");
unsigned NumElems = RegVT.getVectorNumElements();
- unsigned RegSz = RegVT.getSizeInBits();
unsigned MemSz = MemVT.getSizeInBits();
assert(RegSz > MemSz && "Register size must be greater than the mem size");
diff --git a/test/CodeGen/X86/sandybridge-loads.ll b/test/CodeGen/X86/sandybridge-loads.ll
new file mode 100644
index 0000000000..d85c32eaa7
--- /dev/null
+++ b/test/CodeGen/X86/sandybridge-loads.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
+
+;CHECK: wideloads
+;CHECK: vmovaps
+;CHECK: vinsertf128
+;CHECK: vmovups
+;CHECK-NOT: vinsertf128
+;CHECK: ret
+
+define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
+ %v0 = load <8 x float>* %a, align 16 ; <---- unaligned!
+ %v1 = load <8 x float>* %b, align 32 ; <---- aligned!
+ %m0 = fcmp olt <8 x float> %v1, %v0
+ %v2 = load <8 x float>* %c, align 16
+ %m1 = fcmp olt <8 x float> %v2, %v0
+ %mand = and <8 x i1> %m1, %m0
+ %r = zext <8 x i1> %mand to <8 x i32>
+ store <8 x i32> %r, <8 x i32>* undef, align 16
+ ret void
+}
+
diff --git a/test/CodeGen/X86/v8i1-masks.ll b/test/CodeGen/X86/v8i1-masks.ll
index abb4b39bd6..ea231aff5b 100644
--- a/test/CodeGen/X86/v8i1-masks.ll
+++ b/test/CodeGen/X86/v8i1-masks.ll
@@ -1,7 +1,7 @@
; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
;CHECK: and_masks
-;CHECK: vmovups
+;CHECK: vmovaps
;CHECK: vcmpltp
;CHECK: vcmpltp
;CHECK: vandps