aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNadav Rotem <nrotem@apple.com>2012-11-03 00:39:56 +0000
committerNadav Rotem <nrotem@apple.com>2012-11-03 00:39:56 +0000
commitb4b04c3fa0a5da15424de7818e9f72811495c65b (patch)
tree4819e241823db653ae482ef043ec8d1e23e8d1b9
parent3c9c1ab7b7549dfaf22456d89bd241a5e8dfc0a4 (diff)
X86 CostModel: Add support for a some of the common arithmetic instructions for SSE4, AVX and AVX2.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167347 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--include/llvm/Target/TargetTransformImpl.h2
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp70
-rw-r--r--lib/Target/X86/X86ISelLowering.h9
-rw-r--r--test/Analysis/CostModel/X86/arith.ll40
-rw-r--r--test/Analysis/CostModel/X86/vectorized-loop.ll2
5 files changed, 116 insertions, 7 deletions
diff --git a/include/llvm/Target/TargetTransformImpl.h b/include/llvm/Target/TargetTransformImpl.h
index fa1acbea08..625be7208a 100644
--- a/include/llvm/Target/TargetTransformImpl.h
+++ b/include/llvm/Target/TargetTransformImpl.h
@@ -51,7 +51,7 @@ public:
};
class VectorTargetTransformImpl : public VectorTargetTransformInfo {
-private:
+protected:
const TargetLowering *TLI;
/// Estimate the cost of type-legalization and the legalized type.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 9eea44349a..0d38ba236e 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -17504,3 +17504,73 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
return Res;
}
+
+unsigned
+X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode,
+ Type *Ty) const {
+ const X86Subtarget &ST =
+ TLI->getTargetMachine().getSubtarget<X86Subtarget>();
+
+ // Fix some of the inaccuracies of the target independent estimation.
+ if (Ty->isVectorTy() && ST.hasSSE41()) {
+ unsigned NumElem = Ty->getVectorNumElements();
+ unsigned SizeInBits = Ty->getScalarType()->getScalarSizeInBits();
+
+ bool Is2 = (NumElem == 2);
+ bool Is4 = (NumElem == 4);
+ bool Is8 = (NumElem == 8);
+ bool Is32bits = (SizeInBits == 32);
+ bool Is64bits = (SizeInBits == 64);
+ bool HasAvx = ST.hasAVX();
+ bool HasAvx2 = ST.hasAVX2();
+
+ switch (Opcode) {
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul: {
+ // Only AVX2 has support for 8-wide integer operations.
+ if (Is32bits && (Is4 || (Is8 && HasAvx2))) return 1;
+ if (Is64bits && (Is2 || (Is4 && HasAvx2))) return 1;
+
+ // We don't have to completly scalarize unsupported ops. We can
+ // issue two half-sized operations (with some overhead).
+ // We don't need to extract the lower part of the YMM to the XMM.
+ // Extract the upper, two ops, insert the upper = 4.
+ if (Is32bits && Is8 && HasAvx) return 4;
+ if (Is64bits && Is4 && HasAvx) return 4;
+ break;
+ }
+ case Instruction::FAdd:
+ case Instruction::FSub:
+ case Instruction::FMul: {
+ // AVX has support for 8-wide float operations.
+ if (Is32bits && (Is4 || (Is8 && HasAvx))) return 1;
+ if (Is64bits && (Is2 || (Is4 && HasAvx))) return 1;
+ break;
+ }
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ // AVX has support for 8-wide integer bitwise operations.
+ if (Is32bits && (Is4 || (Is8 && HasAvx))) return 1;
+ if (Is64bits && (Is2 || (Is4 && HasAvx))) return 1;
+ break;
+ }
+ }
+ }
+
+ return VectorTargetTransformImpl::getArithmeticInstrCost(Opcode, Ty);
+}
+
+unsigned
+X86VectorTargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index) const {
+ // Floating point scalars are already located in index #0.
+ if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
+ return 0;
+ return VectorTargetTransformImpl::getVectorInstrCost(Opcode, Val, Index);
+}
+
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index d4c30369b7..3ecef983bd 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -953,13 +953,10 @@ namespace llvm {
explicit X86VectorTargetTransformInfo(const TargetLowering *TL) :
VectorTargetTransformImpl(TL) {}
+ virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
+
virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
- unsigned Index) const {
- // Floating point scalars are already located in index #0.
- if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
- return 0;
- return VectorTargetTransformImpl::getVectorInstrCost(Opcode, Val, Index);
- }
+ unsigned Index) const;
};
}
diff --git a/test/Analysis/CostModel/X86/arith.ll b/test/Analysis/CostModel/X86/arith.ll
new file mode 100644
index 0000000000..58b4a7c426
--- /dev/null
+++ b/test/Analysis/CostModel/X86/arith.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @add(i32 %arg) {
+ ;CHECK: cost of 1 {{.*}} add
+ %A = add <4 x i32> undef, undef
+ ;CHECK: cost of 4 {{.*}} add
+ %B = add <8 x i32> undef, undef
+ ;CHECK: cost of 1 {{.*}} add
+ %C = add <2 x i64> undef, undef
+ ;CHECK: cost of 4 {{.*}} add
+ %D = add <4 x i64> undef, undef
+ ;CHECK: cost of 1 {{.*}} ret
+ ret i32 undef
+}
+
+
+define i32 @xor(i32 %arg) {
+ ;CHECK: cost of 1 {{.*}} xor
+ %A = xor <4 x i32> undef, undef
+ ;CHECK: cost of 1 {{.*}} xor
+ %B = xor <8 x i32> undef, undef
+ ;CHECK: cost of 1 {{.*}} xor
+ %C = xor <2 x i64> undef, undef
+ ;CHECK: cost of 1 {{.*}} xor
+ %D = xor <4 x i64> undef, undef
+ ;CHECK: cost of 1 {{.*}} ret
+ ret i32 undef
+}
+
+
+define i32 @fmul(i32 %arg) {
+ ;CHECK: cost of 1 {{.*}} fmul
+ %A = fmul <4 x float> undef, undef
+ ;CHECK: cost of 1 {{.*}} fmul
+ %B = fmul <8 x float> undef, undef
+ ret i32 undef
+}
diff --git a/test/Analysis/CostModel/X86/vectorized-loop.ll b/test/Analysis/CostModel/X86/vectorized-loop.ll
index fbf20de515..7919a9ca9a 100644
--- a/test/Analysis/CostModel/X86/vectorized-loop.ll
+++ b/test/Analysis/CostModel/X86/vectorized-loop.ll
@@ -30,10 +30,12 @@ vector.body: ; preds = %for.body.lr.ph, %ve
%5 = bitcast i32* %4 to <8 x i32>*
;CHECK: cost of 1 {{.*}} load
%6 = load <8 x i32>* %5, align 4
+ ;CHECK: cost of 4 {{.*}} mul
%7 = mul nsw <8 x i32> %6, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
%8 = getelementptr inbounds i32* %A, i64 %index
%9 = bitcast i32* %8 to <8 x i32>*
%10 = load <8 x i32>* %9, align 4
+ ;CHECK: cost of 4 {{.*}} add
%11 = add nsw <8 x i32> %10, %7
;CHECK: cost of 1 {{.*}} store
store <8 x i32> %11, <8 x i32>* %9, align 4