aboutsummaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorCraig Topper <craig.topper@gmail.com>2011-12-19 07:03:25 +0000
committerCraig Topper <craig.topper@gmail.com>2011-12-19 07:03:25 +0000
commit9c2ffd803af03f1728423d0d73ff87d988642633 (patch)
tree86b625709e1c89a4f1387ec25bb570bb2902bd79 /lib
parent099e7f647ccda915513f2b2ec53352dc756082d3 (diff)
More AVX2 intrinsic support including saturating add/sub and palignr.
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@146857 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib')
-rw-r--r--lib/CodeGen/CGBuiltin.cpp38
-rw-r--r--lib/Headers/avx2intrin.h54
2 files changed, 92 insertions, 0 deletions
diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp
index ffe5fffa12..71d515646e 100644
--- a/lib/CodeGen/CGBuiltin.cpp
+++ b/lib/CodeGen/CGBuiltin.cpp
@@ -2288,6 +2288,44 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
// If palignr is shifting the pair of vectors more than 32 bytes, emit zero.
return llvm::Constant::getNullValue(ConvertType(E->getType()));
}
+ case X86::BI__builtin_ia32_palignr256: {
+ unsigned shiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
+
+ // If palignr is shifting the pair of input vectors less than 17 bytes,
+ // emit a shuffle instruction.
+ if (shiftVal <= 16) {
+ SmallVector<llvm::Constant*, 32> Indices;
+ // 256-bit palignr operates on 128-bit lanes so we need to handle that
+ for (unsigned l = 0; l != 2; ++l) {
+ unsigned LaneStart = l * 16;
+ unsigned LaneEnd = (l+1) * 16;
+ for (unsigned i = 0; i != 16; ++i) {
+ unsigned Idx = shiftVal + i + LaneStart;
+ if (Idx >= LaneEnd) Idx += 16; // end of lane, switch operand
+ Indices.push_back(llvm::ConstantInt::get(Int32Ty, Idx));
+ }
+ }
+
+ Value* SV = llvm::ConstantVector::get(Indices);
+ return Builder.CreateShuffleVector(Ops[1], Ops[0], SV, "palignr");
+ }
+
+ // If palignr is shifting the pair of input vectors more than 16 but less
+ // than 32 bytes, emit a logical right shift of the destination.
+ if (shiftVal < 32) {
+ llvm::Type *VecTy = llvm::VectorType::get(Int64Ty, 4);
+
+ Ops[0] = Builder.CreateBitCast(Ops[0], VecTy, "cast");
+ Ops[1] = llvm::ConstantInt::get(Int32Ty, (shiftVal-16) * 8);
+
+ // create i32 constant
+ llvm::Function *F = CGM.getIntrinsic(Intrinsic::x86_avx2_psrl_dq);
+ return Builder.CreateCall(F, makeArrayRef(&Ops[0], 2), "palignr");
+ }
+
+ // If palignr is shifting the pair of vectors more than 32 bytes, emit zero.
+ return llvm::Constant::getNullValue(ConvertType(E->getType()));
+ }
case X86::BI__builtin_ia32_movntps:
case X86::BI__builtin_ia32_movntpd:
case X86::BI__builtin_ia32_movntdq:
diff --git a/lib/Headers/avx2intrin.h b/lib/Headers/avx2intrin.h
index 1cfcac5c29..e4f1e14c90 100644
--- a/lib/Headers/avx2intrin.h
+++ b/lib/Headers/avx2intrin.h
@@ -95,6 +95,35 @@ _mm256_add_epi64(__m256i a, __m256i b)
}
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_adds_epi8(__m256i a, __m256i b)
+{
+ return (__m256i)__builtin_ia32_paddsb256((__v32qi)a, (__v32qi)b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_adds_epi16(__m256i a, __m256i b)
+{
+ return (__m256i)__builtin_ia32_paddsw256((__v16hi)a, (__v16hi)b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_adds_epu8(__m256i a, __m256i b)
+{
+ return (__m256i)__builtin_ia32_paddusb256((__v32qi)a, (__v32qi)b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_adds_epu16(__m256i a, __m256i b)
+{
+ return (__m256i)__builtin_ia32_paddusw256((__v16hi)a, (__v16hi)b);
+}
+
+#define _mm256_alignr_epi8(a, b, n) __extension__ ({ \
+ __m256i __a = (a); \
+ __m256i __b = (b); \
+ (__m256i)__builtin_ia32_palignr256((__v32qi)__a, (__v32qi)__b, (n)); })
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_sub_epi8(__m256i a, __m256i b)
{
return (__m256i)((__v32qi)a - (__v32qi)b);
@@ -117,3 +146,28 @@ _mm256_sub_epi64(__m256i a, __m256i b)
{
return a - b;
}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_subs_epi8(__m256i a, __m256i b)
+{
+ return (__m256i)__builtin_ia32_psubsb256((__v32qi)a, (__v32qi)b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_subs_epi16(__m256i a, __m256i b)
+{
+ return (__m256i)__builtin_ia32_psubsw256((__v16hi)a, (__v16hi)b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_subs_epu8(__m256i a, __m256i b)
+{
+ return (__m256i)__builtin_ia32_psubusb256((__v32qi)a, (__v32qi)b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_subs_epu16(__m256i a, __m256i b)
+{
+ return (__m256i)__builtin_ia32_psubusw256((__v16hi)a, (__v16hi)b);
+}
+