More AVX2 intrinsic support including saturating add/sub and palignr.

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@146857 91177308-0d34-0410-b5e6-96231b3b80d8
author: Craig Topper <craig.topper@gmail.com> 2011-12-19 07:03:25 +0000
committer: Craig Topper <craig.topper@gmail.com> 2011-12-19 07:03:25 +0000
commit: 9c2ffd803af03f1728423d0d73ff87d988642633 (patch)
tree: 86b625709e1c89a4f1387ec25bb570bb2902bd79 /lib
parent: 099e7f647ccda915513f2b2ec53352dc756082d3 (diff)
2 files changed, 92 insertions, 0 deletions
diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp
index ffe5fffa12..71d515646e 100644
--- a/lib/CodeGen/CGBuiltin.cpp
+++ b/lib/CodeGen/CGBuiltin.cpp
@@ -2288,6 +2288,44 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     // If palignr is shifting the pair of vectors more than 32 bytes, emit zero.
     return llvm::Constant::getNullValue(ConvertType(E->getType()));
   }
+  case X86::BI__builtin_ia32_palignr256: {
+    unsigned shiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
+
+    // If palignr is shifting the pair of input vectors less than 17 bytes,
+    // emit a shuffle instruction.
+    if (shiftVal <= 16) {
+      SmallVector<llvm::Constant*, 32> Indices;
+      // 256-bit palignr operates on 128-bit lanes so we need to handle that
+      for (unsigned l = 0; l != 2; ++l) {
+        unsigned LaneStart = l * 16;
+        unsigned LaneEnd = (l+1) * 16;
+        for (unsigned i = 0; i != 16; ++i) {
+          unsigned Idx = shiftVal + i + LaneStart;
+          if (Idx >= LaneEnd) Idx += 16; // end of lane, switch operand
+          Indices.push_back(llvm::ConstantInt::get(Int32Ty, Idx));
+        }
+      }
+
+      Value* SV = llvm::ConstantVector::get(Indices);
+      return Builder.CreateShuffleVector(Ops[1], Ops[0], SV, "palignr");
+    }
+
+    // If palignr is shifting the pair of input vectors more than 16 but less
+    // than 32 bytes, emit a logical right shift of the destination.
+    if (shiftVal < 32) {
+      llvm::Type *VecTy = llvm::VectorType::get(Int64Ty, 4);
+
+      Ops[0] = Builder.CreateBitCast(Ops[0], VecTy, "cast");
+      Ops[1] = llvm::ConstantInt::get(Int32Ty, (shiftVal-16) * 8);
+
+      // create i32 constant
+      llvm::Function *F = CGM.getIntrinsic(Intrinsic::x86_avx2_psrl_dq);
+      return Builder.CreateCall(F, makeArrayRef(&Ops[0], 2), "palignr");
+    }
+
+    // If palignr is shifting the pair of vectors more than 32 bytes, emit zero.
+    return llvm::Constant::getNullValue(ConvertType(E->getType()));
+  }
   case X86::BI__builtin_ia32_movntps:
   case X86::BI__builtin_ia32_movntpd:
   case X86::BI__builtin_ia32_movntdq:
diff --git a/lib/Headers/avx2intrin.h b/lib/Headers/avx2intrin.h
index 1cfcac5c29..e4f1e14c90 100644
--- a/lib/Headers/avx2intrin.h
+++ b/lib/Headers/avx2intrin.h
@@ -95,6 +95,35 @@ _mm256_add_epi64(__m256i a, __m256i b)
 }
 
 static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_adds_epi8(__m256i a, __m256i b)
+{
+  return (__m256i)__builtin_ia32_paddsb256((__v32qi)a, (__v32qi)b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_adds_epi16(__m256i a, __m256i b)
+{
+  return (__m256i)__builtin_ia32_paddsw256((__v16hi)a, (__v16hi)b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_adds_epu8(__m256i a, __m256i b)
+{
+  return (__m256i)__builtin_ia32_paddusb256((__v32qi)a, (__v32qi)b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_adds_epu16(__m256i a, __m256i b)
+{
+  return (__m256i)__builtin_ia32_paddusw256((__v16hi)a, (__v16hi)b);
+}
+
+#define _mm256_alignr_epi8(a, b, n) __extension__ ({ \
+  __m256i __a = (a); \
+  __m256i __b = (b); \
+  (__m256i)__builtin_ia32_palignr256((__v32qi)__a, (__v32qi)__b, (n)); })
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
 _mm256_sub_epi8(__m256i a, __m256i b)
 {
   return (__m256i)((__v32qi)a - (__v32qi)b);
@@ -117,3 +146,28 @@ _mm256_sub_epi64(__m256i a, __m256i b)
 {
   return a - b;
 }
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_subs_epi8(__m256i a, __m256i b)
+{
+  return (__m256i)__builtin_ia32_psubsb256((__v32qi)a, (__v32qi)b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_subs_epi16(__m256i a, __m256i b)
+{
+  return (__m256i)__builtin_ia32_psubsw256((__v16hi)a, (__v16hi)b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_subs_epu8(__m256i a, __m256i b)
+{
+  return (__m256i)__builtin_ia32_psubusb256((__v32qi)a, (__v32qi)b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_subs_epu16(__m256i a, __m256i b)
+{
+  return (__m256i)__builtin_ia32_psubusw256((__v16hi)a, (__v16hi)b);
+}
+
author	Craig Topper <craig.topper@gmail.com>	2011-12-19 07:03:25 +0000
committer	Craig Topper <craig.topper@gmail.com>	2011-12-19 07:03:25 +0000
commit	9c2ffd803af03f1728423d0d73ff87d988642633 (patch)
tree	86b625709e1c89a4f1387ec25bb570bb2902bd79 /lib
parent	099e7f647ccda915513f2b2ec53352dc756082d3 (diff)