7 files changed, 121 insertions, 0 deletions
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index 4e28dfe7fa..b2f246e441 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -205,6 +205,16 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DecodeUNPCKHPMask(4, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(0).getReg());
     break;
+  case X86::VPERMILPSYri:
+    DecodeVPERMILPSMask(8, MI->getOperand(2).getImm(),
+                        ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERMILPDYri:
+    DecodeVPERMILPSMask(4, MI->getOperand(2).getImm(),
+                        ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    break;
   }
 
 
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index cd06060748..c1ff0e5011 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -187,4 +187,31 @@ void DecodeUNPCKLPMask(EVT VT,
   }
 }
 
+void DecodeVPERMILPSMask(unsigned NElts, unsigned Imm,
+                        SmallVectorImpl<unsigned> &ShuffleMask) {
+  DecodeVPERMILMask(MVT::getVectorVT(MVT::i32, NElts), Imm, ShuffleMask);
+}
+
+void DecodeVPERMILPDMask(unsigned NElts, unsigned Imm,
+                        SmallVectorImpl<unsigned> &ShuffleMask) {
+  DecodeVPERMILMask(MVT::getVectorVT(MVT::i64, NElts), Imm, ShuffleMask);
+}
+
+// DecodeVPERMILMask - Decodes VPERMIL permutes for any 128-bit
+// with 32/64-bit elements. For 256-bit vectors, it's considered
+// as two 128 lanes and the mask of the first lane should be
+// identical of the second one.
+void DecodeVPERMILMask(EVT VT, unsigned Imm,
+                       SmallVectorImpl<unsigned> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+
+  for (unsigned l = 0; l != NumLanes; ++l) {
+    for (unsigned i = 0; i != NumElts/NumLanes; ++i) {
+      unsigned Idx = (Imm >> (i*2)) & 0x3 ;
+      ShuffleMask.push_back(Idx+(l*NumElts/NumLanes));
+    }
+  }
+}
+
 } // llvm namespace
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index b18f670330..4a5214028f 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -82,6 +82,20 @@ void DecodeUNPCKLPDMask(unsigned NElts,
 void DecodeUNPCKLPMask(EVT VT,
                        SmallVectorImpl<unsigned> &ShuffleMask);
 
+
+void DecodeVPERMILPSMask(unsigned NElts, unsigned Imm,
+                        SmallVectorImpl<unsigned> &ShuffleMask);
+
+void DecodeVPERMILPDMask(unsigned NElts, unsigned Imm,
+                        SmallVectorImpl<unsigned> &ShuffleMask);
+
+// DecodeVPERMILMask - Decodes VPERMIL permutes for any 128-bit
+// with 32/64-bit elements. For 256-bit vectors, it's considered
+// as two 128 lanes and the mask of the first lane should be
+// identical of the second one.
+void DecodeVPERMILMask(EVT VT, unsigned Imm,
+                       SmallVectorImpl<unsigned> &ShuffleMask);
+
 } // llvm namespace
 
 #endif
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index c4e44f999d..1c827f0dd3 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -2747,6 +2747,7 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::PUNPCKHBW:
   case X86ISD::PUNPCKHDQ:
   case X86ISD::PUNPCKHQDQ:
+  case X86ISD::VPERMIL:
     return true;
   }
   return false;
@@ -2772,6 +2773,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   case X86ISD::PSHUFD:
   case X86ISD::PSHUFHW:
   case X86ISD::PSHUFLW:
+  case X86ISD::VPERMIL:
     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
   }
 
@@ -3422,6 +3424,54 @@ bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
   return ::isMOVLMask(M, N->getValueType(0));
 }
 
+/// isVPERMILMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to VPERMIL*.
+static bool isVPERMILMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+
+  // Match any permutation of 128-bit vector with 32/64-bit types
+  if (NumLanes == 1) {
+    if (NumElts == 4 || NumElts == 2)
+      return true;
+    return false;
+  }
+
+  // Only match 256-bit with 32/64-bit types
+  if (NumElts != 8 && NumElts != 4)
+    return false;
+
+  // The mask on the high lane should be the same as the low. Actually,
+  // they can differ if any of the corresponding index in a lane is undef.
+  int LaneSize = NumElts/NumLanes;
+  for (int i = 0; i < LaneSize; ++i) {
+    int HighElt = i+LaneSize;
+    if (Mask[i] < 0 || Mask[HighElt] < 0)
+      continue;
+
+    if (Mask[HighElt]-Mask[i] != LaneSize)
+      return false;
+  }
+
+  return true;
+}
+
+/// getShuffleVPERMILImmediateediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERMIL* instructions.
+static unsigned getShuffleVPERMILImmediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  EVT VT = SVOp->getValueType(0);
+
+  int NumElts = VT.getVectorNumElements();
+  int NumLanes = VT.getSizeInBits()/128;
+
+  unsigned Mask = 0;
+  for (int i = 0; i < NumElts/NumLanes /* lane size */; ++i)
+    Mask |= SVOp->getMaskElt(i) << (i*2);
+
+  return Mask;
+}
+
 /// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
 /// element of vector 2 and the other elements to come from vector 1 in order.
@@ -4097,6 +4147,10 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
       return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
                                  Depth+1);
     }
+    case X86ISD::VPERMIL:
+      ImmN = N->getOperand(N->getNumOperands()-1);
+      DecodeVPERMILMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                        ShuffleMask);
     default:
       assert("not implemented for target shuffle node");
       return SDValue();
@@ -6043,6 +6097,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   if (NumElems == 4)
     return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);
 
+  // Handle VPERMIL permutations
+  if (isVPERMILMask(M, VT)) {
+    unsigned TargetMask = getShuffleVPERMILImmediate(SVOp);
+    if (VT == MVT::v8f32)
+      return getTargetShuffleNode(X86ISD::VPERMIL, dl, VT, V1, TargetMask, DAG);
+  }
+
   return SDValue();
 }
 
@@ -9660,6 +9721,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::PUNPCKHWD:          return "X86ISD::PUNPCKHWD";
   case X86ISD::PUNPCKHDQ:          return "X86ISD::PUNPCKHDQ";
   case X86ISD::PUNPCKHQDQ:         return "X86ISD::PUNPCKHQDQ";
+  case X86ISD::VPERMIL:            return "X86ISD::VPERMIL";
   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
@@ -12465,6 +12527,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::PSHUFLW:
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
+  case X86ISD::VPERMIL:
   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI);
   }
 
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 376aa8a440..298d45110f 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -271,6 +271,7 @@ namespace llvm {
       PUNPCKHWD,
       PUNPCKHDQ,
       PUNPCKHQDQ,
+      VPERMIL,
 
       // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
       // according to %al. An operator is needed so that this can be expanded
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index b00109c9fa..f792f53b7b 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -150,6 +150,8 @@ def X86Punpckhwd  : SDNode<"X86ISD::PUNPCKHWD", SDTShuff2Op>;
 def X86Punpckhdq  : SDNode<"X86ISD::PUNPCKHDQ", SDTShuff2Op>;
 def X86Punpckhqdq : SDNode<"X86ISD::PUNPCKHQDQ", SDTShuff2Op>;
 
+def X86VPermil : SDNode<"X86ISD::VPERMIL", SDTShuff2OpI>;
+
 //===----------------------------------------------------------------------===//
 // SSE Complex Patterns
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index ae26a80fe2..73e465fe29 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -5529,6 +5529,10 @@ def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
 // The AVX version of some but not all of them are described here, and more
 // should come in a near future.
 
+// Shuffle with VPERMIL instructions
+def : Pat<(v8f32 (X86VPermil VR256:$src1, (i8 imm:$imm))),
+          (VPERMILPSYri VR256:$src1, imm:$imm)>;
+
 // Shuffle with PSHUFD instruction folding loads. The first two patterns match
 // SSE2 loads, which are always promoted to v2i64. The last one should match
 // the SSE1 case, where the only legal load is v4f32, but there is no PSHUFD