2 files changed, 26 insertions, 1 deletions
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 5e88fcbb0e..08946070b4 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7792,6 +7792,14 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
     SmallVector<int, 8> NewMask;
     ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0);
 
+    // If the source shuffle has more than one user then do not try to optimize
+    // it because it may generate a more complex shuffle node. However, if the
+    // source shuffle is also a swizzle (a single source shuffle), our
+    // transformation is still likely to reduce the number of shuffles and only
+    // generate a simple shuffle node.
+    if (N0.getOperand(1).getOpcode() != ISD::UNDEF && !N0.hasOneUse())
+      return SDValue();
+
     EVT InVT = N0.getValueType();
     int InNumElts = InVT.getVectorNumElements();
 
@@ -7808,7 +7816,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
 
       NewMask.push_back(Idx);
     }
-
+    assert(NewMask.size() == VT.getVectorNumElements() && "Invalid mask size");
     return DAG.getVectorShuffle(VT, N->getDebugLoc(), OtherSV->getOperand(0),
                                 OtherSV->getOperand(1), &NewMask[0]);
   }
diff --git a/test/CodeGen/X86/SwizzleShuff.ll b/test/CodeGen/X86/SwizzleShuff.ll
index 11b702e3d1..224556deda 100644
--- a/test/CodeGen/X86/SwizzleShuff.ll
+++ b/test/CodeGen/X86/SwizzleShuff.ll
@@ -12,3 +12,20 @@ define void @pull_bitcast (<4 x i8>* %pA, <4 x i8>* %pB) {
   store <4 x i8> %C, <4 x i8>* %pA
   ret void
 }
+
+; CHECK: multi_use_swizzle
+; CHECK: mov
+; CHECK-NEXT: shuf
+; CHECK-NEXT: shuf
+; CHECK-NEXT: shuf
+; CHECK-NEXT: xor
+; CHECK-NEXT: ret
+define <4 x i32> @multi_use_swizzle (<4 x i32>* %pA, <4 x i32>* %pB) {
+  %A = load <4 x i32>* %pA
+  %B = load <4 x i32>* %pB
+  %S = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 6>
+  %S1 = shufflevector <4 x i32> %S, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 2>
+  %S2 = shufflevector <4 x i32> %S, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 2>
+  %R = xor <4 x i32> %S1, %S2
+  ret <4 x i32> %R
+}