Some enhancements for memcpy / memset inline expansion.

1. Teach it to use overlapping unaligned load / store to copy / set the trailing bytes. e.g. On 86, use two pairs of movups / movaps for 17 - 31 byte copies. 2. Use f64 for memcpy / memset on targets where i64 is not legal but f64 is. e.g. x86 and ARM. 3. When memcpy from a constant string, do *not* replace the load with a constant if it's not possible to materialize an integer immediate with a single instruction (required a new target hook: TLI.isIntImmLegal()). 4. Use unaligned load / stores more aggressively if target hooks indicates they are "fast". 5. Update ARM target hooks to use unaligned load / stores. e.g. vld1.8 / vst1.8. Also increase the threshold to something reasonable (8 for memset, 4 pairs for memcpy). This significantly improves Dhrystone, up to 50% on ARM iOS devices. rdar://12760078 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169791 91177308-0d34-0410-b5e6-96231b3b80d8
author: Evan Cheng <evan.cheng@apple.com> 2012-12-10 23:21:26 +0000
committer: Evan Cheng <evan.cheng@apple.com> 2012-12-10 23:21:26 +0000
commit: 376642ed620ecae05b68c7bc81f79aeb2065abe0 (patch)
tree: 9757b2568050b3ab58af15c234df3bc9f66202b0 /lib/CodeGen/SelectionDAG/SelectionDAG.cpp
parent: 2b475922e6169098606006a69d765160caa77848 (diff)
1 files changed, 64 insertions, 18 deletions
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 7dd57d5420..29339405aa 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3373,7 +3373,7 @@ static SDValue getMemsetStringVal(EVT VT, DebugLoc dl, SelectionDAG &DAG,
   unsigned NumVTBytes = VT.getSizeInBits() / 8;
   unsigned NumBytes = std::min(NumVTBytes, unsigned(Str.size()));
 
-  uint64_t Val = 0;
+  APInt Val(NumBytes*8, 0);
   if (TLI.isLittleEndian()) {
     for (unsigned i = 0; i != NumBytes; ++i)
       Val |= (uint64_t)(unsigned char)Str[i] << i*8;
@@ -3382,7 +3382,9 @@ static SDValue getMemsetStringVal(EVT VT, DebugLoc dl, SelectionDAG &DAG,
       Val |= (uint64_t)(unsigned char)Str[i] << (NumVTBytes-i-1)*8;
   }
 
-  return DAG.getConstant(Val, VT);
+  if (TLI.isIntImmLegal(Val, VT))
+    return DAG.getConstant(Val, VT);
+  return SDValue(0, 0);
 }
 
 /// getMemBasePlusOffset - Returns base and offset node for the
@@ -3422,6 +3424,7 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
                                      unsigned DstAlign, unsigned SrcAlign,
                                      bool IsZeroVal,
                                      bool MemcpyStrSrc,
+                                     bool AllowOverlap,
                                      SelectionDAG &DAG,
                                      const TargetLowering &TLI) {
   assert((SrcAlign == 0 || SrcAlign >= DstAlign) &&
@@ -3461,24 +3464,47 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
 
   unsigned NumMemOps = 0;
   while (Size != 0) {
+    if (++NumMemOps > Limit)
+      return false;
+
     unsigned VTSize = VT.getSizeInBits() / 8;
     while (VTSize > Size) {
       // For now, only use non-vector load / store's for the left-over pieces.
+      EVT NewVT;
+      unsigned NewVTSize;
       if (VT.isVector() || VT.isFloatingPoint()) {
-        VT = MVT::i64;
-        while (!TLI.isTypeLegal(VT))
-          VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
-        VTSize = VT.getSizeInBits() / 8;
+        NewVT = (VT.getSizeInBits() > 64) ? MVT::i64 : MVT::i32;
+        while (!TLI.isOperationLegalOrCustom(ISD::STORE, NewVT)) {
+          if (NewVT == MVT::i64 &&
+              TLI.isOperationLegalOrCustom(ISD::STORE, MVT::f64)) {
+            // i64 is usually not legal on 32-bit targets, but f64 may be.
+            NewVT = MVT::f64;
+            break;
+          }
+          NewVT = (MVT::SimpleValueType)(NewVT.getSimpleVT().SimpleTy - 1);
+        }
+        NewVTSize = NewVT.getSizeInBits() / 8;
       } else {
         // This can result in a type that is not legal on the target, e.g.
         // 1 or 2 bytes on PPC.
-        VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
-        VTSize >>= 1;
+        NewVT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
+        NewVTSize = VTSize >> 1;
+      }
+
+      // If the new VT cannot cover all of the remaining bits, then consider
+      // issuing a (or a pair of) unaligned and overlapping load / store.
+      // FIXME: Only does this for 64-bit or more since we don't have proper
+      // cost model for unaligned load / store.
+      bool Fast;
+      if (AllowOverlap && VTSize >= 8 && NewVTSize < Size &&
+          TLI.allowsUnalignedMemoryAccesses(VT, &Fast) && Fast)
+        VTSize = Size;
+      else {
+        VT = NewVT;
+        VTSize = NewVTSize;
       }
     }
 
-    if (++NumMemOps > Limit)
-      return false;
     MemOps.push_back(VT);
     Size -= VTSize;
   }
@@ -3523,7 +3549,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
   if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
                                 (DstAlignCanChange ? 0 : Align),
                                 (isZeroStr ? 0 : SrcAlign),
-                                true, CopyFromStr, DAG, TLI))
+                                true, CopyFromStr, true, DAG, TLI))
     return SDValue();
 
   if (DstAlignCanChange) {
@@ -3545,6 +3571,14 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
     unsigned VTSize = VT.getSizeInBits() / 8;
     SDValue Value, Store;
 
+    if (VTSize > Size) {
+      // Issuing an unaligned load / store pair  that overlaps with the previous
+      // pair. Adjust the offset accordingly.
+      assert(i == NumMemOps-1 && i != 0);
+      SrcOff -= VTSize - Size;
+      DstOff -= VTSize - Size;
+    }
+
     if (CopyFromStr &&
         (isZeroStr || (VT.isInteger() && !VT.isVector()))) {
       // It's unlikely a store of a vector immediate can be done in a single
@@ -3553,11 +3587,14 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
       // FIXME: Handle other cases where store of vector immediate is done in
       // a single instruction.
       Value = getMemsetStringVal(VT, dl, DAG, TLI, Str.substr(SrcOff));
-      Store = DAG.getStore(Chain, dl, Value,
-                           getMemBasePlusOffset(Dst, DstOff, DAG),
-                           DstPtrInfo.getWithOffset(DstOff), isVol,
-                           false, Align);
-    } else {
+      if (Value.getNode())
+        Store = DAG.getStore(Chain, dl, Value,
+                             getMemBasePlusOffset(Dst, DstOff, DAG),
+                             DstPtrInfo.getWithOffset(DstOff), isVol,
+                             false, Align);
+    }
+
+    if (!Store.getNode()) {
       // The type might not be legal for the target.  This should only happen
       // if the type is smaller than a legal type, as on PPC, so the right
       // thing to do is generate a LoadExt/StoreTrunc pair.  These simplify
@@ -3577,6 +3614,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
     OutChains.push_back(Store);
     SrcOff += VTSize;
     DstOff += VTSize;
+    Size -= VTSize;
   }
 
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
@@ -3613,7 +3651,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
 
   if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
                                 (DstAlignCanChange ? 0 : Align),
-                                SrcAlign, true, false, DAG, TLI))
+                                SrcAlign, true, false, false, DAG, TLI))
     return SDValue();
 
   if (DstAlignCanChange) {
@@ -3689,7 +3727,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
     isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isNullValue();
   if (!FindOptimalMemOpLowering(MemOps, TLI.getMaxStoresPerMemset(OptSize),
                                 Size, (DstAlignCanChange ? 0 : Align), 0,
-                                IsZeroVal, false, DAG, TLI))
+                                IsZeroVal, false, true, DAG, TLI))
     return SDValue();
 
   if (DstAlignCanChange) {
@@ -3716,6 +3754,13 @@ static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
 
   for (unsigned i = 0; i < NumMemOps; i++) {
     EVT VT = MemOps[i];
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    if (VTSize > Size) {
+      // Issuing an unaligned load / store pair  that overlaps with the previous
+      // pair. Adjust the offset accordingly.
+      assert(i == NumMemOps-1 && i != 0);
+      DstOff -= VTSize - Size;
+    }
 
     // If this store is smaller than the largest store see whether we can get
     // the smaller value for free with a truncate.
@@ -3734,6 +3779,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
                                  isVol, false, Align);
     OutChains.push_back(Store);
     DstOff += VT.getSizeInBits() / 8;
+    Size -= VTSize;
   }
 
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
author	Evan Cheng <evan.cheng@apple.com>	2012-12-10 23:21:26 +0000
committer	Evan Cheng <evan.cheng@apple.com>	2012-12-10 23:21:26 +0000
commit	376642ed620ecae05b68c7bc81f79aeb2065abe0 (patch)
tree	9757b2568050b3ab58af15c234df3bc9f66202b0 /lib/CodeGen/SelectionDAG/SelectionDAG.cpp
parent	2b475922e6169098606006a69d765160caa77848 (diff)