make the argument passing stuff in the FCA case smarter still, by

avoiding making the FCA at all when the types exactly line up. For example, before we made: %struct.DeclGroup = type { i64, i64 } define i64 @_Z3foo9DeclGroup(i64, i64) nounwind { entry: %D = alloca %struct.DeclGroup, align 8 ; <%struct.DeclGroup*> [#uses=3] %2 = insertvalue %struct.DeclGroup undef, i64 %0, 0 ; <%struct.DeclGroup> [#uses=1] %3 = insertvalue %struct.DeclGroup %2, i64 %1, 1 ; <%struct.DeclGroup> [#uses=1] store %struct.DeclGroup %3, %struct.DeclGroup* %D %tmp = getelementptr inbounds %struct.DeclGroup* %D, i32 0, i32 0 ; <i64*> [#uses=1] %tmp1 = load i64* %tmp ; <i64> [#uses=1] %tmp2 = getelementptr inbounds %struct.DeclGroup* %D, i32 0, i32 1 ; <i64*> [#uses=1] %tmp3 = load i64* %tmp2 ; <i64> [#uses=1] %add = add nsw i64 %tmp1, %tmp3 ; <i64> [#uses=1] ret i64 %add } ... which has the pointless insertvalue, which fastisel hates, now we make: %struct.DeclGroup = type { i64, i64 } define i64 @_Z3foo9DeclGroup(i64, i64) nounwind { entry: %D = alloca %struct.DeclGroup, align 8 ; <%struct.DeclGroup*> [#uses=4] %2 = getelementptr %struct.DeclGroup* %D, i32 0, i32 0 ; <i64*> [#uses=1] store i64 %0, i64* %2 %3 = getelementptr %struct.DeclGroup* %D, i32 0, i32 1 ; <i64*> [#uses=1] store i64 %1, i64* %3 %tmp = getelementptr inbounds %struct.DeclGroup* %D, i32 0, i32 0 ; <i64*> [#uses=1] %tmp1 = load i64* %tmp ; <i64> [#uses=1] %tmp2 = getelementptr inbounds %struct.DeclGroup* %D, i32 0, i32 1 ; <i64*> [#uses=1] %tmp3 = load i64* %tmp2 ; <i64> [#uses=1] %add = add nsw i64 %tmp1, %tmp3 ; <i64> [#uses=1] ret i64 %add } This only kicks in when x86-64 abi lowering decides it likes us. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@107104 91177308-0d34-0410-b5e6-96231b3b80d8
author: Chris Lattner <sabre@nondot.org> 2010-06-29 00:06:42 +0000
committer: Chris Lattner <sabre@nondot.org> 2010-06-29 00:06:42 +0000
commit: 309c59f6d3a4fd883fdf87334271df2c55338aae (patch)
tree: 3da3e772937453a52cb9695d195ea89c14dd63bd /lib/CodeGen/CGCall.cpp
parent: 4d072932287eb074a4168804cac1acb18a51d5e8 (diff)
1 files changed, 46 insertions, 21 deletions
diff --git a/lib/CodeGen/CGCall.cpp b/lib/CodeGen/CGCall.cpp
index eb517edd81..4d72d91cb7 100644
--- a/lib/CodeGen/CGCall.cpp
+++ b/lib/CodeGen/CGCall.cpp
@@ -894,29 +894,41 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
       continue;
 
     case ABIArgInfo::Coerce: {
+      // FIXME: This is very wasteful; EmitParmDecl is just going to drop the
+      // result in a new alloca anyway, so we could just store into that
+      // directly if we broke the abstraction down more.
+      llvm::Value *V = CreateMemTemp(Ty, "coerce");
+      
       // If the coerce-to type is a first class aggregate, we flatten it and
       // pass the elements. Either way is semantically identical, but fast-isel
       // and the optimizer generally likes scalar values better than FCAs.
-      llvm::Value *FormalArg;
       if (const llvm::StructType *STy =
             dyn_cast<llvm::StructType>(ArgI.getCoerceToType())) {
-        // Reconstruct the FCA here.
-        // FIXME: If we have a direct match, do nice gep/store series.
-        FormalArg = llvm::UndefValue::get(STy);
-        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
-          assert(AI != Fn->arg_end() && "Argument mismatch!");
-          FormalArg = Builder.CreateInsertValue(FormalArg, AI++, i);
+        // If the argument and alloca types match up, we don't have to build the
+        // FCA at all, emit a series of GEPs and stores, which is better for
+        // fast isel.
+        if (STy == cast<llvm::PointerType>(V->getType())->getElementType()) {
+          for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+            assert(AI != Fn->arg_end() && "Argument mismatch!");
+            llvm::Value *EltPtr = Builder.CreateConstGEP2_32(V, 0, i);
+            Builder.CreateStore(AI++, EltPtr);
+          }
+        } else {
+          // Reconstruct the FCA here so we can do a coerced store.
+          llvm::Value *FormalArg = llvm::UndefValue::get(STy);
+          for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+            assert(AI != Fn->arg_end() && "Argument mismatch!");
+            FormalArg = Builder.CreateInsertValue(FormalArg, AI++, i);
+          }
+          CreateCoercedStore(FormalArg, V, /*DestIsVolatile=*/false, *this);
         }
       } else {
+        // Simple case, just do a coerced store of the argument into the alloca.
         assert(AI != Fn->arg_end() && "Argument mismatch!");
-        FormalArg = AI++;
+        CreateCoercedStore(AI++, V, /*DestIsVolatile=*/false, *this);
       }
       
-      // FIXME: This is very wasteful; EmitParmDecl is just going to drop the
-      // result in a new alloca anyway, so we could just store into that
-      // directly if we broke the abstraction down more.
-      llvm::Value *V = CreateMemTemp(Ty, "coerce");
-      CreateCoercedStore(FormalArg, V, /*DestIsVolatile=*/false, *this);
+      
       // Match to what EmitParmDecl is expecting for this type.
       if (!CodeGenFunction::hasAggregateLLVMType(Ty)) {
         V = EmitLoadOfScalar(V, false, Ty);
@@ -1116,19 +1128,32 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
       } else
         SrcPtr = RV.getAggregateAddr();
       
-      llvm::Value *SrcVal = 
-        CreateCoercedLoad(SrcPtr, ArgInfo.getCoerceToType(), *this);
-      
       // If the coerce-to type is a first class aggregate, we flatten it and
       // pass the elements. Either way is semantically identical, but fast-isel
       // and the optimizer generally likes scalar values better than FCAs.
       if (const llvm::StructType *STy =
-            dyn_cast<llvm::StructType>(SrcVal->getType())) {
-        // Extract the elements of the value to pass in.
-        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
-          Args.push_back(Builder.CreateExtractValue(SrcVal, i));
+            dyn_cast<llvm::StructType>(ArgInfo.getCoerceToType())) {
+        // If the argument and alloca types match up, we don't have to build the
+        // FCA at all, emit a series of GEPs and loads, which is better for
+        // fast isel.
+        if (STy ==cast<llvm::PointerType>(SrcPtr->getType())->getElementType()){
+          for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+            llvm::Value *EltPtr = Builder.CreateConstGEP2_32(SrcPtr, 0, i);
+            Args.push_back(Builder.CreateLoad(EltPtr));
+          }
+        } else {
+          // Otherwise, do a coerced load the entire FCA and handle the pieces.
+          llvm::Value *SrcVal = 
+            CreateCoercedLoad(SrcPtr, ArgInfo.getCoerceToType(), *this);
+
+          // Extract the elements of the value to pass in.
+          for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+            Args.push_back(Builder.CreateExtractValue(SrcVal, i));
+        }
       } else {
-        Args.push_back(SrcVal);
+        // In the simple case, just pass the coerced loaded value.
+        Args.push_back(CreateCoercedLoad(SrcPtr, ArgInfo.getCoerceToType(),
+                                         *this));
       }
       
       break;
author	Chris Lattner <sabre@nondot.org>	2010-06-29 00:06:42 +0000
committer	Chris Lattner <sabre@nondot.org>	2010-06-29 00:06:42 +0000
commit	309c59f6d3a4fd883fdf87334271df2c55338aae (patch)
tree	3da3e772937453a52cb9695d195ea89c14dd63bd /lib/CodeGen/CGCall.cpp
parent	4d072932287eb074a4168804cac1acb18a51d5e8 (diff)