2 files changed, 77 insertions, 5 deletions
diff --git a/lib/CodeGen/CGCall.cpp b/lib/CodeGen/CGCall.cpp
index 712ae89a48..5e9ecd574b 100644
--- a/lib/CodeGen/CGCall.cpp
+++ b/lib/CodeGen/CGCall.cpp
@@ -1263,12 +1263,51 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
                             Alignment, I->Ty);
         else
           StoreComplexToAddr(RV.getComplexVal(), Args.back(), false);
-      } else if (I->NeedsCopy && !ArgInfo.getIndirectByVal()) {
-        Args.push_back(CreateMemTemp(I->Ty));
-        EmitAggregateCopy(Args.back(), RV.getAggregateAddr(), I->Ty,
-                          RV.isVolatileQualified());
       } else {
-        Args.push_back(RV.getAggregateAddr());
+        // We want to avoid creating an unnecessary temporary+copy here;
+        // however, we need one in two cases:
+        // 1. If the argument is not byval, and we are required to copy the
+        //    source.  (This case doesn't occur on any common architecture.)
+        // 2. If the argument is byval, RV is not sufficiently aligned, and
+        //    we cannot force it to be sufficiently aligned.
+        // FIXME: This code is ugly because we don't know the required
+        // alignment when RV is generated.
+        llvm::AllocaInst *AI =
+            dyn_cast<llvm::AllocaInst>(RV.getAggregateAddr());
+        bool NeedsAggCopy = false;
+        if (I->NeedsCopy && !ArgInfo.getIndirectByVal())
+          NeedsAggCopy = true;
+        if (ArgInfo.getIndirectByVal()) {
+          if (AI) {
+            // The source is an alloca; we can force appropriate alignment.
+            if (ArgInfo.getIndirectAlign() > AI->getAlignment())
+              AI->setAlignment(ArgInfo.getIndirectAlign());
+          } else if (llvm::Argument *A =
+                         dyn_cast<llvm::Argument>(RV.getAggregateAddr())) {
+            // Check if the source is an appropriately aligned byval argument.
+            if (!A->hasByValAttr() ||
+                A->getParamAlignment() < ArgInfo.getIndirectAlign())
+              NeedsAggCopy = true;
+          } else {
+            // We don't know what the input is; force a temporary+copy if
+            // the type alignment is not sufficient.
+            assert(I->NeedsCopy && "Temporary must be AllocaInst");
+            if (ArgInfo.getIndirectAlign() > Alignment)
+              NeedsAggCopy = true;
+          }
+        }
+        if (NeedsAggCopy) {
+          // Create an aligned temporary, and copy to it.
+          AI = CreateMemTemp(I->Ty);
+          if (ArgInfo.getIndirectAlign() > AI->getAlignment())
+            AI->setAlignment(ArgInfo.getIndirectAlign());
+          Args.push_back(AI);
+          EmitAggregateCopy(AI, RV.getAggregateAddr(), I->Ty,
+                            RV.isVolatileQualified());
+        } else {
+          // Skip the extra memcpy call.
+          Args.push_back(RV.getAggregateAddr());
+        }
       }
       break;
     }
diff --git a/test/CodeGen/byval-memcpy-elim.c b/test/CodeGen/byval-memcpy-elim.c
index 8aa08fb07b..76cdafb5e8 100644
--- a/test/CodeGen/byval-memcpy-elim.c
+++ b/test/CodeGen/byval-memcpy-elim.c
@@ -18,3 +18,36 @@ void test1a(struct Test1S, struct Test2S);
 void test1(struct Test1S *A, struct Test2S *B) {
   test1a(*A, *B);
 }
+
+// The above gets tricker when the byval argument requires higher alignment
+// than the natural alignment of the type in question.
+// rdar://9483886
+
+// Make sure we do generate a memcpy when we cannot guarantee alignment.
+struct Test3S {
+  int a,b,c,d,e,f,g,h,i,j,k,l;
+};
+void test2a(struct Test3S q);
+// CHECK: define void @test2(
+// CHECK: alloca %struct.Test3S, align 8
+// CHECK: memcpy
+// CHECK: call void @test2a
+void test2(struct Test3S *q) {
+  test2a(*q);
+}
+
+// But make sure we don't generate a memcpy when we can guarantee alignment.
+void fooey(void);
+// CHECK: define void @test3(
+// CHECK: alloca %struct.Test3S, align 8
+// CHECK: call void @fooey
+// CHECK-NOT: memcpy
+// CHECK: call void @test2a
+// CHECK-NOT: memcpy
+// CHECK: call void @test2a
+void test3(struct Test3S a) {
+  struct Test3S b = a;
+  fooey();
+  test2a(a);
+  test2a(b);
+}