2 files changed, 36 insertions, 5 deletions
diff --git a/lib/CodeGen/TargetInfo.cpp b/lib/CodeGen/TargetInfo.cpp
index 9060337180..22292603f6 100644
--- a/lib/CodeGen/TargetInfo.cpp
+++ b/lib/CodeGen/TargetInfo.cpp
@@ -3221,16 +3221,15 @@ ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty, int *VFPRegs,
 
   // Support byval for ARM.
   // The ABI alignment for APCS is 4-byte and for AAPCS at least 4-byte and at most 8-byte.
-  // Byval can't handle the case where type alignment is bigger than ABI alignment.
-  // We also increase the threshold for byval due to its overhead.
+  // We realign the indirect argument if type alignment is bigger than ABI alignment.
   uint64_t ABIAlign = 4;
   uint64_t TyAlign = getContext().getTypeAlign(Ty) / 8;
   if (getABIKind() == ARMABIInfo::AAPCS_VFP ||
       getABIKind() == ARMABIInfo::AAPCS)
     ABIAlign = std::min(std::max(TyAlign, (uint64_t)4), (uint64_t)8);
-  if (getContext().getTypeSizeInChars(Ty) > CharUnits::fromQuantity(64*8) &&
-      TyAlign <= ABIAlign) {
-    return ABIArgInfo::getIndirect(0, /*ByVal=*/true);
+  if (getContext().getTypeSizeInChars(Ty) > CharUnits::fromQuantity(64)) {
+    return ABIArgInfo::getIndirect(0, /*ByVal=*/true,
+           /*Realign=*/TyAlign <= ABIAlign ? false : true);
   }
 
   // Otherwise, pass by coercing to a structure of the appropriate size.
diff --git a/test/CodeGen/arm-arguments.c b/test/CodeGen/arm-arguments.c
index 8aa33f696d..63ecd4c599 100644
--- a/test/CodeGen/arm-arguments.c
+++ b/test/CodeGen/arm-arguments.c
@@ -191,3 +191,35 @@ void g34(struct s34 *s) { f34(*s); }
 // AAPCS: %[[a:.*]] = alloca { [1 x i32] }
 // AAPCS: %[[gep:.*]] = getelementptr { [1 x i32] }* %[[a]], i32 0, i32 0
 // AAPCS: load [1 x i32]* %[[gep]]
+
+// rdar://12596507
+struct s35
+{
+   float v[18]; //make sure byval is on.
+} __attribute__((aligned(16)));
+typedef struct s35 s35_with_align;
+
+typedef __attribute__((neon_vector_type(4))) float float32x4_t;
+static __attribute__((__always_inline__, __nodebug__)) float32x4_t vaddq_f32(
+       float32x4_t __a, float32x4_t __b) {
+ return __a + __b;
+}
+float32x4_t f35(int i, s35_with_align s1, s35_with_align s2) {
+  float32x4_t v = vaddq_f32(*(float32x4_t *)&s1,
+                            *(float32x4_t *)&s2);
+  return v;
+}
+// APCS-GNU: define <4 x float> @f35(i32 %i, %struct.s35* byval, %struct.s35* byval)
+// APCS-GNU: %[[a:.*]] = alloca %struct.s35, align 16
+// APCS-GNU: %[[b:.*]] = bitcast %struct.s35* %[[a]] to i8*
+// APCS-GNU: %[[c:.*]] = bitcast %struct.s35* %0 to i8*
+// APCS-GNU: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[b]], i8* %[[c]]
+// APCS-GNU: %[[d:.*]] = bitcast %struct.s35* %[[a]] to <4 x float>*
+// APCS-GNU: load <4 x float>* %[[d]], align 16
+// AAPCS: define arm_aapcscc <4 x float> @f35(i32 %i, %struct.s35* byval, %struct.s35* byval)
+// AAPCS: %[[a:.*]] = alloca %struct.s35, align 16
+// AAPCS: %[[b:.*]] = bitcast %struct.s35* %[[a]] to i8*
+// AAPCS: %[[c:.*]] = bitcast %struct.s35* %0 to i8*
+// AAPCS: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[b]], i8* %[[c]]
+// AAPCS: %[[d:.*]] = bitcast %struct.s35* %[[a]] to <4 x float>*
+// AAPCS: load <4 x float>* %[[d]], align 16