aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBob Wilson <bob.wilson@apple.com>2010-08-27 17:13:24 +0000
committerBob Wilson <bob.wilson@apple.com>2010-08-27 17:13:24 +0000
commit7a9ef44b3b4ddd2dd9a7f92fc8b46c5e5bed6a81 (patch)
treecc09249c520967314da61ae75a6a6ef2d2625172
parent660cab32fe5105bcaa17daa4704c24065ac0a7e6 (diff)
Add alignment arguments to all the NEON load/store intrinsics.
Update all the tests using those intrinsics and add support for auto-upgrading bitcode files with the old versions of the intrinsics. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@112271 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--include/llvm/IntrinsicsARM.td54
-rw-r--r--lib/VMCore/AutoUpgrade.cpp67
-rw-r--r--test/Bitcode/neon-intrinsics.ll34
-rw-r--r--test/Bitcode/neon-intrinsics.ll.bcbin820 -> 2884 bytes
-rw-r--r--test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll22
-rw-r--r--test/CodeGen/ARM/2010-05-21-BuildVector.ll4
-rw-r--r--test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll4
-rw-r--r--test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll4
-rw-r--r--test/CodeGen/ARM/reg_sequence.ll65
-rw-r--r--test/CodeGen/ARM/spill-q.ll8
-rw-r--r--test/CodeGen/ARM/vld1.ll43
-rw-r--r--test/CodeGen/ARM/vld2.ll36
-rw-r--r--test/CodeGen/ARM/vld3.ll36
-rw-r--r--test/CodeGen/ARM/vld4.ll36
-rw-r--r--test/CodeGen/ARM/vldlane.ll84
-rw-r--r--test/CodeGen/ARM/vst1.ll40
-rw-r--r--test/CodeGen/ARM/vst2.ll36
-rw-r--r--test/CodeGen/ARM/vst3.ll36
-rw-r--r--test/CodeGen/ARM/vst4.ll36
-rw-r--r--test/CodeGen/ARM/vstlane.ll84
-rw-r--r--test/CodeGen/Thumb2/crash.ll6
-rw-r--r--test/CodeGen/Thumb2/machine-licm-vdup.ll8
-rw-r--r--test/CodeGen/Thumb2/machine-licm.ll8
-rw-r--r--test/CodeGen/Thumb2/thumb2-spill-q.ll8
24 files changed, 436 insertions, 323 deletions
diff --git a/include/llvm/IntrinsicsARM.td b/include/llvm/IntrinsicsARM.td
index 37d813151c..7be283b29c 100644
--- a/include/llvm/IntrinsicsARM.td
+++ b/include/llvm/IntrinsicsARM.td
@@ -339,62 +339,76 @@ def int_arm_neon_vtbx4 : Neon_Tbl6Arg_Intrinsic;
let TargetPrefix = "arm" in {
// De-interleaving vector loads from N-element structures.
+ // Source operands are the address and alignment.
def int_arm_neon_vld1 : Intrinsic<[llvm_anyvector_ty],
- [llvm_ptr_ty], [IntrReadArgMem]>;
+ [llvm_ptr_ty, llvm_i32_ty],
+ [IntrReadArgMem]>;
def int_arm_neon_vld2 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
- [llvm_ptr_ty], [IntrReadArgMem]>;
+ [llvm_ptr_ty, llvm_i32_ty],
+ [IntrReadArgMem]>;
def int_arm_neon_vld3 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>],
- [llvm_ptr_ty], [IntrReadArgMem]>;
+ [llvm_ptr_ty, llvm_i32_ty],
+ [IntrReadArgMem]>;
def int_arm_neon_vld4 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
- [llvm_ptr_ty], [IntrReadArgMem]>;
+ [llvm_ptr_ty, llvm_i32_ty],
+ [IntrReadArgMem]>;
// Vector load N-element structure to one lane.
+ // Source operands are: the address, the N input vectors (since only one
+ // lane is assigned), the lane number, and the alignment.
def int_arm_neon_vld2lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[llvm_ptr_ty, LLVMMatchType<0>,
- LLVMMatchType<0>, llvm_i32_ty],
- [IntrReadArgMem]>;
+ LLVMMatchType<0>, llvm_i32_ty,
+ llvm_i32_ty], [IntrReadArgMem]>;
def int_arm_neon_vld3lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>],
[llvm_ptr_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>,
- llvm_i32_ty], [IntrReadArgMem]>;
+ llvm_i32_ty, llvm_i32_ty],
+ [IntrReadArgMem]>;
def int_arm_neon_vld4lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
[llvm_ptr_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>,
- LLVMMatchType<0>, llvm_i32_ty],
- [IntrReadArgMem]>;
+ LLVMMatchType<0>, llvm_i32_ty,
+ llvm_i32_ty], [IntrReadArgMem]>;
// Interleaving vector stores from N-element structures.
+ // Source operands are: the address, the N vectors, and the alignment.
def int_arm_neon_vst1 : Intrinsic<[],
- [llvm_ptr_ty, llvm_anyvector_ty],
- [IntrReadWriteArgMem]>;
+ [llvm_ptr_ty, llvm_anyvector_ty,
+ llvm_i32_ty], [IntrReadWriteArgMem]>;
def int_arm_neon_vst2 : Intrinsic<[],
[llvm_ptr_ty, llvm_anyvector_ty,
- LLVMMatchType<0>], [IntrReadWriteArgMem]>;
+ LLVMMatchType<0>, llvm_i32_ty],
+ [IntrReadWriteArgMem]>;
def int_arm_neon_vst3 : Intrinsic<[],
[llvm_ptr_ty, llvm_anyvector_ty,
- LLVMMatchType<0>, LLVMMatchType<0>],
- [IntrReadWriteArgMem]>;
+ LLVMMatchType<0>, LLVMMatchType<0>,
+ llvm_i32_ty], [IntrReadWriteArgMem]>;
def int_arm_neon_vst4 : Intrinsic<[],
[llvm_ptr_ty, llvm_anyvector_ty,
LLVMMatchType<0>, LLVMMatchType<0>,
- LLVMMatchType<0>], [IntrReadWriteArgMem]>;
+ LLVMMatchType<0>, llvm_i32_ty],
+ [IntrReadWriteArgMem]>;
// Vector store N-element structure from one lane.
+ // Source operands are: the address, the N vectors, the lane number, and
+ // the alignment.
def int_arm_neon_vst2lane : Intrinsic<[],
[llvm_ptr_ty, llvm_anyvector_ty,
- LLVMMatchType<0>, llvm_i32_ty],
- [IntrReadWriteArgMem]>;
+ LLVMMatchType<0>, llvm_i32_ty,
+ llvm_i32_ty], [IntrReadWriteArgMem]>;
def int_arm_neon_vst3lane : Intrinsic<[],
[llvm_ptr_ty, llvm_anyvector_ty,
LLVMMatchType<0>, LLVMMatchType<0>,
- llvm_i32_ty], [IntrReadWriteArgMem]>;
+ llvm_i32_ty, llvm_i32_ty],
+ [IntrReadWriteArgMem]>;
def int_arm_neon_vst4lane : Intrinsic<[],
[llvm_ptr_ty, llvm_anyvector_ty,
LLVMMatchType<0>, LLVMMatchType<0>,
- LLVMMatchType<0>, llvm_i32_ty],
- [IntrReadWriteArgMem]>;
+ LLVMMatchType<0>, llvm_i32_ty,
+ llvm_i32_ty], [IntrReadWriteArgMem]>;
}
diff --git a/lib/VMCore/AutoUpgrade.cpp b/lib/VMCore/AutoUpgrade.cpp
index f76d0d254d..052fd2d5b1 100644
--- a/lib/VMCore/AutoUpgrade.cpp
+++ b/lib/VMCore/AutoUpgrade.cpp
@@ -85,6 +85,39 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
NewFn = 0;
return true;
}
+ // Old versions of NEON ld/st intrinsics are missing alignment arguments.
+ bool isVLd = (Name.compare(14, 3, "vld", 3) == 0);
+ bool isVSt = (Name.compare(14, 3, "vst", 3) == 0);
+ if (isVLd || isVSt) {
+ unsigned NumVecs = Name.at(17) - '0';
+ if (NumVecs == 0 || NumVecs > 4)
+ return false;
+ bool isLaneOp = (Name.compare(18, 5, "lane.", 5) == 0);
+ if (!isLaneOp && Name.at(18) != '.')
+ return false;
+ unsigned ExpectedArgs = 2; // for the address and alignment
+ if (isVSt || isLaneOp)
+ ExpectedArgs += NumVecs;
+ if (isLaneOp)
+ ExpectedArgs += 1; // for the lane number
+ unsigned NumP = FTy->getNumParams();
+ if (NumP != ExpectedArgs - 1)
+ return false;
+
+ // Change the name of the old (bad) intrinsic, because
+ // its type is incorrect, but we cannot overload that name.
+ F->setName("");
+
+ // One argument is missing: add the alignment argument.
+ std::vector<const Type*> NewParams;
+ for (unsigned p = 0; p < NumP; ++p)
+ NewParams.push_back(FTy->getParamType(p));
+ NewParams.push_back(Type::getInt32Ty(F->getContext()));
+ FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(),
+ NewParams, false);
+ NewFn = cast<Function>(M->getOrInsertFunction(Name, NewFTy));
+ return true;
+ }
}
break;
case 'b':
@@ -189,7 +222,6 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
NewFnName = "llvm.memset.p0i8.i64";
}
if (NewFnName) {
- const FunctionType *FTy = F->getFunctionType();
NewFn = cast<Function>(M->getOrInsertFunction(NewFnName,
FTy->getReturnType(),
FTy->getParamType(0),
@@ -578,6 +610,39 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
switch (NewFn->getIntrinsicID()) {
default: llvm_unreachable("Unknown function for CallInst upgrade.");
+ case Intrinsic::arm_neon_vld1:
+ case Intrinsic::arm_neon_vld2:
+ case Intrinsic::arm_neon_vld3:
+ case Intrinsic::arm_neon_vld4:
+ case Intrinsic::arm_neon_vst1:
+ case Intrinsic::arm_neon_vst2:
+ case Intrinsic::arm_neon_vst3:
+ case Intrinsic::arm_neon_vst4:
+ case Intrinsic::arm_neon_vld2lane:
+ case Intrinsic::arm_neon_vld3lane:
+ case Intrinsic::arm_neon_vld4lane:
+ case Intrinsic::arm_neon_vst2lane:
+ case Intrinsic::arm_neon_vst3lane:
+ case Intrinsic::arm_neon_vst4lane: {
+ // Add a default alignment argument of 1.
+ SmallVector<Value*, 8> Operands(CS.arg_begin(), CS.arg_end());
+ Operands.push_back(ConstantInt::get(Type::getInt32Ty(C), 1));
+ CallInst *NewCI = CallInst::Create(NewFn, Operands.begin(), Operands.end(),
+ CI->getName(), CI);
+ NewCI->setTailCall(CI->isTailCall());
+ NewCI->setCallingConv(CI->getCallingConv());
+
+ // Handle any uses of the old CallInst.
+ if (!CI->use_empty())
+ // Replace all uses of the old call with the new cast which has the
+ // correct type.
+ CI->replaceAllUsesWith(NewCI);
+
+ // Clean up the old call now that it has been completely upgraded.
+ CI->eraseFromParent();
+ break;
+ }
+
case Intrinsic::x86_mmx_psll_d:
case Intrinsic::x86_mmx_psll_q:
case Intrinsic::x86_mmx_psll_w:
diff --git a/test/Bitcode/neon-intrinsics.ll b/test/Bitcode/neon-intrinsics.ll
index 73ca7075d8..fe76514ee0 100644
--- a/test/Bitcode/neon-intrinsics.ll
+++ b/test/Bitcode/neon-intrinsics.ll
@@ -27,3 +27,37 @@
; CHECK: vmovlu32
; CHECK-NOT: arm.neon.vmovlu.v2i64
; CHECK: zext <2 x i32>
+
+; vld* and vst* intrinsic calls need an alignment argument (defaulted to 1)
+
+; CHECK: vld1i8
+; CHECK: i32 1
+; CHECK: vld2Qi16
+; CHECK: i32 1
+; CHECK: vld3i32
+; CHECK: i32 1
+; CHECK: vld4Qf
+; CHECK: i32 1
+
+; CHECK: vst1i8
+; CHECK: i32 1
+; CHECK: vst2Qi16
+; CHECK: i32 1
+; CHECK: vst3i32
+; CHECK: i32 1
+; CHECK: vst4Qf
+; CHECK: i32 1
+
+; CHECK: vld2laneQi16
+; CHECK: i32 1
+; CHECK: vld3lanei32
+; CHECK: i32 1
+; CHECK: vld4laneQf
+; CHECK: i32 1
+
+; CHECK: vst2laneQi16
+; CHECK: i32 1
+; CHECK: vst3lanei32
+; CHECK: i32 1
+; CHECK: vst4laneQf
+; CHECK: i32 1
diff --git a/test/Bitcode/neon-intrinsics.ll.bc b/test/Bitcode/neon-intrinsics.ll.bc
index 93eeabc822..c324aeef18 100644
--- a/test/Bitcode/neon-intrinsics.ll.bc
+++ b/test/Bitcode/neon-intrinsics.ll.bc
Binary files differ
diff --git a/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll b/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll
index ff60fa8c49..e47c038393 100644
--- a/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll
+++ b/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll
@@ -5,32 +5,32 @@
%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
-declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*, i32) nounwind readonly
-declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>) nounwind
+declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
define <8 x i8> @t3(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A7, i8* %A8, i8* %B) nounwind {
- %tmp1b = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A2) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+ %tmp1b = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A2, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
%tmp2b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 0 ; <<8 x i8>> [#uses=1]
%tmp4b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 1 ; <<8 x i8>> [#uses=1]
- %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+ %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
%tmp2d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 0 ; <<8 x i8>> [#uses=1]
%tmp4d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 1 ; <<8 x i8>> [#uses=1]
- %tmp1e = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A5) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+ %tmp1e = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A5, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
%tmp2e = extractvalue %struct.__neon_int8x8x3_t %tmp1e, 0 ; <<8 x i8>> [#uses=1]
- %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+ %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
%tmp2f = extractvalue %struct.__neon_int8x8x3_t %tmp1f, 0 ; <<8 x i8>> [#uses=1]
- %tmp1g = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A7) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+ %tmp1g = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A7, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
%tmp2g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 0 ; <<8 x i8>> [#uses=1]
%tmp4g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 1 ; <<8 x i8>> [#uses=1]
- %tmp1h = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A8) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+ %tmp1h = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A8, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
%tmp2h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 0 ; <<8 x i8>> [#uses=1]
%tmp3h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 2 ; <<8 x i8>> [#uses=1]
%tmp2bd = add <8 x i8> %tmp2b, %tmp2d ; <<8 x i8>> [#uses=1]
%tmp4bd = add <8 x i8> %tmp4b, %tmp4d ; <<8 x i8>> [#uses=1]
%tmp2abcd = mul <8 x i8> undef, %tmp2bd ; <<8 x i8>> [#uses=1]
%tmp4abcd = mul <8 x i8> undef, %tmp4bd ; <<8 x i8>> [#uses=2]
- call void @llvm.arm.neon.vst3.v8i8(i8* %A1, <8 x i8> %tmp4abcd, <8 x i8> zeroinitializer, <8 x i8> %tmp2abcd)
+ call void @llvm.arm.neon.vst3.v8i8(i8* %A1, <8 x i8> %tmp4abcd, <8 x i8> zeroinitializer, <8 x i8> %tmp2abcd, i32 1)
%tmp2ef = sub <8 x i8> %tmp2e, %tmp2f ; <<8 x i8>> [#uses=1]
%tmp2gh = sub <8 x i8> %tmp2g, %tmp2h ; <<8 x i8>> [#uses=1]
%tmp3gh = sub <8 x i8> zeroinitializer, %tmp3h ; <<8 x i8>> [#uses=1]
@@ -38,8 +38,8 @@ define <8 x i8> @t3(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A
%tmp2efgh = mul <8 x i8> %tmp2ef, %tmp2gh ; <<8 x i8>> [#uses=1]
%tmp3efgh = mul <8 x i8> undef, %tmp3gh ; <<8 x i8>> [#uses=1]
%tmp4efgh = mul <8 x i8> %tmp4ef, undef ; <<8 x i8>> [#uses=2]
- call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> %tmp4efgh, <8 x i8> %tmp3efgh, <8 x i8> %tmp2efgh)
+ call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> %tmp4efgh, <8 x i8> %tmp3efgh, <8 x i8> %tmp2efgh, i32 1)
%tmp4 = sub <8 x i8> %tmp4efgh, %tmp4abcd ; <<8 x i8>> [#uses=1]
- tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> zeroinitializer, <8 x i8> undef, <8 x i8> undef)
+ tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> zeroinitializer, <8 x i8> undef, <8 x i8> undef, i32 1)
ret <8 x i8> %tmp4
}
diff --git a/test/CodeGen/ARM/2010-05-21-BuildVector.ll b/test/CodeGen/ARM/2010-05-21-BuildVector.ll
index ce959d1b91..cd1c9c8c04 100644
--- a/test/CodeGen/ARM/2010-05-21-BuildVector.ll
+++ b/test/CodeGen/ARM/2010-05-21-BuildVector.ll
@@ -36,8 +36,8 @@ entry:
%tmp5 = insertelement <4 x float> %tmp7, float %18, i32 3
%19 = fmul <4 x float> %tmp5, %2
%20 = bitcast float* %fltp to i8*
- tail call void @llvm.arm.neon.vst1.v4f32(i8* %20, <4 x float> %19)
+ tail call void @llvm.arm.neon.vst1.v4f32(i8* %20, <4 x float> %19, i32 1)
ret void
}
-declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>) nounwind
+declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
diff --git a/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll b/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll
index e4f20990be..6f48796231 100644
--- a/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll
+++ b/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll
@@ -12,8 +12,8 @@ entry:
%tmp9 = trunc i128 %tmp8 to i64 ; <i64> [#uses=1]
%tmp16.i = bitcast i64 %tmp6 to <8 x i8> ; <<8 x i8>> [#uses=1]
%tmp20.i = bitcast i64 %tmp9 to <8 x i8> ; <<8 x i8>> [#uses=1]
- tail call void @llvm.arm.neon.vst2.v8i8(i8* %b, <8 x i8> %tmp16.i, <8 x i8> %tmp20.i) nounwind
+ tail call void @llvm.arm.neon.vst2.v8i8(i8* %b, <8 x i8> %tmp16.i, <8 x i8> %tmp20.i, i32 1) nounwind
ret void
}
-declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>) nounwind
+declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind
diff --git a/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll b/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll
index 0c5b180cf8..ffc47ebdf1 100644
--- a/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll
+++ b/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll
@@ -16,10 +16,10 @@ target triple = "thumbv7-apple-darwin10"
define i32 @test(i8* %arg) nounwind {
entry:
- %0 = call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %arg)
+ %0 = call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %arg, i32 1)
%1 = shufflevector <2 x i64> undef, <2 x i64> %0, <2 x i32> <i32 1, i32 2>
store <2 x i64> %1, <2 x i64>* undef, align 16
ret i32 undef
}
-declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*) nounwind readonly
+declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32) nounwind readonly
diff --git a/test/CodeGen/ARM/reg_sequence.ll b/test/CodeGen/ARM/reg_sequence.ll
index 0f3b3a3be8..729d570276 100644
--- a/test/CodeGen/ARM/reg_sequence.ll
+++ b/test/CodeGen/ARM/reg_sequence.ll
@@ -23,7 +23,7 @@ entry:
%2 = getelementptr inbounds %struct.int32x4_t* %vT1ptr, i32 0, i32 0 ; <<4 x i32>*> [#uses=1]
%3 = load <4 x i32>* %2, align 16 ; <<4 x i32>> [#uses=1]
%4 = bitcast i16* %i_ptr to i8* ; <i8*> [#uses=1]
- %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %4) ; <<8 x i16>> [#uses=1]
+ %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %4, i32 1) ; <<8 x i16>> [#uses=1]
%6 = bitcast <8 x i16> %5 to <2 x double> ; <<2 x double>> [#uses=2]
%7 = extractelement <2 x double> %6, i32 0 ; <double> [#uses=1]
%8 = bitcast double %7 to <4 x i16> ; <<4 x i16>> [#uses=1]
@@ -37,7 +37,7 @@ entry:
%16 = tail call <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32> %14, <4 x i32> <i32 -12, i32 -12, i32 -12, i32 -12>) ; <<4 x i16>> [#uses=1]
%17 = shufflevector <4 x i16> %15, <4 x i16> %16, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ; <<8 x i16>> [#uses=1]
%18 = bitcast i16* %o_ptr to i8* ; <i8*> [#uses=1]
- tail call void @llvm.arm.neon.vst1.v8i16(i8* %18, <8 x i16> %17)
+ tail call void @llvm.arm.neon.vst1.v8i16(i8* %18, <8 x i16> %17, i32 1)
ret void
}
@@ -57,17 +57,17 @@ entry:
%2 = getelementptr inbounds %struct.int16x8_t* %vT1ptr, i32 0, i32 0 ; <<8 x i16>*> [#uses=1]
%3 = load <8 x i16>* %2, align 16 ; <<8 x i16>> [#uses=1]
%4 = bitcast i16* %i_ptr to i8* ; <i8*> [#uses=1]
- %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %4) ; <<8 x i16>> [#uses=1]
+ %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %4, i32 1) ; <<8 x i16>> [#uses=1]
%6 = getelementptr inbounds i16* %i_ptr, i32 8 ; <i16*> [#uses=1]
%7 = bitcast i16* %6 to i8* ; <i8*> [#uses=1]
- %8 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %7) ; <<8 x i16>> [#uses=1]
+ %8 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %7, i32 1) ; <<8 x i16>> [#uses=1]
%9 = mul <8 x i16> %1, %5 ; <<8 x i16>> [#uses=1]
%10 = mul <8 x i16> %3, %8 ; <<8 x i16>> [#uses=1]
%11 = bitcast i16* %o_ptr to i8* ; <i8*> [#uses=1]
- tail call void @llvm.arm.neon.vst1.v8i16(i8* %11, <8 x i16> %9)
+ tail call void @llvm.arm.neon.vst1.v8i16(i8* %11, <8 x i16> %9, i32 1)
%12 = getelementptr inbounds i16* %o_ptr, i32 8 ; <i16*> [#uses=1]
%13 = bitcast i16* %12 to i8* ; <i8*> [#uses=1]
- tail call void @llvm.arm.neon.vst1.v8i16(i8* %13, <8 x i16> %10)
+ tail call void @llvm.arm.neon.vst1.v8i16(i8* %13, <8 x i16> %10, i32 1)
ret void
}
@@ -77,14 +77,14 @@ define <8 x i8> @t3(i8* %A, i8* %B) nounwind {
; CHECK: vmul.i8
; CHECK-NOT: vmov
; CHECK: vst3.8
- %tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+ %tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
%tmp2 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 0 ; <<8 x i8>> [#uses=1]
%tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 2 ; <<8 x i8>> [#uses=1]
%tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 1 ; <<8 x i8>> [#uses=1]
%tmp5 = sub <8 x i8> %tmp3, %tmp4
%tmp6 = add <8 x i8> %tmp2, %tmp3 ; <<8 x i8>> [#uses=1]
%tmp7 = mul <8 x i8> %tmp4, %tmp2
- tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> %tmp5, <8 x i8> %tmp6, <8 x i8> %tmp7)
+ tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> %tmp5, <8 x i8> %tmp6, <8 x i8> %tmp7, i32 1)
ret <8 x i8> %tmp4
}
@@ -97,10 +97,10 @@ entry:
; CHECK-NOT: vmov
; CHECK: bne
%tmp1 = bitcast i32* %in to i8* ; <i8*> [#uses=1]
- %tmp2 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
+ %tmp2 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp1, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
%tmp3 = getelementptr inbounds i32* %in, i32 8 ; <i32*> [#uses=1]
%tmp4 = bitcast i32* %tmp3 to i8* ; <i8*> [#uses=1]
- %tmp5 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp4) ; <%struct.__neon_int32x4x2_t> [#uses=2]
+ %tmp5 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp4, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
%tmp8 = bitcast i32* %out to i8* ; <i8*> [#uses=1]
br i1 undef, label %return1, label %return2
@@ -116,7 +116,7 @@ return1:
%tmp39 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1]
%tmp6 = add <4 x i32> %tmp52, %tmp ; <<4 x i32>> [#uses=1]
%tmp7 = add <4 x i32> %tmp57, %tmp39 ; <<4 x i32>> [#uses=1]
- tail call void @llvm.arm.neon.vst2.v4i32(i8* %tmp8, <4 x i32> %tmp6, <4 x i32> %tmp7)
+ tail call void @llvm.arm.neon.vst2.v4i32(i8* %tmp8, <4 x i32> %tmp6, <4 x i32> %tmp7, i32 1)
ret void
return2:
@@ -128,7 +128,7 @@ return2:
%tmp100 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 ; <<4 x i32>> [#uses=1]
%tmp101 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1]
%tmp102 = add <4 x i32> %tmp100, %tmp101 ; <<4 x i32>> [#uses=1]
- tail call void @llvm.arm.neon.vst2.v4i32(i8* %tmp8, <4 x i32> %tmp102, <4 x i32> %tmp101)
+ tail call void @llvm.arm.neon.vst2.v4i32(i8* %tmp8, <4 x i32> %tmp102, <4 x i32> %tmp101, i32 1)
call void @llvm.trap()
unreachable
}
@@ -143,7 +143,7 @@ define <8 x i16> @t5(i16* %A, <8 x i16>* %B) nounwind {
; CHECK: vadd.i16
%tmp0 = bitcast i16* %A to i8* ; <i8*> [#uses=1]
%tmp1 = load <8 x i16>* %B ; <<8 x i16>> [#uses=2]
- %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1) ; <%struct.__neon_int16x8x2_t> [#uses=2]
+ %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 1) ; <%struct.__neon_int16x8x2_t> [#uses=2]
%tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0 ; <<8 x i16>> [#uses=1]
%tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1 ; <<8 x i16>> [#uses=1]
%tmp5 = add <8 x i16> %tmp3, %tmp4 ; <<8 x i16>> [#uses=1]
@@ -156,7 +156,7 @@ define <8 x i8> @t6(i8* %A, <8 x i8>* %B) nounwind {
; CHECK: vmov d1, d0
; CHECK-NEXT: vld2.8 {d0[1], d1[1]}
%tmp1 = load <8 x i8>* %B ; <<8 x i8>> [#uses=2]
- %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1) ; <%struct.__neon_int8x8x2_t> [#uses=2]
+ %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) ; <%struct.__neon_int8x8x2_t> [#uses=2]
%tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 ; <<8 x i8>> [#uses=1]
%tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1 ; <<8 x i8>> [#uses=1]
%tmp5 = add <8 x i8> %tmp3, %tmp4 ; <<8 x i8>> [#uses=1]
@@ -174,14 +174,14 @@ entry:
; CHECK: vuzp.32 q0, q1
; CHECK: vst1.32
%0 = bitcast i32* %iptr to i8* ; <i8*> [#uses=2]
- %1 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %0) ; <%struct.__neon_int32x4x2_t> [#uses=2]
+ %1 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %0, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
%tmp57 = extractvalue %struct.__neon_int32x4x2_t %1, 0 ; <<4 x i32>> [#uses=1]
%tmp60 = extractvalue %struct.__neon_int32x4x2_t %1, 1 ; <<4 x i32>> [#uses=1]
%2 = bitcast i32* %optr to i8* ; <i8*> [#uses=2]
- tail call void @llvm.arm.neon.vst2.v4i32(i8* %2, <4 x i32> %tmp57, <4 x i32> %tmp60)
- %3 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %0) ; <<4 x i32>> [#uses=1]
+ tail call void @llvm.arm.neon.vst2.v4i32(i8* %2, <4 x i32> %tmp57, <4 x i32> %tmp60, i32 1)
+ %3 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %0, i32 1) ; <<4 x i32>> [#uses=1]
%4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> ; <<4 x i32>> [#uses=1]
- tail call void @llvm.arm.neon.vst1.v4i32(i8* %2, <4 x i32> %4)
+ tail call void @llvm.arm.neon.vst1.v4i32(i8* %2, <4 x i32> %4, i32 1)
ret void
}
@@ -304,42 +304,43 @@ bb14: ; preds = %bb6
; This test crashes the coalescer because live variables were not updated properly.
define <8 x i8> @t11(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A7, i8* %A8, i8* %B) nounwind {
- %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+ %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
%tmp2d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 0 ; <<8 x i8>> [#uses=1]
- %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+ %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
%tmp2f = extractvalue %struct.__neon_int8x8x3_t %tmp1f, 0 ; <<8 x i8>> [#uses=1]
%tmp2bd = add <8 x i8> zeroinitializer, %tmp2d ; <<8 x i8>> [#uses=1]
%tmp2abcd = mul <8 x i8> zeroinitializer, %tmp2bd ; <<8 x i8>> [#uses=1]
%tmp2ef = sub <8 x i8> zeroinitializer, %tmp2f ; <<8 x i8>> [#uses=1]
%tmp2efgh = mul <8 x i8> %tmp2ef, undef ; <<8 x i8>> [#uses=2]
- call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp2efgh)
+ call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp2efgh, i32 1)
%tmp2 = sub <8 x i8> %tmp2efgh, %tmp2abcd ; <<8 x i8>> [#uses=1]
%tmp7 = mul <8 x i8> undef, %tmp2 ; <<8 x i8>> [#uses=1]
- tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp7)
+ tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp7, i32 1)
ret <8 x i8> undef
}
-declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*) nounwind readonly
+declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32) nounwind readonly
-declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*) nounwind readonly
+declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly
declare <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>) nounwind
+declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32) nounwind
-declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>) nounwind
+declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
-declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>) nounwind
+declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
+nounwind
-declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*, i32) nounwind readonly
-declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8*) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8*, i32) nounwind readonly
-declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind readonly
+declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
-declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
-declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>) nounwind
+declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind
declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/spill-q.ll b/test/CodeGen/ARM/spill-q.ll
index 792ef79982..ae1ba2f738 100644
--- a/test/CodeGen/ARM/spill-q.ll
+++ b/test/CodeGen/ARM/spill-q.ll
@@ -7,7 +7,7 @@
%quux = type { i32 (...)**, %baz*, i32 }
%quuz = type { %quux, i32, %bar, [128 x i8], [16 x %foo], %foo, %foo, %foo }
-declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*) nounwind readonly
+declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
define void @aaa(%quuz* %this, i8* %block) {
; CHECK: aaa:
@@ -15,11 +15,11 @@ define void @aaa(%quuz* %this, i8* %block) {
; CHECK: vst1.64 {{.*}}sp, :128
; CHECK: vld1.64 {{.*}}sp, :128
entry:
- %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef) nounwind ; <<4 x float>> [#uses=1]
+ %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
store float 6.300000e+01, float* undef, align 4
- %1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef) nounwind ; <<4 x float>> [#uses=1]
+ %1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
store float 0.000000e+00, float* undef, align 4
- %2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef) nounwind ; <<4 x float>> [#uses=1]
+ %2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* unde