aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/Target/ARM/ARMNaClRewritePass.cpp272
-rw-r--r--test/NaCl/ARM/neon-vld1-sandboxing.ll92
-rw-r--r--test/NaCl/ARM/neon-vld2-sandboxing.ll102
-rw-r--r--test/NaCl/ARM/neon-vld3-sandboxing.ll79
-rw-r--r--test/NaCl/ARM/neon-vld4-sandboxing.ll80
-rw-r--r--test/NaCl/ARM/neon-vlddup-sandboxing.ll151
-rw-r--r--test/NaCl/ARM/neon-vldlane-sandboxing.ll319
-rw-r--r--test/NaCl/ARM/neon-vst1-sandboxing.ll12
-rw-r--r--test/NaCl/ARM/neon-vst2-sandboxing.ll8
-rw-r--r--test/NaCl/ARM/neon-vst3-sandboxing.ll2
-rw-r--r--test/NaCl/ARM/neon-vst4-sandboxing.ll2
-rw-r--r--test/NaCl/ARM/neon-vstlane-sandboxing.ll32
12 files changed, 1140 insertions, 11 deletions
diff --git a/lib/Target/ARM/ARMNaClRewritePass.cpp b/lib/Target/ARM/ARMNaClRewritePass.cpp
index f7f64601d7..c54afe742c 100644
--- a/lib/Target/ARM/ARMNaClRewritePass.cpp
+++ b/lib/Target/ARM/ARMNaClRewritePass.cpp
@@ -617,6 +617,278 @@ static bool IsDangerousStore(const MachineInstr &MI, int *AddrIdx) {
break;
//
+ // NEON loads
+ //
+
+ // VLD1
+ case ARM::VLD1d8:
+ case ARM::VLD1d16:
+ case ARM::VLD1d32:
+ case ARM::VLD1d64:
+ case ARM::VLD1q8:
+ case ARM::VLD1q16:
+ case ARM::VLD1q32:
+ case ARM::VLD1q64:
+ case ARM::VLD1d8wb_fixed:
+ case ARM::VLD1d16wb_fixed:
+ case ARM::VLD1d32wb_fixed:
+ case ARM::VLD1d64wb_fixed:
+ case ARM::VLD1q8wb_fixed:
+ case ARM::VLD1q16wb_fixed:
+ case ARM::VLD1q32wb_fixed:
+ case ARM::VLD1q64wb_fixed:
+ case ARM::VLD1d8wb_register:
+ case ARM::VLD1d16wb_register:
+ case ARM::VLD1d32wb_register:
+ case ARM::VLD1d64wb_register:
+ case ARM::VLD1q8wb_register:
+ case ARM::VLD1q16wb_register:
+ case ARM::VLD1q32wb_register:
+ case ARM::VLD1q64wb_register:
+
+ // VLD1T
+ case ARM::VLD1d8T:
+ case ARM::VLD1d16T:
+ case ARM::VLD1d32T:
+ case ARM::VLD1d64T:
+ *AddrIdx = 1;
+ break;
+ case ARM::VLD1d8Twb_fixed:
+ case ARM::VLD1d16Twb_fixed:
+ case ARM::VLD1d32Twb_fixed:
+ case ARM::VLD1d64Twb_fixed:
+ case ARM::VLD1d8Twb_register:
+ case ARM::VLD1d16Twb_register:
+ case ARM::VLD1d32Twb_register:
+ case ARM::VLD1d64Twb_register:
+ *AddrIdx = 2;
+ break;
+
+ // VLD1Q
+ case ARM::VLD1d8Q:
+ case ARM::VLD1d16Q:
+ case ARM::VLD1d32Q:
+ case ARM::VLD1d64Q:
+ *AddrIdx = 1;
+ break;
+ case ARM::VLD1d8Qwb_fixed:
+ case ARM::VLD1d16Qwb_fixed:
+ case ARM::VLD1d32Qwb_fixed:
+ case ARM::VLD1d64Qwb_fixed:
+ case ARM::VLD1d8Qwb_register:
+ case ARM::VLD1d16Qwb_register:
+ case ARM::VLD1d32Qwb_register:
+ case ARM::VLD1d64Qwb_register:
+ *AddrIdx = 2;
+ break;
+
+ // VLD1LN
+ case ARM::VLD1LNd8:
+ case ARM::VLD1LNd16:
+ case ARM::VLD1LNd32:
+ case ARM::VLD1LNd8_UPD:
+ case ARM::VLD1LNd16_UPD:
+ case ARM::VLD1LNd32_UPD:
+
+ // VLD1DUP
+ case ARM::VLD1DUPd8:
+ case ARM::VLD1DUPd16:
+ case ARM::VLD1DUPd32:
+ case ARM::VLD1DUPq8:
+ case ARM::VLD1DUPq16:
+ case ARM::VLD1DUPq32:
+ case ARM::VLD1DUPd8wb_fixed:
+ case ARM::VLD1DUPd16wb_fixed:
+ case ARM::VLD1DUPd32wb_fixed:
+ case ARM::VLD1DUPq8wb_fixed:
+ case ARM::VLD1DUPq16wb_fixed:
+ case ARM::VLD1DUPq32wb_fixed:
+ case ARM::VLD1DUPd8wb_register:
+ case ARM::VLD1DUPd16wb_register:
+ case ARM::VLD1DUPd32wb_register:
+ case ARM::VLD1DUPq8wb_register:
+ case ARM::VLD1DUPq16wb_register:
+ case ARM::VLD1DUPq32wb_register:
+
+ // VLD2
+ case ARM::VLD2d8:
+ case ARM::VLD2d16:
+ case ARM::VLD2d32:
+ case ARM::VLD2b8:
+ case ARM::VLD2b16:
+ case ARM::VLD2b32:
+ case ARM::VLD2q8:
+ case ARM::VLD2q16:
+ case ARM::VLD2q32:
+ *AddrIdx = 1;
+ break;
+
+ case ARM::VLD2d8wb_fixed:
+ case ARM::VLD2d16wb_fixed:
+ case ARM::VLD2d32wb_fixed:
+ case ARM::VLD2b8wb_fixed:
+ case ARM::VLD2b16wb_fixed:
+ case ARM::VLD2b32wb_fixed:
+ case ARM::VLD2q8wb_fixed:
+ case ARM::VLD2q16wb_fixed:
+ case ARM::VLD2q32wb_fixed:
+ case ARM::VLD2d8wb_register:
+ case ARM::VLD2d16wb_register:
+ case ARM::VLD2d32wb_register:
+ case ARM::VLD2b8wb_register:
+ case ARM::VLD2b16wb_register:
+ case ARM::VLD2b32wb_register:
+ case ARM::VLD2q8wb_register:
+ case ARM::VLD2q16wb_register:
+ case ARM::VLD2q32wb_register:
+ *AddrIdx = 2;
+ break;
+
+ // VLD2LN
+ case ARM::VLD2LNd8:
+ case ARM::VLD2LNd16:
+ case ARM::VLD2LNd32:
+ case ARM::VLD2LNq16:
+ case ARM::VLD2LNq32:
+ *AddrIdx = 2;
+ break;
+
+ case ARM::VLD2LNd8_UPD:
+ case ARM::VLD2LNd16_UPD:
+ case ARM::VLD2LNd32_UPD:
+ case ARM::VLD2LNq16_UPD:
+ case ARM::VLD2LNq32_UPD:
+ *AddrIdx = 3;
+ break;
+
+ // VLD2DUP
+ case ARM::VLD2DUPd8:
+ case ARM::VLD2DUPd16:
+ case ARM::VLD2DUPd32:
+ case ARM::VLD2DUPd8x2:
+ case ARM::VLD2DUPd16x2:
+ case ARM::VLD2DUPd32x2:
+ *AddrIdx = 1;
+ break;
+
+ case ARM::VLD2DUPd8wb_fixed:
+ case ARM::VLD2DUPd16wb_fixed:
+ case ARM::VLD2DUPd32wb_fixed:
+ case ARM::VLD2DUPd8wb_register:
+ case ARM::VLD2DUPd16wb_register:
+ case ARM::VLD2DUPd32wb_register:
+ case ARM::VLD2DUPd8x2wb_fixed:
+ case ARM::VLD2DUPd16x2wb_fixed:
+ case ARM::VLD2DUPd32x2wb_fixed:
+ case ARM::VLD2DUPd8x2wb_register:
+ case ARM::VLD2DUPd16x2wb_register:
+ case ARM::VLD2DUPd32x2wb_register:
+ *AddrIdx = 2;
+ break;
+
+ // VLD3
+ case ARM::VLD3d8:
+ case ARM::VLD3d16:
+ case ARM::VLD3d32:
+ case ARM::VLD3q8:
+ case ARM::VLD3q16:
+ case ARM::VLD3q32:
+ case ARM::VLD3d8_UPD:
+ case ARM::VLD3d16_UPD:
+ case ARM::VLD3d32_UPD:
+ case ARM::VLD3q8_UPD:
+ case ARM::VLD3q16_UPD:
+ case ARM::VLD3q32_UPD:
+
+ // VLD3LN
+ case ARM::VLD3LNd8:
+ case ARM::VLD3LNd16:
+ case ARM::VLD3LNd32:
+ case ARM::VLD3LNq16:
+ case ARM::VLD3LNq32:
+ *AddrIdx = 3;
+ break;
+
+ case ARM::VLD3LNd8_UPD:
+ case ARM::VLD3LNd16_UPD:
+ case ARM::VLD3LNd32_UPD:
+ case ARM::VLD3LNq16_UPD:
+ case ARM::VLD3LNq32_UPD:
+ *AddrIdx = 4;
+ break;
+
+ // VLD3DUP
+ case ARM::VLD3DUPd8:
+ case ARM::VLD3DUPd16:
+ case ARM::VLD3DUPd32:
+ case ARM::VLD3DUPq8:
+ case ARM::VLD3DUPq16:
+ case ARM::VLD3DUPq32:
+ *AddrIdx = 3;
+ break;
+
+ case ARM::VLD3DUPd8_UPD:
+ case ARM::VLD3DUPd16_UPD:
+ case ARM::VLD3DUPd32_UPD:
+ case ARM::VLD3DUPq8_UPD:
+ case ARM::VLD3DUPq16_UPD:
+ case ARM::VLD3DUPq32_UPD:
+ *AddrIdx = 4;
+ break;
+
+ // VLD4
+ case ARM::VLD4d8:
+ case ARM::VLD4d16:
+ case ARM::VLD4d32:
+ case ARM::VLD4q8:
+ case ARM::VLD4q16:
+ case ARM::VLD4q32:
+ *AddrIdx = 4;
+ break;
+
+ case ARM::VLD4d8_UPD:
+ case ARM::VLD4d16_UPD:
+ case ARM::VLD4d32_UPD:
+ case ARM::VLD4q8_UPD:
+ case ARM::VLD4q16_UPD:
+ case ARM::VLD4q32_UPD:
+ *AddrIdx = 5;
+ break;
+
+ // VLD4LN
+ case ARM::VLD4LNd8:
+ case ARM::VLD4LNd16:
+ case ARM::VLD4LNd32:
+ case ARM::VLD4LNq16:
+ case ARM::VLD4LNq32:
+ *AddrIdx = 4;
+ break;
+
+ case ARM::VLD4LNd8_UPD:
+ case ARM::VLD4LNd16_UPD:
+ case ARM::VLD4LNd32_UPD:
+ case ARM::VLD4LNq16_UPD:
+ case ARM::VLD4LNq32_UPD:
+ *AddrIdx = 5;
+ break;
+
+ case ARM::VLD4DUPd8:
+ case ARM::VLD4DUPd16:
+ case ARM::VLD4DUPd32:
+ case ARM::VLD4DUPq16:
+ case ARM::VLD4DUPq32:
+ *AddrIdx = 4;
+ break;
+
+ case ARM::VLD4DUPd8_UPD:
+ case ARM::VLD4DUPd16_UPD:
+ case ARM::VLD4DUPd32_UPD:
+ case ARM::VLD4DUPq16_UPD:
+ case ARM::VLD4DUPq32_UPD:
+ *AddrIdx = 5;
+ break;
+
+ //
// NEON stores
//
diff --git a/test/NaCl/ARM/neon-vld1-sandboxing.ll b/test/NaCl/ARM/neon-vld1-sandboxing.ll
new file mode 100644
index 0000000000..52395a559f
--- /dev/null
+++ b/test/NaCl/ARM/neon-vld1-sandboxing.ll
@@ -0,0 +1,92 @@
+; RUN: llc -mtriple=armv7-unknown-nacl -mattr=+neon -sfi-load -sfi-store -filetype=obj %s -o - \
+; RUN: | llvm-objdump -disassemble -triple armv7 - | FileCheck %s
+
+define <8 x i8> @vld1i8(i8* %A) nounwind {
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %A, i32 16)
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld1.8 {{{d[0-9]+}}}, [r0, :64]
+ ret <8 x i8> %tmp1
+}
+
+define <4 x i16> @vld1i16(i16* %A) nounwind {
+ %tmp0 = bitcast i16* %A to i8*
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %tmp0, i32 1)
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld1.16 {{{d[0-9]+}}}, [r0]
+ ret <4 x i16> %tmp1
+}
+
+define <2 x i32> @vld1i32(i32* %A) nounwind {
+ %tmp0 = bitcast i32* %A to i8*
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %tmp0, i32 1)
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld1.32 {{{d[0-9]+}}}, [r0]
+ ret <2 x i32> %tmp1
+}
+
+; Insert useless arguments here just for the sake of moving
+; %A further down the rN chain (testing how sandboxing detects
+; the correct register and not just the default r0)
+define <1 x i64> @vld1i64(i32 %foo, i32 %bar, i32 %baz,
+ i64* %A) nounwind {
+ %tmp0 = bitcast i64* %A to i8*
+ %tmp1 = call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %tmp0, i32 1)
+; CHECK: bic r3, r3, #3221225472
+; CHECK-NEXT: vld1.64 {{{d[0-9]+}}}, [r3]
+ ret <1 x i64> %tmp1
+}
+
+define <16 x i8> @vld1Qi8(i8* %A) nounwind {
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %A, i32 8)
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld1.8 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0, :64]
+ ret <16 x i8> %tmp1
+}
+
+define <8 x i16> @vld1Qi16(i16* %A) nounwind {
+ %tmp0 = bitcast i16* %A to i8*
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %tmp0, i32 32)
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld1.16 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0, :128]
+ ret <8 x i16> %tmp1
+}
+
+define <4 x i32> @vld1Qi32(i32* %A) nounwind {
+ %tmp0 = bitcast i32* %A to i8*
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %tmp0, i32 1)
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0]
+ ret <4 x i32> %tmp1
+}
+
+define <2 x i64> @vld1Qi64(i64* %A) nounwind {
+ %tmp0 = bitcast i64* %A to i8*
+ %tmp1 = call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %tmp0, i32 1)
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0]
+ ret <2 x i64> %tmp1
+}
+
+declare <8 x i8> @llvm.arm.neon.vld1.v8i8(i8*, i32) nounwind readonly
+declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32) nounwind readonly
+declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32) nounwind readonly
+declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32) nounwind readonly
+declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*, i32) nounwind readonly
+
+declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32) nounwind readonly
+declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly
+declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32) nounwind readonly
+declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
+declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32) nounwind readonly
+
+define <4 x i16> @vld1i16_update(i16** %ptr) nounwind {
+ %A = load i16** %ptr
+ %tmp0 = bitcast i16* %A to i8*
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %tmp0, i32 1)
+; CHECK: bic r1, r1, #3221225472
+; CHECK-NEXT: vld1.16 {{{d[0-9]+}}}, [r1]!
+ %tmp2 = getelementptr i16* %A, i32 4
+ store i16* %tmp2, i16** %ptr
+ ret <4 x i16> %tmp1
+}
+
diff --git a/test/NaCl/ARM/neon-vld2-sandboxing.ll b/test/NaCl/ARM/neon-vld2-sandboxing.ll
new file mode 100644
index 0000000000..ffec745e5f
--- /dev/null
+++ b/test/NaCl/ARM/neon-vld2-sandboxing.ll
@@ -0,0 +1,102 @@
+; RUN: llc -mtriple=armv7-unknown-nacl -mattr=+neon -sfi-load -sfi-store -filetype=obj %s -o - \
+; RUN: | llvm-objdump -disassemble -triple armv7 - | FileCheck %s
+
+%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> }
+%struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> }
+%struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> }
+%struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> }
+%struct.__neon_int64x1x2_t = type { <1 x i64>, <1 x i64> }
+
+%struct.__neon_int8x16x2_t = type { <16 x i8>, <16 x i8> }
+%struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> }
+%struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> }
+%struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> }
+
+declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32(i8*, i32) nounwind readonly
+declare %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64(i8*, i32) nounwind readonly
+
+declare %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32(i8*, i32) nounwind readonly
+
+define <8 x i8> @vld2i8(i8* %A) nounwind {
+ %tmp1 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8(i8* %A, i32 8)
+ %tmp2 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 1
+ %tmp4 = add <8 x i8> %tmp2, %tmp3
+; CHECK: bic r0, r0, #3221225472
+; CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0, :64]
+ ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vld2i16(i16* %A) nounwind {
+ %tmp0 = bitcast i16* %A to i8*
+ %tmp1 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16(i8* %tmp0, i32 32)
+ %tmp2 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 1
+ %tmp4 = add <4 x i16> %tmp2, %tmp3
+; CHECK: bic r0, r0, #3221225472
+; CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0, :128]
+ ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vld2i32(i32* %A) nounwind {
+ %tmp0 = bitcast i32* %A to i8*
+ %tmp1 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8* %tmp0, i32 1)
+ %tmp2 = extractvalue %struct.__neon_int32x2x2_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp1, 1
+ %tmp4 = add <2 x i32> %tmp2, %tmp3
+; CHECK: bic r0, r0, #3221225472
+; CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+ ret <2 x i32> %tmp4
+}
+
+define <16 x i8> @vld2Qi8(i8* %A) nounwind {
+ %tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8* %A, i32 8)
+ %tmp2 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 1
+ %tmp4 = add <16 x i8> %tmp2, %tmp3
+; CHECK: bic r0, r0, #3221225472
+; CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0, :64]
+ ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vld2Qi16(i16* %A) nounwind {
+ %tmp0 = bitcast i16* %A to i8*
+ %tmp1 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16(i8* %tmp0, i32 16)
+ %tmp2 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 1
+ %tmp4 = add <8 x i16> %tmp2, %tmp3
+; CHECK: bic r0, r0, #3221225472
+; CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0, :128]
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vld2Qi32(i32* %A) nounwind {
+ %tmp0 = bitcast i32* %A to i8*
+ %tmp1 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp0, i32 64)
+ %tmp2 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 1
+ %tmp4 = add <4 x i32> %tmp2, %tmp3
+; CHECK: bic r0, r0, #3221225472
+; CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0, :256]
+ ret <4 x i32> %tmp4
+}
+
+;Check for a post-increment updating load with register increment.
+define <16 x i8> @vld2Qi8_update(i8** %ptr, i32 %inc) nounwind {
+ %A = load i8** %ptr
+ %tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8* %A, i32 16)
+ %tmp2 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 1
+ %tmp4 = add <16 x i8> %tmp2, %tmp3
+; CHECK: bic r2, r2, #3221225472
+; CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r2, :128], r1
+ %tmp5 = getelementptr i8* %A, i32 %inc
+ store i8* %tmp5, i8** %ptr
+ ret <16 x i8> %tmp4
+}
diff --git a/test/NaCl/ARM/neon-vld3-sandboxing.ll b/test/NaCl/ARM/neon-vld3-sandboxing.ll
new file mode 100644
index 0000000000..49e38b9c77
--- /dev/null
+++ b/test/NaCl/ARM/neon-vld3-sandboxing.ll
@@ -0,0 +1,79 @@
+; RUN: llc -mtriple=armv7-unknown-nacl -mattr=+neon -sfi-store -filetype=obj %s -o - \
+; RUN: | llvm-objdump -disassemble -triple armv7 - | FileCheck %s
+
+%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
+%struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
+%struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> }
+%struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> }
+%struct.__neon_int64x1x3_t = type { <1 x i64>, <1 x i64>, <1 x i64> }
+
+%struct.__neon_int8x16x3_t = type { <16 x i8>, <16 x i8>, <16 x i8> }
+%struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> }
+%struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> }
+%struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> }
+
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32(i8*, i32) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8*, i32) nounwind readonly
+
+declare %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32(i8*, i32) nounwind readonly
+
+define <8 x i8> @vld3i8(i32 %foobar, i32 %ba, i8* %A) nounwind {
+ %tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A, i32 32)
+ %tmp2 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 2
+ %tmp4 = add <8 x i8> %tmp2, %tmp3
+; CHECK: bic r2, r2, #3221225472
+; CHECK-NEXT: vld3.8 {{{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}}, [r2, :64]
+ ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vld3i16(i16* %A) nounwind {
+ %tmp0 = bitcast i16* %A to i8*
+ %tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i8* %tmp0, i32 1)
+ %tmp2 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 2
+ %tmp4 = add <4 x i16> %tmp2, %tmp3
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld3.16 {{{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}}, [r0]
+ ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vld3i32(i32* %A) nounwind {
+ %tmp0 = bitcast i32* %A to i8*
+ %tmp1 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8* %tmp0, i32 1)
+ %tmp2 = extractvalue %struct.__neon_int32x2x3_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp1, 2
+ %tmp4 = add <2 x i32> %tmp2, %tmp3
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld3.32 {{{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}}, [r0]
+ ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @vld3i64(i64* %A) nounwind {
+ %tmp0 = bitcast i64* %A to i8*
+ %tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8* %tmp0, i32 16)
+ %tmp2 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 2
+ %tmp4 = add <1 x i64> %tmp2, %tmp3
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}}, [r0, :64]
+ ret <1 x i64> %tmp4
+}
+
+
+define <16 x i8> @vld3Qi8(i8* %A) nounwind {
+ %tmp1 = call %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8(i8* %A, i32 32)
+ %tmp2 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 2
+ %tmp4 = add <16 x i8> %tmp2, %tmp3
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld3.8 {{{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}}, [r0, :64]!
+ ret <16 x i8> %tmp4
+}
+
diff --git a/test/NaCl/ARM/neon-vld4-sandboxing.ll b/test/NaCl/ARM/neon-vld4-sandboxing.ll
new file mode 100644
index 0000000000..14d903c09e
--- /dev/null
+++ b/test/NaCl/ARM/neon-vld4-sandboxing.ll
@@ -0,0 +1,80 @@
+; RUN: llc -mtriple=armv7-unknown-nacl -mattr=+neon -sfi-store -filetype=obj %s -o - \
+; RUN: | llvm-objdump -disassemble -triple armv7 - | FileCheck %s
+
+%struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }
+%struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
+%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
+%struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> }
+%struct.__neon_int64x1x4_t = type { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }
+
+%struct.__neon_int8x16x4_t = type { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }
+%struct.__neon_int16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }
+%struct.__neon_int32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }
+%struct.__neon_float32x4x4_t = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> }
+
+declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32(i8*, i32) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8*, i32) nounwind readonly
+
+declare %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32(i8*, i32) nounwind readonly
+
+define <8 x i8> @vld4i8(i8* %A) nounwind {
+ %tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8* %A, i32 8)
+ %tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 2
+ %tmp4 = add <8 x i8> %tmp2, %tmp3
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld4.8 {{{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}}, [r0, :64]
+ ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vld4i16(i16* %A) nounwind {
+ %tmp0 = bitcast i16* %A to i8*
+ %tmp1 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16(i8* %tmp0, i32 16)
+ %tmp2 = extractvalue %struct.__neon_int16x4x4_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp1, 2
+ %tmp4 = add <4 x i16> %tmp2, %tmp3
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld4.16 {{{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}}, [r0, :128]
+ ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vld4i32(i32* %A) nounwind {
+ %tmp0 = bitcast i32* %A to i8*
+ %tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8* %tmp0, i32 32)
+ %tmp2 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 2
+ %tmp4 = add <2 x i32> %tmp2, %tmp3
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld4.32 {{{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}}, [r0, :256]
+ ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @vld4i64(i64* %A) nounwind {
+ %tmp0 = bitcast i64* %A to i8*
+ %tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8* %tmp0, i32 64)
+ %tmp2 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 2
+ %tmp4 = add <1 x i64> %tmp2, %tmp3
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}}, [r0, :256]
+ ret <1 x i64> %tmp4
+}
+
+define <16 x i8> @vld4Qi8(i8* %A) nounwind {
+ %tmp1 = call %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8(i8* %A, i32 64)
+ %tmp2 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 2
+ %tmp4 = add <16 x i8> %tmp2, %tmp3
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld4.8 {{{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}}, [r0, :256]!
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld4.8 {{{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}}, [r0, :256]
+ ret <16 x i8> %tmp4
+}
+
diff --git a/test/NaCl/ARM/neon-vlddup-sandboxing.ll b/test/NaCl/ARM/neon-vlddup-sandboxing.ll
new file mode 100644
index 0000000000..cd77ace644
--- /dev/null
+++ b/test/NaCl/ARM/neon-vlddup-sandboxing.ll
@@ -0,0 +1,151 @@
+; RUN: llc -mtriple=armv7-unknown-nacl -mattr=+neon -sfi-store -filetype=obj %s -o - \
+; RUN: | llvm-objdump -disassemble -triple armv7 - | FileCheck %s
+
+%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> }
+%struct.__neon_int4x16x2_t = type { <4 x i16>, <4 x i16> }
+%struct.__neon_int2x32x2_t = type { <2 x i32>, <2 x i32> }
+
+declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
+declare %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+
+%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
+%struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
+
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+
+%struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
+%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
+
+declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+
+define <8 x i8> @vld1dupi8(i32 %foo, i32 %bar,
+ i8* %A) nounwind {
+ %tmp1 = load i8* %A, align 8
+ %tmp2 = insertelement <8 x i8> undef, i8 %tmp1, i32 0
+ %tmp3 = shufflevector <8 x i8> %tmp2, <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK: bic r2, r2, #3221225472
+; CHECK-NEXT: vld1.8 {{{d[0-9]+\[\]}}}, [r2]
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vld1dupi16(i16* %A) nounwind {
+ %tmp1 = load i16* %A, align 8
+ %tmp2 = insertelement <4 x i16> undef, i16 %tmp1, i32 0
+ %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> undef, <4 x i32> zeroinitializer
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld1.16 {{{d[0-9]+\[\]}}}, [r0, :16]
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vld1dupi32(i32* %A) nounwind {
+ %tmp1 = load i32* %A, align 8
+ %tmp2 = insertelement <2 x i32> undef, i32 %tmp1, i32 0
+ %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld1.32 {{{d[0-9]+\[\]}}}, [r0, :32]
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @vld1dupQi8(i8* %A) nounwind {
+ %tmp1 = load i8* %A, align 8
+ %tmp2 = insertelement <16 x i8> undef, i8 %tmp1, i32 0
+ %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld1.8 {{{d[0-9]+\[\]}}, {{d[0-9]+\[\]}}}, [r0]
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @vld2dupi8(i8* %A) nounwind {
+ %tmp0 = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld2.8 {{{d[0-9]+\[\]}}, {{d[0-9]+\[\]}}}, [r0]
+ %tmp1 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 0
+ %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
+ %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 1
+ %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <8 x i32> zeroinitializer
+ %tmp5 = add <8 x i8> %tmp2, %tmp4
+ ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vld2dupi16(i8* %A) nounwind {
+ %tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld2.16 {{{d[0-9]+\[\]}}, {{d[0-9]+\[\]}}}, [r0]
+ %tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0
+ %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
+ %tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1
+ %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer
+ %tmp5 = add <4 x i16> %tmp2, %tmp4
+ ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @vld2dupi32(i8* %A) nounwind {
+ %tmp0 = tail call %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %A, <2 x i32> undef, <2 x i32> undef, i32 0, i32 16)
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld2.32 {{{d[0-9]+\[\]}}, {{d[0-9]+\[\]}}}, [r0, :64]
+ %tmp1 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 0
+ %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
+ %tmp3 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 1
+ %tmp4 = shufflevector <2 x i32> %tmp3, <2 x i32> undef, <2 x i32> zeroinitializer
+ %tmp5 = add <2 x i32> %tmp2, %tmp4
+ ret <2 x i32> %tmp5
+}
+
+define <4 x i16> @vld3dupi16(i8* %A) nounwind {
+ %tmp0 = tail call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 8)
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld3.16 {{{d[0-9]+\[\]}}, {{d[0-9]+\[\]}}, {{d[0-9]+\[\]}}}, [r0]
+ %tmp1 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 0
+ %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
+ %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 1
+ %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer
+ %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 2
+ %tmp6 = shufflevector <4 x i16> %tmp5, <4 x i16> undef, <4 x i32> zeroinitializer
+ %tmp7 = add <4 x i16> %tmp2, %tmp4
+ %tmp8 = add <4 x i16> %tmp7, %tmp6
+ ret <4 x i16> %tmp8
+}
+
+define <2 x i32> @vld4dupi32(i8* %A) nounwind {
+ %tmp0 = tail call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %A, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 8)
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld4.32 {{{d[0-9]+\[\]}}, {{d[0-9]+\[\]}}, {{d[0-9]+\[\]}}, {{d[0-9]+\[\]}}}, [r0, :64]
+ %tmp1 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 0
+ %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
+ %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 1
+ %tmp4 = shufflevector <2 x i32> %tmp3, <2 x i32> undef, <2 x i32> zeroinitializer
+ %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 2
+ %tmp6 = shufflevector <2 x i32> %tmp5, <2 x i32> undef, <2 x i32> zeroinitializer
+ %tmp7 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 3
+ %tmp8 = shufflevector <2 x i32> %tmp7, <2 x i32> undef, <2 x i32> zeroinitializer
+ %tmp9 = add <2 x i32> %tmp2, %tmp4
+ %tmp10 = add <2 x i32> %tmp6, %tmp8
+ %tmp11 = add <2 x i32> %tmp9, %tmp10
+ ret <2 x i32> %tmp11
+}
+
+;Check for a post-increment updating load.
+define <4 x i16> @vld4dupi16_update(i16** %ptr) nounwind {
+ %A = load i16** %ptr
+ %A2 = bitcast i16* %A to i8*
+ %tmp0 = tail call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %A2, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 1)
+; CHECK: bic r1, r1, #3221225472
+; CHECK-NEXT: vld4.16 {{{d[0-9]+\[\]}}, {{d[0-9]+\[\]}}, {{d[0-9]+\[\]}}, {{d[0-9]+\[\]}}}, [r1]!
+ %tmp1 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 0
+ %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
+ %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 1
+ %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer
+ %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 2
+ %tmp6 = shufflevector <4 x i16> %tmp5, <4 x i16> undef, <4 x i32> zeroinitializer
+ %tmp7 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 3
+ %tmp8 = shufflevector <4 x i16> %tmp7, <4 x i16> undef, <4 x i32> zeroinitializer
+ %tmp9 = add <4 x i16> %tmp2, %tmp4
+ %tmp10 = add <4 x i16> %tmp6, %tmp8
+ %tmp11 = add <4 x i16> %tmp9, %tmp10
+ %tmp12 = getelementptr i16* %A, i32 4
+ store i16* %tmp12, i16** %ptr
+ ret <4 x i16> %tmp11
+}
diff --git a/test/NaCl/ARM/neon-vldlane-sandboxing.ll b/test/NaCl/ARM/neon-vldlane-sandboxing.ll
new file mode 100644
index 0000000000..716da93298
--- /dev/null
+++ b/test/NaCl/ARM/neon-vldlane-sandboxing.ll
@@ -0,0 +1,319 @@
+; RUN: llc -mtriple=armv7-unknown-nacl -mattr=+neon -sfi-store -filetype=obj %s -o - \
+; RUN: | llvm-objdump -disassemble -triple armv7 - | FileCheck %s
+
+%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> }
+%struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> }
+%struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> }
+%struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> }
+
+%struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> }
+%struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> }
+%struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> }
+
+declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly
+
+declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly
+
+%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
+%struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
+%struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> }
+%struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> }
+
+%struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> }
+%struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> }
+%struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> }
+
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
+
+declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
+
+%struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }
+%struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
+%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
+%struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> }
+
+%struct.__neon_int16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }
+%struct.__neon_int32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }
+%struct.__neon_float32x4x4_t = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> }
+
+declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
+
+declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
+
+define <8 x i8> @vld1lanei8(i8* %A, <8 x i8>* %B) nounwind {
+ %tmp1 = load <8 x i8>* %B
+ %tmp2 = load i8* %A, align 8
+ %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 3
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld1.8 {{{d[0-9]+\[[0-9]\]}}}, [r0]
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vld1lanei16(i16* %A, <4 x i16>* %B) nounwind {
+ %tmp1 = load <4 x i16>* %B
+ %tmp2 = load i16* %A, align 8
+ %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 2
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld1.16 {{{d[0-9]+\[[0-9]\]}}}, [r0, :16]
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vld1lanei32(i32* %A, <2 x i32>* %B) nounwind {
+ %tmp1 = load <2 x i32>* %B
+ %tmp2 = load i32* %A, align 8
+ %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld1.32 {{{d[0-9]+\[[0-9]\]}}}, [r0, :32]
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @vld1laneQi8(i8* %A, <16 x i8>* %B) nounwind {
+ %tmp1 = load <16 x i8>* %B
+ %tmp2 = load i8* %A, align 8
+ %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 9
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld1.8 {{{d[0-9]+\[[0-9]\]}}}, [r0]
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vld1laneQi16(i16* %A, <8 x i16>* %B) nounwind {
+ %tmp1 = load <8 x i16>* %B
+ %tmp2 = load i16* %A, align 8
+ %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 5
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld1.16 {{{d[0-9]+\[[0-9]\]}}}, [r0, :16]
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vld1laneQi32(i32* %A, <4 x i32>* %B) nounwind {
+ %tmp1 = load <4 x i32>* %B
+ %tmp2 = load i32* %A, align 8
+ %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 3
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld1.32 {{{d[0-9]+\[[0-9]\]}}}, [r0, :32]
+ ret <4 x i32> %tmp3
+}
+
+define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind {
+ %tmp1 = load <8 x i8>* %B
+ %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
+ %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1
+ %tmp5 = add <8 x i8> %tmp3, %tmp4
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld2.8 {{{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}}, [r0, :16]
+ ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind {
+ %tmp0 = bitcast i16* %A to i8*
+ %tmp1 = load <4 x i16>* %B
+ %tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
+ %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1
+ %tmp5 = add <4 x i16> %tmp3, %tmp4
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld2.16 {{{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}}, [r0, :32]
+ ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @vld2lanei32(i32 %foo, i32 %bar, i32 %baz,
+ i32* %A, <2 x i32>* %B) nounwind {
+ %tmp0 = bitcast i32* %A to i8*
+ %tmp1 = load <2 x i32>* %B
+ %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
+ %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
+ %tmp5 = add <2 x i32> %tmp3, %tmp4
+; CHECK: bic r3, r3, #3221225472
+; CHECK-NEXT: vld2.32 {{{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}}, [r3]
+ ret <2 x i32> %tmp5
+}
+
+define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
+ %tmp0 = bitcast i16* %A to i8*
+ %tmp1 = load <8 x i16>* %B
+ %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
+ %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1
+ %tmp5 = add <8 x i16> %tmp3, %tmp4
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld2.16 {{{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}}, [r0]
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
+ %tmp0 = bitcast i32* %A to i8*
+ %tmp1 = load <4 x i32>* %B
+ %tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
+ %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1
+ %tmp5 = add <4 x i32> %tmp3, %tmp4
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld2.32 {{{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}}, [r0, :64]
+ ret <4 x i32> %tmp5
+}
+
+define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind {
+ %tmp1 = load <8 x i8>* %B
+ %tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
+ %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2
+ %tmp6 = add <8 x i8> %tmp3, %tmp4
+ %tmp7 = add <8 x i8> %tmp5, %tmp6
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld3.8 {{{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}}, [r0]
+ ret <8 x i8> %tmp7
+}
+
+define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind {
+ %tmp0 = bitcast i16* %A to i8*
+ %tmp1 = load <4 x i16>* %B
+ %tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
+ %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2
+ %tmp6 = add <4 x i16> %tmp3, %tmp4
+ %tmp7 = add <4 x i16> %tmp5, %tmp6
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld3.16 {{{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}}, [r0]
+ ret <4 x i16> %tmp7
+}
+
+define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind {
+ %tmp0 = bitcast i32* %A to i8*
+ %tmp1 = load <2 x i32>* %B
+ %tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
+ %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2
+ %tmp6 = add <2 x i32> %tmp3, %tmp4
+ %tmp7 = add <2 x i32> %tmp5, %tmp6
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld3.32 {{{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}}, [r0]
+ ret <2 x i32> %tmp7
+}
+
+define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
+ %tmp0 = bitcast i16* %A to i8*
+ %tmp1 = load <8 x i16>* %B
+ %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
+ %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
+ %tmp6 = add <8 x i16> %tmp3, %tmp4
+ %tmp7 = add <8 x i16> %tmp5, %tmp6
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld3.16 {{{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}}, [r0]
+ ret <8 x i16> %tmp7
+}
+
+define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
+ %tmp0 = bitcast i32* %A to i8*
+ %tmp1 = load <4 x i32>* %B
+ %tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1)
+ %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2
+ %tmp6 = add <4 x i32> %tmp3, %tmp4
+ %tmp7 = add <4 x i32> %tmp5, %tmp6
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld3.32 {{{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}}, [r0]
+ ret <4 x i32> %tmp7
+}
+
+define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind {
+ %tmp1 = load <8 x i8>* %B
+ %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
+ %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
+ %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
+ %tmp7 = add <8 x i8> %tmp3, %tmp4
+ %tmp8 = add <8 x i8> %tmp5, %tmp6
+ %tmp9 = add <8 x i8> %tmp7, %tmp8
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld4.8 {{{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}}, [r0, :32]
+ ret <8 x i8> %tmp9
+}
+
+define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind {
+ %tmp0 = bitcast i16* %A to i8*
+ %tmp1 = load <4 x i16>* %B
+ %tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4)
+ %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2
+ %tmp6 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 3
+ %tmp7 = add <4 x i16> %tmp3, %tmp4
+ %tmp8 = add <4 x i16> %tmp5, %tmp6
+ %tmp9 = add <4 x i16> %tmp7, %tmp8
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld4.16 {{{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}}, [r0]
+ ret <4 x i16> %tmp9
+}
+
+define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind {
+ %tmp0 = bitcast i32* %A to i8*
+ %tmp1 = load <2 x i32>* %B
+ %tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8)
+ %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2
+ %tmp6 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 3
+ %tmp7 = add <2 x i32> %tmp3, %tmp4
+ %tmp8 = add <2 x i32> %tmp5, %tmp6
+ %tmp9 = add <2 x i32> %tmp7, %tmp8
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld4.32 {{{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}}, [r0, :64]
+ ret <2 x i32> %tmp9
+}
+
+define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
+ %tmp0 = bitcast i16* %A to i8*
+ %tmp1 = load <8 x i16>* %B
+ %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16)
+ %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2
+ %tmp6 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 3
+ %tmp7 = add <8 x i16> %tmp3, %tmp4
+ %tmp8 = add <8 x i16> %tmp5, %tmp6
+ %tmp9 = add <8 x i16> %tmp7, %tmp8
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld4.16 {{{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}}, [r0, :64]
+ ret <8 x i16> %tmp9
+}
+
+define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
+ %tmp0 = bitcast i32* %A to i8*
+ %tmp1 = load <4 x i32>* %B
+ %tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
+ %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2
+ %tmp6 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 3
+ %tmp7 = add <4 x i32> %tmp3, %tmp4
+ %tmp8 = add <4 x i32> %tmp5, %tmp6
+ %tmp9 = add <4 x i32> %tmp7, %tmp8
+; CHECK: bic r0, r0, #3221225472
+; CHECK-NEXT: vld4.32 {{{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}, {{d[0-9]+\[[0-9]\]}}}, [r0]
+ ret <4 x i32> %tmp9
+}
+
diff --git a/test/NaCl/ARM/neon-vst1-sandboxing.ll b/test/NaCl/ARM/neon-vst1-sandboxing.ll
index 8fd580bb49..ec5712ee94 100644
--- a/test/NaCl/ARM/neon-vst1-sandboxing.ll
+++ b/test/NaCl/ARM/neon-vst1-sandboxing.ll
@@ -47,15 +47,19 @@ define void @vst1i64(i64* %A, <1 x i64>* %B) nounwind {
define void @vst1Qi8(i8* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %B
+; CHECK: bic r1, r1, #3221225472
+; CHECK-NEXT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
call void @llvm.arm.neon.vst1.v16i8(i8* %A, <16 x i8> %tmp1, i32 8)
; CHECK: bic r0, r0, #3221225472
-; CHECK-NEXT: vst1.8 {{{d[0-9]+, d[0-9]+}}}, [r0, :64]
+; CHECK-NEXT: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0, :64]
ret void
}
define void @vst1Qi16(i16* %A, <8 x i16>* %B) nounwind {
%tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>* %B
+; CHECK: bic r1, r1, #3221225472
+; CHECK-NEXT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 32)
; CHECK: bic r0, r0, #3221225472
; CHECK-NEXT: vst1.16 {{{d[0-9]+, d[0-9]+}}}, [r0, :128]
@@ -65,6 +69,8 @@ define void @vst1Qi16(i16* %A, <8 x i16>* %B) nounwind {
define void @vst1Qi32(i32* %A, <4 x i32>* %B) nounwind {
%tmp0 = bitcast i32* %A to i8*
%tmp1 = load <4 x i32>* %B
+; CHECK: bic r1, r1, #3221225472
+; CHECK-NEXT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
call void @llvm.arm.neon.vst1.v4i32(i8* %tmp0, <4 x i32> %tmp1, i32 1)
; CHECK: bic r0, r0, #3221225472
; CHECK-NEXT: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r0]
@@ -74,6 +80,8 @@ define void @vst1Qi32(i32* %A, <4 x i32>* %B) nounwind {
define void @vst1Qf(float* %A, <4 x float>* %B) nounwind {
%tmp0 = bitcast float* %A to i8*
%tmp1 = load <4 x float>* %B
+; CHECK: bic r1, r1, #3221225472
+; CHECK-NEXT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
call void @llvm.arm.neon.vst1.v4f32(i8* %tmp0, <4 x float> %tmp1, i32 1)
; CHECK: bic r0, r0, #3221225472
; CHECK-NEXT: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r0]
@@ -83,6 +91,8 @@ define void @vst1Qf(float* %A, <4 x float>* %B) nounwind {
define void @vst1Qi64(i64* %A, <2 x i64>* %B) nounwind {
%tmp0 = bitcast i64* %A to i8*
%tmp1 = load <2 x i64>* %B
+; CHECK: bic r1, r1, #3221225472
+; CHECK-NEXT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
call void @llvm.arm.neon.vst1.v2i64(i8* %tmp0, <2 x i64> %tmp1, i32 1)
; CHECK: bic r0, r0, #3221225472
; CHECK-NEXT: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r0]
diff --git a/test/NaCl/ARM/neon-vst2-sandboxing.ll b/test/NaCl/ARM/neon-vst2-sandboxing.ll
index e87373c174..431f68612c 100644
--- a/test/NaCl/ARM/neon-vst2-sandboxing.ll
+++ b/test/NaCl/ARM/neon-vst2-sandboxing.ll
@@ -38,6 +38,8 @@ define void @vst2f(float* %A, <2 x float>* %B) nounwind {
define void @vst2Qi8(i8* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %B
+; CHECK: bic r1, r1, #3221225472
+; CHECK-NEXT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
call void @llvm.arm.neon.vst2.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 8)
; CHECK: bic r0, r0, #3221225472
; CHECK-NEXT: vst2.8 {{{d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}}}, [r0, :64]
@@ -47,6 +49,8 @@ define void @vst2Qi8(i8* %A, <16 x i8>* %B) nounwind {
define void @vst2Qi16(i16* %A, <8 x i16>* %B) nounwind {
%tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>* %B
+; CHECK: bic r1, r1, #3221225472
+; CHECK-NEXT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
call void @llvm.arm.neon.vst2.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 16)
; CHECK: bic r0, r0, #3221225472
; CHECK-NEXT: vst2.16 {{{d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}}}, [r0, :128]
@@ -56,6 +60,8 @@ define void @vst2Qi16(i16* %A, <8 x i16>* %B) nounwind {
define void @vst2Qi32(i32* %A, <4 x i32>* %B) nounwind {
%tmp0 = bitcast i32* %A to i8*
%tmp1 = load <4 x i32>* %B
+; CHECK: bic r1, r1, #3221225472
+; CHECK-NEXT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
call void @llvm.arm.neon.vst2.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 64)
; CHECK: bic r0, r0, #3221225472
; CHECK-NEXT: vst2.32 {{{d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}}}, [r0, :256]
@@ -65,6 +71,8 @@ define void @vst2Qi32(i32* %A, <4 x i32>* %B) nounwind {
define void @vst2Qf(float* %A, <4 x float>* %B) nounwind {
%tmp0 = bitcast float* %A to i8*
%tmp1 = load <4 x float>* %B
+; CHECK: bic r1, r1, #3221225472
+; CHECK-NEXT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
call void @llvm.arm.neon.vst2.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
; CHECK: bic r0, r0, #3221225472
; CHECK-NEXT: vst2.32 {{{d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}}}, [r0]
diff --git a/test/NaCl/ARM/neon-vst3-sandboxing.ll b/test/NaCl/ARM/neon-vst3-sandboxing.ll
index b496c0c592..95f85bbeb6 100644
--- a/test/NaCl/ARM/neon-vst3-sandboxing.ll
+++ b/test/NaCl/ARM/neon-vst3-sandboxing.ll
@@ -32,6 +32,8 @@ define void @vst3Qi16_update(i16** %ptr, <8 x i16>* %B) nounwind {
%A = load i16** %ptr
%tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>* %B
+; CHECK: bic r1, r1, #3221225472
+; CHECK-NEXT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
call void @llvm.arm.neon.vst3.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
; CHECK: bic r1, r1, #3221225472
; CHECK-NEXT: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r1]!
diff --git a/test/NaCl/ARM/neon-vst4-sandboxing.ll b/test/NaCl/ARM/neon-vst4-sandboxing.ll
index 032f194231..2b0eb31b3d 100644
--- a/test/NaCl/ARM/neon-vst4-sandboxing.ll
+++ b/test/NaCl/ARM/neon-vst4-sandboxing.ll
@@ -32,6 +32,8 @@ define void @vst4Qf_update(float** %ptr, <4 x float>* %B) nounwind {
%A = load float** %ptr
%tmp0 = bitcast float* %A to i8*
%tmp1 = load <4 x float>* %B
+; CHECK: bic r1, r1, #3221225472
+; CHECK-NEXT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
call void @llvm.arm.neon.vst4.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
; CHECK: bic r1, r1, #3221225472
; CHECK-NEXT: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r1]!
diff --git a/test/NaCl/ARM/neon-vstlane-sandboxing.ll b/test/NaCl/ARM/neon-vstlane-sandboxing.ll
index 5b4dc63a14..8da70115f9 100644
--- a/test/NaCl/ARM/neon-vstlane-sandboxing.ll
+++ b/test/NaCl/ARM/neon-vstlane-sandboxing.ll
@@ -3,8 +3,8 @@
define void @vst1lanei8(i8* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %B
- %tmp2 = extractelement <8 x i8> %tmp1, i32 3
- store i8 %tmp2, i8* %A, align 8
+ %tmp2 = extractelement <8 x i8> %tmp1, i32 3
+ store i8 %tmp2, i8* %A, align 8
; CHECK: bic r0, r0, #3221225472
; CHECK-NEXT: vst1.8 {d{{[0-9]+}}[3]}, [r0]
ret void
@@ -12,8 +12,8 @@ define void @vst1lanei8(i8* %A, <8 x i8>* %B) nounwind {
define void @vst1lanei16(i16* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %B
- %tmp2 = extractelement <4 x i16> %tmp1, i32 2
- store i16 %tmp2, i16* %A, align 8
+ %tmp2 = extractelement <4 x i16> %tmp1, i32 2
+ store i16 %tmp2, i16* %A, align 8
; CHECK: bic r0, r0, #3221225472
; CHECK-NEXT: vst1.16 {d{{[0-9]+}}[2]}, [r0, :16]
ret void
@@ -21,8 +21,8 @@ define void @vst1lanei16(i16* %A, <4 x i16>* %B) nounwind {
define void @vst1lanei32(i32* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %B
- %tmp2 = extractelement <2 x i32> %tmp1, i32 1
- store i32 %tmp2, i32* %A, align 8
+ %tmp2 = extractelement <2 x i32> %tmp1, i32 1
+ store i32 %tmp2, i32* %A, align 8
; CHECK: bic r0, r0, #3221225472
; CHECK-NEXT: vst1.32 {d{{[0-9]+}}[1]}, [r0, :32]
ret void
@@ -30,8 +30,10 @@ define void @vst1lanei32(i32* %A, <2 x i32>* %B) nounwind {
define void @vst1laneQi8(i8* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %B
- %tmp2 = extractelement <16 x i8> %tmp1, i32 9
- store i8 %tmp2, i8* %A, align 8
+; CHECK: bic r1, r1, #3221225472
+; CHECK-NEXT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
+ %tmp2 = extractelement <16 x i8> %tmp1, i32 9
+ store i8 %tmp2, i8* %A, align 8
; CHECK: bic r0, r0, #3221225472
; CHECK-NEXT: vst1.8 {d{{[0-9]+}}[1]}, [r0]
ret void
@@ -39,8 +41,10 @@ define void @vst1laneQi8(i8* %A, <16 x i8>* %B) nounwind {
define void @vst1laneQi16(i16* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %B
- %tmp2 = extractelement <8 x i16> %tmp1, i32 5
- store i16 %tmp2, i16* %A, align 8
+; CHECK: bic r1, r1, #3221225472
+; CHECK-NEXT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
+ %tmp2 = extractelement <8 x i16> %tmp1, i32 5
+ store i16 %tmp2, i16* %A, align 8
; CHECK: bic r0, r0, #3221225472
; CHECK-NEXT: vst1.16 {d{{[0-9]+}}[1]}, [r0, :16]
ret void
@@ -75,6 +79,8 @@ define void @vst2lanei32(i32* %A, <2 x i32>* %B) nounwind {
define void @vst2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
%tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>* %B
+; CHECK: bic r1, r1, #3221225472
+; CHECK-NEXT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
call void @llvm.arm.neon.vst2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
; CHECK: bic r0, r0, #3221225472
; CHECK-NEXT: vst2.16 {d{{[0-9]+}}[1], d{{[0-9]+}}[1]}, [r0]
@@ -84,6 +90,8 @@ define void @vst2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
define void @vst2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
%tmp0 = bitcast i32* %A to i8*
%tmp1 = load <4 x i32>* %B
+; CHECK: bic r1, r1, #3221225472
+; CHECK-NEXT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
call void @llvm.arm.neon.vst2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
; CHECK: bic r0, r0, #3221225472
; CHECK-NEXT: vst2.32 {d{{[0-9]+}}[0], d{{[0-9]+}}[0]}, [r0, :64]
@@ -145,6 +153,8 @@ define void @vst4lanei32(i32* %A, <2 x i32>* %B) nounwind {
define void @vst4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
%tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>* %B
+; CHECK: bic r1, r1, #3221225472
+; CHECK-NEXT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
call void @llvm.arm.neon.vst4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 7, i32 16)
; CHECK: bic r0, r0, #3221225472
; CHECK-NEXT: vst4.16 {d{{[0-9]+}}[3], d{{[0-9]+}}[3], d{{[0-9]+}}[3], d{{[0-9]+}}[3]}, [r0, :64]
@@ -154,6 +164,8 @@ define void @vst4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
define void @vst4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
%tmp0 = bitcast i32* %A to i8*
%tmp1 = load <4 x i32>* %B
+; CHECK: bic r1, r1, #3221225472
+; CHECK-NEXT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
call void @llvm.arm.neon.vst4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
; CHECK: bic r0, r0, #3221225472
; CHECK-NEXT: vst4.32 {d{{[0-9]+}}[0], d{{[0-9]+}}[0], d{{[0-9]+}}[0], d{{[0-9]+}}[0]}, [r0]