diff options
author | Bob Wilson <bob.wilson@apple.com> | 2009-06-22 23:27:02 +0000 |
---|---|---|
committer | Bob Wilson <bob.wilson@apple.com> | 2009-06-22 23:27:02 +0000 |
commit | 5bafff36c798608a189c517d37527e4a38863071 (patch) | |
tree | 79bd2abbc5253e6f00db07023cf7d829cbcdee5a | |
parent | 5de83afcdc3f4f0edf8caacba523f5d05ee48048 (diff) |
Add support for ARM's Advanced SIMD (NEON) instruction set.
This is still a work in progress but most of the NEON instruction set
is supported.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@73919 91177308-0d34-0410-b5e6-96231b3b80d8
102 files changed, 10300 insertions, 125 deletions
diff --git a/include/llvm/Intrinsics.td b/include/llvm/Intrinsics.td index bce3ce098f..5ed2f7734a 100644 --- a/include/llvm/Intrinsics.td +++ b/include/llvm/Intrinsics.td @@ -116,6 +116,7 @@ def llvm_v2i64_ty : LLVMType<v2i64>; // 2 x i64 def llvm_v2i32_ty : LLVMType<v2i32>; // 2 x i32 def llvm_v1i64_ty : LLVMType<v1i64>; // 1 x i64 def llvm_v4i32_ty : LLVMType<v4i32>; // 4 x i32 +def llvm_v2f32_ty : LLVMType<v2f32>; // 2 x float def llvm_v4f32_ty : LLVMType<v4f32>; // 4 x float def llvm_v2f64_ty : LLVMType<v2f64>; // 2 x double diff --git a/include/llvm/IntrinsicsARM.td b/include/llvm/IntrinsicsARM.td index e574938c72..a73dc45802 100644 --- a/include/llvm/IntrinsicsARM.td +++ b/include/llvm/IntrinsicsARM.td @@ -19,3 +19,298 @@ let TargetPrefix = "arm" in { // All intrinsics start with "llvm.arm.". def int_arm_thread_pointer : GCCBuiltin<"__builtin_thread_pointer">, Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>; } + +//===----------------------------------------------------------------------===// +// Advanced SIMD (NEON) + +let TargetPrefix = "arm" in { // All intrinsics start with "llvm.arm.". + + // The following classes do not correspond directly to GCC builtins. + class Neon_1Arg_Intrinsic + : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; + class Neon_1Arg_Float_Intrinsic + : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + class Neon_1Arg_Narrow_Intrinsic + : Intrinsic<[llvm_anyint_ty], + [LLVMExtendedElementVectorType<0>], [IntrNoMem]>; + class Neon_1Arg_Long_Intrinsic + : Intrinsic<[llvm_anyint_ty], + [LLVMTruncatedElementVectorType<0>], [IntrNoMem]>; + class Neon_2Arg_Intrinsic + : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem]>; + class Neon_2Arg_Float_Intrinsic + : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem]>; + class Neon_2Arg_Narrow_Intrinsic + : Intrinsic<[llvm_anyint_ty], + [LLVMExtendedElementVectorType<0>, + LLVMExtendedElementVectorType<0>], + [IntrNoMem]>; + class Neon_2Arg_Long_Intrinsic + : Intrinsic<[llvm_anyint_ty], + [LLVMTruncatedElementVectorType<0>, + LLVMTruncatedElementVectorType<0>], + [IntrNoMem]>; + class Neon_2Arg_Wide_Intrinsic + : Intrinsic<[llvm_anyint_ty], + [LLVMMatchType<0>, LLVMTruncatedElementVectorType<0>], + [IntrNoMem]>; + class Neon_3Arg_Intrinsic + : Intrinsic<[llvm_anyint_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem]>; + class Neon_3Arg_Long_Intrinsic + : Intrinsic<[llvm_anyint_ty], + [LLVMMatchType<0>, + LLVMTruncatedElementVectorType<0>, + LLVMTruncatedElementVectorType<0>], + [IntrNoMem]>; + class Neon_CvtFxToFP_Intrinsic + : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; + class Neon_CvtFPToFx_Intrinsic + : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>; +} + +// Arithmetic ops + +let Properties = [IntrNoMem, Commutative] in { + + // Vector Add. + def int_arm_neon_vhadds : Neon_2Arg_Intrinsic; + def int_arm_neon_vhaddu : Neon_2Arg_Intrinsic; + def int_arm_neon_vrhadds : Neon_2Arg_Intrinsic; + def int_arm_neon_vrhaddu : Neon_2Arg_Intrinsic; + def int_arm_neon_vqadds : Neon_2Arg_Intrinsic; + def int_arm_neon_vqaddu : Neon_2Arg_Intrinsic; + def int_arm_neon_vaddhn : Neon_2Arg_Narrow_Intrinsic; + def int_arm_neon_vraddhn : Neon_2Arg_Narrow_Intrinsic; + def int_arm_neon_vaddls : Neon_2Arg_Long_Intrinsic; + def int_arm_neon_vaddlu : Neon_2Arg_Long_Intrinsic; + def int_arm_neon_vaddws : Neon_2Arg_Wide_Intrinsic; + def int_arm_neon_vaddwu : Neon_2Arg_Wide_Intrinsic; + + // Vector Multiply. + def int_arm_neon_vmulp : Neon_2Arg_Intrinsic; + def int_arm_neon_vqdmulh : Neon_2Arg_Intrinsic; + def int_arm_neon_vqrdmulh : Neon_2Arg_Intrinsic; + def int_arm_neon_vmulls : Neon_2Arg_Long_Intrinsic; + def int_arm_neon_vmullu : Neon_2Arg_Long_Intrinsic; + def int_arm_neon_vmullp : Neon_2Arg_Long_Intrinsic; + def int_arm_neon_vqdmull : Neon_2Arg_Long_Intrinsic; + + // Vector Multiply and Accumulate/Subtract. + def int_arm_neon_vmlals : Neon_3Arg_Long_Intrinsic; + def int_arm_neon_vmlalu : Neon_3Arg_Long_Intrinsic; + def int_arm_neon_vmlsls : Neon_3Arg_Long_Intrinsic; + def int_arm_neon_vmlslu : Neon_3Arg_Long_Intrinsic; + def int_arm_neon_vqdmlal : Neon_3Arg_Long_Intrinsic; + def int_arm_neon_vqdmlsl : Neon_3Arg_Long_Intrinsic; + + // Vector Maximum. + def int_arm_neon_vmaxs : Neon_2Arg_Intrinsic; + def int_arm_neon_vmaxu : Neon_2Arg_Intrinsic; + def int_arm_neon_vmaxf : Neon_2Arg_Float_Intrinsic; + + // Vector Minimum. + def int_arm_neon_vmins : Neon_2Arg_Intrinsic; + def int_arm_neon_vminu : Neon_2Arg_Intrinsic; + def int_arm_neon_vminf : Neon_2Arg_Float_Intrinsic; + + // Vector Reciprocal Step. + def int_arm_neon_vrecps : Neon_2Arg_Float_Intrinsic; + + // Vector Reciprocal Square Root Step. + def int_arm_neon_vrsqrts : Neon_2Arg_Float_Intrinsic; +} + +// Vector Subtract. +def int_arm_neon_vhsubs : Neon_2Arg_Intrinsic; +def int_arm_neon_vhsubu : Neon_2Arg_Intrinsic; +def int_arm_neon_vqsubs : Neon_2Arg_Intrinsic; +def int_arm_neon_vqsubu : Neon_2Arg_Intrinsic; +def int_arm_neon_vsubhn : Neon_2Arg_Narrow_Intrinsic; +def int_arm_neon_vrsubhn : Neon_2Arg_Narrow_Intrinsic; +def int_arm_neon_vsubls : Neon_2Arg_Long_Intrinsic; +def int_arm_neon_vsublu : Neon_2Arg_Long_Intrinsic; +def int_arm_neon_vsubws : Neon_2Arg_Wide_Intrinsic; +def int_arm_neon_vsubwu : Neon_2Arg_Wide_Intrinsic; + +// Vector Absolute Compare. +let TargetPrefix = "arm" in { + def int_arm_neon_vacged : Intrinsic<[llvm_v2i32_ty], + [llvm_v2f32_ty, llvm_v2f32_ty], + [IntrNoMem]>; + def int_arm_neon_vacgeq : Intrinsic<[llvm_v4i32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty], + [IntrNoMem]>; + def int_arm_neon_vacgtd : Intrinsic<[llvm_v2i32_ty], + [llvm_v2f32_ty, llvm_v2f32_ty], + [IntrNoMem]>; + def int_arm_neon_vacgtq : Intrinsic<[llvm_v4i32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty], + [IntrNoMem]>; +} + +// Vector Absolute Differences. +def int_arm_neon_vabds : Neon_2Arg_Intrinsic; +def int_arm_neon_vabdu : Neon_2Arg_Intrinsic; +def int_arm_neon_vabdf : Neon_2Arg_Float_Intrinsic; +def int_arm_neon_vabdls : Neon_2Arg_Long_Intrinsic; +def int_arm_neon_vabdlu : Neon_2Arg_Long_Intrinsic; + +// Vector Absolute Difference and Accumulate. +def int_arm_neon_vabas : Neon_3Arg_Intrinsic; +def int_arm_neon_vabau : Neon_3Arg_Intrinsic; +def int_arm_neon_vabals : Neon_3Arg_Long_Intrinsic; +def int_arm_neon_vabalu : Neon_3Arg_Long_Intrinsic; + +// Vector Pairwise Add. +def int_arm_neon_vpaddi : Neon_2Arg_Intrinsic; +def int_arm_neon_vpaddf : Neon_2Arg_Float_Intrinsic; + +// Vector Pairwise Add Long. +// Note: This is different than the other "long" NEON intrinsics because +// the result vector has half as many elements as the source vector. +// The source and destination vector types must be specified separately. +let TargetPrefix = "arm" in { + def int_arm_neon_vpaddls : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty], + [IntrNoMem]>; + def int_arm_neon_vpaddlu : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty], + [IntrNoMem]>; +} + +// Vector Pairwise Add and Accumulate Long. +// Note: This is similar to vpaddl but the destination vector also appears +// as the first argument. +let TargetPrefix = "arm" in { + def int_arm_neon_vpadals : Intrinsic<[llvm_anyint_ty], + [LLVMMatchType<0>, llvm_anyint_ty], + [IntrNoMem]>; + def int_arm_neon_vpadalu : Intrinsic<[llvm_anyint_ty], + [LLVMMatchType<0>, llvm_anyint_ty], + [IntrNoMem]>; +} + +// Vector Pairwise Maximum and Minimum. +def int_arm_neon_vpmaxs : Neon_2Arg_Intrinsic; +def int_arm_neon_vpmaxu : Neon_2Arg_Intrinsic; +def int_arm_neon_vpmaxf : Neon_2Arg_Float_Intrinsic; +def int_arm_neon_vpmins : Neon_2Arg_Intrinsic; +def int_arm_neon_vpminu : Neon_2Arg_Intrinsic; +def int_arm_neon_vpminf : Neon_2Arg_Float_Intrinsic; + +// Vector Shifts: +// +// The various saturating and rounding vector shift operations need to be +// represented by intrinsics in LLVM, and even the basic VSHL variable shift +// operation cannot be safely translated to LLVM's shift operators. VSHL can +// be used for both left and right shifts, or even combinations of the two, +// depending on the signs of the shift amounts. It also has well-defined +// behavior for shift amounts that LLVM leaves undefined. Only basic shifts +// by constants can be represented with LLVM's shift operators. +// +// The shift counts for these intrinsics are always vectors, even for constant +// shifts, where the constant is replicated. For consistency with VSHL (and +// other variable shift instructions), left shifts have positive shift counts +// and right shifts have negative shift counts. This convention is also used +// for constant right shift intrinsics, and to help preserve sanity, the +// intrinsic names use "shift" instead of either "shl" or "shr". Where +// applicable, signed and unsigned versions of the intrinsics are +// distinguished with "s" and "u" suffixes. A few NEON shift instructions, +// such as VQSHLU, take signed operands but produce unsigned results; these +// use a "su" suffix. + +// Vector Shift. +def int_arm_neon_vshifts : Neon_2Arg_Intrinsic; +def int_arm_neon_vshiftu : Neon_2Arg_Intrinsic; +def int_arm_neon_vshiftls : Neon_2Arg_Long_Intrinsic; +def int_arm_neon_vshiftlu : Neon_2Arg_Long_Intrinsic; +def int_arm_neon_vshiftn : Neon_2Arg_Narrow_Intrinsic; + +// Vector Rounding Shift. +def int_arm_neon_vrshifts : Neon_2Arg_Intrinsic; +def int_arm_neon_vrshiftu : Neon_2Arg_Intrinsic; +def int_arm_neon_vrshiftn : Neon_2Arg_Narrow_Intrinsic; + +// Vector Saturating Shift. +def int_arm_neon_vqshifts : Neon_2Arg_Intrinsic; +def int_arm_neon_vqshiftu : Neon_2Arg_Intrinsic; +def int_arm_neon_vqshiftsu : Neon_2Arg_Intrinsic; +def int_arm_neon_vqshiftns : Neon_2Arg_Narrow_Intrinsic; +def int_arm_neon_vqshiftnu : Neon_2Arg_Narrow_Intrinsic; +def int_arm_neon_vqshiftnsu : Neon_2Arg_Narrow_Intrinsic; + +// Vector Saturating Rounding Shift. +def int_arm_neon_vqrshifts : Neon_2Arg_Intrinsic; +def int_arm_neon_vqrshiftu : Neon_2Arg_Intrinsic; +def int_arm_neon_vqrshiftns : Neon_2Arg_Narrow_Intrinsic; +def int_arm_neon_vqrshiftnu : Neon_2Arg_Narrow_Intrinsic; +def int_arm_neon_vqrshiftnsu : Neon_2Arg_Narrow_Intrinsic; + +// Vector Shift and Insert. +def int_arm_neon_vshiftins : Neon_3Arg_Intrinsic; + +// Vector Absolute Value and Saturating Absolute Value. +def int_arm_neon_vabs : Neon_1Arg_Intrinsic; +def int_arm_neon_vabsf : Neon_1Arg_Float_Intrinsic; +def int_arm_neon_vqabs : Neon_1Arg_Intrinsic; + +// Vector Saturating Negate. +def int_arm_neon_vqneg : Neon_1Arg_Intrinsic; + +// Vector Count Leading Sign/Zero Bits. +def int_arm_neon_vcls : Neon_1Arg_Intrinsic; +def int_arm_neon_vclz : Neon_1Arg_Intrinsic; + +// Vector Count One Bits. +def int_arm_neon_vcnt : Neon_1Arg_Intrinsic; + +// Vector Reciprocal Estimate. +def int_arm_neon_vrecpe : Neon_1Arg_Intrinsic; +def int_arm_neon_vrecpef : Neon_1Arg_Float_Intrinsic; + +// Vector Reciprocal Square Root Estimate. +def int_arm_neon_vrsqrte : Neon_1Arg_Intrinsic; +def int_arm_neon_vrsqrtef : Neon_1Arg_Float_Intrinsic; + +// Vector Conversions Between Floating-point and Fixed-point. +def int_arm_neon_vcvtfp2fxs : Neon_CvtFPToFx_Intrinsic; +def int_arm_neon_vcvtfp2fxu : Neon_CvtFPToFx_Intrinsic; +def int_arm_neon_vcvtfxs2fp : Neon_CvtFxToFP_Intrinsic; +def int_arm_neon_vcvtfxu2fp : Neon_CvtFxToFP_Intrinsic; + +// Narrowing and Lengthening Vector Moves. +def int_arm_neon_vmovn : Neon_1Arg_Narrow_Intrinsic; +def int_arm_neon_vqmovns : Neon_1Arg_Narrow_Intrinsic; +def int_arm_neon_vqmovnu : Neon_1Arg_Narrow_Intrinsic; +def int_arm_neon_vqmovnsu : Neon_1Arg_Narrow_Intrinsic; +def int_arm_neon_vmovls : Neon_1Arg_Long_Intrinsic; +def int_arm_neon_vmovlu : Neon_1Arg_Long_Intrinsic; + +let TargetPrefix = "arm" in { + + // De-interleaving vector loads from N-element structures. + def int_arm_neon_vld3i : Intrinsic<[llvm_anyint_ty], + [llvm_ptr_ty], [IntrReadArgMem]>; + def int_arm_neon_vld3f : Intrinsic<[llvm_anyfloat_ty], + [llvm_ptr_ty], [IntrReadArgMem]>; + def int_arm_neon_vld4i : Intrinsic<[llvm_anyint_ty], + [llvm_ptr_ty], [IntrReadArgMem]>; + def int_arm_neon_vld4f : Intrinsic<[llvm_anyfloat_ty], + [llvm_ptr_ty], [IntrReadArgMem]>; + + // Interleaving vector stores from N-element structures. + def int_arm_neon_vst3i : Intrinsic<[llvm_void_ty], + [llvm_anyint_ty, llvm_ptr_ty], + [IntrWriteArgMem]>; + def int_arm_neon_vst3f : Intrinsic<[llvm_void_ty], + [llvm_anyfloat_ty, llvm_ptr_ty], + [IntrWriteArgMem]>; + def int_arm_neon_vst4i : Intrinsic<[llvm_void_ty], + [llvm_anyint_ty, llvm_ptr_ty], + [IntrWriteArgMem]>; + def int_arm_neon_vst4f : Intrinsic<[llvm_void_ty], + [llvm_anyfloat_ty, llvm_ptr_ty], + [IntrWriteArgMem]>; +} diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td index 47151e667c..8a4c741faf 100644 --- a/lib/Target/ARM/ARMCallingConv.td +++ b/lib/Target/ARM/ARMCallingConv.td @@ -24,19 +24,29 @@ def CC_ARM_APCS : CallingConv<[ CCIfType<[i8, i16], CCPromoteToType<i32>>, - // f64 is passed in pairs of GPRs, possibly split onto the stack - CCIfType<[f64], CCCustom<"CC_ARM_APCS_Custom_f64">>, + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + // f64 and v2f64 are passed in adjacent GPRs, possibly split onto the stack + CCIfType<[f64, v2f64], CCCustom<"CC_ARM_APCS_Custom_f64">>, CCIfType<[f32], CCBitConvertToType<i32>>, CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>, CCIfType<[i32], CCAssignToStack<4, 4>>, - CCIfType<[f64], CCAssignToStack<8, 4>> + CCIfType<[f64], CCAssignToStack<8, 4>>, + CCIfType<[v2f64], CCAssignToStack<16, 4>> ]>; def RetCC_ARM_APCS : CallingConv<[ CCIfType<[f32], CCBitConvertToType<i32>>, - CCIfType<[f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>, + + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>, CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>, CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>> @@ -59,7 +69,8 @@ def CC_ARM_AAPCS_Common : CallingConv<[ CCAssignToReg<[R0, R1, R2, R3]>>>, CCIfType<[i32, f32], CCAssignToStack<4, 4>>, - CCIfType<[f64], CCAssignToStack<8, 8>> + CCIfType<[f64], CCAssignToStack<8, 8>>, + CCIfType<[v2f64], CCAssignToStack<16, 8>> ]>; def RetCC_ARM_AAPCS_Common : CallingConv<[ @@ -72,13 +83,21 @@ def RetCC_ARM_AAPCS_Common : CallingConv<[ //===----------------------------------------------------------------------===// def CC_ARM_AAPCS : CallingConv<[ - CCIfType<[f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>, + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>, CCIfType<[f32], CCBitConvertToType<i32>>, CCDelegateTo<CC_ARM_AAPCS_Common> ]>; def RetCC_ARM_AAPCS : CallingConv<[ - CCIfType<[f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>, + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + + CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>, CCIfType<[f32], CCBitConvertToType<i32>>, CCDelegateTo<RetCC_ARM_AAPCS_Common> ]>; @@ -88,6 +107,10 @@ def RetCC_ARM_AAPCS : CallingConv<[ //===----------------------------------------------------------------------===// def CC_ARM_AAPCS_VFP : CallingConv<[ + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15]>>, @@ -95,6 +118,10 @@ def CC_ARM_AAPCS_VFP : CallingConv<[ ]>; def RetCC_ARM_AAPCS_VFP : CallingConv<[ + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15]>>, diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index d3c3047fc9..ee9dadff15 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -32,6 +32,9 @@ #include "llvm/Support/Debug.h" using namespace llvm; +static const unsigned arm_dsubreg_0 = 5; +static const unsigned arm_dsubreg_1 = 6; + //===--------------------------------------------------------------------===// /// ARMDAGToDAGISel - ARM specific code to select ARM machine /// instructions for SelectionDAG operations. @@ -918,6 +921,65 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) { return CurDAG->getTargetNode(TargetInstrInfo::DECLARE, dl, MVT::Other, Ops, 3); } + + case ISD::CONCAT_VECTORS: { + MVT VT = Op.getValueType(); + assert(VT.is128BitVector() && Op.getNumOperands() == 2 && + "unexpected CONCAT_VECTORS"); + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + SDNode *Result = + CurDAG->getTargetNode(TargetInstrInfo::IMPLICIT_DEF, dl, VT); + if (N0.getOpcode() != ISD::UNDEF) + Result = CurDAG->getTargetNode(TargetInstrInfo::INSERT_SUBREG, dl, VT, + SDValue(Result, 0), N0, + CurDAG->getTargetConstant(arm_dsubreg_0, + MVT::i32)); + if (N1.getOpcode() != ISD::UNDEF) + Result = CurDAG->getTargetNode(TargetInstrInfo::INSERT_SUBREG, dl, VT, + SDValue(Result, 0), N1, + CurDAG->getTargetConstant(arm_dsubreg_1, + MVT::i32)); + return Result; + } + + case ISD::VECTOR_SHUFFLE: { + MVT VT = Op.getValueType(); + + // Match 128-bit splat to VDUPLANEQ. (This could be done with a Pat in + // ARMInstrNEON.td but it is awkward because the shuffle mask needs to be + // transformed first into a lane number and then to both a subregister + // index and an adjusted lane number.) If the source operand is a + // SCALAR_TO_VECTOR, leave it so it will be matched later as a VDUP. + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + if (VT.is128BitVector() && SVOp->isSplat() && + Op.getOperand(0).getOpcode() != ISD::SCALAR_TO_VECTOR && + Op.getOperand(1).getOpcode() == ISD::UNDEF) { + unsigned LaneVal = SVOp->getSplatIndex(); + + MVT HalfVT; + unsigned Opc = 0; + switch (VT.getVectorElementType().getSimpleVT()) { + default: assert(false && "unhandled VDUP splat type"); + case MVT::i8: Opc = ARM::VDUPLN8q; HalfVT = MVT::v8i8; break; + case MVT::i16: Opc = ARM::VDUPLN16q; HalfVT = MVT::v4i16; break; + case MVT::i32: Opc = ARM::VDUPLN32q; HalfVT = MVT::v2i32; break; + case MVT::f32: Opc = ARM::VDUPLNfq; HalfVT = MVT::v2f32; break; + } + + // The source operand needs to be changed to a subreg of the original + // 128-bit operand, and the lane number needs to be adjusted accordingly. + unsigned NumElts = VT.getVectorNumElements() / 2; + unsigned SRVal = (LaneVal < NumElts ? arm_dsubreg_0 : arm_dsubreg_1); + SDValue SR = CurDAG->getTargetConstant(SRVal, MVT::i32); + SDValue NewLane = CurDAG->getTargetConstant(LaneVal % NumElts, MVT::i32); + SDNode *SubReg = CurDAG->getTargetNode(TargetInstrInfo::EXTRACT_SUBREG, + dl, HalfVT, N->getOperand(0), SR); + return CurDAG->SelectNodeTo(N, Opc, VT, SDValue(SubReg, 0), NewLane); + } + + break; + } } return SelectCode(Op); diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 2443625d4b..29d3da2b79 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -56,6 +56,52 @@ static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, ISD::ArgFlagsTy &ArgFlags, CCState &State); +void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, + MVT PromotedBitwiseVT) { + if (VT != PromotedLdStVT) { + setOperationAction(ISD::LOAD, VT, Promote); + AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); + + setOperationAction(ISD::STORE, VT, Promote); + AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); + } + + MVT ElemTy = VT.getVectorElementType(); + if (ElemTy != MVT::i64 && ElemTy != MVT::f64) + setOperationAction(ISD::VSETCC, VT, Custom); + if (ElemTy == MVT::i8 || ElemTy == MVT::i16) + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + if (VT.isInteger()) { + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::SRL, VT, Custom); + } + + // Promote all bit-wise operations. + if (VT.isInteger() && VT != PromotedBitwiseVT) { + setOperationAction(ISD::AND, VT, Promote); + AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT); + setOperationAction(ISD::OR, VT, Promote); + AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT); + setOperationAction(ISD::XOR, VT, Promote); + AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT); + } +} + +void ARMTargetLowering::addDRTypeForNEON(MVT VT) { + addRegisterClass(VT, ARM::DPRRegisterClass); + addTypeForNEON(VT, MVT::f64, MVT::v2i32); +} + +void ARMTargetLowering::addQRTypeForNEON(MVT VT) { + addRegisterClass(VT, ARM::QPRRegisterClass); + addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); +} + ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) : TargetLowering(TM), ARMPCLabelIndex(0) { Subtarget = &TM.getSubtarget<ARMSubtarget>(); @@ -152,6 +198,30 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setTruncStoreAction(MVT::f64, MVT::f32, Expand); } + + if (Subtarget->hasNEON()) { + addDRTypeForNEON(MVT::v2f32); + addDRTypeForNEON(MVT::v8i8); + addDRTypeForNEON(MVT::v4i16); + addDRTypeForNEON(MVT::v2i32); + addDRTypeForNEON(MVT::v1i64); + + addQRTypeForNEON(MVT::v4f32); + addQRTypeForNEON(MVT::v2f64); + addQRTypeForNEON(MVT::v16i8); + addQRTypeForNEON(MVT::v8i16); + addQRTypeForNEON(MVT::v4i32); + addQRTypeForNEON(MVT::v2i64); + + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); + setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SRL); + setTargetDAGCombine(ISD::SRA); + setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::ANY_EXTEND); + } + computeRegisterProperties(); // ARM does not have f32 extending load. @@ -352,6 +422,36 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::FMDRR: return "ARMISD::FMDRR"; case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; + + case ARMISD::VCEQ: return "ARMISD::VCEQ"; + case ARMISD::VCGE: return "ARMISD::VCGE"; + case ARMISD::VCGEU: return "ARMISD::VCGEU"; + case ARMISD::VCGT: return "ARMISD::VCGT"; + case ARMISD::VCGTU: return "ARMISD::VCGTU"; + case ARMISD::VTST: return "ARMISD::VTST"; + + case ARMISD::VSHL: return "ARMISD::VSHL"; + case ARMISD::VSHRs: return "ARMISD::VSHRs"; + case ARMISD::VSHRu: return "ARMISD::VSHRu"; + case ARMISD::VSHLLs: return "ARMISD::VSHLLs"; + case ARMISD::VSHLLu: return "ARMISD::VSHLLu"; + case ARMISD::VSHLLi: return "ARMISD::VSHLLi"; + case ARMISD::VSHRN: return "ARMISD::VSHRN"; + case ARMISD::VRSHRs: return "ARMISD::VRSHRs"; + case ARMISD::VRSHRu: return "ARMISD::VRSHRu"; + case ARMISD::VRSHRN: return "ARMISD::VRSHRN"; + case ARMISD::VQSHLs: return "ARMISD::VQSHLs"; + case ARMISD::VQSHLu: return "ARMISD::VQSHLu"; + case ARMISD::VQSHLsu: return "ARMISD::VQSHLsu"; + case ARMISD::VQSHRNs: return "ARMISD::VQSHRNs"; + case ARMISD::VQSHRNu: return "ARMISD::VQSHRNu"; + case ARMISD::VQSHRNsu: return "ARMISD::VQSHRNsu"; + case ARMISD::VQRSHRNs: return "ARMISD::VQRSHRNs"; + case ARMISD::VQRSHRNu: return "ARMISD::VQRSHRNu"; + case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu"; + case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; + case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; + case ARMISD::VDUPLANEQ: return "ARMISD::VDUPLANEQ"; } } @@ -423,63 +523,93 @@ static bool FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, #include "ARMGenCallingConv.inc" // APCS f64 is in register pairs, possibly split to stack -static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - static const unsigned HiRegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; - static const unsigned LoRegList[] = { ARM::R1, - ARM::R2, - ARM::R3, - ARM::NoRegister }; - - unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 4); - if (Reg == 0) - return false; // we didn't handle it +static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + CCState &State, bool CanFail) { + static const unsigned RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; + + // Try to get the first register. + if (unsigned Reg = State.AllocateReg(RegList, 4)) + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + else { + // For the 2nd half of a v2f64, do not fail. + if (CanFail) + return false; - unsigned i; - for (i = 0; i < 4; ++i) - if (HiRegList[i] == Reg) - break; + // Put the whole thing on the stack. + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, + State.AllocateStack(8, 4), + LocVT, LocInfo)); + return true; + } - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, MVT::i32, LocInfo)); - if (LoRegList[i] != ARM::NoRegister) - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], - MVT::i32, LocInfo)); + // Try to get the second register. + if (unsigned Reg = State.AllocateReg(RegList, 4)) + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); else State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, State.AllocateStack(4, 4), - MVT::i32, LocInfo)); + LocVT, LocInfo)); + return true; +} + +static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + if (!f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, true)) + return false; + if (LocVT == MVT::v2f64 && + !f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, false)) + return false; return true; // we handled it } // AAPCS f64 is in aligned register pairs -static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { +static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + CCState &State, bool CanFail) { static const unsigned HiRegList[] = { ARM::R0, ARM::R2 }; static const unsigned LoRegList[] = { ARM::R1, ARM::R3 }; unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2); - if (Reg == 0) - return false; // we didn't handle it + if (Reg == 0) { + // For the 2nd half of a v2f64, do not just fail. + if (CanFail) + return false; + + // Put the whole thing on the stack. + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, + State.AllocateStack(8, 8), + LocVT, LocInfo)); + return true; + } unsigned i; for (i = 0; i < 2; ++i) if (HiRegList[i] == Reg) break; - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, MVT::i32, LocInfo)); + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], - MVT::i32, LocInfo)); + LocVT, LocInfo)); + return true; +} + +static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + if (!f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, true)) + return false; + if (LocVT == MVT::v2f64 && + !f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, false)) + return false; return true; // we handled it } -static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { +static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, CCState &State) { static const unsigned HiRegList[] = { ARM::R0, ARM::R2 }; static const unsigned LoRegList[] = { ARM::R1, ARM::R3 }; @@ -492,9 +622,20 @@ static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, if (HiRegList[i] == Reg) break; - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, MVT::i32, LocInfo)); + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], - MVT::i32, LocInfo)); + LocVT, LocInfo)); + return true; +} + +static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + if (!f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State)) + return false; + if (LocVT == MVT::v2f64 && !f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State)) + return false; return true; // we handled it } @@ -558,7 +699,7 @@ LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall, SDValue Val; if (VA.needsCustom()) { - // Handle f64 as custom. + // Handle f64 or half of a v2f64. SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); Chain = Lo.getValue(1); @@ -569,6 +710,24 @@ LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall, Chain = Hi.getValue(1); InFlag = Hi.getValue(2); Val = DAG.getNode(ARMISD::FMDRR, dl, MVT::f64, Lo, Hi); + + if (VA.getLocVT() == MVT::v2f64) { + SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, + DAG.getConstant(0, MVT::i32)); + + VA = RVLocs[++i]; // skip ahead to next loc + Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); + Chain = Lo.getValue(1); + InFlag = Lo.getValue(2); + VA = RVLocs[++i]; // skip ahead to next loc + Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); + Chain = Hi.getValue(1); + InFlag = Hi.getValue(2); + Val = DAG.getNode(ARMISD::FMDRR, dl, MVT::f64, Lo, Hi); + Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, + DAG.getConstant(1, MVT::i32)); + } } else { Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), InFlag); @@ -625,6 +784,31 @@ ARMTargetLowering::LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG, PseudoSourceValue::getStack(), LocMemOffset); } +void ARMTargetLowering::PassF64ArgInRegs(CallSDNode *TheCall, SelectionDAG &DAG, + SDValue Chain, SDValue &Arg, + RegsToPassVector &RegsToPass, + CCValAssign &VA, CCValAssign &NextVA, + SDValue &StackPtr, + SmallVector<SDValue, 8> &MemOpChains, + ISD::ArgFlagsTy Flags) { + DebugLoc dl = TheCall->getDebugLoc(); + + SDValue fmrrd = DAG.getNode(ARMISD::FMRRD, dl, + DAG.getVTList(MVT::i32, MVT::i32), Arg); + RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd)); + + if (NextVA.isRegLoc()) + RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1))); + else { + assert(NextVA.isMemLoc()); + if (StackPtr.getNode() == 0) + StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); + + MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, NextVA, + Chain, fmrrd.getValue(1), Flags)); + } +} + /// LowerCALL - Lowering a ISD::CALL node into a callseq_start <- /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter /// nodes. @@ -651,7 +835,7 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { SDValue StackPtr = DAG.getRegister(ARM::SP, MVT::i32); - SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; + RegsToPassVector RegsToPass; SmallVector<SDValue, 8> MemOpChains; // Walk the register/memloc assignments, inserting copies/loads. In the case @@ -681,22 +865,32 @@ SDValue ARMTargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { break; } - // f64 is passed in i32 pairs and must be combined + // f64 and v2f64 are passed in i32 pairs and must be split into pieces if (VA.needsCustom()) { - SDValue fmrrd = DAG.getNode(ARMISD::FMRRD, dl, - DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1); - RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd)); - VA = ArgLocs[++i]; // skip ahead to next loc - if (VA.isRegLoc()) - RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(1))); - else { - assert(VA.isMemLoc()); - if (StackPtr.getNode() == 0) - StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); - - MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA, - Chain, fmrrd.getValue(1), - Flags)); + if (VA.getLocVT() == MVT::v2f64) { + SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(0, MVT::i32)); + SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(1, MVT::i32)); + + PassF64ArgInRegs(TheCall, DAG, Chain, Op0, RegsToPass, + VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); + + VA = ArgLocs[++i]; // skip ahead to next loc + if (VA.isRegLoc()) { + PassF64ArgInRegs(TheCall, DAG, Chain, Op1, RegsToPass, + VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); + } else { + assert(VA.isMemLoc()); + if (StackPtr.getNode() == 0) + StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); + + MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA, + Chain, Op1, Flags)); + } + } else { + PassF64ArgInRegs(TheCall, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], + StackPtr, MemOpChains, Flags); } } else if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); @@ -864,9 +1058,28 @@ SDValue ARMTargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) { break; } - // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is - // available. if (VA.needsCustom()) { + if (VA.getLocVT() == MVT::v2f64) { + // Extract the first half and return it in two registers. + SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(0, MVT::i32)); + SDValue HalfGPRs = DAG.getNode(ARMISD::FMRRD, dl, + DAG.getVTList(MVT::i32, MVT::i32), Half); + + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs, Flag); + Flag = Chain.getValue(1); + VA = RVLocs[++i]; // skip ahead to next loc + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), + HalfGPRs.getValue(1), Flag); + Flag = Chain.getValue(1); + VA = RVLocs[++i]; // skip ahead to next loc + + // Extract the 2nd half and fall through to handle it as an f64 value. + Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(1, MVT::i32)); + } + // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is + // available. SDValue fmrrd = DAG.getNode(ARMISD::FMRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1); Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd, Flag); @@ -1117,6 +1330,40 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG, } SDValue +ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, + SDValue &Root, SelectionDAG &DAG, + DebugLoc dl) { + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + TargetRegisterClass *RC; + if (AFI->isThumbFunction()) + RC = ARM::tGPRRegisterClass; + else + RC = ARM::GPRRegisterClass; + + // Transform the arguments stored in physical registers into virtual ones. + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); + + SDValue ArgValue2; + if (NextVA.isMemLoc()) { + unsigned ArgSize = NextVA.getLocVT().getSizeInBits()/8; + MachineFrameInfo *MFI = MF.getFrameInfo(); + int FI = MFI->CreateFixedObject(ArgSize, NextVA.getLocMemOffset()); + + // Create load node to retrieve arguments from the stack. + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN, NULL, 0); + } else { + Reg = MF.addLiveIn(NextVA.getLocReg(), RC); + ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); + } + + return DAG.getNode(ARMISD::FMDRR, dl, MVT::f64, ArgValue, ArgValue2); +} + +SDValue ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -1141,47 +1388,45 @@ ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { // Arguments stored in registers. if (VA.isRegLoc()) { MVT RegVT = VA.getLocVT(); - TargetRegisterClass *RC; - if (AFI->isThumbFunction()) - RC = ARM::tGPRRegisterClass; - else - RC = ARM::GPRRegisterClass; - if (FloatABIType == FloatABI::Hard) { - if (RegVT == MVT::f32) - RC = ARM::SPRRegisterClass; - else if (RegVT == MVT::f64) - RC = ARM::DPRRegisterClass; - } else if (RegVT == MVT::f64) { - // f64 is passed in pairs of GPRs and must be combined. + SDValue ArgValue; + if (VA.needsCustom()) { + // f64 and vector types are split up into multiple registers or + // combinations of registers and stack slots. RegVT = MVT::i32; - } else if (!((RegVT == MVT::i32) || (RegVT == MVT::f32))) - assert(0 && "RegVT not supported by FORMAL_ARGUMENTS Lowering"); - // Transform the arguments stored in physical registers into virtual ones. - unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); - SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, RegVT); + if (VA.getLocVT() == MVT::v2f64) { + SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], + Root, DAG, dl); + VA = ArgLocs[++i]; // skip ahead to next loc + SDValue ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], + Root, DAG, dl); + ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); + ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, + ArgValue, ArgValue1, DAG.getIntPtrConstant(0)); + ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, + ArgValue, ArgValue2, DAG.getIntPtrConstant(1)); + } else + ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Root, DAG, dl); - // f64 is passed in i32 pairs and must be combined. - if (VA.needsCustom()) { - SDValue ArgValue2; + } else { + TargetRegisterClass *RC; + if (FloatABIType == FloatABI::Hard && RegVT == MVT::f32) + RC = ARM::SPRRegisterClass; + else if (FloatABIType == FloatABI::Hard && RegVT == MVT::f64) + RC = ARM::DPRRegisterClass; + else if (AFI->isThumbFunction()) + RC = ARM::tGPRRegisterClass; + else + RC = ARM::GPRRegisterClass; - VA = ArgLocs[++i]; // skip ahead to next loc - if (VA.isMemLoc()) { - // must be APCS to split like this - unsigned ArgSize = VA.getLocVT().getSizeInBits()/8; - int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset()); - - // Create load node to retrieve arguments from the stack. - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); - ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN, NULL, 0); - } else { - Reg = MF.addLiveIn(VA.getLocReg(), RC); - ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); - } + assert((RegVT == MVT::i32 || RegVT == MVT::f32 || + (FloatABIType == FloatABI::Hard && RegVT == MVT::f64)) && + "RegVT not supported by FORMAL_ARGUMENTS Lowering"); - ArgValue = DAG.getNode(ARMISD::FMDRR, dl, MVT::f64, - ArgValue, ArgValue2); + // Transform the arguments in physical registers into virtual ones. + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + ArgValue = DAG.getCopyFromReg(Root, dl, Reg, RegVT); } // If this is an 8 or 16-bit value, it is really passed promoted @@ -1638,8 +1883,78 @@ static SDValue ExpandBIT_CONVERT(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); } -static SDValue ExpandSRx(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { - assert(N->getValueType(0) == MVT::i64 && +/// getZeroVector - Returns a vector of specified type with all zero elements. +/// +static SDValue getZeroVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) { + assert(VT.isVector() && "Expected a vector type"); + + // Zero vectors are used to represent vector negation and in those cases + // will be implemented with the NEON VNEG instruction. However, VNEG does + // not support i64 elements, so sometimes the zero vectors will need to be + // explicitly constructed. For those cases, and potentially other uses in + // the future, always build zero vectors as <4 x i32> or <2 x i32> bitcasted + // to their dest type. This ensures they get CSE'd. + SDValue Vec; + SDValue Cst = DAG.getTargetConstant(0, MVT::i32); + if (VT.getSizeInBits() == 64) + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); + else + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); + + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); +} + +/// getOnesVector - Returns a vector of specified type with all bits set. +/// +static SDValue getOnesVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) { + assert(VT.isVector() && "Expected a vector type"); + + // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest + // type. This ensures they get CSE'd. + SDValue Vec; + SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); + if (VT.getSizeInBits() == 64) + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); + else + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); + + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); +} + +static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + MVT VT = N->getValueType(0); + DebugLoc dl = N->getDebugLoc(); + + // Lower vector shifts on NEON to use VSHL. + if (VT.isVector()) { + assert(ST->hasNEON() && "unexpected vector shift"); + + // Left shifts translate directly to the vshiftu intrinsic. + if (N->getOpcode() == ISD::SHL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::arm_neon_vshiftu, MVT::i32), + N->getOperand(0), N->getOperand(1)); + + assert((N->getOpcode() == ISD::SRA || + N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode"); + + // NEON uses the same intrinsics for both left and right shifts. For + // right shifts, the shift amounts are negative, so negate the vector of + // shift amounts. + MVT ShiftVT = N->getOperand(1).getValueType(); + SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT, + getZeroVector(ShiftVT, DAG, dl), + N->getOperand(1)); + Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ? + Intrinsic::arm_neon_vshifts : + Intrinsic::arm_neon_vshiftu); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(vshiftInt, MVT::i32), + N->getOperand(0), NegatedCount); + } + + assert(VT == MVT::i64 && (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && "Unknown shift to lower!"); @@ -1652,7 +1967,6 @@ static SDValue ExpandSRx(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { if (ST->isThumb()) return SDValue(); // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. - DebugLoc dl = N->getDebugLoc(); SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), DAG.getConstant(0, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), @@ -1670,6 +1984,273 @@ static SDValue ExpandSRx(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); } +static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { + SDValue TmpOp0, TmpOp1; + bool Invert = false; + bool Swap = false; + unsigned Opc = 0; + + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue CC = Op.getOperand(2); + MVT VT = Op.getValueType(); + ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); + DebugLoc dl = Op.getDebugLoc(); + + if (Op.getOperand(1).getValueType().isFloatingPoint()) { + switch (SetCCOpcode) { + default: assert(0 && "Illegal FP comparison"); break; + case ISD::SETUNE: + case ISD::SETNE: Invert = true; // Fallthrough + case ISD::SETOEQ: + case ISD::SETEQ: Opc = ARMISD::VCEQ; break; + case ISD::SETOLT: + case ISD::SETLT: Swap = true; // Fallthrough + case ISD::SETOGT: + case ISD::SETGT: Opc = ARMISD::VCGT; break; + case ISD::SETOLE: + case ISD::SETLE: Swap = true; // Fallthrough + case ISD::SETOGE: + case ISD::SETGE: Opc = ARMISD::VCGE; break; + case ISD::SETUGE: Swap = true; // Fallthrough + case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break; + case ISD::SETUGT: Swap = true; // Fallthrough + case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break; + case ISD::SETUEQ: Invert = true; // Fallthrough + case ISD::SETONE: + // Expand this to (OLT | OGT). + TmpOp0 = Op0; + TmpOp1 = Op1; + Opc = ISD::OR; + Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0); + Op1 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp0, TmpOp1); + break; + case ISD::SETUO: Invert = true; // Fallthrough + case ISD::SETO: + // Expand this to (OLT | OGE). + TmpOp0 = Op0; + TmpOp1 = Op1; + Opc = ISD::OR; + Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0); + Op1 = DAG.getNode(ARMISD::VCGE, dl, VT, TmpOp0, TmpOp1); + break; + } + } else { + // Integer comparisons. + switch (SetCCOpcode) { + default: assert(0 && "Illegal integer comparison"); break; + case ISD::SETNE: Invert = true; + case ISD::SETEQ: Opc = ARMISD::VCEQ; break; + case ISD::SETLT: Swap = true; + case ISD::SETGT: Opc = ARMISD::VCGT; break; + case ISD::SETLE: Swap = true; + case ISD::SETGE: Opc = ARMISD::VCGE; break; + case ISD::SETULT: Swap = true; + case ISD::SETUGT: Opc = ARMISD::VCGTU; break; + case ISD::SETULE: Swap = true; + case ISD::SETUGE: Opc = ARMISD::VCGEU; break; + } + + // Detect VTST (Vector Test Bits) = vicmp ne (and (op0, op1), zero). + if (Opc == ARMISD::VCEQ) { + + SDValue AndOp; + if (ISD::isBuildVectorAllZeros(Op1.getNode())) + AndOp = Op0; + else if (ISD::isBuildVectorAllZeros(Op0.getNode())) + AndOp = Op1; + + // Ignore bitconvert. + if (AndOp.getNode() && AndOp.getOpcode() == ISD::BIT_CONVERT) + AndOp = AndOp.getOperand(0); + + if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { + Opc = ARMISD::VTST; + Op0 = DAG.getNode(ISD::BIT_CONVERT, dl, VT, AndOp.getOperand(0)); + Op1 = DAG.getNode(ISD::BIT_CONVERT, dl, VT, AndOp.getOperand(1)); + Invert = !Invert; + } + } + } + + if (Swap) + std::swap(Op0, Op1); + + SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); + + if (Invert) + Result = DAG.getNOT(dl, Result, VT); + + return Result; +} + +/// isVMOVSplat - Check if the specified splat value corresponds to an immediate +/// VMOV instruction, and if so, return the constant being splatted. +static SDValue isVMOVSplat(uint64_t SplatBits, uint64_t SplatUndef, + unsigned SplatBitSize, SelectionDAG &DAG) { + switch (SplatBitSize) { + case 8: + // Any 1-byte value is OK. + assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); + return DAG.getTargetConstant(SplatBits, MVT::i8); + + case 16: + // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. + if ((SplatBits & ~0xff) == 0 || + (SplatBits & ~0xff00) == 0) + return DAG.getTargetConstant(SplatBits, MVT::i16); + break; + + case 32: + // NEON's 32-bit VMOV supports splat values where: + // * only one byte is nonzero, or + // * the least significant byte is 0xff and the second byte is nonzero, or + // * the least significant 2 bytes are 0xff and the third is nonzero. + if ((SplatBits & ~0xff) == 0 || + (SplatBits & ~0xff00) == 0 || + (SplatBits & ~0xff0000) == 0 || + (SplatBits & ~0xff000000) == 0) + return DAG.getTargetConstant(SplatBits, MVT::i32); + + if ((SplatBits & ~0xffff) == 0 && + ((SplatBits | SplatUndef) & 0xff) == 0xff) + return DAG.getTargetConstant(SplatBits | 0xff, MVT::i32); + + if ((SplatBits & ~0xffffff) == 0 && + ((SplatBits | SplatUndef) & 0xffff) == 0xffff) + return DAG.getTargetConstant(SplatBits | 0xffff, MVT::i32); + + // Note: there are a few 32-bit splat values (specifically: 00ffff00, + // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not + // VMOV.I32. A (very) minor optimization would be to replicate the value + // and fall through here to test for a valid 64-bit splat. But, then the + // caller would also need to check and handle the change in size. + break; + + case 64: { + // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. + uint64_t BitMask = 0xff; + uint64_t Val = 0; + for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { + if (((SplatBits | SplatUndef) & BitMask) == BitMask) + Val |= BitMask; + else if ((SplatBits & BitMask) != 0) + return SDValue(); + BitMask <<= 8; + } + return DAG.getTargetConstant(Val, MVT::i64); + } + + default: + assert(0 && "unexpected size for isVMOVSplat"); + break; + } + + return SDValue(); +} + +/// getVMOVImm - If this is a build_vector of constants which can be +/// formed by using a VMOV instruction of the specified element size, +/// return the constant being splatted. The ByteSize field indicates the +/// number of bytes of each element [1248]. +SDValue ARM::getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { + BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N); + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, + HasAnyUndefs, ByteSize * 8)) + return SDValue(); + + if (SplatBitSize > ByteSize * 8) + return SDValue(); + + return isVMOVSplat(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), + SplatBitSize, DAG); +} + +static SDValue BuildSplat(SDValue Val, MVT VT, SelectionDAG &DAG, DebugLoc dl) { + // Canonicalize all-zeros and all-ones vectors. + ConstantSDNode *ConstVal = dyn_cast<ConstantSDNode>(Val.getNode()); + if (ConstVal->isNullValue()) + return getZeroVector(VT, DAG, dl); + if (ConstVal->isAllOnesValue()) + return getOnesVector(VT, DAG, dl); + + MVT CanonicalVT; + if (VT.is64BitVector()) { + switch (Val.getValueType().getSizeInBits()) { + case 8: CanonicalVT = MVT::v8i8; break; + case 16: CanonicalVT = MVT::v4i16; break; + case 32: CanonicalVT = MVT::v2i32; break; + case 64: CanonicalVT = MVT::v1i64; break; + default: assert(0 && "unexpected splat element type"); break; + } + } else { + assert(VT.is128BitVector() && "unknown splat vector size"); + switch (Val.getValueType().getSizeInBits()) { + case 8: CanonicalVT = MVT::v16i8; break; + case 16: CanonicalVT = MVT::v8i16; break; + case 32: CanonicalVT = MVT::v4i32; break; + case 64: CanonicalVT = MVT::v2i64; break; + default: assert(0 && "unexpected splat element type"); break; + } + } + + // Build a canonical splat for this value. + SmallVector<SDValue, 8> Ops; + Ops.assign(CanonicalVT.getVectorNumElements(), Val); + SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, dl, CanonicalVT, &Ops[0], + Ops.size()); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Res); +} + +// If this is a case we can't handle, return null and let the default +// expansion code take care of it. +static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { + BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); + assert(BVN != 0 && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR"); + DebugLoc dl = Op.getDebugLoc(); + + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { + SDValue Val = isVMOVSplat(SplatBits.getZExtValue(), + SplatUndef.getZExtValue(), SplatBitSize, DAG); + if (Val.getNode()) + return BuildSplat(Val, Op.getValueType(), DAG, dl); + } + + return SDValue(); +} + +static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { + return Op; +} + +static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { + return Op; +} + +static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + assert((VT == MVT::i8 || VT == MVT::i16) && + "unexpected type for custom-lowering vector extract"); + SDValue Vec = Op.getOperand(0); + SDValue Lane = Op.getOperand(1); + Op = DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); + Op = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Op, DAG.getValueType(VT)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); +} + +static SDValue LowerCONCAT_VECTORS(SDValue Op) { + if (Op.getValueType().is128BitVector() && Op.getNumOperands() == 2) + return Op; + return SDValue(); +} + SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { switch (Op.getOpcode()) { default: assert(0 && "Don't know how to custom lower this!"); abort(); @@ -1695,8 +2276,15 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::BIT_CONVERT: return ExpandBIT_CONVERT(Op.getNode(), DAG); + case ISD::SHL: case ISD::SRL: - case ISD::SRA: return ExpandSRx(Op.getNode(), DAG,Subtarget); + case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); + case ISD::VSETCC: return LowerVSETCC(Op, DAG); + case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); + case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op); } return SDValue(); } @@ -1715,7 +2303,7 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, return; case ISD::SRL: case ISD::SRA: { - SDValue Res = ExpandSRx(N, DAG, Subtarget); + SDValue Res = LowerShift(N, DAG, Subtarget); if (Res.getNode()) Results.push_back(Res); return; @@ -1900,6 +2488,294 @@ static SDValue PerformFMRRDCombine(SDNode *N, return SDValue(); } +/// getVShiftImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift operation, where all the elements of the +/// build_vector must have the same constant integer value. +static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { + // Ignore bit_converts. + while (Op.getOpcode() == ISD::BIT_CONVERT) + Op = Op.getOperand(0); + BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, + HasAnyUndefs, ElementBits) || + SplatBitSize > ElementBits) + return false; + Cnt = SplatBits.getSExtValue(); + return true; +} + +/// isVShiftLImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift left operation. That value must be in the range: +/// 0 <= Value < ElementBits for a left shift; or +/// 0 <= Value <= ElementBits for a long left shift. +static bool isVShiftLImm(SDValue Op, MVT VT, bool isLong, int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + if (! getVShiftImm(Op, ElementBits, Cnt)) + return false; + return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits); +} + +/// isVShiftRImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift right operation. For a shift opcode, the value +/// is positive, but for an intrinsic the value count must be negative. The +/// absolute value must be in the range: +/// 1 <= |Value| <= ElementBits for a right shift; or +/// 1 <= |Value| <= ElementBits/2 for a narrow right shift. +static bool isVShiftRImm(SDValue Op, MVT VT, bool isNarrow, bool isIntrinsic, + int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + if (! getVShiftImm(Op, ElementBits, Cnt)) + return false; + if (isIntrinsic) + Cnt = -Cnt; + return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); +} + +/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. +static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + switch (IntNo) { + default: + // Don't do anything for most intrinsics. + break; + + // Vector shifts: check for immediate versions and lower them. + // Note: This is done during DAG combining instead of DAG legalizing because + // the build_vectors for 64-bit vector element shift counts are generally + // not legal, and it is hard to see their values after they get legalized to + // loads from a constant pool. + case Intrinsic::arm_neon_vshifts: + case Intrinsic::arm_neon_vshiftu: + case Intrinsic::arm_neon_vshiftls: + case Intrinsic::arm_neon_vshiftlu: + case Intrinsic::arm_neon_vshiftn: + case Intrinsic::arm_neon_vrshifts: + case Intrinsic::arm_neon_vrshiftu: + case Intrinsic::arm_neon_vrshiftn: + case Intrinsic::arm_neon_vqshifts: + case Intrinsic::arm_neon_vqshiftu: + case Intrinsic::arm_neon_vqshiftsu: + case Intrinsic::arm_neon_vqshiftns: + case Intrinsic::arm_neon_vqshiftnu: + case Intrinsic::arm_neon_vqshiftnsu: + case Intrinsic::arm_neon_vqrshiftns: + case Intrinsic::arm_neon_vqrshiftnu: + case Intrinsic::arm_neon_vqrshiftnsu: { + MVT VT = N->getOperand(1).getValueType(); + int64_t Cnt; + unsigned VShiftOpc = 0; + + switch (IntNo) { + case Intrinsic::arm_neon_vshifts: + case Intrinsic::arm_neon_vshiftu: + if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { + VShiftOpc = ARMISD::VSHL; + break; + } + if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { + VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? + ARMISD::VSHRs : ARMISD::VSHRu); + break; + } + return SDValue(); + + case Intrinsic::arm_neon_vshiftls: + case Intrinsic::arm_neon_vshiftlu: + if (isVShiftLImm(N->getOperand(2), VT, true, Cnt)) + break; + assert(0 && "invalid shift count for vshll intrinsic"); + abort(); + + case Intrinsic::arm_neon_vrshifts: + case Intrinsic::arm_neon_vrshiftu: + if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) + break; + return SDValue(); + + case Intrinsic::arm_neon_vqshifts: + case Intrinsic::arm_neon_vqshiftu: + if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) + break; + return SDValue(); + + case Intrinsic::arm_neon_vqshiftsu: + if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) + break; + assert(0 && "invalid shift count for vqshlu intrinsic"); + abort(); + + case Intrinsic::arm_neon_vshiftn: + case Intrinsic::arm_neon_vrshiftn: + case Intrinsic::arm_neon_vqshiftns: + case Intrinsic::arm_neon_vqshiftnu: + case Intrinsic::arm_neon_vqshiftnsu: + case Intrinsic::arm_neon_vqrshiftns: + case Intrinsic::arm_neon_vqrshiftnu: + case Intrinsic::arm_neon_vqrshiftnsu: + // Narrowing shifts require an immediate right shift. + if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) + break; + assert(0 && "invalid shift count for narrowing vector shift intrinsic"); + abort(); + + default: + assert(0 && "unhandled vector shift"); + } + + switch (IntNo) { + case Intrinsic::arm_neon_vshifts: + case Intrinsic::arm_neon_vshiftu: + // Opcode already set above. + break; + case Intrinsic::arm_neon_vshiftls: + case Intrinsic::arm_neon_vshiftlu: + if (Cnt == VT.getVectorElementType().getSizeInBits()) + VShiftOpc = ARMISD::VSHLLi; + else + VShiftOpc = (IntNo == Intrinsic::arm_neon_vshiftls ? + ARMISD::VSHLLs : ARMISD::VSHLLu); + break; + case Intrinsic::arm_neon_vshiftn: + VShiftOpc = ARMISD::VSHRN; break; + case Intrinsic::arm_neon_vrshifts: + VShiftOpc = ARMISD::VRSHRs; break; + case Intrinsic::arm_neon_vrshiftu: + VShiftOpc = ARMISD::VRSHRu; break; + case Intrinsic::arm_neon_vrshiftn: + VShiftOpc = ARMISD::VRSHRN; break; + case Intrinsic::arm_neon_vqshifts: + VShiftOpc = ARMISD::VQSHLs; break; + case Intrinsic::arm_neon_vqshiftu: + VShiftOpc = ARMISD::VQSHLu; break; + case Intrinsic::arm_neon_vqshiftsu: + VShiftOpc = ARMISD::VQSHLsu; break; + case Intrinsic::arm_neon_vqshiftns: + VShiftOpc = ARMISD::VQSHRNs; break; + case Intrinsic::arm_neon_vqshiftnu: + VShiftOpc = ARMISD::VQSHRNu; break; + case Intrinsic::arm_neon_vqshiftnsu: + VShiftOpc = ARMISD::VQSHRNsu; break; + case Intrinsic::arm_neon_vqrshiftns: + VShiftOpc = ARMISD::VQRSHRNs; break; + case Intrinsic::arm_neon_vqrshiftnu: + VShiftOpc = ARMISD::VQRSHRNu; break; + case Intrinsic::arm_neon_vqrshiftnsu: + VShiftOpc = ARMISD::VQRSHRNsu; break; + } + + return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0), + N->getOperand(1), DAG.getConstant(Cnt, MVT::i32)); + } + + case Intrinsic::arm_neon_vshiftins: { + MVT VT = N->getOperand(1).getValueType(); + int64_t Cnt; + unsigned VShiftOpc = 0; + + if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) + VShiftOpc = ARMISD::VSLI; + else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) + VShiftOpc = ARMISD::VSRI; + else { + assert(0 && "invalid shift count for vsli/vsri intrinsic"); + abort(); + } + + return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0), + N->getOperand(1), N->getOperand(2), + DAG.getConstant(Cnt, MVT::i32)); + } + + case Intrinsic::arm_neon_vqrshifts: + case Intrinsic::arm_neon_vqrshiftu: + // No immediate versions of these to check for. + break; + } + + return SDValue(); +} + +/// PerformShiftCombine - Checks for immediate versions of vector shifts and +/// lowers them. As with the vector shift intrinsics, this is done during DAG +/// combining instead of DAG legalizing because the build_vectors for 64-bit +/// vector element shift counts are generally not legal, and it is hard to see +/// their values after they get legalized to loads from a constant pool. +static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + MVT VT = N->getValueType(0); + + // Nothing to be done for scalar shifts. + if (! VT.isVector()) + return SDValue(); + + assert(ST->hasNEON() && "unexpected vector shift"); + int64_t Cnt; + + switch (N->getOpcode()) { + default: assert(0 && "unexpected shift opcode"); + + case ISD::SHL: + if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) + return DAG.getNode(ARMISD::VSHL, N->getDebugLoc(), VT, N->getOperand(0), + DAG.getConstant(Cnt, MVT::i32)); + break; + + case ISD::SRA: + case ISD::SRL: + if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { + unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ? + ARMISD::VSHRs : ARMISD::VSHRu); + return DAG.getNode(VShiftOpc, N->getDebugLoc(), VT, N->getOperand(0), + DAG.getConstant(Cnt, MVT::i32)); + } + } + return SDValue(); +} + +/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, +/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. +static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + SDValue N0 = N->getOperand(0); + + // Check for sign- and zero-extensions of vector extract operations of 8- + // and 16-bit vector elements. NEON supports these directly. They are + // handled during DAG combining because type legalization will promote them + // to 32-bit types and it is messy to recognize the operations after that. + if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + SDValue Vec = N0.getOperand(0); + SDValue Lane = N0.getOperand(1); + MVT VT = N->getValueType(0); + MVT EltVT = N0.getValueType(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + if (VT == MVT::i32 && + (EltVT == MVT::i8 || EltVT == MVT::i16) && + TLI.isTypeLegal(Vec.getValueType())) { + + unsigned Opc = 0; + switch (N->getOpcode()) { + default: assert(0 && "unexpected opcode"); + case ISD::SIGN_EXTEND: + Opc = ARMISD::VGETLANEs; + break; + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + Opc = ARMISD::VGETLANEu; + break; + } + return DAG.getNode(Opc, N->getDebugLoc(), VT, Vec, Lane); + } + } + + return SDValue(); +} + SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { @@ -1907,8 +2783,17 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::ADD: return PerformADDCombine(N, DCI); case ISD::SUB: return PerformSUBCombine(N, DCI); case ARMISD::FMRRD: return PerformFMRRDCombine(N, DCI); + case ISD::INTRINSIC_WO_CHAIN: + return PerformIntrinsicCombine(N, DCI.DAG); + case ISD::SHL: + case ISD::SRA: + case ISD::SRL: + return PerformShiftCombine(N, DCI.DAG, Subtarget); + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + return PerformExtendCombine(N, DCI.DAG, Subtarget); } - return SDValue(); } diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 8f53e396ea..631e37f90b 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -67,10 +67,65 @@ namespace llvm { EH_SJLJ_SETJMP, // SjLj exception handling setjmp EH_SJLJ_LONGJMP, // SjLj exception handling longjmp - THREAD_POINTER + THREAD_POINTER, + + VCEQ, // Vector compare equal. + VCGE, // Vector compare greater than or equal. + VCGEU, // Vector compare unsigned greater than or equal. + VCGT, // Vector compare greater than. + VCGTU, // Vector compare unsigned greater than. + VTST, // Vector test bits. + + // Vector shift by immediate: + VSHL, // ...left + VSHRs, // ...right (signed) + VSHRu, // ...right (unsigned) + VSHLLs, // ...left long (signed) + VSHLLu, // ...left long (unsigned) + VSHLLi, // ...left long (with maximum shift count) + VSHRN, // ...right narrow + + // Vector rounding shift by immediate: + VRSHRs, // ...right (signed) + VRSHRu, // ...right (unsigned) + VRSHRN, // ...right narrow + + // Vector saturating shift by immediate: + VQSHLs, // ...left (signed) + VQSHLu, // ...left (unsigned) + VQSHLsu, // ...left (signed to unsigned) + VQSHRNs, // ...right narrow (signed) + VQSHRNu, // ...right narrow (unsigned) + VQSHRNsu, // ...right narrow (signed to unsigned) + + // Vector saturating rounding shift by immediate: + VQRSHRNs, // ...right narrow (signed) + VQRSHRNu, // ...right narrow (unsigned) + VQRSHRNsu, // ...right narrow (signed to unsigned) + + // Vector shift and insert: + VSLI, // ...left + VSRI, // ...right + + // Vector get lane (VMOV scalar to ARM core register) + // (These are used for 8- and 16-bit element types only.) + VGETLANEu, // zero-extend vector extract element + VGETLANEs, // sign-extend vector extract element + + // Vector duplicate lane (128-bit result only; 64-bit is a shuffle) + VDUPLANEQ // splat a lane from a 64-bit vector to a 128-bit vector }; } + /// Define some predicates that are used for node matching. + namespace ARM { + /// getVMOVImm - If this is a build_vector of constants which can be + /// formed by using a VMOV instruction of the specified element size, + /// return the constant being splatted. The ByteSize field indicates the + /// number of bytes of each element [1248]. + SDValue getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG); + } + //===--------------------------------------------------------------------===// // ARMTargetLowering - ARM Implementation of the TargetLowering interface @@ -151,6 +206,21 @@ namespace llvm { /// unsigned ARMPCLabelIndex; + void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT); + void addDRTypeForNEON(MVT VT); + void addQRTypeForNEON(MVT VT); + + typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector; + void PassF64ArgInRegs(CallSDNode *TheCall, SelectionDAG &DAG, + SDValue Chain, SDValue &Arg, + RegsToPassVector &RegsToPass, + CCValAssign &VA, CCValAssign &NextVA, + SDValue &StackPtr, + SmallVector<SDValue, 8> &MemOpChains, + ISD::ArgFlagsTy Flags); + SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, + SDValue &Root, SelectionDAG &DAG, DebugLoc dl); + CCAssignFn *CCAssignFnForNode(unsigned CC, bool Return) const; SDValue LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG, const SDValue &StackPtr, const CCValAssign &VA, diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td index 9a1e1c2bb7..14cca7a6e5 100644 --- a/lib/Target/ARM/ARMInstrFormats.td +++ b/lib/Target/ARM/ARMInstrFormats.td @@ -49,6 +49,11 @@ def VFPMiscFrm : Format<22>; def ThumbFrm : Format<23>; +def NEONFrm : Format<24>; +def NEONGetLnFrm : Format<25>; +def NEONSetLnFrm : Format<26>; +def NEONDupFrm : Format<27>; + // Misc flag for data processing instructions that indicates whether // the instruction has a Rn register operand. class UnaryDP { bit isUnaryDataProc = 1; } @@ -737,6 +742,14 @@ class TIx2<dag outs, dag ins, string asm, list<dag> pattern> class TJTI<dag outs, dag ins, string asm, list<dag> pattern> : ThumbI<outs, ins, AddrModeNone, SizeSpecial, asm, "", pattern>; +// ThumbPat - Same as Pat<>, but requires that the compiler be in Thumb mode. +class ThumbPat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsThumb]; +} + +class ThumbV5Pat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsThumb, HasV5T]; +} //===----------------------------------------------------------------------===// @@ -857,12 +870,102 @@ class AVConv5I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc, //===----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// +// ARM NEON Instruction templates. +// -// ThumbPat - Same as Pat<>, but requires that the compiler be in Thumb mode. -class ThumbPat<dag pattern, dag result> : Pat<pattern, result> { - list<Predicate> Predicates = [IsThumb]; -} - -class ThumbV5Pat<dag pattern, dag result> : Pat<pattern, result> { - list<Predicate> Predicates = [IsThumb, HasV5T]; -} +class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, string asm, + string cstr, list<dag> pattern> + : InstARM<am, Size4Bytes, im, NEONFrm, cstr> { + let OutOperandList = oops; + let InOperandList = iops; + let AsmString = asm; + let Pattern = pattern; + list<Predicate> Predicates = [HasNEON]; +} + +class NI<dag oops, dag iops, string asm, list<dag> pattern> + : NeonI<oops, iops, AddrModeNone, IndexModeNone, asm, "", pattern> { +} + +class NDataI<dag oops, dag iops, string asm, string cstr, list<dag> pattern> + : NeonI<oops, iops, AddrModeNone, IndexModeNone, asm, cstr, pattern> { + let Inst{31-25} = 0b1111001; +} + +// NEON "one register and a modified immediate" format. +class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6, + bit op5, bit op4, + dag oops, dag iops, string asm, string cstr, list<dag> pattern> + : NDataI<oops, iops, asm, cstr, pattern> { + let Inst{23} = op23; + let Inst{21-19} = op21_19; + let Inst{11-8} = op11_8; + let Inst{7} = op7; + let Inst{6} = op6; + let Inst{5} = op5; + let Inst{4} = op4; +} + +// NEON 2 vector register format. +class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16, + bits<5> op11_7, bit op6, bit op4, + dag oops, dag iops, string asm, string cstr, list<dag> pattern> + : NDataI<oops, iops, asm, cstr, pattern> { + let Inst{24-23} = op24_23; + let Inst{21-20} = op21_20; + let Inst{19-18} = op19_18; + let Inst{17-16} = op17_16; + let Inst{11-7} = op11_7; + let Inst{6} = op6; + let Inst{4} = op4; +} + +// NEON 2 vector register with immediate. +class N2VImm<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, + bit op6, bit op4, + dag oops, dag iops, string asm, string cstr, list<dag> pattern> + : NDataI<oops, iops, asm, cstr, pattern> { + let Inst{24} = op24; + let Inst{23} = op23; + let Inst{21-16} = op21_16; + let Inst{11-8} = op11_8; + let Inst{7} = op7; + let Inst{6} = op6; + let Inst{4} = op4; +} + +// NEON 3 vector register format. +class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4, + dag oops, dag iops, string asm, string cstr, list<dag> pattern> + : NDataI<oops, iops, asm, cstr, pattern> { + let Inst{24} = op24; + let Inst{23} = op23; + let Inst{21-20} = op21_20; + let Inst{11-8} = op11_8; + let Inst{6} = op6; + let Inst{4} = op4; +} + +// NEON VMOVs between scalar and core registers. +class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, + dag oops, dag iops, Format f, string opc, string asm, + list<dag> pattern> + : AI<oops, iops, f, opc, asm, pattern> { + let Inst{27-20} = opcod1; + let Inst{11-8} = opcod2; + let Inst{6-5} = opcod3; + let Inst{4} = 1; + list<Predicate> Predicates = [HasNEON]; +} +class NVGetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, + dag oops, dag iops, string opc, string asm, list<dag> pattern> + : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONGetLnFrm, opc, asm, + pattern>; +class NVSetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, + dag oops, dag iops, string opc, string asm, list<dag> pattern> + : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONSetLnFrm, opc, asm, + pattern>; +class NVDup<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3, + dag oops, dag iops, string opc, string asm, list<dag> pattern> + : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONDupFrm, opc, asm, pattern>; diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp index d19fb8eea8..e8da927624 100644 --- a/lib/Target/ARM/ARMInstrInfo.cpp +++ b/lib/Target/ARM/ARMInstrInfo.cpp @@ -59,6 +59,8 @@ bool ARMInstrInfo::isMoveInstr(const MachineInstr &MI, return false; case ARM::FCPYS: case ARM::FCPYD: + case ARM::VMOVD: + case ARM::VMOVQ: SrcReg = MI.getOperand(1).getReg(); DstReg = MI.getOperand(0).getReg(); return true; @@ -528,6 +530,8 @@ bool ARMInstrInfo::copyRegToReg(MachineBasicBlock &MBB, else if (DestRC == ARM::DPRRegisterClass) AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYD), DestReg) .addReg(SrcReg)); + else if (DestRC == ARM::QPRRegisterClass) + BuildMI(MBB, I, DL, get(ARM::VMOVQ), DestReg).addReg(SrcReg); else return false; @@ -844,6 +848,10 @@ canFoldMemoryOperand(const MachineInstr *MI, case ARM::FCPYS: case ARM::FCPYD: return true; + + case ARM::VMOVD: + case ARM::VMOVQ: + return false; // FIXME } return false; diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h index 13ff3fea84..9658f3bcba 100644 --- a/lib/Target/ARM/ARMInstrInfo.h +++ b/lib/Target/ARM/ARMInstrInfo.h @@ -114,6 +114,12 @@ namespace ARMII { // Thumb format ThumbFrm = 23 << FormShift, + // NEON format + NEONFrm = 24 << FormShift, + NEONGetLnFrm = 25 << FormShift, + NEONSetLnFrm = 26 << FormShift, + NEONDupFrm = 27 << FormShift, + //===------------------------------------------------------------------===// // Field shifts - such shifts are used to set field while generating // machine instructions. diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 58e3df5907..ccdfee5563 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -93,6 +93,10 @@ def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP", SDT_ARMEH_SJLJ_Setjmp>; def HasV5T : Predicate<"Subtarget->hasV5TOps()">; def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">; def HasV6 : Predicate<"Subtarget->hasV6Ops()">; +def HasV7 : Predicate<"Subtarget->hasV7Ops()">; +def HasVFP2 : Predicate<"Subtarget->hasVFP2()">; +def HasVFP3 : Predicate<"Subtarget->hasVFP3()">; +def HasNEON : Predicate<"Subtarget->hasNEON()">; def IsThumb : Predicate<"Subtarget->isThumb()">; def HasThumb2 : Predicate<"Subtarget->hasThumb2()">; def IsARM : Predicate<"!Subtarget->isThumb()">; @@ -1437,3 +1441,9 @@ include "ARMInstrThumb2.td" // include "ARMInstrVFP.td" + +//===----------------------------------------------------------------------===// +// Advanced SIMD (NEON) Support +// + +include "ARMInstrNEON.td" diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td new file mode 100644 index 0000000000..a62597bad8 --- /dev/null +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -0,0 +1,1665 @@ +//===- ARMInstrNEON.td - NEON support for ARM -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the ARM NEON instruction set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// NEON-specific DAG Nodes. +//===----------------------------------------------------------------------===// + +def SDTARMVCMP : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>; + +def NEONvceq : SDNode<"ARMISD::VCEQ", SDTARMVCMP>; +def NEONvcge : SDNode<"ARMISD::VCGE", SDTARMVCMP>; +def NEONvcgeu : SDNode<"ARMISD::VCGEU", SDTARMVCMP>; +def NEONvcgt : SDNode<"ARMISD::VCGT", SDTARMVCMP>; +def NEONvcgtu : SDNode<"ARMISD::VCGTU", SDTARMVCMP>; +def NEONvtst : SDNode<"ARMISD::VTST", SDTARMVCMP>; + +// Types for vector shift by immediates. The "SHX" version is for long and +// narrow operations where the source and destination vectors have different +// types. The "SHINS" version is for shift and insert operations. +def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>, + SDTCisVT<2, i32>]>; +def SDTARMVSHX : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, + SDTCisVT<2, i32>]>; +def SDTARMVSHINS : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>; + +def NEONvshl : SDNode<"ARMISD::VSHL", SDTARMVSH>; +def NEONvshrs : SDNode<"ARMISD::VSHRs", SDTARMVSH>; +def NEONvshru : SDNode<"ARMISD::VSHRu", SDTARMVSH>; +def NEONvshlls : SDNode<"ARMISD::VSHLLs", SDTARMVSHX>; +def NEONvshllu : SDNode<"ARMISD::VSHLLu", SDTARMVSHX>; +def NEONvshlli : SDNode<"ARMISD::VSHLLi", SDTARMVSHX>; +def NEONvshrn : SDNode<"ARMISD::VSHRN", SDTARMVSHX>; + +def NEONvrshrs : SDNode<"ARMISD::VRSHRs", SDTARMVSH>; +def NEONvrshru : SDNode<"ARMISD::VRSHRu", SDTARMVSH>; +def NEONvrshrn : SDNode<"ARMISD::VRSHRN", SDTARMVSHX>; + +def NEONvqshls : SDNode<"ARMISD::VQSHLs", SDTARMVSH>; +def NEONvqshlu : SDNode<"ARMISD::VQSHLu", SDTARMVSH>; +def NEONvqshlsu : SDNode<"ARMISD::VQSHLsu", SDTARMVSH>; +def NEONvqshrns : SDNode<"ARMISD::VQSHRNs", SDTARMVSHX>; +def NEONvqshrnu : SDNode<"ARMISD::VQSHRNu", SDTARMVSHX>; +def NEONvqshrnsu : SDNode<"ARMISD::VQSHRNsu", SDTARMVSHX>; + +def NEONvqrshrns : SDNode<"ARMISD::VQRSHRNs", SDTARMVSHX>; +def NEONvqrshrnu : SDNode<"ARMISD::VQRSHRNu", SDTARMVSHX>; +def NEONvqrshrnsu : SDNode<"ARMISD::VQRSHRNsu", SDTARMVSHX>; + +def NEONvsli : SDNode<"ARMISD::VSLI", SDTARMVSHINS>; +def NEONvsri : SDNode<"ARMISD::VSRI", SDTARMVSHINS>; + +def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>, + SDTCisVT<2, i32>]>; +def NEONvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>; +def NEONvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>; + +def NEONvduplaneq : SDNode<"ARMISD::VDUPLANEQ", + SDTypeProfile<1, 2, [SDTCisVT<2, i32>]>>; + +//===----------------------------------------------------------------------===// +// NEON operand definitions +//===----------------------------------------------------------------------===// + +// addrmode_neonldstm := reg +// +/* TODO: Take advantage of vldm. +def addrmode_neonldstm : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrModeNeonLdStM", []> { + let PrintMethod = "printAddrNeonLdStMOperand"; + let MIOperandInfo = (ops GPR, i32imm); +} +*/ + +//===----------------------------------------------------------------------===// +// NEON load / store instructions +//===----------------------------------------------------------------------===// + +/* TODO: Take advantage of vldm. +let mayLoad = 1 in { +def VLDMD : NI<(outs), + (ins addrmode_neonldstm:$addr, reglist:$dst1, variable_ops), + "vldm${addr:submode} ${addr:base}, $dst1", + []>; + +def VLDMS : NI<(outs), + (ins addrmode_neonldstm:$addr, reglist:$dst1, variable_ops), + "vldm${addr:submode} ${addr:base}, $dst1", + []>; +} +*/ + +// Use vldmia to load a Q register as a D register pair. +def VLDRQ : NI<(outs QPR:$dst), (ins GPR:$addr), + "vldmia $addr, ${dst:dregpair}", + [(set QPR:$dst, (v2f64 (load GPR:$addr)))]>; + +// Use vstmia to store a Q register as a D register pair. +def VSTRQ : NI<(outs), (ins QPR:$src, GPR:$addr), + "vstmia $addr, ${src:dregpair}", + [(store (v2f64 QPR:$src), GPR:$addr)]>; + + +//===----------------------------------------------------------------------===// +// NEON pattern fragments +//===----------------------------------------------------------------------===// + +// Extract D sub-registers of Q registers. +// (arm_dsubreg_0 is 5; arm_dsubreg_1 is 6) +def SubReg_i8_reg : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(5 + N->getZExtValue() / 8, MVT::i32); +}]>; +def SubReg_i16_reg : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(5 + N->getZExtValue() / 4, MVT::i32); +}]>; +def SubReg_i32_reg : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(5 + N->getZExtValue() / 2, MVT::i32); +}]>; +def SubReg_f64_reg : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(5 + N->getZExtValue(), MVT::i32); +}]>; + +// Translate lane numbers from Q registers to D subregs. +def SubReg_i8_lane : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() & 7, MVT::i32); +}]>; +def SubReg_i16_lane : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() & 3, MVT::i32); +}]>; +def SubReg_i32_lane : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() & 1, MVT::i32); +}]>; + +//===----------------------------------------------------------------------===// +// Instruction Classes +//===----------------------------------------------------------------------===// + +// Basic 2-register operations, both double- and quad-register. +class N2VD<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, + ValueType ResTy, ValueType OpTy, SDNode OpNode> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst), + (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "", + [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src))))]>; +class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, + ValueType ResTy, ValueType OpTy, SDNode OpNode> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst), + (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "", + [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src))))]>; + +// Basic 2-register intrinsics, both double- and quad-register. +class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst), + (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "", + [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src))))]>; +class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst), + (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "", + [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>; + +// Narrow 2-register intrinsics. +class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op6, bit op4, + string OpcodeStr, ValueType TyD, ValueType TyQ, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$dst), + (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "", + [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src))))]>; + +// Long 2-register intrinsics. (This is currently only used for VMOVL and is +// derived from N2VImm instead of N2V because of the way the size is encoded.) +class N2VLInt<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, + bit op6, bit op4, string OpcodeStr, ValueType TyQ, ValueType TyD, + Intrinsic IntOp> + : N2VImm<op24, op23, op21_16, op11_8, op7, op6, op4, (outs QPR:$dst), + (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "", + [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src))))]>; + +// Basic 3-register operations, both double- and quad-register. +class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + string OpcodeStr, ValueType ResTy, ValueType OpTy, + SDNode OpNode, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), + !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "", + [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]> { + let isCommutable = Commutable; +} +class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + string OpcodeStr, ValueType ResTy, ValueType OpTy, + SDNode OpNode, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), + !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "", + [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]> { + let isCommutable = Commutable; +} + +// Basic 3-register intrinsics, both double- and quad-register. +class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + string OpcodeStr, ValueType ResTy, ValueType OpTy, + Intrinsic IntOp, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), + !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "", + [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1), (OpTy DPR:$src2))))]> { + let isCommutable = Commutable; +} +class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + string OpcodeStr, ValueType ResTy, ValueType OpTy, + Intrinsic IntOp, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), + !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "", + [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), (OpTy QPR:$src2))))]> { + let isCommutable = Commutable; +} + +// Multiply-Add/Sub operations, both double- and quad-register. +class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode OpNode> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), + !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst", + [(set DPR:$dst, (Ty (OpNode DPR:$src1, + (Ty (MulOp DPR:$src2, DPR:$src3)))))]>; +class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode OpNode> + : N3V<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), + !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst", + [(set QPR:$dst, (Ty (OpNode QPR:$src1, + (Ty (MulOp QPR:$src2, QPR:$src3)))))]>; + +// Neon 3-argument intrinsics, both double- and quad-register. +// The destination register is also used as the first source operand register. +class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + string OpcodeStr, ValueType ResTy, ValueType OpTy, + Intrinsic IntOp> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), + !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst", + [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1), + (OpTy DPR:$src2), (OpTy DPR:$src3))))]>; +class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + string OpcodeStr, ValueType ResTy, ValueType OpTy, + Intrinsic IntOp> + : N3V<op24, op23, op21_20, op11_8, 1, op4, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), + !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst", + [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), + (OpTy QPR:$src2), (OpTy QPR:$src3))))]>; + +// Neon Long 3-argument intrinsic. The destination register is +// a quad-register and is also used as the first source operand register. +class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + string OpcodeStr, ValueType TyQ, ValueType TyD, Intrinsic IntOp> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), + !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst", + [(set QPR:$dst, + (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2), (TyD DPR:$src3))))]>; + +// Narrowing 3-register intrinsics. +class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + string OpcodeStr, ValueType TyD, ValueType TyQ, + Intrinsic IntOp, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs DPR:$dst), (ins QPR:$src1, QPR:$src2), + !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "", + [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src1), (TyQ QPR:$src2))))]> { + let isCommutable = Commutable; +} + +// Long 3-register intrinsics. +class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + string OpcodeStr, ValueType TyQ, ValueType TyD, + Intrinsic IntOp, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), + !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "", + [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src1), (TyD DPR:$src2))))]> { + let isCommutable = Commutable; +} + +// Wide 3-register intrinsics. +class N3VWInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4, + string OpcodeStr, ValueType TyQ, ValueType TyD, + Intrinsic IntOp, bit Commutable> + : N3V<op24, op23, op21_20, op11_8, 0, op4, + (outs QPR:$dst), (ins QPR:$src1, DPR:$src2), + !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "", + [(set QPR:$dst, (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2))))]> { + let isCommutable = Commutable; +} + +// Pairwise long 2-register intrinsics, both double- and quad-register. +class N2VDPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst), + (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "", + [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src))))]>; +class N2VQPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst), + (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "", + [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>; + +// Pairwise long 2-register accumulate intrinsics, +// both double- and quad-register. +// The destination register is also used as the first source operand register. +class N2VDPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), + !strconcat(OpcodeStr, "\t$dst, $src2"), "$src1 = $dst", + [(set DPR:$dst, (ResTy (IntOp (ResTy DPR:$src1), (OpTy DPR:$src2))))]>; +class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, + bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr, + ValueType ResTy, ValueType OpTy, Intrinsic IntOp> + : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), + !strconcat(OpcodeStr, "\t$dst, $src2"), "$src1 = $dst", + [(set QPR:$dst, (ResTy (IntOp (ResTy QPR:$src1), (OpTy QPR:$src2))))]>; + +// Shift by immediate, +// both double- and quad-register. +class N2VDSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, + bit op4, string OpcodeStr, ValueType Ty, SDNode OpNode> + : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4, + (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), + !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "", + [(set DPR:$dst, (Ty (OpNode (Ty DPR:$src), (i32 imm:$SIMM))))]>; +class N2VQSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, + bit op4, string OpcodeStr, ValueType Ty, SDNode OpNode> + : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4, + (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), + !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "", + [(set QPR:$dst, (Ty (OpNode (Ty QPR:$src), (i32 imm:$SIMM))))]>; + +// Long shift by immediate. +class N2VLSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, + bit op6, bit op4, string OpcodeStr, ValueType ResTy, + ValueType OpTy, SDNode OpNode> + : N2VImm<op24, op23, op21_16, op11_8, op7, op6, op4, + (outs QPR:$dst), (ins DPR:$src, i32imm:$SIMM), + !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "", + [(set QPR:$dst, (ResTy (OpNode (OpTy DPR:$src), + (i32 imm:$SIMM))))]>; + +// Narrow shift by immediate. +class N2VNSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, + bit op6, bit op4, string OpcodeStr, ValueType ResTy, + ValueType OpTy, SDNode OpNode> + : N2VImm<op24, op23, op21_16, op11_8, op7, op6, op4, + (outs DPR:$dst), (ins QPR:$src, i32imm:$SIMM), + !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "", + [(set DPR:$dst, (ResTy (OpNode (OpTy QPR:$src), + (i32 imm:$SIMM))))]>; + +// Shift right by immediate and accumulate, +// both double- and quad-register. +class N2VDShAdd<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, + bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp> + : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), + !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst", + [(set DPR:$dst, (Ty (add DPR:$src1, + (Ty (ShOp DPR:$src2, (i32 imm:$SIMM))))))]>; +class N2VQShAdd<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, + bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp> + : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), + !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst", + [(set QPR:$dst, (Ty (add QPR:$src1, + (Ty (ShOp QPR:$src2, (i32 imm:$SIMM))))))]>; + +// Shift by immediate and insert, +// both double- and quad-register. +class N2VDShIns<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, + bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp> + : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4, + (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), + !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst", + [(set DPR:$dst, (Ty (ShOp DPR:$src1, DPR:$src2, (i32 imm:$SIMM))))]>; +class N2VQShIns<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, + bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp> + : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4, + (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), + !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst", + [(set QPR:$dst, (Ty (ShOp QPR:$src1, QPR:$src2, (i32 imm:$SIMM))))]>; + +// Convert, with fractional bits immediate, +// both double- and quad-register. +class N2VCvtD<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, + bit op4, string OpcodeStr, ValueType ResTy, ValueType OpTy, + Intrinsic IntOp> + : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4, + (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), + !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "", + [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src), (i32 imm:$SIMM))))]>; +class N2VCvtQ<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7, + bit op4, string OpcodeStr, ValueType ResTy, ValueType OpTy, + Intrinsic IntOp> + : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4, + (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), + !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "", + [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src), (i32 imm:$SIMM))))]>; + +//===----------------------------------------------------------------------===// +// Multiclasses +//===----------------------------------------------------------------------===// + +// Neon 3-register vector operations. + +// First with only element sizes of 8, 16 and 32 bits: +multiclass N3V_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, SDNode OpNode, bit Commutable = 0> { + // 64-bit vector types. + def v8i8 : N3VD<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"), + v8i8, v8i8, OpNode, Commutable>; + def v4i16 : N3VD<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr, "16"), + v4i16, v4i16, OpNode, Commutable>; + def v2i32 : N3VD<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr, "32"), + v2i32, v2i32, OpNode, Commutable>; + + // 128-bit vector types. + def v16i8 : N3VQ<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"), + v16i8, v16i8, OpNode, Commutable>; + def v8i16 : N3VQ<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr, "16"), + v8i16, v8i16, OpNode, Commutable>; + def v4i32 : N3VQ<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr, "32"), + v4i32, v4i32, OpNode, Commutable>; +} + +// ....then also with element size 64 bits: +multiclass N3V_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, SDNode OpNode, bit Commutable = 0> + : N3V_QHS<op24, op23, op11_8, op4, OpcodeStr, OpNode, Commutable> { + def v1i64 : N3VD<op24, op23, 0b11, op11_8, op4, !strconcat(OpcodeStr, "64"), + v1i64, v1i64, OpNode, Commutable>; + def v2i64 : N3VQ<op24, op23, 0b11, op11_8, op4, !strconcat(OpcodeStr, "64"), + v2i64, v2i64, OpNode, Commutable>; +} + + +// Neon Narrowing 2-register vector intrinsics, +// source operand element sizes of 16, 32 and 64 bits: +multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op6, bit op4, string OpcodeStr, + Intrinsic IntOp> { + def v8i8 : N2VNInt<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4, + !strconcat(OpcodeStr, "16"), v8i8, v8i16, IntOp>; + def v4i16 : N2VNInt<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4, + !strconcat(OpcodeStr, "32"), v4i16, v4i32, IntOp>; + def v2i32 : N2VNInt<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4, + !strconcat(OpcodeStr, "64"), v2i32, v2i64, IntOp>; +} + + +// Neon Lengthening 2-register vector intrinsic (currently specific to VMOVL). +// source operand element sizes of 16, 32 and 64 bits: +multiclass N2VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, + bit op4, string OpcodeStr, Intrinsic IntOp> { + def v8i16 : N2VLInt<op24, op23, 0b001000, op11_8, op7, op6, op4, + !strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp>; + def v4i32 : N2VLInt<op24, op23, 0b010000, op11_8, op7, op6, op4, + !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>; + def v2i64 : N2VLInt<op24, op23, 0b100000, op11_8, op7, op6, op4, + !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>; +} + + +// Neon 3-register vector intrinsics. + +// First with only element sizes of 16 and 32 bits: +multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> { + // 64-bit vector types. + def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr,"16"), + v4i16, v4i16, IntOp, Commutable>; + def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr,"32"), + v2i32, v2i32, IntOp, Commutable>; + + // 128-bit vector types. + def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr,"16"), + v8i16, v8i16, IntOp, Commutable>; + def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr,"32"), + v4i32, v4i32, IntOp, Commutable>; +} + +// ....then also with element size of 8 bits: +multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> + : N3VInt_HS<op24, op23, op11_8, op4, OpcodeStr, IntOp, Commutable> { + def v8i8 : N3VDInt<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"), + v8i8, v8i8, IntOp, Commutable>; + def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"), + v16i8, v16i8, IntOp, Commutable>; +} + +// ....then also with element size of 64 bits: +multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> + : N3VInt_QHS<op24, op23, op11_8, op4, OpcodeStr, IntOp, Commutable> { + def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, !strconcat(OpcodeStr,"64"), + v1i64, v1i64, IntOp, Commutable>; + def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, !strconcat(OpcodeStr,"64"), + v2i64, v2i64, IntOp, Commutable>; +} + + +// Neon Narrowing 3-register vector intrinsics, +// source operand element sizes of 16, 32 and 64 bits: +multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> { + def v8i8 : N3VNInt<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr,"16"), + v8i8, v8i16, IntOp, Commutable>; + def v4i16 : N3VNInt<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr,"32"), + v4i16, v4i32, IntOp, Commutable>; + def v2i32 : N3VNInt<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr,"64"), + v2i32, v2i64, IntOp, Commutable>; +} + + +// Neon Long 3-register vector intrinsics. + +// First with only element sizes of 16 and 32 bits: +multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> { + def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr,"16"), + v4i32, v4i16, IntOp, Commutable>; + def v2i64 : N3VLInt<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr,"32"), + v2i64, v2i32, IntOp, Commutable>; +} + +// ....then also with element size of 8 bits: +multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> + : N3VLInt_HS<op24, op23, op11_8, op4, OpcodeStr, IntOp, Commutable> { + def v8i16 : N3VLInt<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"), + v8i16, v8i8, IntOp, Commutable>; +} + + +// Neon Wide 3-register vector intrinsics, +// source operand element sizes of 8, 16 and 32 bits: +multiclass N3VWInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> { + def v8i16 : N3VWInt<op24, op23, 0b00, op11_8, op4, !strconcat(OpcodeStr, "8"), + v8i16, v8i8, IntOp, Commutable>; + def v4i32 : N3VWInt<op24, op23, 0b01, op11_8, op4, !strconcat(OpcodeStr,"16"), + v4i32, v4i16, IntOp, Commutable>; + def v2i64 : N3VWInt<op24, op23, 0b10, op11_8, op4, !strconcat(OpcodeStr,"32"), + v2i64, v2i32, IntOp, Commutable>; +} + + +// Neon Multiply-Op vector operations, +// element sizes of 8, 16 and 32 bits: +multiclass N3VMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, SDNode OpNode> { + // 64-bit vector types. + def v8i8 : N3VDMulOp<op24, op23, 0b00, op11_8, op4, + !strconcat(OpcodeStr, "8"), v8i8, mul, OpNode>; + def v4i16 : N3VDMulOp<op24, op23, 0b01, op11_8, op4, + !strconcat(OpcodeStr, "16"), v4i16, mul, OpNode>; + def v2i32 : N3VDMulOp<op24, op23, 0b10, op11_8, op4, + !strconcat(OpcodeStr, "32"), v2i32, mul, OpNode>; + + // 128-bit vector types. + def v16i8 : N3VQMulOp<op24, op23, 0b00, op11_8, op4, + !strconcat(OpcodeStr, "8"), v16i8, mul, OpNode>; + def v8i16 : N3VQMulOp<op24, op23, 0b01, op11_8, op4, + !strconcat(OpcodeStr, "16"), v8i16, mul, OpNode>; + def v4i32 : N3VQMulOp<op24, op23, 0b10, op11_8, op4, + !strconcat(OpcodeStr, "32"), v4i32, mul, OpNode>; +} + + +// Neon 3-argument intrinsics, +// element sizes of 8, 16 and 32 bits: +multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, Intrinsic IntOp> { + // 64-bit vector types. + def v8i8 : N3VDInt3<op24, op23, 0b00, op11_8, op4, + !strconcat(OpcodeStr, "8"), v8i8, v8i8, IntOp>; + def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4, + !strconcat(OpcodeStr, "16"), v4i16, v4i16, IntOp>; + def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4, + !strconcat(OpcodeStr, "32"), v2i32, v2i32, IntOp>; + + // 128-bit vector types. + def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4, + !strconcat(OpcodeStr, "8"), v16i8, v16i8, IntOp>; + def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4, + !strconcat(OpcodeStr, "16"), v8i16, v8i16, IntOp>; + def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4, + !strconcat(OpcodeStr, "32"), v4i32, v4i32, IntOp>; +} + + +// Neon Long 3-argument intrinsics. + +// First with only element sizes of 16 and 32 bits: +multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, Intrinsic IntOp> { + def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4, + !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>; + def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4, + !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>; +} + +// ....then also with element size of 8 bits: +multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, Intrinsic IntOp> + : N3VLInt3_HS<op24, op23, op11_8, op4, OpcodeStr, IntOp> { + def v8i16 : N3VLInt3<op24, op23, 0b01, op11_8, op4, + !strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp>; +} + + +// Neon 2-register vector intrinsics, +// element sizes of 8, 16 and 32 bits: +multiclass N2VInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op4, string OpcodeStr, + Intrinsic IntOp> { + // 64-bit vector types. + def v8i8 : N2VDInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + !strconcat(OpcodeStr, "8"), v8i8, v8i8, IntOp>; + def v4i16 : N2VDInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + !strconcat(OpcodeStr, "16"), v4i16, v4i16, IntOp>; + def v2i32 : N2VDInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + !strconcat(OpcodeStr, "32"), v2i32, v2i32, IntOp>; + + // 128-bit vector types. + def v16i8 : N2VQInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + !strconcat(OpcodeStr, "8"), v16i8, v16i8, IntOp>; + def v8i16 : N2VQInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + !strconcat(OpcodeStr, "16"), v8i16, v8i16, IntOp>; + def v4i32 : N2VQInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + !strconcat(OpcodeStr, "32"), v4i32, v4i32, IntOp>; +} + + +// Neon Pairwise long 2-register intrinsics, +// element sizes of 8, 16 and 32 bits: +multiclass N2VPLInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op4, + string OpcodeStr, Intrinsic IntOp> { + // 64-bit vector types. + def v8i8 : N2VDPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + !strconcat(OpcodeStr, "8"), v4i16, v8i8, IntOp>; + def v4i16 : N2VDPLInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + !strconcat(OpcodeStr, "16"), v2i32, v4i16, IntOp>; + def v2i32 : N2VDPLInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + !strconcat(OpcodeStr, "32"), v1i64, v2i32, IntOp>; + + // 128-bit vector types. + def v16i8 : N2VQPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + !strconcat(OpcodeStr, "8"), v8i16, v16i8, IntOp>; + def v8i16 : N2VQPLInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + !strconcat(OpcodeStr, "16"), v4i32, v8i16, IntOp>; + def v4i32 : N2VQPLInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + !strconcat(OpcodeStr, "32"), v2i64, v4i32, IntOp>; +} + + +// Neon Pairwise long 2-register accumulate intrinsics, +// element sizes of 8, 16 and 32 bits: +multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, + bits<5> op11_7, bit op4, + string OpcodeStr, Intrinsic IntOp> { + // 64-bit vector types. + def v8i8 : N2VDPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + !strconcat(OpcodeStr, "8"), v4i16, v8i8, IntOp>; + def v4i16 : N2VDPLInt2<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + !strconcat(OpcodeStr, "16"), v2i32, v4i16, IntOp>; + def v2i32 : N2VDPLInt2<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + !strconcat(OpcodeStr, "32"), v1i64, v2i32, IntOp>; + + // 128-bit vector types. + def v16i8 : N2VQPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4, + !strconcat(OpcodeStr, "8"), v8i16, v16i8, IntOp>; + def v8i16 : N2VQPLInt2<op24_23, op21_20, 0b01, op17_16, op11_7, op4, + !strconcat(OpcodeStr, "16"), v4i32, v8i16, IntOp>; + def v4i32 : N2VQPLInt2<op24_23, op21_20, 0b10, op17_16, op11_7, op4, + !strconcat(OpcodeStr, "32"), v2i64, v4i32, IntOp>; +} + + +// Neon 2-register vector shift by immediate, +// element sizes of 8, 16, 32 and 64 bits: +multiclass N2VSh_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, SDNode OpNode> { + // 64-bit vector types. + def v8i8 : N2VDSh<op24, op23, 0b001000, op11_8, 0, op4, + !strconcat(OpcodeStr, "8"), v8i8, OpNode>; + def v4i16 : N2VDSh<op24, op23, 0b010000, op11_8, 0, op4, + !strconcat(OpcodeStr, "16"), v4i16, OpNode>; + def v2i32 : N2VDSh<op24, op23, 0b100000, op11_8, 0, op4, + !strconcat(OpcodeStr, "32"), v2i32, OpNode>; + def v1i64 : N2VDSh<op24, op23, 0b000000, op11_8, 1, op4, + !strconcat(OpcodeStr, "64"), v1i64, OpNode>; + + // 128-bit vector types. + def v16i8 : N2VQSh<op24, op23, 0b001000, op11_8, 0, op4, + !strconcat(OpcodeStr, "8"), v16i8, OpNode>; + def v8i16 : N2VQSh<op24, op23, 0b010000, op11_8, 0, op4, + !strconcat(OpcodeStr, "16"), v8i16, OpNode>; + def v4i32 : N2VQSh<op24, op23, 0b100000, op11_8, 0, op4, + !strconcat(OpcodeStr, "32"), v4i32, OpNode>; + def v2i64 : N2VQSh<op24, op23, 0b000000, op11_8, 1, op4, + !strconcat(OpcodeStr, "64"), v2i64, OpNode>; +} + + +// Neon Shift-Accumulate vector operations, +// element sizes of 8, 16, 32 and 64 bits: +multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, SDNode ShOp> { + // 64-bit vector types. + def v8i8 : N2VDShAdd<op24, op23, 0b001000, op11_8, 0, op4, + !strconcat(OpcodeStr, "8"), v8i8, ShOp>; + def v4i16 : N2VDShAdd<op24, op23, 0b010000, op11_8, 0, op4, + !strconcat(OpcodeStr, "16"), v4i16, ShOp>; + def v2i32 : N2VDShAdd<op24, op23, 0b100000, op11_8, 0, op4, + !strconcat(OpcodeStr, "32"), v2i32, ShOp>; + def v1i64 : N2VDShAdd<op24, op23, 0b000000, op11_8, 1, op4, + !strconcat(OpcodeStr, "64"), v1i64, ShOp>; + + // 128-bit vector types. + def v16i8 : N2VQShAdd<op24, op23, 0b001000, op11_8, 0, op4, + !strconcat(OpcodeStr, "8"), v16i8, ShOp>; + def v8i16 : N2VQShAdd<op24, op23, 0b010000, op11_8, 0, op4, + !strconcat(OpcodeStr, "16"), v8i16, ShOp>; + def v4i32 : N2VQShAdd<op24, op23, 0b100000, op11_8, 0, op4, + !strconcat(OpcodeStr, "32"), v4i32, ShOp>; + def v2i64 : N2VQShAdd<op24, op23, 0b000000, op11_8, 1, op4, + !strconcat(OpcodeStr, "64"), v2i64, ShOp>; +} + + +// Neon Shift-Insert vector operations, +// element sizes of 8, 16, 32 and 64 bits: +multiclass N2VShIns_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, + string OpcodeStr, SDNode ShOp> { + // 64-bit vector types. + def v8i8 : N2VDShIns<op24, op23, 0b001000, op11_8, 0, op4, + !strconcat(OpcodeStr, "8"), v8i8, ShOp>; + def v4i16 : N2VDShIns<op24, op23, 0b010000, op11_8, 0, op4, + !strconcat(OpcodeStr, "16"), v4i16, ShOp>; + def v2i32 : N2VDShIns<op24, op23, 0b100000, op11_8, 0, op4, + !strconcat(OpcodeStr, "32"), v2i32, ShOp>; + def v1i64 : N2VDShIns<op24, op23, 0b000000, op11_8, 1, op4, + !strconcat(OpcodeStr, "64"), v1i64, ShOp>; + + // 128-bit vector types. + def v16i8 : N2VQShIns<op24, op23, 0b001000, op11_8, 0, op4, + !strconcat(OpcodeStr, "8"), v16i8, ShOp>; + def v8i16 : N2VQShIns<op24, op23, 0b010000, op11_8, 0, op4, + !strconcat(OpcodeStr, "16"), v8i16, ShOp>; + def v4i32 : N2VQShIns<op24, op23, 0b100000, op11_8, 0, op4, + !strconcat(OpcodeStr, "32"), v4i32, ShOp>; + def v2i64 : N2VQShIns<op24, op23, 0b000000, op11_8, 1, op4, + !strconcat(OpcodeStr, "64"), v2i64, ShOp>; +} + +//===----------------------------------------------------------------------===// +// Instruction Definitions. +//===----------------------------------------------------------------------===// + +// Vector Add Operations. + +// VADD : Vector Add (integer and floating-point) +defm VADD : N3V_QHSD<0, 0, 0b1000, 0, "vadd.i", add, 1>; +def VADDfd : N3VD<0, 0, 0b00, 0b1101, 0, "vadd.f32", v2f32, v2f32, fadd, 1>; +def VADDfq : N3VQ<0, 0, 0b00, 0b1101, 0, "vadd.f32", v4f32, v4f32, fadd, 1>; +// VADDL : Vector Add Long (Q = D + D) +defm VADDLs : N3VLInt_QHS<0,1,0b0000,0, "vaddl.s", int_arm_neon_vaddls, 1>; +defm VADDLu : N3VLInt_QHS<1,1,0b0000,0, "vaddl.u", int_arm_neon_vaddlu, 1>; +// VADDW : Vector Add Wide (Q = Q + D) +defm VADDWs : N3VWInt_QHS<0,1,0b0001,0, "vaddw.s", int_arm_neon_vaddws, 0>; +defm VADDWu : N3VWInt_QHS<1,1,0b0001,0, "vaddw.u", int_arm_neon_vaddwu, 0>; +// VHADD : Vector Halving Add +defm VHADDs : N3VInt_QHS<0,0,0b0000,0, "vhadd.s", int_arm_neon_vhadds, 1>; +defm VHADDu : N3VInt_QHS<1,0,0b0000,0, "vhadd.u", int_arm_neon_vhaddu, 1>; +// VRHADD : Vector Rounding Halving Add +defm VRHADDs : N3VInt_QHS<0,0,0b0001,0, "vrhadd.s", int_arm_neon_vrhadds, 1>; +defm VRHADDu : N3VInt_QHS<1,0,0b0001,0, "vrhadd.u", int_arm_neon_vrhaddu, 1>; +// VQADD : Vector Saturating Add +defm VQADDs : N3VInt_QHSD<0,0,0b0000,1, "vqadd.s", int_arm_neon_vqadds, 1>; +defm VQADDu : N3VInt_QHSD<1,0,0b0000,1, "vqadd.u", int_arm_neon_vqaddu, 1>; +// VADDHN : Vector Add and Narrow Returning High Half (D = Q + Q) +defm VADDHN : N3VNInt_HSD<0,1,0b0100,0, "vaddhn.i", int_arm_neon_vaddhn, 1>; +// VRADDHN : Vector Rounding Add and Narrow Returning High Half (D = Q + Q) +defm VRADDHN : N3VNInt_HSD<1,1,0b0100,0, "vraddhn.i", int_arm_neon_vraddhn, 1>; + +// Vector Multiply Operations. + +// VMUL : Vector Multiply (integer, polynomial and floating-point) +defm VMUL : N3V_QHS<0, 0, 0b1001, 1, "vmul.i", mul, 1>; +def VMULpd : N3VDInt<1, 0, 0b00, 0b1001, 1, "vmul.p8", v8i8, v8i8, + int_arm_neon_vmulp, 1>; +def VMULpq : N3VQInt<1, 0, 0b00, 0b1001, 1, "vmul.p8", v16i8, v16i8, + int_arm_neon_vmulp, 1>; +def VMULfd : N3VD<1, 0, 0b00, 0b1101, 1, "vmul.f32", v2f32, v2f32, fmul, 1>; +def VMULfq : N3VQ<1, 0, 0b00, 0b1101, 1, "vmul.f32", v4f32, v4f32, fmul, 1>; +// VQDMULH : Vector Saturating Doubling Multiply Returning High Half +defm VQDMULH : N3VInt_HS<0,0,0b1011,0, "vqdmulh.s", int_arm_neon_vqdmulh, 1>; +// VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half +defm VQRDMULH : N3VInt_HS<1,0,0b1011,0, "vqrdmulh.s", int_arm_neon_vqrdmulh, 1>; +// VMULL : Vector Multiply Long (integer and polynomial) (Q = D * D) +defm VMULLs : N3VLInt_QHS<0,1,0b1100,0, "vmull.s", int_arm_neon_vmulls, 1>; +defm VMULLu : N3VLInt_QHS<1,1,0b1100,0, "vmull.u", int_arm_neon_vmullu, 1>; +def VMULLp : N3VLInt<0, 1, 0b00, 0b1110, 0, "vmull.p8", v8i16, v8i8, + int_arm_neon_vmullp, 1>; +// VQDMULL : Vector Saturating Doubling Multiply Long (Q = D * D) +defm VQDMULL : N3VLInt_HS<0,1,0b1101,0, "vqdmull.s", int_arm_neon_vqdmull, 1>; + +// Vector Multiply-Accumulate and Multiply-Subtract Operations. + +// VMLA : Vector Multiply Accumulate (integer and floating-point) +defm VMLA : N3VMulOp_QHS<0, 0, 0b1001, 0, "vmla.i", add>; +def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, "vmla.f32", v2f32, fmul, fadd>; +def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, "vmla.f32", v4f32, fmul, fadd>; +// VMLAL : Vector Multiply Accumulate Long (Q += D * D) +defm VMLALs : N3VLInt3_QHS<0,1,0b1000,0, "vmlal.s", int_arm_neon_vmlals>; +defm VMLALu : N3VLInt3_QHS<1,1,0b1000,0, "vmlal.u", int_arm_neon_vmlalu>; +// VQDMLAL : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D) +defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, "vqdmlal.s", int_arm_neon_vqdmlal>; +// VMLS : Vector Multiply Subtract (integer and floating-point) +defm VMLS : N3VMulOp_QHS<0, 0, 0b1001, 0, "vmls.i", sub>; +def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, "vmls.f32", v2f32, fmul, fsub>; +def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, "vmls.f32", v4f32, fmul, fsub>; +// VMLSL : Vector Multiply Subtract Long (Q -= D * D) +defm VMLSLs : N3VLInt3_QHS<0,1,0b1010,0, "vmlsl.s", int_arm_neon_vmlsls>; +defm VMLSLu : N3VLInt3_QHS<1,1,0b1010,0, "vmlsl.u", int_arm_neon_vmlslu>; +// VQDMLSL : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D) +defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, "vqdmlsl.s", int_arm_neon_vqdmlsl>; + +// Vector Subtract Operations. + +// VSUB : Vector Subtract (integer and floating-point) +defm VSUB : N3V_QHSD<1, 0, 0b1000, 0, "vsub.i", sub, 0>; +def VSUBfd : N3VD<0, 0, 0b10, 0b1101, 0, "vsub.f32", v2f32, v2f32, fsub, 0>; +def VSUBfq : N3VQ<0, 0, 0b10, 0b1101, 0, "vsub.f32", v4f32, v4f32, fsub, 0>; +// VSUBL : Vector Subtract Long (Q = D - D) +defm VSUBLs : N3VLInt_QHS<0,1,0b0010,0, "vsubl.s", int_arm_neon_vsubls, 1>; +defm VSUBLu : N3VLInt_QHS<1,1,0b0010,0, "vsubl.u", int_arm_neon_vsublu, 1>; +// VSUBW : Vector Subtract Wide (Q = Q - D) +defm VSUBWs : N3VWInt_QHS<0,1,0b0011,0, "vsubw.s", int_arm_neon_vsubws, 0>; +defm VSUBWu : N3VWInt_QHS<1,1,0b0011,0, "vsubw.u", int_arm_neon_vsubwu, 0>; +// VHSUB : Vector Halving Subtract +defm VHSUBs : N3VInt_QHS<0, 0, 0b0010, 0, "vhsub.s", int_arm_neon_vhsubs, 0>; +defm VHSUBu : N3VInt_QHS<1, 0, 0b0010, 0, "vhsub.u", int_arm_neon_vhsubu, 0>; +// VQSUB : Vector Saturing Subtract +defm VQSUBs : N3VInt_QHSD<0, 0, 0b0010, 1, "vqsub.s", int_arm_neon_vqsubs, 0>; +defm VQSUBu : N3VInt_QHSD<1, 0, 0b0010, 1, "vqsub.u", int_arm_neon_vqsubu, 0>; +// VSUBHN : Vector Subtract and Narrow Returning High Half (D = Q - Q) +defm VSUBHN : N3VNInt_HSD<0,1,0b0110,0, "vsubhn.i", int_arm_neon_vsubhn, 0>; +// VRSUBHN : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q) +defm VRSUBHN : N3VNInt_HSD<1,1,0b0110,0, "vrsubhn.i", int_arm_neon_vrsubhn, 0>; + +// Vector Comparisons. + +// VCEQ : Vector Compare Equal +defm VCEQ : N3V_QHS<1, 0, 0b1000, 1, "vceq.i", NEONvceq, 1>; +def VCEQfd : N3VD<0,0,0b00,0b1110,0, "vceq.f32", v2i32, v2f32, NEONvceq, 1>; +def VCEQfq : N3VQ<0,0,0b00,0b1110,0, "vceq.f32", v4i32, v4f32, NEONvceq, 1>; +// VCGE : Vector Compare Greater Than or Equal +defm VCGEs : N3V_QHS<0, 0, 0b0011, 1, "vcge.s", NEONvcge, 0>; +defm VCGEu : N3V_QHS<1, 0, 0b0011, 1, "vcge.u", NEONvcgeu, 0>; +def VCGEfd : N3VD<1,0,0b00,0b1110,0, "vcge.f32", v2i32, v2f32, NEONvcge, 0>; +def VCGEfq : N3VQ<1,0,0b00,0b1110,0, "vcge.f32", v4i32, v4f32, NEONvcge, 0>; +// VCGT : Vector Compare Greater Than +defm VCGTs : N3V_QHS<0, 0, 0b0011, 0, "vcgt.s", NEONvcgt, 0>; +defm VCGTu : N3V_QHS<1, 0, 0b0011, 0, "vcgt.u", NEONvcgtu, 0>; +def VCGTfd : N3VD<1,0,0b10,0b1110,0, "vcgt.f32", v2i32, v2f32, NEONvcgt, 0>; +def VCGTfq : N3VQ<1,0,0b10,0b1110,0, "vcgt.f32", v4i32, v4f32, NEONvcgt, 0>; +// VACGE : Vector Absolute Compare Greater Than or Equal (aka VCAGE) +def VACGEd : N3VDInt<1, 0, 0b00, 0b1110, 1, "vacge.f32", v2i32, v2f32, + int_arm_neon_vacged, 0>; +def VACGEq : N3VQInt<1, 0, 0b00, 0b1110, 1, "vacge.f32", v4i32, v4f32, + int_arm_neon_vacgeq, 0>; +// VACGT : Vector Absolute Compare Greater Than (aka VCAGT) +def VACGTd : N3VDInt<1, 0, 0b10, 0b1110, 1, "vacgt.f32", v2i32, v2f32, + int_arm_neon_vacgtd, 0>; +def VACGTq : N3VQInt<1, 0, 0b10, 0b1110, 1, "vacgt.f32", v4i32, v4f32, + int_arm_neon_vacgtq, 0>; +// VTST : Vector Test Bits +defm VTST : N3V_QHS<0, 0, 0b1000, 1, "vtst.i", NEONvtst, 1>; + +// Vector Bitwise Operations. + +// VAND : Vector Bitwise AND +def VANDd : N3VD<0, 0, 0b00, 0b0001, 1, "vand", v2i32, v2i32, and, 1>; +def VANDq : N3VQ<0, 0, 0b00, 0b0001, 1, "vand", v4i32, v4i32, and, 1>; + +// VEOR : Vector Bitwise Exclusive OR +def VEORd : N3VD<1, 0, 0b00, 0b0001, 1, "veor", v2i32, v2i32, xor, 1>; +def VEORq : N3VQ<1, 0, 0b00, 0b0001, 1, "veor", v4i32, v4i32, xor, 1>; + +// VORR : Vector Bitwise OR +def VORRd : N3VD<0, 0, 0b10, 0b0001, 1, "vorr", v2i32, v2i32, or, 1>; +def VORRq : N3VQ<0, 0, 0b10, 0b0001, 1, "vorr", v4i32, v4i32, or, 1>; + +// VBIC : Vector Bitwise Bit Clear (AND NOT) +def VBICd : N3V<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst), + (ins DPR:$src1, DPR:$src2), "vbic\t$dst, $src1, $src2", "", + [(set DPR:$dst, (v2i32 (and DPR:$src1,(vnot DPR:$src2))))]>; +def VBICq : N3V<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst), + (ins QPR:$src1, QPR:$src2), "vbic\t$dst, $src1, $src2", "", + [(set QPR:$dst, (v4i32 (and QPR:$src1,(vnot QPR:$src2))))]>; + +// VORN : Vector Bitwise OR NOT +def VORNd : N3V<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$dst), + (ins DPR:$src1, DPR:$src2), "vorn\t$dst, $src1, $src2", "", + [(set DPR:$dst, (v2i32 (or DPR:$src1, (vnot DPR:$src2))))]>; +def VORNq : N3V<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$dst), + (ins QPR:$src1, QPR:$src2), "vorn\t$dst, $src1, $src2", "", + [(set QPR:$dst, (v4i32 (or QPR:$src1, (vnot QPR:$src2))))]>; + +// VMVN : Vector Bitwise NOT +def VMVNd : N2V<0b11, 0b11, 0b00, 0b00, 0b01011, 0, 0, + (outs DPR:$dst), (ins DPR:$src), "vmvn\t$dst, $src", "", + [(set DPR:$dst, (v2i32 (vnot DPR:$src)))]>; +def VMVNq : N2V<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0, + (outs QPR:$dst), (ins QPR:$src), "vmvn\t$dst, $src", "", + [(set QPR:$dst, (v4i32 (vnot QPR:$src)))]>; +def : Pat<(v2i32 (vnot_conv DPR:$src)), (VMVNd DPR:$src)>; +def : Pat<(v4i32 (vnot_conv QPR:$src)), (VMVNq QPR:$src)>; + +// VBSL : Vector Bitwise Select +def VBSLd : N3V<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst), + (ins DPR:$src1, DPR:$src2, DPR:$src3), + "vbsl\t$dst, $src2, $src3", "$src1 = $dst", + [(set DPR:$dst, + (v2i32 (or (and DPR:$src2, DPR:$src1), + (and DPR:$src3, (vnot DPR:$src1)))))]>; +def VBSLq : N3V<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst), + (ins QPR:$src1, QPR:$src2, QPR:$src3), + "vbsl\t$dst, $src2, $src3", "$src1 = $dst", + [(set QPR:$dst, + (v4i32 (or (and QPR:$src2, QPR:$src1), + (and QPR:$src3, (vnot QPR:$src1)))))]>; + +// VBIF : Vector Bitwise Insert if False +// like VBSL but with: "vbif\t$dst, $src3, $src1", "$src2 = $dst", +// VBIT : Vector Bitwise Insert if True +// like VBSL but with: "vbit\t$dst, $src2, $src1", "$src3 = $dst", +// These are not yet implemented. The TwoAddress pass will not go looking +// for equivalent operations with different register constraints; it just +// inserts copies. + +// Vector Absolute Differences. + +// VABD : Vector Absolute Difference +defm VABDs : N3VInt_QHS<0, 0, 0b0111, 0, "vabd.s", int_arm_neon_vabds, 0>; +defm VABDu : N3VInt_QHS<1, 0, 0b0111, 0, "vabd.u", int_arm_neon_vabdu, 0>; +def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, "vabd.f32", v2f32, v2f32, + int_arm_neon_vabdf, 0>; +def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, "vabd.f32", v4f32, v4f32, + int_arm_neon_vabdf, 0>; + +// VABDL : Vector Absolute Difference Long (Q = | D - D |) +defm VABDLs : N3VLInt_QHS<0,1,0b0111,0, "vabdl.s", int_arm_neon_vabdls, 0>; +defm VABDLu : N3VLInt_QHS<1,1,0b0111,0, "vabdl.u", int_arm_neon_vabdlu, 0>; + +// VABA : Vector Absolute Difference and Accumulate +defm VABAs : N3VInt3_QHS<0,1,0b0101,0, "vaba.s", int_arm_neon_vabas>; +defm VABAu : N3VInt3_QHS<1,1,0b0101,0, "vaba.u", int_arm_neon_vabau>; + +// VABAL : Vector Absolute Difference and Accumulate Long (Q += | D - D |) +defm VABALs : N3VLInt3_QHS<0,1,0b0101,0, "vabal.s", int_arm_neon_vabals>; +defm VABALu : N3VLInt3_QHS<1,1,0b0101,0, "vabal.u", int_arm_neon_vabalu>; + +// Vector Maximum and Minimum. + +// VMAX : Vector Maximum +defm VMAXs : N3VInt_QHS<0, 0, 0b0110, 0, "vmax.s", int_arm_neon_vmaxs, 1>; +defm VMAXu : N3VInt_QHS<1, 0, 0b0110, 0, "vmax.u", int_arm_neon_vmaxu, 1>; +def VMAXfd : N3VDInt<0, 0, 0b00, 0b1111, 0, "vmax.f32", v2f32, v2f32, + int_arm_neon_vmaxf, 1>; +def VMAXfq : N3VQInt<0, 0, 0b00, 0b1111, 0, "vmax.f32", v4f32, v4f32, + int_arm_neon_vmaxf, 1>; + +// VMIN : Vector Minimum +defm VMINs : N3VInt_QHS<0, 0, 0b0110, 1, "vmin.s", int_arm_neon_vmins, 1>; +defm VMINu : N3VInt_QHS<1, 0, 0b0110, 1, "vmin.u", int_arm_neon_vminu, 1>; +def VMINfd : N3VDInt<0, 0, 0b10, 0b1111, 0, "vmin.f32", v2f32, v2f32, + int_arm_neon_vminf, 1>; +def VMINfq : N3VQInt<0, 0, 0b10, 0b1111, 0, "vmin.f32", v4f32, v4f32, + int_arm_neon_vminf, 1>; + +// Vector Pairwise Operations. + +// VPADD : Vector Pairwise Add +def VPADDi8 : N3VDInt<0, 0, 0b00, 0b1011, 1, "vpadd.i8", v8i8, v8i8, + int_arm_neon_vpaddi, 0>; +def VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, "vpadd.i16", v4i16, v4i16, + int_arm_neon_vpaddi, 0>; +def VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, "vpadd.i32", v2i32, v2i32, + int_arm_neon_vpaddi, 0>; +def VPADDf : N3VDInt<1, 0, 0b00, 0b1101, 0, "vpadd.f32", v2f32, v2f32, + int_arm_neon_vpaddf, 0>; + +// VPADDL : Vector Pairwise Add Long +defm VPADDLs : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpaddl.s", + int_arm_neon_vpaddls>; +defm VPADDLu : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00101, 0, "vpaddl.u", + int_arm_neon_vpaddlu>; + +// VPADAL : Vector Pairwise Add and Accumulate Long +defm VPADALs : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpadal.s", + int_arm_neon_vpadals>; +defm VPADALu : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b00101, 0, "vpadal.u", + int_arm_neon_vpadalu>; + +// VPMAX : Vector Pairwise Maximum +def VPMAXs8 : N3VDInt<0, 0, 0b00, 0b1010, 0, "vpmax.s8", v8i8, v8i8, + int_arm_neon_vpmaxs, 0>; +def VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, "vpmax.s16", v4i16, v4i16, + int_arm_neon_vpmaxs, 0>; +def VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, "vpmax.s32", v2i32, v2i32, + int_arm_neon_vpmaxs, 0>; +def VPMAXu8 : N3VDInt<1, 0, 0b00, 0b1010, 0, "vpmax.u8", v8i8, v8i8, + int_arm_neon_vpmaxu, 0>; +def VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, "vpmax.u16", v4i16, v4i16, + int_arm_neon_vpmaxu, 0>; +def VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, "vpmax.u32", v2i32, v2i32, + int_arm_neon_vpmaxu, 0>; +def VPMAXf : N3VDInt<1, 0, 0b00, 0b1111, 0, "vpmax.f32", v2f32, v2f32, + int_arm_neon_vpmaxf, 0>; + +// VPMIN : Vector Pairwise Minimum +def VPMINs8 : N3VDInt<0, 0, 0b00, 0b1010, 1, "vpmin.s8", v8i8, v8i8, + int_arm_neon_vpmins, 0>; +def VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, "vpmin.s16", v4i16, v4i16, + int_arm_neon_vpmins, 0>; +def VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, "vpmin.s32", v2i32, v2i32, + int_arm_neon_vpmins, 0>; +def VPMINu8 : N3VDInt<1, 0, 0b00, 0b1010, 1, "vpmin.u8", v8i8, v8i8, + int_arm_neon_vpminu, 0>; +def VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, "vpmin.u16", v4i16, v4i16, + int_arm_neon_vpminu, 0>; +def VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, "vpmin.u32", v2i32, v2i32, + int_arm_neon_vpminu, 0>; +def VPMINf : N3VDInt<1, 0, 0b10, 0b1111, 0, "vpmin.f32", v2f32, v2f32, + int_arm_neon_vpminf, 0>; + +// Vector Reciprocal and Reciprocal Square Root Estimate and Step. + +// VRECPE : Vector Reciprocal Estimate +def VRECPEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, "vrecpe.u32", + v2i32, v2i32, int_arm_neon_vrecpe>; +def VRECPEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, "vrecpe.u32", + v4i32, v4i32, int_arm_neon_vrecpe>; +def VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, "vrecpe.f32", + v2f32, v2f32, int_arm_neon_vrecpef>; +def VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, "vrecpe.f32", + v4f32, v4f32, int_arm_neon_vrecpef>; + +// VRECPS : Vector Reciprocal Step +def VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, "vrecps.f32", v2f32, v2f32, + int_arm_neon_vrecps, 1>; +def VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, "vrecps.f32", v4f32, v4f32, + int_arm_neon_vrecps, 1>; + +// VRSQRTE : Vector Reciprocal Square Root Estimate +def VRSQRTEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, "vrsqrte.u32", + v2i32, v2i32, int_arm_neon_vrsqrte>; +def VRSQRTEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, "vrsqrte.u32", + v4i32, v4i32, int_arm_neon_vrsqrte>; +def VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, "vrsqrte.f32", + v2f32, v2f32, int_arm_neon_vrsqrtef>; +def VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, "vrsqrte.f32", + v4f32, v4f32, int_arm_neon_vrsqrtef>; + +// VRSQRTS : Vector Reciprocal Square Root Step +def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, "vrsqrts.f32", v2f32, v2f32, + int_arm_neon_vrsqrts, 1>; +def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, "vrsqrts.f32", v4f32, v4f32, + int_arm_neon_vrsqrts, 1>; + +// Vector Shifts. + +// VSHL : Vector Shift +defm VSHLs : N3VInt_QHSD<0, 0, 0b0100, 0, "vshl.s", int_arm_neon_vshifts, 0>; +defm VSHLu : N3VInt_QHSD<1, 0, 0b0100, 0, "vshl.u", int_arm_neon_vshiftu, 0>; +// VSHL : Vector Shift Left (Immediate) +defm VSHLi : N2VSh_QHSD<0, 1, 0b0111, 1, "vshl.i", NEONvshl>; +// VSHR : Vector Shift Right (Immediate) +defm VSHRs : N2VSh_QHSD<0, 1, 0b0000, 1, "vshr.s", NEONvshrs>; +defm VSHRu : N2VSh_QHSD<1, 1, 0b0000, 1, "vshr.u", NEONvshru>; + +// VSHLL : Vector Shift Left Long +def VSHLLs8 : N2VLSh<0, 1, 0b001000, 0b1010, 0, 0, 1, "vshll.s8", + v8i16, v8i8, NEONvshlls>; +def VSHLLs16 : N2VLSh<0, 1, 0b010000, 0b1010, 0, 0, 1, "vshll.s16", + v4i32, v4i16, NEONvshlls>; +def VSHLLs32 : N2VLSh<0, 1, 0b100000, 0b1010, 0, 0, 1, "vshll.s32", + v2i64, v2i32, NEONvshlls>; +def VSHLLu8 : N2VLSh<1, 1, 0b001000, 0b1010, 0, 0, 1, "vshll.u8", + v8i16, v8i8, NEONvshllu>; +def VSHLLu16 : N2VLSh<1, 1, 0b010000, 0b1010, 0, 0, 1, "vshll.u16", + v4i32, v4i16, NEONvshllu>; +def VSHLLu32 : N2VLSh<1, 1, 0b100000, 0b1010, 0, 0, 1, "vshll.u32", + v2i64, v2i32, NEONvshllu>; + +// VSHLL : Vector Shift Left Long (with maximum shift count) +def VSHLLi8 : N2VLSh<1, 1, 0b110010, 0b0011, 0, 0, 0, "vshll.i8", + v8i16, v8i8, NEONvshlli>; +def VSHLLi16 : N2VLSh<1, 1, 0b110110, 0b0011, 0, 0, 0, "vshll.i16", + v4i32, v4i16, NEONvshlli>; +def VSHLLi32 : N2VLSh<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll.i32", + v2i64, v2i32, NEONvshlli>; + +// VSHRN : Vector Shift Right and Narrow +def VSHRN16 : N2VNSh<0, 1, 0b001000, 0b1000, 0, 0, 1, "vshrn.i16", + v8i8, v8i16, NEONvshrn>; +def VSHRN32 : N2VNSh<0, 1, 0b010000, 0b1000, 0, 0, 1, "vshrn.i32", + v4i16, v4i32, NEONvshrn>; +def VSHRN64 : N2VNSh<0, 1, 0b100000, 0b1000, 0, 0, 1, "vshrn.i64", + v2i32, v2i64, NEONvshrn>; + +// VRSHL : Vector Rounding Shift +defm VRSHLs : N3VInt_QHSD<0,0,0b0101,0, "vrshl.s", int_arm_neon_vrshifts, 0>; +defm VRSHLu : N3VInt_QHSD<1,0,0b0101,0, "vrshl.u", int_arm_neon_vrshiftu, 0>; +// VRSHR : Vector Rounding Shift Right +defm VRSHRs : N2VSh_QHSD<0, 1, 0b0010, 1, "vrshr.s", NEONvrshrs>; +defm VRSHRu : N2VSh_QHSD<1, 1, 0b0010, 1, "vrshr.u", NEONvrshru>; + +// VRSHRN : Vector Rounding Shift Right and Narrow +def VRSHRN16 : N2VNSh<0, 1, 0b001000, 0b1000, 0, 1, 1, "vrshrn.i16", + v8i8, v8i16, NEONvrshrn>; +def VRSHRN32 : N2VNSh<0, 1, 0b010000, 0b1000, 0, 1, 1, "vrshrn.i32", + v4i16, v4i32, NEONvrshrn>; +def VRSHRN64 : N2VNSh<0, 1, 0b100000, 0b1000, 0, 1, 1, "vrshrn.i64", + v2i32, v2i64, NEONvrshrn>; + +// VQSHL : Vector Saturating Shift +defm VQSHLs : N3VInt_QHSD<0,0,0b0100,1, "vqshl.s", int_arm_neon_vqshifts, 0>; +defm VQSHLu : N3VInt_QHSD<1,0,0b0100,1, "vqshl.u", int_arm_neon_vqshiftu, 0>; +// VQSHL : Vector Saturating Shift Left (Immediate) +defm VQSHLsi : N2VSh_QHSD<0, 1, 0b0111, 1, "vqshl.s", NEONvqshls>; +defm VQSHLui : N2VSh_QHSD<1, 1, 0b0111, 1, "vqshl.u", NEONvqshlu>; +// VQSHLU : Vector Saturating Shift Left (Immediate, Unsigned) +defm VQSHLsu : N2VSh_QHSD<1, 1, 0b0110, 1, "vqshlu.s", NEONvqshlsu>; + +// VQSHRN : Vector Saturating Shift Right and Narrow +def VQSHRNs16 : N2VNSh<0, 1, 0b001000, 0b1001, 0, 0, 1, "vqshrn.s16", + v8i8, v8i16, NEONvqshrns>; +def VQSHRNs32 : N2VNSh<0, 1, 0b010000, 0b1001, 0, 0, 1, "vqshrn.s32", + v4i16, v4i32, NEONvqshrns>; +def VQSHRNs64 : N2VNSh<0, 1, 0b100000, 0b1001, 0, 0, 1, "vqshrn.s64", + v2i32, v2i64, NEONvqshrns>; +def VQSHRNu16 : N2VNSh<1, 1, 0b001000, 0b1001, 0, 0, 1, "vqshrn.u16", + v8i8, v8i16, NEONvqshrnu>; +def VQSHRNu32 : N2VNSh<1, 1, 0b010000, 0b1001, 0, 0, 1, "vqshrn.u32", + v4i16, v4i32, NEONvqshrnu>; +def VQSHRNu64 : N2VNSh<1, 1, 0b100000, 0b1001, 0, 0, 1, "vqshrn.u64", + v2i32, v2i64, NEONvqshrnu>; + +// VQSHRUN : Vector Saturating Shift Right and Narrow (Unsigned) +def VQSHRUN16 : N2VNSh<1, 1, 0b001000, 0b1000, 0, 0, 1, "vqshrun.s16", + v8i8, v8i16, NEONvqshrnsu>; +def VQSHRUN32 : N2VNSh<1, 1, 0b010000, 0b1000, 0, 0, 1, "vqshrun.s32", + v4i16, v4i32, NEONvqshrnsu>; +def VQSHRUN64 : N2VNSh<1, 1, 0b100000, 0b1000, 0, 0, 1, "vqshrun.s64", + v2i32, v2i64, NEONvqshrnsu>; + +// VQRSHL : Vector Saturating Rounding Shift +defm VQRSHLs : N3VInt_QHSD<0, 0, 0b0101, 1, "vqrshl.s", + int_arm_neon_vqrshifts, 0>; +defm VQRSHLu : N3VInt_QHSD<1, 0, 0b0101, 1, "vqrshl.u", + int_arm_neon_vqrshiftu, 0>; + +// VQRSHRN : Vector Saturating Rounding Shift Right and Narrow +def VQRSHRNs16: N2VNSh<0, 1, 0b001000, 0b1001, 0, 1, 1, "vqrshrn.s16", + v8i8, v8i16, NEONvqrshrns>; +def VQRSHRNs32: N2VNSh<0, 1, 0b010000, 0b1001, 0, 1, 1, "vqrshrn.s32", + v4i16, v4i32, NEONvqrshrns>; +def VQRSHRNs64: N2VNSh<0, 1, 0b100000, 0b1001, 0, 1, 1, "vqrshrn.s64", + v2i32, v2i64, NEONvqrshrns>; +def VQRSHRNu16: N2VNSh<1, 1, 0b001000, 0b1001, 0, 1, 1, "vqrshrn.u16", + v8i8, v8i16, NEONvqrshrnu>; +def VQRSHRNu32: N2VNSh<1, 1, 0b010000, 0b1001, 0, 1, 1, "vqrshrn.u32", + v4i16, v4i32, NEONvqrshrnu>; +def VQRSHRNu64: N2VNSh<1, 1, 0b100000, 0b1001, 0, 1, 1, "vqrshrn.u64", + v2i32, v2i64, NEONvqrshrnu>; + +// VQRSHRUN : Vector Saturating Rounding Shift Right and Narrow (Unsigned) +def VQRSHRUN16: N2VNSh<1, 1, 0b001000, 0b1000, 0, 1, 1, "vqrshrun.s16", + v8i8, v8i16, NEONvqrshrnsu>; +def VQRSHRUN32: N2VNSh<1, 1, 0b010000, 0b1000, 0, 1, 1, "vqrshrun.s32", + v4i16, v4i32, NEONvqrshrnsu>; +def VQRSHRUN64: N2VNSh<1, 1, 0b100000, 0b1000, 0, 1, 1, "vqrshrun.s64", + v2i32, v2i64, NEONvqrshrnsu>; + +// VSRA : Vector Shift Right and Accumulate +defm VSRAs : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra.s", NEONvshrs>; +defm VSRAu : N2VShAdd_QHSD<1, 1, 0b0001, 1, "vsra.u", NEONvshru>; +// VRSRA : Vector Rounding Shift Right and Accumulate +defm VRSRAs : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra.s", NEONvrshrs>; +defm VRSRAu : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra.u", NEONvrshru>; + +// VSLI : Vector Shift Left and Insert +defm VSLI : N2VShIns_QHSD<1, 1, 0b0101, 1, "vsli.", NEONvsli>; +// VSRI : Vector Shift Right and Insert +defm VSRI : N2VShIns_QHSD<1, 1, 0b0100, 1, "vsri.", NEONvsri>; + +// Vector Absolute and Saturating Absolute. + +// VABS : Vector Absolute Value +defm VABS : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0, "vabs.s", + int_arm_neon_vabs>; +def VABSfd : N2VDInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32", + v2f32, v2f32, int_arm_neon_vabsf>; +def VABSfq : N2VQInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32", + v4f32, v4f32, int_arm_neon_vabsf>; + +// VQABS : Vector Saturating Absolute Value +defm VQABS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, "vqabs.s", + int_arm_neon_vqabs>; + +// Vector Negate. + +def vneg : PatFrag<(ops node:$in), (sub immAllZerosV, node:$in)>; +def vneg_conv : PatFrag<(ops node:$in), (sub immAllZerosV_bc, node:$in)>; + +class VNEGD<bits<2> size, string OpcodeStr, ValueType Ty> + : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$dst), (ins DPR:$src), + !strconcat(OpcodeStr, "\t$dst, $src"), "", + [(set DPR:$dst, (Ty (vneg DPR:$src)))]>; +class VNEGQ<bits<2> size, string OpcodeStr, ValueType Ty> + : N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$dst), (ins QPR:$src), + !strconcat(OpcodeStr, "\t$dst, $src"), "", + [(set QPR:$dst, (Ty (vneg QPR:$src)))]>; + +// VNEG : Vector Negate +def VNEGs8d : VNEGD<0b00, "vneg.s8", v8i8>; +def VNEGs16d : VNEGD<0b01, "vneg.s16", v4i16>; +def VNEGs32d : VNEGD<0b10, "vneg.s32", v2i32>; +def VNEGs8q : VNEGQ<0b00, "vneg.s8", v16i8>; +def VNEGs16q : VNEGQ<0b01, "vneg.s16", v8i16>; +def VNEGs32q : VNEGQ<0b10, "vneg.s32", v4i32>; + +// VNEG : Vector Negate (floating-point) +def VNEGf32d : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0, + (outs DPR:$dst), (ins DPR:$src), "vneg.f32\t$dst, $src", "", + [(set DPR:$dst, (v2f32 (fneg DPR:$src)))]>; +def VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0, + (outs QPR:$dst), (ins QPR:$src), "vneg.f32\t$dst, $src", "", + [(set QPR:$dst, (v4f32 (fneg QPR:$src)))]>; + +def : Pat<(v8i8 (vneg_conv DPR:$src)), (VNEGs8d DPR:$src)>; +def : Pat<(v4i16 (vneg_conv DPR:$src)), (VNEGs16d DPR:$src)>; +def : Pat<(v2i32 (vneg_conv DPR:$src)), (VNEGs32d DPR:$src)>; +def : Pat<(v16i8 (vneg_conv QPR:$src)), (VNEGs8q QPR:$src)>; +def : Pat<(v8i16 (vneg_conv QPR:$src)), (VNEGs16q QPR:$src)>; +def : Pat<(v4i32 (vneg_conv QPR:$src)), (VNEGs32q QPR:$src)>; + +// VQNEG : Vector Saturating Negate +defm VQNEG : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0, "vqneg.s", + int_arm_neon_vqneg>; + +// Vector Bit Counting Operations. + +// VCLS : Vector Count Leading Sign Bits +defm VCLS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0, "vcls.s", + int_arm_neon_vcls>; +// VCLZ : Vector Count Leading Zeros +defm VCLZ : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0, "vclz.i", + int_arm_neon_vclz>; +// VCNT : Vector Count One Bits +def VCNTd : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, "vcnt.8", + v8i8, v8i8, int_arm_neon_vcnt>; +def VCNTq : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, "vcnt.8", + v16i8, v16i8, int_arm_neon_vcnt>; + +// Vector Move Operations. + +// VMOV : Vector Move (Register) + +def VMOVD : N3V<0, 0, 0b10, 0b0001, 0, 1, (outs DPR:$dst), (ins DPR:$src), + "vmov\t$dst, $src", "", []>; +def VMOVQ : N3V<0, 0, 0b10, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src), + "vmov\t$dst, $src", "", []>; + +// VMOV : Vector Move (Immediate) + +// VMOV_get_imm8 xform function: convert build_vector to VMOV.i8 imm. +def VMOV_get_imm8 : SDNodeXForm<build_vector, [{ + return ARM::getVMOVImm(N, 1, *CurDAG); +}]>; +def vmovImm8 : PatLeaf<(build_vector), [{ + return ARM::getVMOVImm(N, 1, *CurDAG).getNode() != 0; +}], VMOV_get_imm8>; + +// VMOV_get_imm16 xform function: convert build_vector to VMOV.i16 imm. +def VMOV_get_imm16 : SDNodeXForm<build_vector, [{ + return ARM::getVMOVImm(N, 2, *CurDAG); +}]>; +def vmovImm16 : PatLeaf<(build_vector), [{ + return ARM::getVMOVImm(N, 2, *CurDAG).getNode() != 0; +}], VMOV_get_imm16>; + +// VMOV_get_imm32 xform function: convert build_vector to VMOV.i32 imm. +def VMOV_get_imm32 : SDNodeXForm<build_vector, [{ + return ARM::getVMOVImm(N, 4, *CurDAG); +}]>; +def vmovImm32 : PatLeaf<(build_vector), [{ + return ARM::getVMOVImm(N, 4, *CurDAG).getNode() != 0; +}], VMOV_get_imm32>; + +// VMOV_get_imm64 xform function: convert build_vector to VMOV.i64 imm. +def VMOV_get_imm64 : SDNodeXForm<build_vector, [{ + return ARM::getVMOVImm(N, 8, *CurDAG); +}]>; +def vmovImm64 : PatLeaf<(build_vector), [{ + return ARM::getVMOVImm(N, 8, *CurDAG).getNode() != 0; +}], VMOV_get_imm64>; + +// Note: Some of the cmode bits in the following VMOV instructions need to +// be encoded based on the immed values. + +def VMOVv8i8 : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$dst), + (ins i8imm:$SIMM), "vmov.i8\t$dst, $SIMM", "", + [(set DPR:$dst, (v8i8 vmovImm8:$SIMM))]>; +def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$dst), + (ins i8imm:$SIMM), "vmov.i8\t$dst, $SIMM", "", + [(set QPR:$dst, (v16i8 vmovImm8:$SIMM))]>; + +def VMOVv4i16 : N1ModImm<1, 0b000, 0b1000, 0, 0, 0, 1, (outs DPR:$dst), + (ins i16imm:$SIMM), "vmov.i16\t$dst, $SIMM", "", + [(set DPR:$dst, (v4i16 vmovImm16:$SIMM))]>; +def VMOVv8i16 : N1ModImm<1, 0b000, 0b1000, 0, 1, 0, 1, (outs QPR:$dst), + (ins i16imm:$SIMM), "vmov.i16\t$dst, $SIMM", "", + [(set QPR:$dst, (v8i16 vmovImm16:$SIMM))]>; + +def VMOVv2i32 : N1ModImm<1, 0b000, 0b0000, 0, 0, 0, 1, (outs DPR:$dst), + (ins i32imm:$SIMM), "vmov.i32\t$dst, $SIMM", "", + [(set DPR:$dst, (v2i32 vmovImm32:$SIMM))]>; +def VMOVv4i32 : N1ModImm<1, 0b000, 0b0000, 0, 1, 0, 1, (outs QPR:$dst), + (ins i32imm:$SIMM), "vmov.i32\t$dst, $SIMM", "", + [(set QPR:$dst, (v4i32 vmovImm32:$SIMM))]>; + +def VMOVv1i64 : N1ModImm<1, 0b000, 0b1110, 0, 0, 1, 1, (outs DPR:$dst), + (ins i64imm:$SIMM), "vmov.i64\t$dst, $SIMM", "", + [(set DPR:$dst, (v1i64 vmovImm64:$SIMM))]>; +def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$dst), + (ins i64imm:$SIMM), "vmov.i64\t$dst, $SIMM", "", + [(set QPR:$dst, (v2i64 vmovImm64:$SIMM))]>; + +// VMOV : Vector Get Lane (move scalar to ARM core register) + +def VGETLNs8 : NVGetLane<0b11100101, 0b1011, 0b00, + (outs GPR:$dst), (ins DPR:$src, i32imm:$lane), + "vmov", ".s8\t$dst, $src[$lane]", + [(set GPR:$dst, (NEONvgetlanes (v8i8 DPR:$src), + imm:$lane))]>; +def VGETLNs16 : NVGetLane<0b11100001, 0b1011, 0b01, + (outs GPR:$dst), (ins DPR:$src, i32imm:$lane), + "vmov", ".s16\t$dst, $src[$lane]", + [(set GPR:$dst, (NEONvgetlanes (v4i16 DPR:$src), + imm:$lane))]>; +def VGETLNu8 : NVGetLane<0b11101101, 0b1011, 0b00, + (outs GPR:$dst), (ins DPR:$src, i32imm:$lane), + "vmov", ".u8\t$dst, $src[$lane]", + [(set GPR:$dst, (NEONvgetlaneu (v8i8 DPR:$src), + imm:$lane))]>; +def VGETLNu16 : NVGetLane<0b11101001, 0b1011, 0b01, + (outs GPR:$dst), (ins DPR:$src, i32imm:$lane), + "vmov", ".u16\t$dst, $src[$lane]", + [(set GPR:$dst, (NEONvgetlaneu (v4i16 DPR:$src), + imm:$lane))]>; +def VGETLNi32 : NVGetLane<0b11100001, 0b1011, 0b00, + (outs GPR:$dst), (ins DPR:$src, i32imm:$lane), + "vmov", ".32\t$dst, $src[$lane]", + [(set GPR:$dst, (extractelt (v2i32 DPR:$src), + imm:$lane))]>; +// def VGETLNf32: see FMRDH and FMRDL in ARMInstrVFP.td +def : Pat<(NEONvgetlanes (v16i8 QPR:$src), imm:$lane), + (VGETLNs8 (v8i8 (EXTRACT_SUBREG QPR:$src, + (SubReg_i8_reg imm:$lane))), + (SubReg_i8_lane imm:$lane))>; +def : Pat<(NEONvgetlanes (v8i16 QPR:$src), imm:$lane), + (VGETLNs16 (v4i16 (EXTRACT_SUBREG QPR:$src, + (SubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane))>; +def : Pat<(NEONvgetlaneu (v16i8 QPR:$src), imm:$lane), + (VGETLNu8 (v8i8 (EXTRACT_SUBREG QPR:$src, + (SubReg_i8_reg imm:$lane))), + (SubReg_i8_lane imm:$lane))>; +def : Pat<(NEONvgetlaneu (v8i16 QPR:$src), imm:$lane), + (VGETLNu16 (v4i16 (EXTRACT_SUBREG QPR:$src, + (SubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane))>; +def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane), + (VGETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src, + (SubReg_i32_reg imm:$lane))), + (SubReg_i32_lane imm:$lane))>; +//def : Pat<(extractelt (v2i64 QPR:$src1), imm:$src2), +// (EXTRACT_SUBREG QPR:$src1, (SubReg_f64_reg imm:$src2))>; +def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2), + (EXTRACT_SUBREG QPR:$src1, (SubReg_f64_reg imm:$src2))>; + + +// VMOV : Vector Set Lane (move ARM core register to scalar) + +let Constraints = "$src1 = $dst" in { +def VSETLNi8 : NVSetLane<0b11100100, 0b1011, 0b00, (outs DPR:$dst), + (ins DPR:$src1, GPR:$src2, i32imm:$lane), + "vmov", ".8\t$dst[$lane], $src2", + [(set DPR:$dst, (vector_insert (v8i8 DPR:$src1), + GPR:$src2, imm:$lane))]>; +def VSETLNi16 : NVSetLane<0b11100000, 0b1011, 0b01, (outs DPR:$dst), + (ins DPR:$src1, GPR:$src2, i32imm:$lane), + "vmov", ".16\t$dst[$lane], $src2", + [(set DPR:$dst, (vector_insert (v4i16 DPR:$src1), + GPR:$src2, imm:$lane))]>; +def VSETLNi32 : NVSetLane<0b11100000, 0b1011, 0b00, (outs DPR:$dst), + (ins DPR:$src1, GPR:$src2, i32imm:$lane), + "vmov", ".32\t$dst[$lane], $src2", + [(set DPR:$dst, (insertelt (v2i32 DPR:$src1), + GPR:$src2, imm:$lane))]>; +} +def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane), + (v16i8 (INSERT_SUBREG QPR:$src1, + (VSETLNi8 (v8i8 (EXTRACT_SUBREG QPR:$src1, + (SubReg_i8_reg imm:$lane))), + GPR:$src2, (SubReg_i8_lane imm:$lane)), + (SubReg_i8_reg imm:$lane)))>; +def : Pat<(vector_insert (v8i16 QPR:$src1), GPR:$src2, imm:$lane), + (v8i16 (INSERT_SUBREG QPR:$src1, + (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1, + (SubReg_i16_reg imm:$lane))), + GPR:$src2, (SubReg_i16_lane imm:$lane)), + (SubReg_i16_reg imm:$lane)))>; +def : Pat<(insertelt (v4i32 QPR:$src1), GPR:$src2, imm:$lane), + (v4i32 (INSERT_SUBREG QPR:$src1, + (VSETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src1, + (SubReg_i32_reg imm:$lane))), + GPR:$src2, (SubReg_i32_lane imm:$lane)), + (SubReg_i32_reg imm:$lane)))>; + +//def : Pat<(v2i64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)), +// (INSERT_SUBREG QPR:$src1, DPR:$src2, (SubReg_f64_reg imm:$src3))>; +def : Pat<(v2f64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)), + (INSERT_SUBREG QPR:$src1, DPR:$src2, (SubReg_f64_reg imm:$src3))>; + +// VDUP : Vector Duplicate (from ARM core register to all elements) + +def splat_lo : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + return SVOp->isSplat() && SVOp->getSplatIndex() == 0; +}]>; + +class VDUPD<bits<8> opcod1, bits<2> opcod3, string asmSize, ValueType Ty> + : NVDup<opcod1, 0b1011, opcod3, (outs DPR:$dst), (ins GPR:$src), + "vdup", !strconcat(asmSize, "\t$dst, $src"), + [(set DPR:$dst, (Ty (splat_lo (scalar_to_vector GPR:$src), undef)))]>; +class VDUPQ<bits<8> opcod1, bits<2> opcod3, string asmSize, ValueType Ty> + : NVDup<opcod1, 0b1011, opcod3, (outs QPR:$dst), (ins GPR:$src), + "vdup", !strconcat(asmSize, "\t$dst, $src"), + [(set QPR:$dst, (Ty (splat_lo (scalar_to_vector GPR:$src), undef)))]>; + +def VDUP8d : VDUPD<0b11101100, 0b00, ".8", v8i8>; +def VDUP16d : VDUPD<0b11101000, 0b01, ".16", v4i16>; +def VDUP32d : VDUPD<0b11101000, 0b00, ".32", v2i32>; +def VDUP8q : VDUPQ<0b11101110, 0b00, ".8", v16i8>; +def VDUP16q : VDUPQ<0b11101010, 0b01, ".16", v8i16>; +def VDUP32q : VDUPQ<0b11101010, 0b00, ".32", v4i32>; + +def VDUPfd : NVDup<0b11101000, 0b1011, 0b00, (outs DPR:$dst), (ins GPR:$src), + "vdup", ".32\t$dst, $src", + [(set DPR:$dst, (v2f32 (splat_lo + (scalar_to_vector + (f32 (bitconvert GPR:$src))), + undef)))]>; +def VDUPfq : NVDup<0b11101010, 0b1011, 0b00, (outs QPR:$dst), (ins GPR:$src), + "vdup", ".32\t$dst, $src", + [(set QPR:$dst, (v4f32 (splat_lo + (scalar_to_vector + (f32 (bitconvert GPR:$src))), + undef)))]>; + +// VDUP : Vector Duplicate Lane (from scalar to all elements) + +def SHUFFLE_get_splat_lane : SDNodeXForm<vector_shuffle, [{ + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + return CurDAG->getTargetConstant(SVOp->getSplatIndex(), MVT::i32); +}]>; + +def splat_lane : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + return SVOp->isSplat(); +}], SHUFFLE_get_splat_lane>; + +class VDUPLND<bits<2> op19_18, bits<2> op17_16, string OpcodeStr, ValueType Ty> + : N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 0, 0, + (outs DPR:$dst), (ins DPR:$src, i32imm:$lane), + !strconcat(OpcodeStr, "\t$dst, $src[$lane]"), "", + [(set DPR:$dst, (Ty (splat_lane:$lane DPR:$src, undef)))]>; + +// vector_shuffle requires that the source and destination types match, so +// VDUP to a 128-bit result uses a target-specific VDUPLANEQ node. +class VDUPLNQ<bits<2> op19_18, bits<2> op17_16, string OpcodeStr, + ValueType ResTy, ValueType OpTy> + : N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 1, 0, + (outs QPR:$dst), (ins DPR:$src, i32imm:$lane), + !strconcat(OpcodeStr, "\t$dst, $src[$lane]"), "", + [(set QPR:$dst, (ResTy (NEONvduplaneq (OpTy DPR:$src), imm:$lane)))]>; + +def VDUPLN8d : VDUPLND<0b00, 0b01, "vdup.8", v8i8>; +def VDUPLN16d : VDUPLND<0b00, 0b10, "vdup.16", v4i16>; +def VDUPLN32d : VDUPLND<0b01, 0b00, "vdup.32", v2i32>; +def VDUPLNfd : VDUPLND<0b01, 0b00, "vdup.32", v2f32>; +def VDUPLN8q : VDUPLNQ<0b00, 0b01, "vdup.8", v16i8, v8i8>; +def VDUPLN16q : VDUPLNQ<0b00, 0b10, "vdup.16", v8i16, v4i16>; +def VDUPLN32q : VDUPLNQ<0b01, 0b00, "vdup.32", v4i32, v2i32>; +def VDUPLNfq : VDUPLNQ<0b01, 0b00, "vdup.32", v4f32, v2f32>; + +// VMOVN : Vector Narrowing Move +defm VMOVN : N2VNInt_HSD<0b11,0b11,0b10,0b00100,0,0, "vmovn.i", + int_arm_neon_vmovn>; +// VQMOVN : Vector Saturating Narrowing Move +defm VQMOVNs : N2VNInt_HSD<0b11,0b11,0b10,0b00101,0,0, "vqmovn.s", + int_arm_neon_vqmovns>; +defm VQMOVNu : N2VNInt_HSD<0b11,0b11,0b10,0b00101,1,0, "vqmovn.u", + int_arm_neon_vqmovnu>; +defm VQMOVNsu : N2VNInt_HSD<0b11,0b11,0b10,0b00100,1,0, "vqmovun.s", + int_arm_neon_vqmovnsu>; +// VMOVL : Vector Lengthening Move +defm VMOVLs : N2VLInt_QHS<0,1,0b1010,0,0,1, "vmovl.s", int_arm_neon_vmovls>; +defm VMOVLu : N2VLInt_QHS<1,1,0b1010,0,0,1, "vmovl.u", int_arm_neon_vmovlu>; + +// Vector Conversions. + +// VCVT : Vector Convert Between Floating-Point and Integers +def VCVTf2sd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt.s32.f32", + v2i32, v2f32, fp_to_sint>; +def VCVTf2ud : N2VD<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt.u32.f32", + v2i32, v2f32, fp_to_uint>; +def VCVTs2fd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt.f32.s32", + v2f32, v2i32, sint_to_fp>; +def VCVTu2fd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt.f32.u32", + v2f32, v2i32, uint_to_fp>; + +def VCVTf2sq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt.s32.f32", + v4i32, v4f32, fp_to_sint>; +def VCVTf2uq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt.u32.f32", + v4i32, v4f32, fp_to_uint>; +def VCVTs2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt.f32.s32", + v4f32, v4i32, sint_to_fp>; +def VCVTu2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt.f32.u32", + v4f32, v4i32, uint_to_fp>; + +// VCVT : Vector Convert Between Floating-Point and Fixed-Point. +// Note: Some of the opcode bits in the following VCVT instructions need to +// be encoded based on the immed values. +def VCVTf2xsd : N2VCvtD<0, 1, 0b000000, 0b1111, 0, 1, "vcvt.s32.f32", + v2i32, v2f32, int_arm_neon_vcvtfp2fxs>; +def VCVTf2xud : N2VCvtD<1, 1, 0b000000, 0b1111, 0, 1, "vcvt.u32.f32", + v2i32, v2f32, int_arm_neon_vcvtfp2fxu>; +def VCVTxs2fd : N2VCvtD<0, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.s32", + v2f32, v2i32, int_arm_neon_vcvtfxs2fp>; +def VCVTxu2fd : N2VCvtD<1, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.u32", + v2f32, v2i32, int_arm_neon_vcvtfxu2fp>; + +def VCVTf2xsq : N2VCvtQ<0, 1, 0b000000, 0b1111, 0, 1, "vcvt.s32.f32", + v4i32, v4f32, int_arm_neon_vcvtfp2fxs>; +def VCVTf2xuq : N2VCvtQ<1, 1, 0b000000, 0b1111, 0, 1, "vcvt.u32.f32", + v4i32, v4f32, int_arm_neon_vcvtfp2fxu>; +def VCVTxs2fq : N2VCvtQ<0, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.s32", + v4f32, v4i32, int_arm_neon_vcvtfxs2fp>; +def VCVTxu2fq : N2VCvtQ<1, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.u32", + v4f32, v4i32, int_arm_neon_vcvtfxu2fp>; + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +//===----------------------------------------------------------------------===// + +// bit_convert +def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (v1i64 DPR:$src)>; +def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (v1i64 DPR:$src)>; +def : Pat<(v1i64 (bitconvert (v8i8 DPR:$src))), (v1i64 DPR:$src)>; +def : Pat<(v1i64 (bitconvert (f64 DPR:$src))), (v1i64 DPR:$src)>; +def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>; +def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (v2i32 DPR:$src)>; +def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (v2i32 DPR:$src)>; +def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (v2i32 DPR:$src)>; +def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (v2i32 DPR:$src)>; +def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>; +def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>; +def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (v4i16 DPR:$src)>; +def : Pat<(v4i16 (bitconvert (v8i8 DPR:$src))), (v4i16 DPR:$src)>; +def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (v4i16 DPR:$src)>; +def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>; +def : Pat<(v8i8 (bitconvert (v1i64 DPR:$src))), (v8i8 DPR:$src)>; +def : Pat<(v8i8 (bitconvert (v2i32 DPR:$src))), (v8i8 DPR:$src)>; +def : Pat<(v8i8 (bitconvert (v4i16 DPR:$src))), (v8i8 DPR:$src)>; +def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (v8i8 DPR:$src)>; +def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (v8i8 DPR:$src)>; +def : Pat<(f64 (bitconvert (v1i64 DPR:$src))), (f64 DPR:$src)>; +def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (f64 DPR:$src)>; +def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (f64 DPR:$src)>; +def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (f64 DPR:$src)>; +def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (f64 DPR:$src)>; +def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (v2f32 DPR:$src)>; +def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>; +def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>; +def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>; +def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (v2f32 DPR:$src)>; + +def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>; +def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>; +def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>; +def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>; +def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>; +def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>; +def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>; +def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>; +def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>; +def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>; +def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>; +def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>; +def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>; +def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>; +def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>; +def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>; +def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>; +def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>; +def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>; +def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>; +def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>; +def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>; +def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>; +def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>; +def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>; +def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>; +def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>; +def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>; +def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>; +def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>; diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td index e7e6cf577c..a057e5cabf 100644 --- a/lib/Target/ARM/ARMRegisterInfo.td +++ b/lib/Target/ARM/ARMRegisterInfo.td @@ -77,6 +77,34 @@ def D13 : ARMReg<13, "d13", [S26, S27]>; def D14 : ARMReg<14, "d14", [S28, S29]>; def D15 : ARMReg<15, "d15", [S30, S31]>; +// VFP3 defines 16 additional double registers +def D16 : ARMFReg<16, "d16">; def D17 : ARMFReg<17, "d16">; +def D18 : ARMFReg<18, "d16">; def D19 : ARMFReg<19, "d16">; +def D20 : ARMFReg<20, "d16">; def D21 : ARMFReg<21, "d16">; +def D22 : ARMFReg<22, "d16">; def D23 : ARMFReg<23, "d16">; +def D24 : ARMFReg<24, "d16">; def D25 : ARMFReg<25, "d16">; +def D26 : ARMFReg<26, "d16">; def D27 : ARMFReg<27, "d16">; +def D28 : ARMFReg<28, "d16">; def D29 : ARMFReg<29, "d16">; +def D30 : ARMFReg<30, "d16">; def D31 : ARMFReg<31, "d16">; + +// Advanced SIMD (NEON) defines 16 quad-word aliases +def Q0 : ARMReg< 0, "q0", [D0, D1]>; +def Q1 : ARMReg< 1, "q1", [D2, D3]>; +def Q2 : ARMReg< 2, "q2", [D4, D5]>; +def Q3 : ARMReg< 3, "q3", [D6, D7]>; +def Q4 : ARMReg< 4, "q4", [D8, D9]>; +def Q5 : ARMReg< 5, "q5", [D10, D11]>; +def Q6 : ARMReg< 6, "q6", [D12, D13]>; +def Q7 : ARMReg< 7, "q7", [D14, D15]>; +def Q8 : ARMReg< 8, "q8", [D16, D17]>; +def Q9 : ARMReg< 9, "q9", [D18, D19]>; +def Q10 : ARMReg<10, "q10", [D20, D21]>; +def Q11 : ARMReg<11, "q11", [D22, D23]>; +def Q12 : ARMReg<12, "q12", [D24, D25]>; +def Q13 : ARMReg<13, "q13", [D26, D27]>; +def Q14 : ARMReg<14, "q14", [D28, D29]>; +def Q15 : ARMReg<15, "q15", [D30, D31]>; + // Current Program Status Register. def CPSR : ARMReg<0, "cpsr">; @@ -207,14 +235,67 @@ def tGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7]> { }]; } +// Scalar single precision floating point register class.. def SPR : RegisterClass<"ARM", [f32], 32, [S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S30, S31]>; +// Scalar double precision floating point / generic 64-bit vector register +// class. // ARM requires only word alignment for double. It's more performant if it // is double-word alignment though. -def DPR : RegisterClass<"ARM", [f64], 64, [D0, D1, D2, D3, D4, D5, D6, D7, D8, - D9, D10, D11, D12, D13, D14, D15]>; +def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, + [D0, D1, D2, D3, D4, D5, D6, D7, + D8, D9, D10, D11, D12, D13, D14, D15]> { + let SubRegClassList = [SPR, SPR]; + let MethodProtos = [{ + iterator allocation_order_begin(const MachineFunction &MF) const; + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + // VFP2 + static const unsigned ARM_DPR_VFP2[] = { + ARM::D0, ARM::D1, ARM::D2, ARM::D3, + ARM::D4, ARM::D5, ARM::D6, ARM::D7, + ARM::D8, ARM::D9, ARM::D10, ARM::D11, + ARM::D12, ARM::D13, ARM::D14, ARM::D15 }; + // VFP3 + static const unsigned ARM_DPR_VFP3[] = { + ARM::D0, ARM::D1, ARM::D2, ARM::D3, + ARM::D4, ARM::D5, ARM::D6, ARM::D7, + ARM::D8, ARM::D9, ARM::D10, ARM::D11, + ARM::D12, ARM::D13, ARM::D14, ARM::D15, + ARM::D16, ARM::D17, ARM::D18, ARM::D15, + ARM::D20, ARM::D21, ARM::D22, ARM::D23, + ARM::D24, ARM::D25, ARM::D26, ARM::D27, + ARM::D28, ARM::D29, ARM::D30, ARM::D31 }; + DPRClass::iterator + DPRClass::allocation_order_begin(const MachineFunction &MF) const { + const TargetMachine &TM = MF.getTarget(); + const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>(); + if (Subtarget.hasVFP3()) + return ARM_DPR_VFP3; + return ARM_DPR_VFP2; + } + + DPRClass::iterator + DPRClass::allocation_order_end(const MachineFunction &MF) const { + const TargetMachine &TM = MF.getTarget(); + const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>(); + if (Subtarget.hasVFP3()) + return ARM_DPR_VFP3 + (sizeof(ARM_DPR_VFP3)/sizeof(unsigned)); + else + return ARM_DPR_VFP2 + (sizeof(ARM_DPR_VFP2)/sizeof(unsigned)); + } + }]; +} + +// Generic 128-bit vector register class. +def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128, + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, + Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15]> { + let SubRegClassList = [SPR, SPR, SPR, SPR, DPR, DPR]; +} // Condition code registers. def CCR : RegisterClass<"ARM", [i32], 32, [CPSR]>; @@ -224,12 +305,40 @@ def CCR : RegisterClass<"ARM", [i32], 32, [CPSR]>; // sub registers for each register. // -def : SubRegSet<1, [D0, D1, D2, D3, D4, D5, D6, D7, - D8, D9, D10, D11, D12, D13, D14, D15], - [S0, S2, S4, S6, S8, S10, S12, S14, - S16, S18, S20, S22, S24, S26, S28, S30]>; +def arm_ssubreg_0 : PatLeaf<(i32 1)>; +def arm_ssubreg_1 : PatLeaf<(i32 2)>; +def arm_ssubreg_2 : PatLeaf<(i32 3)>; +def arm_ssubreg_3 : PatLeaf<(i32 4)>; +def arm_dsubreg_0 : PatLeaf<(i32 5)>; +def arm_dsubreg_1 : PatLeaf<(i32 6)>; -def : SubRegSet<2, [D0, D1, D2, D3, D4, D5, D6, D7, - D8, D9, D10, D11, D12, D13, D14, D15], - [S1, S3, S5, S7, S9, S11, S13, S15, +// S sub-registers of D registers. +def : SubRegSet<1, [D0, D1, D2, D3, D4, D5, D6, D7, + D8, D9, D10, D11, D12, D13, D14, D15], + [S0, S2, S4, S6, S8, S10, S12, S14, + S16, S18, S20, S22, S24, S26, S28, S30]>; +def : SubRegSet<2, [D0, D1, D2, D3, D4, D5, D6, D7, + D8, D9, D10, D11, D12, D13, D14, D15], + [S1, S3, S5, S7, S9, S11, S13, S15, S17, S19, S21, S23, S25, S27, S29, S31]>; + +// S sub-registers of Q registers. +def : SubRegSet<1, [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7], + [S0, S4, S8, S12, S16, S20, S24, S28]>; +def : SubRegSet<2, [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7], + [S1, S5, S9, S13, S17, S21, S25, S29]>; +def : SubRegSet<3, [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7], + [S2, S6, S10, S14, S18, S22, S26, S30]>; +def : SubRegSet<4, [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7], + [S3, S7, S11, S15, S19, S23, S27, S31]>; + +// D sub-registers of Q registers. +def : SubRegSet<5, [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, + Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15], + [D0, D2, D4, D6, D8, D10, D12, D14, + D16, D18, D20, D22, D24, D26, D28, D30]>; +def : SubRegSet<6, [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, + Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15], + [D1, D3, D5, D7, D9, D11, D13, D15, + D17, D19, D21, D23, D25, D27, D29, D31]>; + diff --git a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp index 948a10070d..58ba50e7ed 100644 --- a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp +++ b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp @@ -285,12 +285,22 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int opNum, const char *Modifier) { const MachineOperand &MO = MI->getOperand(opNum); switch (MO.getType()) { - case MachineOperand::MO_Register: - if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) - O << TM.getRegisterInfo()->get(MO.getReg()).AsmName; - else + case MachineOperand::MO_Register: { + unsigned Reg = MO.getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + if (Modifier && strcmp(Modifier, "dregpair") == 0) { + unsigned DRegLo = TRI->getSubReg(Reg, 5); // arm_dsubreg_0 + unsigned DRegHi = TRI->getSubReg(Reg, 6); // arm_dsubreg_1 + O << '{' + << TRI->getAsmName(DRegLo) << "-" << TRI->getAsmName(DRegHi) + << '}'; + } else { + O << TRI->getAsmName(Reg); + } + } else assert(0 && "not implemented"); break; + } case MachineOperand::MO_Immediate: { if (!Modifier || strcmp(Modifier, "no_hash") != 0) O << "#"; diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt index 068c441ed7..0252a4aef4 100644 --- a/lib/Target/ARM/README.txt +++ b/lib/Target/ARM/README.txt @@ -552,3 +552,23 @@ __Z11no_overflowjj: //===---------------------------------------------------------------------===// +Some of the NEON intrinsics may be appropriate for more general use, either +as target-independent intrinsics or perhaps elsewhere in the ARM backend. +Some of them may also be lowered to target-independent SDNodes, and perhaps +some new SDNodes could be added. + +For example, maximum, minimum, and absolute value operations are well-defined +and standard operations, both for vector and scalar types. + +The current NEON-specific intrinsics for count leading zeros and count one +bits could perhaps be replaced by the target-independent ctlz and ctpop +intrinsics. It may also make sense to add a target-independent "ctls" +intrinsic for "count leading sign bits". Likewise, the backend could use +the target-independent SDNodes for these operations. + +ARMv6 has scalar saturating and halving adds and subtracts. The same +intrinsics could possibly be used for both NEON's vector implementations of +those operations and the ARMv6 scalar versions. + +//===---------------------------------------------------------------------===// + diff --git a/test/CodeGen/ARM/2009-06-02-ISelCrash.ll b/test/CodeGen/ARM/2009-06-02-ISelCrash.ll new file mode 100644 index 0000000000..7cd35b9557 --- /dev/null +++ b/test/CodeGen/ARM/2009-06-02-ISelCrash.ll @@ -0,0 +1,62 @@ +; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -relocation-model=pic -mattr=+v6,+vfp2 + +@"\01LC" = external constant [15 x i8] ; <[15 x i8]*> [#uses=1] + +declare i32 @printf(i8* nocapture, ...) nounwind + +define i32 @main() nounwind { +entry: + br label %bb.i1.i + +bb.i1.i: ; preds = %Cos.exit.i.i, %entry + br label %bb.i.i.i + +bb.i.i.i: ; preds = %bb.i.i.i, %bb.i1.i + br i1 undef, label %Cos.exit.i.i, label %bb.i.i.i + +Cos.exit.i.i: ; preds = %bb.i.i.i + br i1 undef, label %bb2.i.i, label %bb.i1.i + +bb2.i.i: ; preds = %Cos.exit.i.i + br label %bb3.i.i + +bb3.i.i: ; preds = %bb5.i.i, %bb2.i.i + br label %bb4.i.i + +bb4.i.i: ; preds = %bb4.i.i, %bb3.i.i + br i1 undef, label %bb5.i.i, label %bb4.i.i + +bb5.i.i: ; preds = %bb4.i.i + br i1 undef, label %bb.i, label %bb3.i.i + +bb.i: ; preds = %bb.i, %bb5.i.i + br i1 undef, label %bb1.outer2.i.i.outer, label %bb.i + +bb1.outer2.i.i.outer: ; preds = %Fft.exit.i, %bb5.i12.i, %bb.i + br label %bb1.outer2.i.i + +bb1.outer2.i.i: ; preds = %bb2.i9.i, %bb1.outer2.i.i.outer + br label %bb1.i.i + +bb1.i.i: ; preds = %bb1.i.i, %bb1.outer2.i.i + br i1 undef, label %bb2.i9.i, label %bb1.i.i + +bb2.i9.i: ; preds = %bb1.i.i + br i1 undef, label %bb4.i11.i, label %bb1.outer2.i.i + +bb4.i11.i: ; preds = %bb4.i11.i, %bb2.i9.i + br i1 undef, label %bb5.i12.i, label %bb4.i11.i + +bb5.i12.i: ; preds = %bb4.i11.i + br i1 undef, label %bb7.i.i, label %bb1.outer2.i.i.outer + +bb7.i.i: ; preds = %bb7.i.i, %bb5.i12.i + br i1 undef, label %Fft.exit.i, label %bb7.i.i + +Fft.exit.i: ; preds = %bb7.i.i + br i1 undef, label %bb5.i, label %bb1.outer2.i.i.outer + +bb5.i: ; preds = %Fft.exit.i + %0 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([15 x i8]* @"\01LC", i32 0, i32 0), double undef, double undef) nounwind ; <i32> [#uses=0] + unreachable +} diff --git a/test/CodeGen/ARM/neon_arith1.ll b/test/CodeGen/ARM/neon_arith1.ll new file mode 100644 index 0000000000..18b516fc1a --- /dev/null +++ b/test/CodeGen/ARM/neon_arith1.ll @@ -0,0 +1,7 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep vadd + +define <8 x i8> @t_i8x8(<8 x i8> %a, <8 x i8> %b) nounwind { +entry: + %0 = add <8 x i8> %a, %b + ret <8 x i8> %0 +} diff --git a/test/CodeGen/ARM/neon_ld1.ll b/test/CodeGen/ARM/neon_ld1.ll new file mode 100644 index 0000000000..8901ba177d --- /dev/null +++ b/test/CodeGen/ARM/neon_ld1.ll @@ -0,0 +1,22 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fldd | count 4 +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fstd +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fmrrd + +define void @t1(<2 x i32>* %r, <4 x i16>* %a, <4 x i16>* %b) nounwind { +entry: + %0 = load <4 x i16>* %a, align 8 ; <<4 x i16>> [#uses=1] + %1 = load <4 x i16>* %b, align 8 ; <<4 x i16>> [#uses=1] + %2 = add <4 x i16> %0, %1 ; <<4 x i16>> [#uses=1] + %3 = bitcast <4 x i16> %2 to <2 x i32> ; <<2 x i32>> [#uses=1] + store <2 x i32> %3, <2 x i32>* %r, align 8 + ret void +} + +define <2 x i32> @t2(<4 x i16>* %a, <4 x i16>* %b) nounwind readonly { +entry: + %0 = load <4 x i16>* %a, align 8 ; <<4 x i16>> [#uses=1] + %1 = load <4 x i16>* %b, align 8 ; <<4 x i16>> [#uses=1] + %2 = sub <4 x i16> %0, %1 ; <<4 x i16>> [#uses=1] + %3 = bitcast <4 x i16> %2 to <2 x i32> ; <<2 x i32>> [#uses=1] + ret <2 x i32> %3 +} diff --git a/test/CodeGen/ARM/neon_ld2.ll b/test/CodeGen/ARM/neon_ld2.ll new file mode 100644 index 0000000000..a26904afca --- /dev/null +++ b/test/CodeGen/ARM/neon_ld2.ll @@ -0,0 +1,23 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep vldmia | count 4 +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep vstmia | count 1 +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fmrrd | count 2 + +define void @t1(<4 x i32>* %r, <2 x i64>* %a, <2 x i64>* %b) nounwind { +entry: + %0 = load <2 x i64>* %a, align 16 ; <<2 x i64>> [#uses=1] + %1 = load <2 x i64>* %b, align 16 ; <<2 x i64>> [#uses=1] + %2 = add <2 x i64> %0, %1 ; <<2 x i64>> [#uses=1] + %3 = bitcast <2 x i64> %2 to <4 x i32> ; <<4 x i32>> [#uses=1] + store <4 x i32> %3, <4 x i32>* %r, align 16 + ret void +} + +define <4 x i32> @t2(<2 x i64>* %a, <2 x i64>* %b) nounwind readonly { +entry: + %0 = load <2 x i64>* %a, align 16 ; <<2 x i64>> [#uses=1] + %1 = load <2 x i64>* %b, align 16 ; <<2 x i64>> [#uses=1] + %2 = sub <2 x i64> %0, %1 ; <<2 x i64>> [#uses=1] + %3 = bitcast <2 x i64> %2 to <4 x i32> ; <<4 x i32>> [#uses=1] + ret <4 x i32> %3 +} + diff --git a/test/CodeGen/ARM/vaba.ll b/test/CodeGen/ARM/vaba.ll new file mode 100644 index 0000000000..98ee1e155b --- /dev/null +++ b/test/CodeGen/ARM/vaba.ll @@ -0,0 +1,119 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vaba\\.s8} %t | count 2 +; RUN: grep {vaba\\.s16} %t | count 2 +; RUN: grep {vaba\\.s32} %t | count 2 +; RUN: grep {vaba\\.u8} %t | count 2 +; RUN: grep {vaba\\.u16} %t | count 2 +; RUN: grep {vaba\\.u32} %t | count 2 + +define <8 x i8> @vabas8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = load <8 x i8>* %C + %tmp4 = call <8 x i8> @llvm.arm.neon.vabas.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) + ret <8 x i8> %tmp4 +} + +define <4 x i16> @vabas16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = call <4 x i16> @llvm.arm.neon.vabas.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) + ret <4 x i16> %tmp4 +} + +define <2 x i32> @vabas32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = call <2 x i32> @llvm.arm.neon.vabas.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) + ret <2 x i32> %tmp4 +} + +define <8 x i8> @vabau8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = load <8 x i8>* %C + %tmp4 = call <8 x i8> @llvm.arm.neon.vabau.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) + ret <8 x i8> %tmp4 +} + +define <4 x i16> @vabau16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = call <4 x i16> @llvm.arm.neon.vabau.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) + ret <4 x i16> %tmp4 +} + +define <2 x i32> @vabau32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = call <2 x i32> @llvm.arm.neon.vabau.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) + ret <2 x i32> %tmp4 +} + +define <16 x i8> @vabaQs8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = load <16 x i8>* %C + %tmp4 = call <16 x i8> @llvm.arm.neon.vabas.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> %tmp3) + ret <16 x i8> %tmp4 +} + +define <8 x i16> @vabaQs16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = load <8 x i16>* %C + %tmp4 = call <8 x i16> @llvm.arm.neon.vabas.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> %tmp3) + ret <8 x i16> %tmp4 +} + +define <4 x i32> @vabaQs32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = load <4 x i32>* %C + %tmp4 = call <4 x i32> @llvm.arm.neon.vabas.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> %tmp3) + ret <4 x i32> %tmp4 +} + +define <16 x i8> @vabaQu8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = load <16 x i8>* %C + %tmp4 = call <16 x i8> @llvm.arm.neon.vabau.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> %tmp3) + ret <16 x i8> %tmp4 +} + +define <8 x i16> @vabaQu16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = load <8 x i16>* %C + %tmp4 = call <8 x i16> @llvm.arm.neon.vabau.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> %tmp3) + ret <8 x i16> %tmp4 +} + +define <4 x i32> @vabaQu32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = load <4 x i32>* %C + %tmp4 = call <4 x i32> @llvm.arm.neon.vabau.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> %tmp3) + ret <4 x i32> %tmp4 +} + +declare <8 x i8> @llvm.arm.neon.vabas.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vabas.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vabas.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vabau.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vabau.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vabau.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vabas.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vabas.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vabas.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vabau.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vabau.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vabau.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vabal.ll b/test/CodeGen/ARM/vabal.ll new file mode 100644 index 0000000000..ed9460345b --- /dev/null +++ b/test/CodeGen/ARM/vabal.ll @@ -0,0 +1,63 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vabal\\.s8} %t | count 1 +; RUN: grep {vabal\\.s16} %t | count 1 +; RUN: grep {vabal\\.s32} %t | count 1 +; RUN: grep {vabal\\.u8} %t | count 1 +; RUN: grep {vabal\\.u16} %t | count 1 +; RUN: grep {vabal\\.u32} %t | count 1 + +define <8 x i16> @vabals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = load <8 x i8>* %C + %tmp4 = call <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) + ret <8 x i16> %tmp4 +} + +define <4 x i32> @vabals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = call <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) + ret <4 x i32> %tmp4 +} + +define <2 x i64> @vabals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = call <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) + ret <2 x i64> %tmp4 +} + +define <8 x i16> @vabalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = load <8 x i8>* %C + %tmp4 = call <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) + ret <8 x i16> %tmp4 +} + +define <4 x i32> @vabalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = call <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) + ret <4 x i32> %tmp4 +} + +define <2 x i64> @vabalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = call <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) + ret <2 x i64> %tmp4 +} + +declare <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vabd.ll b/test/CodeGen/ARM/vabd.ll new file mode 100644 index 0000000000..0fe5ddb94b --- /dev/null +++ b/test/CodeGen/ARM/vabd.ll @@ -0,0 +1,126 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vabd\\.s8} %t | count 2 +; RUN: grep {vabd\\.s16} %t | count 2 +; RUN: grep {vabd\\.s32} %t | count 2 +; RUN: grep {vabd\\.u8} %t | count 2 +; RUN: grep {vabd\\.u16} %t | count 2 +; RUN: grep {vabd\\.u32} %t | count 2 +; RUN: grep {vabd\\.f32} %t | count 2 + +define <8 x i8> @vabds8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vabds16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vabds32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <8 x i8> @vabdu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vabdu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vabdu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <2 x float> @vabdf32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = call <2 x float> @llvm.arm.neon.vabdf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) + ret <2 x float> %tmp3 +} + +define <16 x i8> @vabdQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vabdQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vabdQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <16 x i8> @vabdQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vabdQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vabdQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <4 x float> @vabdQf32(<4 x float>* %A, <4 x float>* %B) nounwind { + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = call <4 x float> @llvm.arm.neon.vabdf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) + ret <4 x float> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <2 x float> @llvm.arm.neon.vabdf.v2f32(<2 x float>, <2 x float>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +declare <4 x float> @llvm.arm.neon.vabdf.v4f32(<4 x float>, <4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vabdl.ll b/test/CodeGen/ARM/vabdl.ll new file mode 100644 index 0000000000..b351d0f8d3 --- /dev/null +++ b/test/CodeGen/ARM/vabdl.ll @@ -0,0 +1,57 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vabdl\\.s8} %t | count 1 +; RUN: grep {vabdl\\.s16} %t | count 1 +; RUN: grep {vabdl\\.s32} %t | count 1 +; RUN: grep {vabdl\\.u8} %t | count 1 +; RUN: grep {vabdl\\.u16} %t | count 1 +; RUN: grep {vabdl\\.u32} %t | count 1 + +define <8 x i16> @vabdls8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vabdls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vabdls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +define <8 x i16> @vabdlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vabdlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vabdlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +declare <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vabs.ll b/test/CodeGen/ARM/vabs.ll new file mode 100644 index 0000000000..629baa762a --- /dev/null +++ b/test/CodeGen/ARM/vabs.ll @@ -0,0 +1,64 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vabs\\.s8} %t | count 2 +; RUN: grep {vabs\\.s16} %t | count 2 +; RUN: grep {vabs\\.s32} %t | count 2 +; RUN: grep {vabs\\.f32} %t | count 2 + +define <8 x i8> @vabss8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %tmp1) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vabss16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> %tmp1) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vabss32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> %tmp1) + ret <2 x i32> %tmp2 +} + +define <2 x float> @vabsf32(<2 x float>* %A) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = call <2 x float> @llvm.arm.neon.vabsf.v2f32(<2 x float> %tmp1) + ret <2 x float> %tmp2 +} + +define <16 x i8> @vabsQs8(<16 x i8>* %A) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %tmp1) + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vabsQs16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %tmp1) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vabsQs32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> %tmp1) + ret <4 x i32> %tmp2 +} + +define <4 x float> @vabsQf32(<4 x float>* %A) nounwind { + %tmp1 = load <4 x float>* %A + %tmp2 = call <4 x float> @llvm.arm.neon.vabsf.v4f32(<4 x float> %tmp1) + ret <4 x float> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32>) nounwind readnone +declare <2 x float> @llvm.arm.neon.vabsf.v2f32(<2 x float>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vabsf.v4f32(<4 x float>) nounwind readnone + diff --git a/test/CodeGen/ARM/vacge.ll b/test/CodeGen/ARM/vacge.ll new file mode 100644 index 0000000000..5703dd15f3 --- /dev/null +++ b/test/CodeGen/ARM/vacge.ll @@ -0,0 +1,19 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vacge\\.f32} %t | count 2 + +define <2 x i32> @vacgef32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vacged(<2 x float> %tmp1, <2 x float> %tmp2) + ret <2 x i32> %tmp3 +} + +define <4 x i32> @vacgeQf32(<4 x float>* %A, <4 x float>* %B) nounwind { + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vacgeq(<4 x float> %tmp1, <4 x float> %tmp2) + ret <4 x i32> %tmp3 +} + +declare <2 x i32> @llvm.arm.neon.vacged(<2 x float>, <2 x float>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vacgeq(<4 x float>, <4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vacgt.ll b/test/CodeGen/ARM/vacgt.ll new file mode 100644 index 0000000000..ebf590e98a --- /dev/null +++ b/test/CodeGen/ARM/vacgt.ll @@ -0,0 +1,19 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vacgt\\.f32} %t | count 2 + +define <2 x i32> @vacgtf32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vacgtd(<2 x float> %tmp1, <2 x float> %tmp2) + ret <2 x i32> %tmp3 +} + +define <4 x i32> @vacgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind { + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vacgtq(<4 x float> %tmp1, <4 x float> %tmp2) + ret <4 x i32> %tmp3 +} + +declare <2 x i32> @llvm.arm.neon.vacgtd(<2 x float>, <2 x float>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vacgtq(<4 x float>, <4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vadd.ll b/test/CodeGen/ARM/vadd.ll new file mode 100644 index 0000000000..b2b0e2397c --- /dev/null +++ b/test/CodeGen/ARM/vadd.ll @@ -0,0 +1,76 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vadd\\.i8} %t | count 2 +; RUN: grep {vadd\\.i16} %t | count 2 +; RUN: grep {vadd\\.i32} %t | count 2 +; RUN: grep {vadd\\.i64} %t | count 2 +; RUN: grep {vadd\\.f32} %t | count 2 + +define <8 x i8> @vaddi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = add <8 x i8> %tmp1, %tmp2 + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vaddi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = add <4 x i16> %tmp1, %tmp2 + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vaddi32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = add <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vaddi64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = add <1 x i64> %tmp1, %tmp2 + ret <1 x i64> %tmp3 +} + +define <2 x float> @vaddf32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = add <2 x float> %tmp1, %tmp2 + ret <2 x float> %tmp3 +} + +define <16 x i8> @vaddQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = add <16 x i8> %tmp1, %tmp2 + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vaddQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = add <8 x i16> %tmp1, %tmp2 + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vaddQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = add <4 x i32> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vaddQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = add <2 x i64> %tmp1, %tmp2 + ret <2 x i64> %tmp3 +} + +define <4 x float> @vaddQf32(<4 x float>* %A, <4 x float>* %B) nounwind { + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = add <4 x float> %tmp1, %tmp2 + ret <4 x float> %tmp3 +} diff --git a/test/CodeGen/ARM/vaddhn.ll b/test/CodeGen/ARM/vaddhn.ll new file mode 100644 index 0000000000..b2d1b142f3 --- /dev/null +++ b/test/CodeGen/ARM/vaddhn.ll @@ -0,0 +1,29 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vaddhn\\.i16} %t | count 1 +; RUN: grep {vaddhn\\.i32} %t | count 1 +; RUN: grep {vaddhn\\.i64} %t | count 1 + +define <8 x i8> @vaddhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vaddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vaddhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vaddhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i32> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vaddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vaddl.ll b/test/CodeGen/ARM/vaddl.ll new file mode 100644 index 0000000000..26ab17bbc0 --- /dev/null +++ b/test/CodeGen/ARM/vaddl.ll @@ -0,0 +1,57 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vaddl\\.s8} %t | count 1 +; RUN: grep {vaddl\\.s16} %t | count 1 +; RUN: grep {vaddl\\.s32} %t | count 1 +; RUN: grep {vaddl\\.u8} %t | count 1 +; RUN: grep {vaddl\\.u16} %t | count 1 +; RUN: grep {vaddl\\.u32} %t | count 1 + +define <8 x i16> @vaddls8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vaddls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vaddls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +define <8 x i16> @vaddlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vaddlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vaddlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +declare <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vaddw.ll b/test/CodeGen/ARM/vaddw.ll new file mode 100644 index 0000000000..e06f94a8df --- /dev/null +++ b/test/CodeGen/ARM/vaddw.ll @@ -0,0 +1,57 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vaddw\\.s8} %t | count 1 +; RUN: grep {vaddw\\.s16} %t | count 1 +; RUN: grep {vaddw\\.s32} %t | count 1 +; RUN: grep {vaddw\\.u8} %t | count 1 +; RUN: grep {vaddw\\.u16} %t | count 1 +; RUN: grep {vaddw\\.u32} %t | count 1 + +define <8 x i16> @vaddws8(<8 x i16>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vaddws16(<4 x i32>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vaddws32(<2 x i64>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +define <8 x i16> @vaddwu8(<8 x i16>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vaddwu16(<4 x i32>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vaddwu32(<2 x i64>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +declare <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vand.ll b/test/CodeGen/ARM/vand.ll new file mode 100644 index 0000000000..b3eaf1c5dc --- /dev/null +++ b/test/CodeGen/ARM/vand.ll @@ -0,0 +1,59 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep vand %t | count 8 +; Note: function names do not include "vand" to allow simple grep for opcodes + +define <8 x i8> @v_andi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = and <8 x i8> %tmp1, %tmp2 + ret <8 x i8> %tmp3 +} + +define <4 x i16> @v_andi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = and <4 x i16> %tmp1, %tmp2 + ret <4 x i16> %tmp3 +} + +define <2 x i32> @v_andi32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = and <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <1 x i64> @v_andi64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = and <1 x i64> %tmp1, %tmp2 + ret <1 x i64> %tmp3 +} + +define <16 x i8> @v_andQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = and <16 x i8> %tmp1, %tmp2 + ret <16 x i8> %tmp3 +} + +define <8 x i16> @v_andQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = and <8 x i16> %tmp1, %tmp2 + ret <8 x i16> %tmp3 +} + +define <4 x i32> @v_andQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = and <4 x i32> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} + +define <2 x i64> @v_andQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = and <2 x i64> %tmp1, %tmp2 + ret <2 x i64> %tmp3 +} diff --git a/test/CodeGen/ARM/vbic.ll b/test/CodeGen/ARM/vbic.ll new file mode 100644 index 0000000000..dbc11ea58c --- /dev/null +++ b/test/CodeGen/ARM/vbic.ll @@ -0,0 +1,67 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep vbic %t | count 8 +; Note: function names do not include "vbic" to allow simple grep for opcodes + +define <8 x i8> @v_bici8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = xor <8 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp4 = and <8 x i8> %tmp1, %tmp3 + ret <8 x i8> %tmp4 +} + +define <4 x i16> @v_bici16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = xor <4 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1 > + %tmp4 = and <4 x i16> %tmp1, %tmp3 + ret <4 x i16> %tmp4 +} + +define <2 x i32> @v_bici32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = xor <2 x i32> %tmp2, < i32 -1, i32 -1 > + %tmp4 = and <2 x i32> %tmp1, %tmp3 + ret <2 x i32> %tmp4 +} + +define <1 x i64> @v_bici64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = xor <1 x i64> %tmp2, < i64 -1 > + %tmp4 = and <1 x i64> %tmp1, %tmp3 + ret <1 x i64> %tmp4 +} + +define <16 x i8> @v_bicQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = xor <16 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp4 = and <16 x i8> %tmp1, %tmp3 + ret <16 x i8> %tmp4 +} + +define <8 x i16> @v_bicQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = xor <8 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 > + %tmp4 = and <8 x i16> %tmp1, %tmp3 + ret <8 x i16> %tmp4 +} + +define <4 x i32> @v_bicQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = xor <4 x i32> %tmp2, < i32 -1, i32 -1, i32 -1, i32 -1 > + %tmp4 = and <4 x i32> %tmp1, %tmp3 + ret <4 x i32> %tmp4 +} + +define <2 x i64> @v_bicQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = xor <2 x i64> %tmp2, < i64 -1, i64 -1 > + %tmp4 = and <2 x i64> %tmp1, %tmp3 + ret <2 x i64> %tmp4 +} diff --git a/test/CodeGen/ARM/vbsl.ll b/test/CodeGen/ARM/vbsl.ll new file mode 100644 index 0000000000..37ddf4de6d --- /dev/null +++ b/test/CodeGen/ARM/vbsl.ll @@ -0,0 +1,91 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep vbsl %t | count 8 +; Note: function names do not include "vbsl" to allow simple grep for opcodes + +define <8 x i8> @v_bsli8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = load <8 x i8>* %C + %tmp4 = and <8 x i8> %tmp1, %tmp2 + %tmp5 = xor <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp6 = and <8 x i8> %tmp5, %tmp3 + %tmp7 = or <8 x i8> %tmp4, %tmp6 + ret <8 x i8> %tmp7 +} + +define <4 x i16> @v_bsli16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = and <4 x i16> %tmp1, %tmp2 + %tmp5 = xor <4 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1 > + %tmp6 = and <4 x i16> %tmp5, %tmp3 + %tmp7 = or <4 x i16> %tmp4, %tmp6 + ret <4 x i16> %tmp7 +} + +define <2 x i32> @v_bsli32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = and <2 x i32> %tmp1, %tmp2 + %tmp5 = xor <2 x i32> %tmp1, < i32 -1, i32 -1 > + %tmp6 = and <2 x i32> %tmp5, %tmp3 + %tmp7 = or <2 x i32> %tmp4, %tmp6 + ret <2 x i32> %tmp7 +} + +define <1 x i64> @v_bsli64(<1 x i64>* %A, <1 x i64>* %B, <1 x i64>* %C) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = load <1 x i64>* %C + %tmp4 = and <1 x i64> %tmp1, %tmp2 + %tmp5 = xor <1 x i64> %tmp1, < i64 -1 > + %tmp6 = and <1 x i64> %tmp5, %tmp3 + %tmp7 = or <1 x i64> %tmp4, %tmp6 + ret <1 x i64> %tmp7 +} + +define <16 x i8> @v_bslQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = load <16 x i8>* %C + %tmp4 = and <16 x i8> %tmp1, %tmp2 + %tmp5 = xor <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp6 = and <16 x i8> %tmp5, %tmp3 + %tmp7 = or <16 x i8> %tmp4, %tmp6 + ret <16 x i8> %tmp7 +} + +define <8 x i16> @v_bslQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = load <8 x i16>* %C + %tmp4 = and <8 x i16> %tmp1, %tmp2 + %tmp5 = xor <8 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 > + %tmp6 = and <8 x i16> %tmp5, %tmp3 + %tmp7 = or <8 x i16> %tmp4, %tmp6 + ret <8 x i16> %tmp7 +} + +define <4 x i32> @v_bslQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = load <4 x i32>* %C + %tmp4 = and <4 x i32> %tmp1, %tmp2 + %tmp5 = xor <4 x i32> %tmp1, < i32 -1, i32 -1, i32 -1, i32 -1 > + %tmp6 = and <4 x i32> %tmp5, %tmp3 + %tmp7 = or <4 x i32> %tmp4, %tmp6 + ret <4 x i32> %tmp7 +} + +define <2 x i64> @v_bslQi64(<2 x i64>* %A, <2 x i64>* %B, <2 x i64>* %C) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = load <2 x i64>* %C + %tmp4 = and <2 x i64> %tmp1, %tmp2 + %tmp5 = xor <2 x i64> %tmp1, < i64 -1, i64 -1 > + %tmp6 = and <2 x i64> %tmp5, %tmp3 + %tmp7 = or <2 x i64> %tmp4, %tmp6 + ret <2 x i64> %tmp7 +} diff --git a/test/CodeGen/ARM/vceq.ll b/test/CodeGen/ARM/vceq.ll new file mode 100644 index 0000000000..77f1890d08 --- /dev/null +++ b/test/CodeGen/ARM/vceq.ll @@ -0,0 +1,61 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vceq\\.i8} %t | count 2 +; RUN: grep {vceq\\.i16} %t | count 2 +; RUN: grep {vceq\\.i32} %t | count 2 +; RUN: grep {vceq\\.f32} %t | count 2 + +define <8 x i8> @vceqi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = vicmp eq <8 x i8> %tmp1, %tmp2 + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vceqi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = vicmp eq <4 x i16> %tmp1, %tmp2 + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vceqi32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = vicmp eq <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <2 x i32> @vceqf32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = vfcmp oeq <2 x float> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <16 x i8> @vceqQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = vicmp eq <16 x i8> %tmp1, %tmp2 + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vceqQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = vicmp eq <8 x i16> %tmp1, %tmp2 + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vceqQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = vicmp eq <4 x i32> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} + +define <4 x i32> @vceqQf32(<4 x float>* %A, <4 x float>* %B) nounwind { + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = vfcmp oeq <4 x float> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} diff --git a/test/CodeGen/ARM/vcge.ll b/test/CodeGen/ARM/vcge.ll new file mode 100644 index 0000000000..14c623ea08 --- /dev/null +++ b/test/CodeGen/ARM/vcge.ll @@ -0,0 +1,106 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vcge\\.s8} %t | count 2 +; RUN: grep {vcge\\.s16} %t | count 2 +; RUN: grep {vcge\\.s32} %t | count 2 +; RUN: grep {vcge\\.u8} %t | count 2 +; RUN: grep {vcge\\.u16} %t | count 2 +; RUN: grep {vcge\\.u32} %t | count 2 +; RUN: grep {vcge\\.f32} %t | count 2 + +define <8 x i8> @vcges8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = vicmp sge <8 x i8> %tmp1, %tmp2 + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vcges16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = vicmp sge <4 x i16> %tmp1, %tmp2 + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vcges32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = vicmp sge <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <8 x i8> @vcgeu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = vicmp uge <8 x i8> %tmp1, %tmp2 + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vcgeu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = vicmp uge <4 x i16> %tmp1, %tmp2 + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vcgeu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = vicmp uge <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <2 x i32> @vcgef32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = vfcmp oge <2 x float> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <16 x i8> @vcgeQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = vicmp sge <16 x i8> %tmp1, %tmp2 + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vcgeQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = vicmp sge <8 x i16> %tmp1, %tmp2 + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vcgeQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = vicmp sge <4 x i32> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} + +define <16 x i8> @vcgeQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = vicmp uge <16 x i8> %tmp1, %tmp2 + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vcgeQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = vicmp uge <8 x i16> %tmp1, %tmp2 + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vcgeQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = vicmp uge <4 x i32> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} + +define <4 x i32> @vcgeQf32(<4 x float>* %A, <4 x float>* %B) nounwind { + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = vfcmp oge <4 x float> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} diff --git a/test/CodeGen/ARM/vcgt.ll b/test/CodeGen/ARM/vcgt.ll new file mode 100644 index 0000000000..3f7e550787 --- /dev/null +++ b/test/CodeGen/ARM/vcgt.ll @@ -0,0 +1,106 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vcgt\\.s8} %t | count 2 +; RUN: grep {vcgt\\.s16} %t | count 2 +; RUN: grep {vcgt\\.s32} %t | count 2 +; RUN: grep {vcgt\\.u8} %t | count 2 +; RUN: grep {vcgt\\.u16} %t | count 2 +; RUN: grep {vcgt\\.u32} %t | count 2 +; RUN: grep {vcgt\\.f32} %t | count 2 + +define <8 x i8> @vcgts8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = vicmp sgt <8 x i8> %tmp1, %tmp2 + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vcgts16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = vicmp sgt <4 x i16> %tmp1, %tmp2 + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vcgts32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = vicmp sgt <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <8 x i8> @vcgtu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = vicmp ugt <8 x i8> %tmp1, %tmp2 + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vcgtu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = vicmp ugt <4 x i16> %tmp1, %tmp2 + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vcgtu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = vicmp ugt <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <2 x i32> @vcgtf32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = vfcmp ogt <2 x float> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <16 x i8> @vcgtQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = vicmp sgt <16 x i8> %tmp1, %tmp2 + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vcgtQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = vicmp sgt <8 x i16> %tmp1, %tmp2 + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vcgtQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = vicmp sgt <4 x i32> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} + +define <16 x i8> @vcgtQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = vicmp ugt <16 x i8> %tmp1, %tmp2 + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vcgtQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = vicmp ugt <8 x i16> %tmp1, %tmp2 + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vcgtQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = vicmp ugt <4 x i32> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} + +define <4 x i32> @vcgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind { + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = vfcmp ogt <4 x float> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} diff --git a/test/CodeGen/ARM/vcls.ll b/test/CodeGen/ARM/vcls.ll new file mode 100644 index 0000000000..69e4bdbe16 --- /dev/null +++ b/test/CodeGen/ARM/vcls.ll @@ -0,0 +1,48 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vcls\\.s8} %t | count 2 +; RUN: grep {vcls\\.s16} %t | count 2 +; RUN: grep {vcls\\.s32} %t | count 2 + +define <8 x i8> @vclss8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %tmp1) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vclss16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %tmp1) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vclss32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %tmp1) + ret <2 x i32> %tmp2 +} + +define <16 x i8> @vclsQs8(<16 x i8>* %A) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %tmp1) + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vclsQs16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %tmp1) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vclsQs32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %tmp1) + ret <4 x i32> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vclz.ll b/test/CodeGen/ARM/vclz.ll new file mode 100644 index 0000000000..575ea7d2e8 --- /dev/null +++ b/test/CodeGen/ARM/vclz.ll @@ -0,0 +1,48 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vclz\\.i8} %t | count 2 +; RUN: grep {vclz\\.i16} %t | count 2 +; RUN: grep {vclz\\.i32} %t | count 2 + +define <8 x i8> @vclz8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vclz.v8i8(<8 x i8> %tmp1) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vclz16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16> %tmp1) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vclz32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32> %tmp1) + ret <2 x i32> %tmp2 +} + +define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = call <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8> %tmp1) + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16> %tmp1) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32> %tmp1) + ret <4 x i32> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vclz.v8i8(<8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vcnt.ll b/test/CodeGen/ARM/vcnt.ll new file mode 100644 index 0000000000..9817168958 --- /dev/null +++ b/test/CodeGen/ARM/vcnt.ll @@ -0,0 +1,17 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vcnt\\.8} %t | count 2 + +define <8 x i8> @vcnt8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vcnt.v8i8(<8 x i8> %tmp1) + ret <8 x i8> %tmp2 +} + +define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = call <16 x i8> @llvm.arm.neon.vcnt.v16i8(<16 x i8> %tmp1) + ret <16 x i8> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vcnt.v8i8(<8 x i8>) nounwind readnone +declare <16 x i8> @llvm.arm.neon.vcnt.v16i8(<16 x i8>) nounwind readnone diff --git a/test/CodeGen/ARM/vcvt.ll b/test/CodeGen/ARM/vcvt.ll new file mode 100644 index 0000000000..1cb42bf155 --- /dev/null +++ b/test/CodeGen/ARM/vcvt.ll @@ -0,0 +1,53 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vcvt\\.s32\\.f32} %t | count 2 +; RUN: grep {vcvt\\.u32\\.f32} %t | count 2 +; RUN: grep {vcvt\\.f32\\.s32} %t | count 2 +; RUN: grep {vcvt\\.f32\\.u32} %t | count 2 + +define <2 x i32> @vcvt_f32tos32(<2 x float>* %A) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = fptosi <2 x float> %tmp1 to <2 x i32> + ret <2 x i32> %tmp2 +} + +define <2 x i32> @vcvt_f32tou32(<2 x float>* %A) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = fptoui <2 x float> %tmp1 to <2 x i32> + ret <2 x i32> %tmp2 +} + +define <2 x float> @vcvt_s32tof32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = sitofp <2 x i32> %tmp1 to <2 x float> + ret <2 x float> %tmp2 +} + +define <2 x float> @vcvt_u32tof32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = uitofp <2 x i32> %tmp1 to <2 x float> + ret <2 x float> %tmp2 +} + +define <4 x i32> @vcvtQ_f32tos32(<4 x float>* %A) nounwind { + %tmp1 = load <4 x float>* %A + %tmp2 = fptosi <4 x float> %tmp1 to <4 x i32> + ret <4 x i32> %tmp2 +} + +define <4 x i32> @vcvtQ_f32tou32(<4 x float>* %A) nounwind { + %tmp1 = load <4 x float>* %A + %tmp2 = fptoui <4 x float> %tmp1 to <4 x i32> + ret <4 x i32> %tmp2 +} + +define <4 x float> @vcvtQ_s32tof32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = sitofp <4 x i32> %tmp1 to <4 x float> + ret <4 x float> %tmp2 +} + +define <4 x float> @vcvtQ_u32tof32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = uitofp <4 x i32> %tmp1 to <4 x float> + ret <4 x float> %tmp2 +} diff --git a/test/CodeGen/ARM/vcvt_n.ll b/test/CodeGen/ARM/vcvt_n.ll new file mode 100644 index 0000000000..ac86b73d8d --- /dev/null +++ b/test/CodeGen/ARM/vcvt_n.ll @@ -0,0 +1,64 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vcvt\\.s32\\.f32} %t | count 2 +; RUN: grep {vcvt\\.u32\\.f32} %t | count 2 +; RUN: grep {vcvt\\.f32\\.s32} %t | count 2 +; RUN: grep {vcvt\\.f32\\.u32} %t | count 2 + +define <2 x i32> @vcvt_f32tos32(<2 x float>* %A) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %tmp1, i32 1) + ret <2 x i32> %tmp2 +} + +define <2 x i32> @vcvt_f32tou32(<2 x float>* %A) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %tmp1, i32 1) + ret <2 x i32> %tmp2 +} + +define <2 x float> @vcvt_s32tof32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %tmp1, i32 1) + ret <2 x float> %tmp2 +} + +define <2 x float> @vcvt_u32tof32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %tmp1, i32 1) + ret <2 x float> %tmp2 +} + +declare <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone +declare <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone +declare <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone + +define <4 x i32> @vcvtQ_f32tos32(<4 x float>* %A) nounwind { + %tmp1 = load <4 x float>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %tmp1, i32 1) + ret <4 x i32> %tmp2 +} + +define <4 x i32> @vcvtQ_f32tou32(<4 x float>* %A) nounwind { + %tmp1 = load <4 x float>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %tmp1, i32 1) + ret <4 x i32> %tmp2 +} + +define <4 x float> @vcvtQ_s32tof32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %tmp1, i32 1) + ret <4 x float> %tmp2 +} + +define <4 x float> @vcvtQ_u32tof32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %tmp1, i32 1) + ret <4 x float> %tmp2 +} + +declare <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone +declare <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone +declare <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone + diff --git a/test/CodeGen/ARM/vdup.ll b/test/CodeGen/ARM/vdup.ll new file mode 100644 index 0000000000..1c0887a249 --- /dev/null +++ b/test/CodeGen/ARM/vdup.ll @@ -0,0 +1,134 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep vdup.8 %t | count 4 +; RUN: grep vdup.16 %t | count 4 +; RUN: grep vdup.32 %t | count 8 + +define <8 x i8> @v_dup8(i8 %A) nounwind { + %tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0 + %tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1 + %tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2 + %tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3 + %tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4 + %tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5 + %tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6 + %tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7 + ret <8 x i8> %tmp8 +} + +define <4 x i16> @v_dup16(i16 %A) nounwind { + %tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0 + %tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1 + %tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2 + %tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3 + ret <4 x i16> %tmp4 +} + +define <2 x i32> @v_dup32(i32 %A) nounwind { + %tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0 + %tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1 + ret <2 x i32> %tmp2 +} + +define <2 x float> @v_dupfloat(float %A) nounwind { + %tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0 + %tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1 + ret <2 x float> %tmp2 +} + +define <16 x i8> @v_dupQ8(i8 %A) nounwind { + %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0 + %tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1 + %tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2 + %tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3 + %tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4 + %tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5 + %tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6 + %tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7 + %tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8 + %tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9 + %tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10 + %tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11 + %tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12 + %tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13 + %tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14 + %tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15 + ret <16 x i8> %tmp16 +} + +define <8 x i16> @v_dupQ16(i16 %A) nounwind { + %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0 + %tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1 + %tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2 + %tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3 + %tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4 + %tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5 + %tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6 + %tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7 + ret <8 x i16> %tmp8 +} + +define <4 x i32> @v_dupQ32(i32 %A) nounwind { + %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0 + %tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1 + %tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2 + %tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3 + ret <4 x i32> %tmp4 +} + +define <4 x float> @v_dupQfloat(float %A) nounwind { + %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0 + %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1 + %tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2 + %tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3 + ret <4 x float> %tmp4 +} + +; Check to make sure it works with shuffles, too. + +define <8 x i8> @v_shuffledup8(i8 %A) nounwind { + %tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0 + %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer + ret <8 x i8> %tmp2 +} + +define <4 x i16> @v_shuffledup16(i16 %A) nounwind { + %tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0 + %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer + ret <4 x i16> %tmp2 +} + +define <2 x i32> @v_shuffledup32(i32 %A) nounwind { + %tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0 + %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer + ret <2 x i32> %tmp2 +} + +define <2 x float> @v_shuffledupfloat(float %A) nounwind { + %tmp1 = insertelement <2 x float> undef, float %A, i32 0 + %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer + ret <2 x float> %tmp2 +} + +define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind { + %tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0 + %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer + ret <16 x i8> %tmp2 +} + +define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind { + %tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0 + %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer + ret <8 x i16> %tmp2 +} + +define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind { + %tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0 + %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer + ret <4 x i32> %tmp2 +} + +define <4 x float> @v_shuffledupQfloat(float %A) nounwind { + %tmp1 = insertelement <4 x float> undef, float %A, i32 0 + %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer + ret <4 x float> %tmp2 +} diff --git a/test/CodeGen/ARM/vdup_lane.ll b/test/CodeGen/ARM/vdup_lane.ll new file mode 100644 index 0000000000..adadc9f5d3 --- /dev/null +++ b/test/CodeGen/ARM/vdup_lane.ll @@ -0,0 +1,52 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep vdup.8 %t | count 2 +; RUN: grep vdup.16 %t | count 2 +; RUN: grep vdup.32 %t | count 4 + +define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 > + ret <2 x i32> %tmp2 +} + +define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 > + ret <2 x float> %tmp2 +} + +define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > + ret <4 x i32> %tmp2 +} + +define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > + ret <4 x float> %tmp2 +} diff --git a/test/CodeGen/ARM/veor.ll b/test/CodeGen/ARM/veor.ll new file mode 100644 index 0000000000..47a5f3f9f7 --- /dev/null +++ b/test/CodeGen/ARM/veor.ll @@ -0,0 +1,59 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep veor %t | count 8 +; Note: function names do not include "veor" to allow simple grep for opcodes + +define <8 x i8> @v_eori8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = xor <8 x i8> %tmp1, %tmp2 + ret <8 x i8> %tmp3 +} + +define <4 x i16> @v_eori16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = xor <4 x i16> %tmp1, %tmp2 + ret <4 x i16> %tmp3 +} + +define <2 x i32> @v_eori32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = xor <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <1 x i64> @v_eori64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = xor <1 x i64> %tmp1, %tmp2 + ret <1 x i64> %tmp3 +} + +define <16 x i8> @v_eorQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = xor <16 x i8> %tmp1, %tmp2 + ret <16 x i8> %tmp3 +} + +define <8 x i16> @v_eorQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = xor <8 x i16> %tmp1, %tmp2 + ret <8 x i16> %tmp3 +} + +define <4 x i32> @v_eorQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = xor <4 x i32> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} + +define <2 x i64> @v_eorQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = xor <2 x i64> %tmp1, %tmp2 + ret <2 x i64> %tmp3 +} diff --git a/test/CodeGen/ARM/vfcmp.ll b/test/CodeGen/ARM/vfcmp.ll new file mode 100644 index 0000000000..58c2068bc8 --- /dev/null +++ b/test/CodeGen/ARM/vfcmp.ll @@ -0,0 +1,96 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vceq\\.f32} %t | count 1 +; RUN: grep {vcgt\\.f32} %t | count 9 +; RUN: grep {vcge\\.f32} %t | count 5 +; RUN: grep vorr %t | count 4 +; RUN: grep vmvn %t | count 7 + +; This tests vfcmp operations that do not map directly to NEON instructions. + +; une is implemented with VCEQ/VMVN +define <2 x i32> @vcunef32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = vfcmp une <2 x float> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +; olt is implemented with VCGT +define <2 x i32> @vcoltf32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = vfcmp olt <2 x float> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +; ole is implemented with VCGE +define <2 x i32> @vcolef32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = vfcmp ole <2 x float> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +; uge is implemented with VCGT/VMVN +define <2 x i32> @vcugef32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = vfcmp uge <2 x float> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +; ule is implemented with VCGT/VMVN +define <2 x i32> @vculef32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = vfcmp ule <2 x float> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +; ugt is implemented with VCGE/VMVN +define <2 x i32> @vcugtf32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = vfcmp ugt <2 x float> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +; ult is implemented with VCGE/VMVN +define <2 x i32> @vcultf32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = vfcmp ult <2 x float> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +; ueq is implemented with VCGT/VCGT/VORR/VMVN +define <2 x i32> @vcueqf32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = vfcmp ueq <2 x float> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +; one is implemented with VCGT/VCGT/VORR +define <2 x i32> @vconef32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = vfcmp one <2 x float> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +; uno is implemented with VCGT/VCGE/VORR/VMVN +define <2 x i32> @vcunof32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = vfcmp uno <2 x float> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +; ord is implemented with VCGT/VCGE/VORR +define <2 x i32> @vcordf32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = vfcmp ord <2 x float> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} diff --git a/test/CodeGen/ARM/vget_lane.ll b/test/CodeGen/ARM/vget_lane.ll new file mode 100644 index 0000000000..a361ba2ba9 --- /dev/null +++ b/test/CodeGen/ARM/vget_lane.ll @@ -0,0 +1,78 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vmov\\.s8} %t | count 2 +; RUN: grep {vmov\\.s16} %t | count 2 +; RUN: grep {vmov\\.u8} %t | count 2 +; RUN: grep {vmov\\.u16} %t | count 2 +; RUN: grep {vmov\\.32} %t | count 2 + +define i32 @vget_lanes8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = extractelement <8 x i8> %tmp1, i32 1 + %tmp3 = sext i8 %tmp2 to i32 + ret i32 %tmp3 +} + +define i32 @vget_lanes16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = extractelement <4 x i16> %tmp1, i32 1 + %tmp3 = sext i16 %tmp2 to i32 + ret i32 %tmp3 +} + +define i32 @vget_laneu8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = extractelement <8 x i8> %tmp1, i32 1 + %tmp3 = zext i8 %tmp2 to i32 + ret i32 %tmp3 +} + +define i32 @vget_laneu16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = extractelement <4 x i16> %tmp1, i32 1 + %tmp3 = zext i16 %tmp2 to i32 + ret i32 %tmp3 +} + +; Do a vector add to keep the extraction from being done directly from memory. +define i32 @vget_lanei32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = add <2 x i32> %tmp1, %tmp1 + %tmp3 = extractelement <2 x i32> %tmp2, i32 1 + ret i32 %tmp3 +} + +define i32 @vgetQ_lanes8(<16 x i8>* %A) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = extractelement <16 x i8> %tmp1, i32 1 + %tmp3 = sext i8 %tmp2 to i32 + ret i32 %tmp3 +} + +define i32 @vgetQ_lanes16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = extractelement <8 x i16> %tmp1, i32 1 + %tmp3 = sext i16 %tmp2 to i32 + ret i32 %tmp3 +} + +define i32 @vgetQ_laneu8(<16 x i8>* %A) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = extractelement <16 x i8> %tmp1, i32 1 + %tmp3 = zext i8 %tmp2 to i32 + ret i32 %tmp3 +} + +define i32 @vgetQ_laneu16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = extractelement <8 x i16> %tmp1, i32 1 + %tmp3 = zext i16 %tmp2 to i32 + ret i32 %tmp3 +} + +; Do a vector add to keep the extraction from being done directly from memory. +define i32 @vgetQ_lanei32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = add <4 x i32> %tmp1, %tmp1 + %tmp3 = extractelement <4 x i32> %tmp2, i32 1 + ret i32 %tmp3 +} diff --git a/test/CodeGen/ARM/vhadd.ll b/test/CodeGen/ARM/vhadd.ll new file mode 100644 index 0000000000..5e7503dc71 --- /dev/null +++ b/test/CodeGen/ARM/vhadd.ll @@ -0,0 +1,107 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vhadd\\.s8} %t | count 2 +; RUN: grep {vhadd\\.s16} %t | count 2 +; RUN: grep {vhadd\\.s32} %t | count 2 +; RUN: grep {vhadd\\.u8} %t | count 2 +; RUN: grep {vhadd\\.u16} %t | count 2 +; RUN: grep {vhadd\\.u32} %t | count 2 + +define <8 x i8> @vhadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vhadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vhadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <8 x i8> @vhaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vhaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vhaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <16 x i8> @vhaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vhaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vhaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <16 x i8> @vhaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vhaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vhaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vhsub.ll b/test/CodeGen/ARM/vhsub.ll new file mode 100644 index 0000000000..32a66e5479 --- /dev/null +++ b/test/CodeGen/ARM/vhsub.ll @@ -0,0 +1,107 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vhsub\\.s8} %t | count 2 +; RUN: grep {vhsub\\.s16} %t | count 2 +; RUN: grep {vhsub\\.s32} %t | count 2 +; RUN: grep {vhsub\\.u8} %t | count 2 +; RUN: grep {vhsub\\.u16} %t | count 2 +; RUN: grep {vhsub\\.u32} %t | count 2 + +define <8 x i8> @vhsubs8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vhsubs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vhsubs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <8 x i8> @vhsubu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vhsubu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vhsubu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <16 x i8> @vhsubQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vhsubQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vhsubQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <16 x i8> @vhsubQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vhsubQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vhsubQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vicmp.ll b/test/CodeGen/ARM/vicmp.ll new file mode 100644 index 0000000000..86858f9293 --- /dev/null +++ b/test/CodeGen/ARM/vicmp.ll @@ -0,0 +1,85 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vceq\\.i8} %t | count 2 +; RUN: grep {vceq\\.i16} %t | count 2 +; RUN: grep {vceq\\.i32} %t | count 2 +; RUN: grep vmvn %t | count 6 +; RUN: grep {vcgt\\.s8} %t | count 1 +; RUN: grep {vcge\\.s16} %t | count 1 +; RUN: grep {vcgt\\.u16} %t | count 1 +; RUN: grep {vcge\\.u32} %t | count 1 + +; This tests vicmp operations that do not map directly to NEON instructions. +; Not-equal (ne) operations are implemented by VCEQ/VMVN. Less-than (lt/ult) +; and less-than-or-equal (le/ule) are implemented by swapping the arguments +; to VCGT and VCGE. Test all the operand types for not-equal but only sample +; the other operations. + +define <8 x i8> @vcnei8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = vicmp ne <8 x i8> %tmp1, %tmp2 + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vcnei16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = vicmp ne <4 x i16> %tmp1, %tmp2 + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vcnei32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = vicmp ne <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <16 x i8> @vcneQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = vicmp ne <16 x i8> %tmp1, %tmp2 + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vcneQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = vicmp ne <8 x i16> %tmp1, %tmp2 + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vcneQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = vicmp ne <4 x i32> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} + +define <16 x i8> @vcltQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = vicmp slt <16 x i8> %tmp1, %tmp2 + ret <16 x i8> %tmp3 +} + +define <4 x i16> @vcles16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = vicmp sle <4 x i16> %tmp1, %tmp2 + ret <4 x i16> %tmp3 +} + +define <4 x i16> @vcltu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = vicmp ult <4 x i16> %tmp1, %tmp2 + ret <4 x i16> %tmp3 +} + +define <4 x i32> @vcleQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = vicmp ule <4 x i32> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} diff --git a/test/CodeGen/ARM/vmax.ll b/test/CodeGen/ARM/vmax.ll new file mode 100644 index 0000000000..60322f85d3 --- /dev/null +++ b/test/CodeGen/ARM/vmax.ll @@ -0,0 +1,126 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vmax\\.s8} %t | count 2 +; RUN: grep {vmax\\.s16} %t | count 2 +; RUN: grep {vmax\\.s32} %t | count 2 +; RUN: grep {vmax\\.u8} %t | count 2 +; RUN: grep {vmax\\.u16} %t | count 2 +; RUN: grep {vmax\\.u32} %t | count 2 +; RUN: grep {vmax\\.f32} %t | count 2 + +define <8 x i8> @vmaxs8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vmaxs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vmaxs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <8 x i8> @vmaxu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vmaxu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vmaxu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <2 x float> @vmaxf32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = call <2 x float> @llvm.arm.neon.vmaxf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) + ret <2 x float> %tmp3 +} + +define <16 x i8> @vmaxQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vmaxQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vmaxQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <16 x i8> @vmaxQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vmaxQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vmaxQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <4 x float> @vmaxQf32(<4 x float>* %A, <4 x float>* %B) nounwind { + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = call <4 x float> @llvm.arm.neon.vmaxf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) + ret <4 x float> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <2 x float> @llvm.arm.neon.vmaxf.v2f32(<2 x float>, <2 x float>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +declare <4 x float> @llvm.arm.neon.vmaxf.v4f32(<4 x float>, <4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vmin.ll b/test/CodeGen/ARM/vmin.ll new file mode 100644 index 0000000000..a6936937c7 --- /dev/null +++ b/test/CodeGen/ARM/vmin.ll @@ -0,0 +1,126 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vmin\\.s8} %t | count 2 +; RUN: grep {vmin\\.s16} %t | count 2 +; RUN: grep {vmin\\.s32} %t | count 2 +; RUN: grep {vmin\\.u8} %t | count 2 +; RUN: grep {vmin\\.u16} %t | count 2 +; RUN: grep {vmin\\.u32} %t | count 2 +; RUN: grep {vmin\\.f32} %t | count 2 + +define <8 x i8> @vmins8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vmins16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vmins32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <8 x i8> @vminu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vminu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vminu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <2 x float> @vminf32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = call <2 x float> @llvm.arm.neon.vminf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) + ret <2 x float> %tmp3 +} + +define <16 x i8> @vminQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vminQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vminQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <16 x i8> @vminQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vminQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vminQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <4 x float> @vminQf32(<4 x float>* %A, <4 x float>* %B) nounwind { + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = call <4 x float> @llvm.arm.neon.vminf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) + ret <4 x float> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <2 x float> @llvm.arm.neon.vminf.v2f32(<2 x float>, <2 x float>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +declare <4 x float> @llvm.arm.neon.vminf.v4f32(<4 x float>, <4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vmla.ll b/test/CodeGen/ARM/vmla.ll new file mode 100644 index 0000000000..ed77e11a7c --- /dev/null +++ b/test/CodeGen/ARM/vmla.ll @@ -0,0 +1,77 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vmla\\.i8} %t | count 2 +; RUN: grep {vmla\\.i16} %t | count 2 +; RUN: grep {vmla\\.i32} %t | count 2 +; RUN: grep {vmla\\.f32} %t | count 2 + +define <8 x i8> @vmlai8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8> * %C) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = load <8 x i8>* %C + %tmp4 = mul <8 x i8> %tmp2, %tmp3 + %tmp5 = add <8 x i8> %tmp1, %tmp4 + ret <8 x i8> %tmp5 +} + +define <4 x i16> @vmlai16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = mul <4 x i16> %tmp2, %tmp3 + %tmp5 = add <4 x i16> %tmp1, %tmp4 + ret <4 x i16> %tmp5 +} + +define <2 x i32> @vmlai32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = mul <2 x i32> %tmp2, %tmp3 + %tmp5 = add <2 x i32> %tmp1, %tmp4 + ret <2 x i32> %tmp5 +} + +define <2 x float> @vmlaf32(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = load <2 x float>* %C + %tmp4 = mul <2 x float> %tmp2, %tmp3 + %tmp5 = add <2 x float> %tmp1, %tmp4 + ret <2 x float> %tmp5 +} + +define <16 x i8> @vmlaQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8> * %C) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = load <16 x i8>* %C + %tmp4 = mul <16 x i8> %tmp2, %tmp3 + %tmp5 = add <16 x i8> %tmp1, %tmp4 + ret <16 x i8> %tmp5 +} + +define <8 x i16> @vmlaQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = load <8 x i16>* %C + %tmp4 = mul <8 x i16> %tmp2, %tmp3 + %tmp5 = add <8 x i16> %tmp1, %tmp4 + ret <8 x i16> %tmp5 +} + +define <4 x i32> @vmlaQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = load <4 x i32>* %C + %tmp4 = mul <4 x i32> %tmp2, %tmp3 + %tmp5 = add <4 x i32> %tmp1, %tmp4 + ret <4 x i32> %tmp5 +} + +define <4 x float> @vmlaQf32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind { + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = load <4 x float>* %C + %tmp4 = mul <4 x float> %tmp2, %tmp3 + %tmp5 = add <4 x float> %tmp1, %tmp4 + ret <4 x float> %tmp5 +} diff --git a/test/CodeGen/ARM/vmlal.ll b/test/CodeGen/ARM/vmlal.ll new file mode 100644 index 0000000000..7fd00bac5e --- /dev/null +++ b/test/CodeGen/ARM/vmlal.ll @@ -0,0 +1,63 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vmlal\\.s8} %t | count 1 +; RUN: grep {vmlal\\.s16} %t | count 1 +; RUN: grep {vmlal\\.s32} %t | count 1 +; RUN: grep {vmlal\\.u8} %t | count 1 +; RUN: grep {vmlal\\.u16} %t | count 1 +; RUN: grep {vmlal\\.u32} %t | count 1 + +define <8 x i16> @vmlals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = load <8 x i8>* %C + %tmp4 = call <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) + ret <8 x i16> %tmp4 +} + +define <4 x i32> @vmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = call <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) + ret <4 x i32> %tmp4 +} + +define <2 x i64> @vmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = call <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) + ret <2 x i64> %tmp4 +} + +define <8 x i16> @vmlalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = load <8 x i8>* %C + %tmp4 = call <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) + ret <8 x i16> %tmp4 +} + +define <4 x i32> @vmlalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = call <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) + ret <4 x i32> %tmp4 +} + +define <2 x i64> @vmlalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = call <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) + ret <2 x i64> %tmp4 +} + +declare <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vmls.ll b/test/CodeGen/ARM/vmls.ll new file mode 100644 index 0000000000..d519b7e70e --- /dev/null +++ b/test/CodeGen/ARM/vmls.ll @@ -0,0 +1,77 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vmls\\.i8} %t | count 2 +; RUN: grep {vmls\\.i16} %t | count 2 +; RUN: grep {vmls\\.i32} %t | count 2 +; RUN: grep {vmls\\.f32} %t | count 2 + +define <8 x i8> @vmlsi8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8> * %C) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = load <8 x i8>* %C + %tmp4 = mul <8 x i8> %tmp2, %tmp3 + %tmp5 = sub <8 x i8> %tmp1, %tmp4 + ret <8 x i8> %tmp5 +} + +define <4 x i16> @vmlsi16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = mul <4 x i16> %tmp2, %tmp3 + %tmp5 = sub <4 x i16> %tmp1, %tmp4 + ret <4 x i16> %tmp5 +} + +define <2 x i32> @vmlsi32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = mul <2 x i32> %tmp2, %tmp3 + %tmp5 = sub <2 x i32> %tmp1, %tmp4 + ret <2 x i32> %tmp5 +} + +define <2 x float> @vmlsf32(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = load <2 x float>* %C + %tmp4 = mul <2 x float> %tmp2, %tmp3 + %tmp5 = sub <2 x float> %tmp1, %tmp4 + ret <2 x float> %tmp5 +} + +define <16 x i8> @vmlsQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8> * %C) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = load <16 x i8>* %C + %tmp4 = mul <16 x i8> %tmp2, %tmp3 + %tmp5 = sub <16 x i8> %tmp1, %tmp4 + ret <16 x i8> %tmp5 +} + +define <8 x i16> @vmlsQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = load <8 x i16>* %C + %tmp4 = mul <8 x i16> %tmp2, %tmp3 + %tmp5 = sub <8 x i16> %tmp1, %tmp4 + ret <8 x i16> %tmp5 +} + +define <4 x i32> @vmlsQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = load <4 x i32>* %C + %tmp4 = mul <4 x i32> %tmp2, %tmp3 + %tmp5 = sub <4 x i32> %tmp1, %tmp4 + ret <4 x i32> %tmp5 +} + +define <4 x float> @vmlsQf32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind { + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = load <4 x float>* %C + %tmp4 = mul <4 x float> %tmp2, %tmp3 + %tmp5 = sub <4 x float> %tmp1, %tmp4 + ret <4 x float> %tmp5 +} diff --git a/test/CodeGen/ARM/vmlsl.ll b/test/CodeGen/ARM/vmlsl.ll new file mode 100644 index 0000000000..94910dc77b --- /dev/null +++ b/test/CodeGen/ARM/vmlsl.ll @@ -0,0 +1,63 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vmlsl\\.s8} %t | count 1 +; RUN: grep {vmlsl\\.s16} %t | count 1 +; RUN: grep {vmlsl\\.s32} %t | count 1 +; RUN: grep {vmlsl\\.u8} %t | count 1 +; RUN: grep {vmlsl\\.u16} %t | count 1 +; RUN: grep {vmlsl\\.u32} %t | count 1 + +define <8 x i16> @vmlsls8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = load <8 x i8>* %C + %tmp4 = call <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) + ret <8 x i16> %tmp4 +} + +define <4 x i32> @vmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = call <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) + ret <4 x i32> %tmp4 +} + +define <2 x i64> @vmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = call <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) + ret <2 x i64> %tmp4 +} + +define <8 x i16> @vmlslu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = load <8 x i8>* %C + %tmp4 = call <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) + ret <8 x i16> %tmp4 +} + +define <4 x i32> @vmlslu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = call <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) + ret <4 x i32> %tmp4 +} + +define <2 x i64> @vmlslu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = call <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) + ret <2 x i64> %tmp4 +} + +declare <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vmov.ll b/test/CodeGen/ARM/vmov.ll new file mode 100644 index 0000000000..af9c8e2598 --- /dev/null +++ b/test/CodeGen/ARM/vmov.ll @@ -0,0 +1,101 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep vmov.i8 %t | count 2 +; RUN: grep vmov.i16 %t | count 4 +; RUN: grep vmov.i32 %t | count 12 +; RUN: grep vmov.i64 %t | count 2 +; Note: function names do not include "vmov" to allow simple grep for opcodes + +define <8 x i8> @v_movi8() nounwind { + ret <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > +} + +define <4 x i16> @v_movi16a() nounwind { + ret <4 x i16> < i16 16, i16 16, i16 16, i16 16 > +} + +; 0x1000 = 4096 +define <4 x i16> @v_movi16b() nounwind { + ret <4 x i16> < i16 4096, i16 4096, i16 4096, i16 4096 > +} + +define <2 x i32> @v_movi32a() nounwind { + ret <2 x i32> < i32 32, i32 32 > +} + +; 0x2000 = 8192 +define <2 x i32> @v_movi32b() nounwind { + ret <2 x i32> < i32 8192, i32 8192 > +} + +; 0x200000 = 2097152 +define <2 x i32> @v_movi32c() nounwind { + ret <2 x i32> < i32 2097152, i32 2097152 > +} + +; 0x20000000 = 536870912 +define <2 x i32> @v_movi32d() nounwind { + ret <2 x i32> < i32 536870912, i32 536870912 > +} + +; 0x20ff = 8447 +define <2 x i32> @v_movi32e() nounwind { + ret <2 x i32> < i32 8447, i32 8447 > +} + +; 0x20ffff = 2162687 +define <2 x i32> @v_movi32f() nounwind { + ret <2 x i32> < i32 2162687, i32 2162687 > +} + +; 0xff0000ff0000ffff = 18374687574888349695 +define <1 x i64> @v_movi64() nounwind { + ret <1 x i64> < i64 18374687574888349695 > +} + +define <16 x i8> @v_movQi8() nounwind { + ret <16 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > +} + +define <8 x i16> @v_movQi16a() nounwind { + ret <8 x i16> < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 > +} + +; 0x1000 = 4096 +define <8 x i16> @v_movQi16b() nounwind { + ret <8 x i16> < i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096 > +} + +define <4 x i32> @v_movQi32a() nounwind { + ret <4 x i32> < i32 32, i32 32, i32 32, i32 32 > +} + +; 0x2000 = 8192 +define <4 x i32> @v_movQi32b() nounwind { + ret <4 x i32> < i32 8192, i32 8192, i32 8192, i32 8192 > +} + +; 0x200000 = 2097152 +define <4 x i32> @v_movQi32c() nounwind { + ret <4 x i32> < i32 2097152, i32 2097152, i32 2097152, i32 2097152 > +} + +; 0x20000000 = 536870912 +define <4 x i32> @v_movQi32d() nounwind { + ret <4 x i32> < i32 536870912, i32 536870912, i32 536870912, i32 536870912 > +} + +; 0x20ff = 8447 +define <4 x i32> @v_movQi32e() nounwind { + ret <4 x i32> < i32 8447, i32 8447, i32 8447, i32 8447 > +} + +; 0x20ffff = 2162687 +define <4 x i32> @v_movQi32f() nounwind { + ret <4 x i32> < i32 2162687, i32 2162687, i32 2162687, i32 2162687 > +} + +; 0xff0000ff0000ffff = 18374687574888349695 +define <2 x i64> @v_movQi64() nounwind { + ret <2 x i64> < i64 18374687574888349695, i64 18374687574888349695 > +} + diff --git a/test/CodeGen/ARM/vmovl.ll b/test/CodeGen/ARM/vmovl.ll new file mode 100644 index 0000000000..09a77af0be --- /dev/null +++ b/test/CodeGen/ARM/vmovl.ll @@ -0,0 +1,51 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vmovl\\.s8} %t | count 1 +; RUN: grep {vmovl\\.s16} %t | count 1 +; RUN: grep {vmovl\\.s32} %t | count 1 +; RUN: grep {vmovl\\.u8} %t | count 1 +; RUN: grep {vmovl\\.u16} %t | count 1 +; RUN: grep {vmovl\\.u32} %t | count 1 + +define <8 x i16> @vmovls8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vmovls.v8i16(<8 x i8> %tmp1) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vmovls16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16> %tmp1) + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vmovls32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i64> @llvm.arm.neon.vmovls.v2i64(<2 x i32> %tmp1) + ret <2 x i64> %tmp2 +} + +define <8 x i16> @vmovlu8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vmovlu.v8i16(<8 x i8> %tmp1) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vmovlu16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vmovlu.v4i32(<4 x i16> %tmp1) + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vmovlu32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i64> @llvm.arm.neon.vmovlu.v2i64(<2 x i32> %tmp1) + ret <2 x i64> %tmp2 +} + +declare <8 x i16> @llvm.arm.neon.vmovls.v8i16(<8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vmovls.v2i64(<2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vmovlu.v8i16(<8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmovlu.v4i32(<4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vmovlu.v2i64(<2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vmovn.ll b/test/CodeGen/ARM/vmovn.ll new file mode 100644 index 0000000000..d4d027c26c --- /dev/null +++ b/test/CodeGen/ARM/vmovn.ll @@ -0,0 +1,26 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vmovn\\.i16} %t | count 1 +; RUN: grep {vmovn\\.i32} %t | count 1 +; RUN: grep {vmovn\\.i64} %t | count 1 + +define <8 x i8> @vmovni16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vmovn.v8i8(<8 x i16> %tmp1) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vmovni32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vmovn.v4i16(<4 x i32> %tmp1) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vmovni64(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vmovn.v2i32(<2 x i64> %tmp1) + ret <2 x i32> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vmovn.v8i8(<8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vmovn.v4i16(<4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vmovn.v2i32(<2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll new file mode 100644 index 0000000000..eb9ae7b95c --- /dev/null +++ b/test/CodeGen/ARM/vmul.ll @@ -0,0 +1,79 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vmul\\.i8} %t | count 2 +; RUN: grep {vmul\\.i16} %t | count 2 +; RUN: grep {vmul\\.i32} %t | count 2 +; RUN: grep {vmul\\.f32} %t | count 2 +; RUN: grep {vmul\\.p8} %t | count 2 + +define <8 x i8> @vmuli8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = mul <8 x i8> %tmp1, %tmp2 + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vmuli16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = mul <4 x i16> %tmp1, %tmp2 + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vmuli32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = mul <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <2 x float> @vmulf32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = mul <2 x float> %tmp1, %tmp2 + ret <2 x float> %tmp3 +} + +define <8 x i8> @vmulp8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <16 x i8> @vmulQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = mul <16 x i8> %tmp1, %tmp2 + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vmulQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = mul <8 x i16> %tmp1, %tmp2 + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vmulQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = mul <4 x i32> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} + +define <4 x float> @vmulQf32(<4 x float>* %A, <4 x float>* %B) nounwind { + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = mul <4 x float> %tmp1, %tmp2 + ret <4 x float> %tmp3 +} + +define <16 x i8> @vmulQp8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone diff --git a/test/CodeGen/ARM/vmull.ll b/test/CodeGen/ARM/vmull.ll new file mode 100644 index 0000000000..2fe6f37411 --- /dev/null +++ b/test/CodeGen/ARM/vmull.ll @@ -0,0 +1,67 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vmull\\.s8} %t | count 1 +; RUN: grep {vmull\\.s16} %t | count 1 +; RUN: grep {vmull\\.s32} %t | count 1 +; RUN: grep {vmull\\.u8} %t | count 1 +; RUN: grep {vmull\\.u16} %t | count 1 +; RUN: grep {vmull\\.u32} %t | count 1 +; RUN: grep {vmull\\.p8} %t | count 1 + +define <8 x i16> @vmulls8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +define <8 x i16> @vmullu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vmullu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vmullu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +define <8 x i16> @vmullp8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>) nounwind readnone diff --git a/test/CodeGen/ARM/vmvn.ll b/test/CodeGen/ARM/vmvn.ll new file mode 100644 index 0000000000..9bc135241e --- /dev/null +++ b/test/CodeGen/ARM/vmvn.ll @@ -0,0 +1,51 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep vmvn %t | count 8 +; Note: function names do not include "vmvn" to allow simple grep for opcodes + +define <8 x i8> @v_mvni8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = xor <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + ret <8 x i8> %tmp2 +} + +define <4 x i16> @v_mvni16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = xor <4 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1 > + ret <4 x i16> %tmp2 +} + +define <2 x i32> @v_mvni32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = xor <2 x i32> %tmp1, < i32 -1, i32 -1 > + ret <2 x i32> %tmp2 +} + +define <1 x i64> @v_mvni64(<1 x i64>* %A) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = xor <1 x i64> %tmp1, < i64 -1 > + ret <1 x i64> %tmp2 +} + +define <16 x i8> @v_mvnQi8(<16 x i8>* %A) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = xor <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + ret <16 x i8> %tmp2 +} + +define <8 x i16> @v_mvnQi16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = xor <8 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 > + ret <8 x i16> %tmp2 +} + +define <4 x i32> @v_mvnQi32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = xor <4 x i32> %tmp1, < i32 -1, i32 -1, i32 -1, i32 -1 > + ret <4 x i32> %tmp2 +} + +define <2 x i64> @v_mvnQi64(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = xor <2 x i64> %tmp1, < i64 -1, i64 -1 > + ret <2 x i64> %tmp2 +} diff --git a/test/CodeGen/ARM/vneg.ll b/test/CodeGen/ARM/vneg.ll new file mode 100644 index 0000000000..9fa527f52f --- /dev/null +++ b/test/CodeGen/ARM/vneg.ll @@ -0,0 +1,53 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vneg\\.s8} %t | count 2 +; RUN: grep {vneg\\.s16} %t | count 2 +; RUN: grep {vneg\\.s32} %t | count 2 +; RUN: grep {vneg\\.f32} %t | count 2 + +define <8 x i8> @vnegs8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = sub <8 x i8> zeroinitializer, %tmp1 + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vnegs16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = sub <4 x i16> zeroinitializer, %tmp1 + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vnegs32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = sub <2 x i32> zeroinitializer, %tmp1 + ret <2 x i32> %tmp2 +} + +define <2 x float> @vnegf32(<2 x float>* %A) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = sub <2 x float> < float -0.000000e+00, float -0.000000e+00 >, %tmp1 + ret <2 x float> %tmp2 +} + +define <16 x i8> @vnegQs8(<16 x i8>* %A) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = sub <16 x i8> zeroinitializer, %tmp1 + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vnegQs16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = sub <8 x i16> zeroinitializer, %tmp1 + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vnegQs32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = sub <4 x i32> zeroinitializer, %tmp1 + ret <4 x i32> %tmp2 +} + +define <4 x float> @vnegQf32(<4 x float>* %A) nounwind { + %tmp1 = load <4 x float>* %A + %tmp2 = sub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %tmp1 + ret <4 x float> %tmp2 +} diff --git a/test/CodeGen/ARM/vorn.ll b/test/CodeGen/ARM/vorn.ll new file mode 100644 index 0000000000..cee744ccb1 --- /dev/null +++ b/test/CodeGen/ARM/vorn.ll @@ -0,0 +1,67 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep vorn %t | count 8 +; Note: function names do not include "vorn" to allow simple grep for opcodes + +define <8 x i8> @v_orni8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = xor <8 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp4 = or <8 x i8> %tmp1, %tmp3 + ret <8 x i8> %tmp4 +} + +define <4 x i16> @v_orni16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = xor <4 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1 > + %tmp4 = or <4 x i16> %tmp1, %tmp3 + ret <4 x i16> %tmp4 +} + +define <2 x i32> @v_orni32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = xor <2 x i32> %tmp2, < i32 -1, i32 -1 > + %tmp4 = or <2 x i32> %tmp1, %tmp3 + ret <2 x i32> %tmp4 +} + +define <1 x i64> @v_orni64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = xor <1 x i64> %tmp2, < i64 -1 > + %tmp4 = or <1 x i64> %tmp1, %tmp3 + ret <1 x i64> %tmp4 +} + +define <16 x i8> @v_ornQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = xor <16 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp4 = or <16 x i8> %tmp1, %tmp3 + ret <16 x i8> %tmp4 +} + +define <8 x i16> @v_ornQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = xor <8 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 > + %tmp4 = or <8 x i16> %tmp1, %tmp3 + ret <8 x i16> %tmp4 +} + +define <4 x i32> @v_ornQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = xor <4 x i32> %tmp2, < i32 -1, i32 -1, i32 -1, i32 -1 > + %tmp4 = or <4 x i32> %tmp1, %tmp3 + ret <4 x i32> %tmp4 +} + +define <2 x i64> @v_ornQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = xor <2 x i64> %tmp2, < i64 -1, i64 -1 > + %tmp4 = or <2 x i64> %tmp1, %tmp3 + ret <2 x i64> %tmp4 +} diff --git a/test/CodeGen/ARM/vorr.ll b/test/CodeGen/ARM/vorr.ll new file mode 100644 index 0000000000..39f4814c66 --- /dev/null +++ b/test/CodeGen/ARM/vorr.ll @@ -0,0 +1,59 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep vorr %t | count 8 +; Note: function names do not include "vorr" to allow simple grep for opcodes + +define <8 x i8> @v_orri8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = or <8 x i8> %tmp1, %tmp2 + ret <8 x i8> %tmp3 +} + +define <4 x i16> @v_orri16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = or <4 x i16> %tmp1, %tmp2 + ret <4 x i16> %tmp3 +} + +define <2 x i32> @v_orri32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = or <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <1 x i64> @v_orri64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = or <1 x i64> %tmp1, %tmp2 + ret <1 x i64> %tmp3 +} + +define <16 x i8> @v_orrQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = or <16 x i8> %tmp1, %tmp2 + ret <16 x i8> %tmp3 +} + +define <8 x i16> @v_orrQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = or <8 x i16> %tmp1, %tmp2 + ret <8 x i16> %tmp3 +} + +define <4 x i32> @v_orrQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = or <4 x i32> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} + +define <2 x i64> @v_orrQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = or <2 x i64> %tmp1, %tmp2 + ret <2 x i64> %tmp3 +} diff --git a/test/CodeGen/ARM/vpadal.ll b/test/CodeGen/ARM/vpadal.ll new file mode 100644 index 0000000000..c41c532988 --- /dev/null +++ b/test/CodeGen/ARM/vpadal.ll @@ -0,0 +1,107 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vpadal\\.s8} %t | count 2 +; RUN: grep {vpadal\\.s16} %t | count 2 +; RUN: grep {vpadal\\.s32} %t | count 2 +; RUN: grep {vpadal\\.u8} %t | count 2 +; RUN: grep {vpadal\\.u16} %t | count 2 +; RUN: grep {vpadal\\.u32} %t | count 2 + +define <4 x i16> @vpadals8(<4 x i16>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> %tmp1, <8 x i8> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vpadals16(<2 x i32>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> %tmp1, <4 x i16> %tmp2) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vpadals32(<1 x i64>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> %tmp1, <2 x i32> %tmp2) + ret <1 x i64> %tmp3 +} + +define <4 x i16> @vpadalu8(<4 x i16>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> %tmp1, <8 x i8> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vpadalu16(<2 x i32>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> %tmp1, <4 x i16> %tmp2) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vpadalu32(<1 x i64>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> %tmp1, <2 x i32> %tmp2) + ret <1 x i64> %tmp3 +} + +define <8 x i16> @vpadalQs8(<8 x i16>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> %tmp1, <16 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vpadalQs16(<4 x i32>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> %tmp1, <8 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vpadalQs32(<2 x i64>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> %tmp1, <4 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +define <8 x i16> @vpadalQu8(<8 x i16>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> %tmp1, <16 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vpadalQu16(<4 x i32>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> %tmp1, <8 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vpadalQu32(<2 x i64>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> %tmp1, <4 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +declare <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16>, <8 x i8>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32>, <4 x i16>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64>, <2 x i32>) nounwind readnone + +declare <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16>, <8 x i8>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32>, <4 x i16>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16>, <16 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32>, <8 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64>, <4 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16>, <16 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32>, <8 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64>, <4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vpadd.ll b/test/CodeGen/ARM/vpadd.ll new file mode 100644 index 0000000000..baff49227e --- /dev/null +++ b/test/CodeGen/ARM/vpadd.ll @@ -0,0 +1,39 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vpadd\\.i8} %t | count 1 +; RUN: grep {vpadd\\.i16} %t | count 1 +; RUN: grep {vpadd\\.i32} %t | count 1 +; RUN: grep {vpadd\\.f32} %t | count 1 + +define <8 x i8> @vpaddi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vpaddi.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vpaddi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vpaddi.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vpaddi32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vpaddi.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <2 x float> @vpaddf32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = call <2 x float> @llvm.arm.neon.vpaddf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) + ret <2 x float> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vpaddi.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vpaddi.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vpaddi.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <2 x float> @llvm.arm.neon.vpaddf.v2f32(<2 x float>, <2 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vpaddl.ll b/test/CodeGen/ARM/vpaddl.ll new file mode 100644 index 0000000000..babb49515a --- /dev/null +++ b/test/CodeGen/ARM/vpaddl.ll @@ -0,0 +1,95 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vpaddl\\.s8} %t | count 2 +; RUN: grep {vpaddl\\.s16} %t | count 2 +; RUN: grep {vpaddl\\.s32} %t | count 2 +; RUN: grep {vpaddl\\.u8} %t | count 2 +; RUN: grep {vpaddl\\.u16} %t | count 2 +; RUN: grep {vpaddl\\.u32} %t | count 2 + +define <4 x i16> @vpaddls8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %tmp1) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vpaddls16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %tmp1) + ret <2 x i32> %tmp2 +} + +define <1 x i64> @vpaddls32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %tmp1) + ret <1 x i64> %tmp2 +} + +define <4 x i16> @vpaddlu8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %tmp1) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vpaddlu16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %tmp1) + ret <2 x i32> %tmp2 +} + +define <1 x i64> @vpaddlu32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %tmp1) + ret <1 x i64> %tmp2 +} + +define <8 x i16> @vpaddlQs8(<16 x i8>* %A) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %tmp1) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vpaddlQs16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %tmp1) + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vpaddlQs32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %tmp1) + ret <2 x i64> %tmp2 +} + +define <8 x i16> @vpaddlQu8(<16 x i8>* %A) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %tmp1) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vpaddlQu16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %tmp1) + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vpaddlQu32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %tmp1) + ret <2 x i64> %tmp2 +} + +declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) nounwind readnone + +declare <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vpmax.ll b/test/CodeGen/ARM/vpmax.ll new file mode 100644 index 0000000000..9878ca8c7b --- /dev/null +++ b/test/CodeGen/ARM/vpmax.ll @@ -0,0 +1,67 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vpmax\\.s8} %t | count 1 +; RUN: grep {vpmax\\.s16} %t | count 1 +; RUN: grep {vpmax\\.s32} %t | count 1 +; RUN: grep {vpmax\\.u8} %t | count 1 +; RUN: grep {vpmax\\.u16} %t | count 1 +; RUN: grep {vpmax\\.u32} %t | count 1 +; RUN: grep {vpmax\\.f32} %t | count 1 + +define <8 x i8> @vpmaxs8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vpmaxs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vpmaxs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <8 x i8> @vpmaxu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vpmaxu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vpmaxu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <2 x float> @vpmaxf32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = call <2 x float> @llvm.arm.neon.vpmaxf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) + ret <2 x float> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <2 x float> @llvm.arm.neon.vpmaxf.v2f32(<2 x float>, <2 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vpmin.ll b/test/CodeGen/ARM/vpmin.ll new file mode 100644 index 0000000000..7b5348baa5 --- /dev/null +++ b/test/CodeGen/ARM/vpmin.ll @@ -0,0 +1,67 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vpmin\\.s8} %t | count 1 +; RUN: grep {vpmin\\.s16} %t | count 1 +; RUN: grep {vpmin\\.s32} %t | count 1 +; RUN: grep {vpmin\\.u8} %t | count 1 +; RUN: grep {vpmin\\.u16} %t | count 1 +; RUN: grep {vpmin\\.u32} %t | count 1 +; RUN: grep {vpmin\\.f32} %t | count 1 + +define <8 x i8> @vpmins8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vpmins16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vpmins32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <8 x i8> @vpminu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vpminu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vpminu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <2 x float> @vpminf32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = call <2 x float> @llvm.arm.neon.vpminf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) + ret <2 x float> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <2 x float> @llvm.arm.neon.vpminf.v2f32(<2 x float>, <2 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vqabs.ll b/test/CodeGen/ARM/vqabs.ll new file mode 100644 index 0000000000..04b8ad995b --- /dev/null +++ b/test/CodeGen/ARM/vqabs.ll @@ -0,0 +1,48 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vqabs\\.s8} %t | count 2 +; RUN: grep {vqabs\\.s16} %t | count 2 +; RUN: grep {vqabs\\.s32} %t | count 2 + +define <8 x i8> @vqabss8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %tmp1) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vqabss16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %tmp1) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vqabss32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %tmp1) + ret <2 x i32> %tmp2 +} + +define <16 x i8> @vqabsQs8(<16 x i8>* %A) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %tmp1) + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vqabsQs16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %tmp1) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vqabsQs32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %tmp1) + ret <4 x i32> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vqadd.ll b/test/CodeGen/ARM/vqadd.ll new file mode 100644 index 0000000000..c9e2359953 --- /dev/null +++ b/test/CodeGen/ARM/vqadd.ll @@ -0,0 +1,141 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vqadd\\.s8} %t | count 2 +; RUN: grep {vqadd\\.s16} %t | count 2 +; RUN: grep {vqadd\\.s32} %t | count 2 +; RUN: grep {vqadd\\.s64} %t | count 2 +; RUN: grep {vqadd\\.u8} %t | count 2 +; RUN: grep {vqadd\\.u16} %t | count 2 +; RUN: grep {vqadd\\.u32} %t | count 2 +; RUN: grep {vqadd\\.u64} %t | count 2 + +define <8 x i8> @vqadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vqadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vqadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vqadds64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + ret <1 x i64> %tmp3 +} + +define <8 x i8> @vqaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vqaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vqaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vqaddu64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + ret <1 x i64> %tmp3 +} + +define <16 x i8> @vqaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vqaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vqaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vqaddQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i64> %tmp3 +} + +define <16 x i8> @vqaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vqaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vqaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vqaddQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i64> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vqdmlal.ll b/test/CodeGen/ARM/vqdmlal.ll new file mode 100644 index 0000000000..f05bf530c4 --- /dev/null +++ b/test/CodeGen/ARM/vqdmlal.ll @@ -0,0 +1,22 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vqdmlal\\.s16} %t | count 1 +; RUN: grep {vqdmlal\\.s32} %t | count 1 + +define <4 x i32> @vqdmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) + ret <4 x i32> %tmp4 +} + +define <2 x i64> @vqdmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) + ret <2 x i64> %tmp4 +} + +declare <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vqdmlsl.ll b/test/CodeGen/ARM/vqdmlsl.ll new file mode 100644 index 0000000000..2b71ed5563 --- /dev/null +++ b/test/CodeGen/ARM/vqdmlsl.ll @@ -0,0 +1,22 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vqdmlsl\\.s16} %t | count 1 +; RUN: grep {vqdmlsl\\.s32} %t | count 1 + +define <4 x i32> @vqdmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) + ret <4 x i32> %tmp4 +} + +define <2 x i64> @vqdmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) + ret <2 x i64> %tmp4 +} + +declare <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vqdmulh.ll b/test/CodeGen/ARM/vqdmulh.ll new file mode 100644 index 0000000000..dd686ddd93 --- /dev/null +++ b/test/CodeGen/ARM/vqdmulh.ll @@ -0,0 +1,73 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vqdmulh\\.s16} %t | count 2 +; RUN: grep {vqdmulh\\.s32} %t | count 2 +; RUN: grep {vqrdmulh\\.s16} %t | count 2 +; RUN: grep {vqrdmulh\\.s32} %t | count 2 + +define <4 x i16> @vqdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vqdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <8 x i16> @vqdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vqdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +define <4 x i16> @vqrdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vqrdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <8 x i16> @vqrdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vqrdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vqdmull.ll b/test/CodeGen/ARM/vqdmull.ll new file mode 100644 index 0000000000..3ff90f795d --- /dev/null +++ b/test/CodeGen/ARM/vqdmull.ll @@ -0,0 +1,20 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vqdmull\\.s16} %t | count 1 +; RUN: grep {vqdmull\\.s32} %t | count 1 + +define <4 x i32> @vqdmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vqdmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vqmovn.ll b/test/CodeGen/ARM/vqmovn.ll new file mode 100644 index 0000000000..78e6d23416 --- /dev/null +++ b/test/CodeGen/ARM/vqmovn.ll @@ -0,0 +1,76 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vqmovn\\.s16} %t | count 1 +; RUN: grep {vqmovn\\.s32} %t | count 1 +; RUN: grep {vqmovn\\.s64} %t | count 1 +; RUN: grep {vqmovn\\.u16} %t | count 1 +; RUN: grep {vqmovn\\.u32} %t | count 1 +; RUN: grep {vqmovn\\.u64} %t | count 1 +; RUN: grep {vqmovun\\.s16} %t | count 1 +; RUN: grep {vqmovun\\.s32} %t | count 1 +; RUN: grep {vqmovun\\.s64} %t | count 1 + +define <8 x i8> @vqmovns16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %tmp1) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vqmovns32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %tmp1) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vqmovns64(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %tmp1) + ret <2 x i32> %tmp2 +} + +define <8 x i8> @vqmovnu16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %tmp1) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vqmovnu32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %tmp1) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vqmovnu64(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %tmp1) + ret <2 x i32> %tmp2 +} + +define <8 x i8> @vqmovuns16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %tmp1) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vqmovuns32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %tmp1) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vqmovuns64(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %tmp1) + ret <2 x i32> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vqneg.ll b/test/CodeGen/ARM/vqneg.ll new file mode 100644 index 0000000000..72035f024f --- /dev/null +++ b/test/CodeGen/ARM/vqneg.ll @@ -0,0 +1,48 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vqneg\\.s8} %t | count 2 +; RUN: grep {vqneg\\.s16} %t | count 2 +; RUN: grep {vqneg\\.s32} %t | count 2 + +define <8 x i8> @vqnegs8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %tmp1) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vqnegs16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %tmp1) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vqnegs32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %tmp1) + ret <2 x i32> %tmp2 +} + +define <16 x i8> @vqnegQs8(<16 x i8>* %A) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %tmp1) + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vqnegQs16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %tmp1) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vqnegQs32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %tmp1) + ret <4 x i32> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vqrshl.ll b/test/CodeGen/ARM/vqrshl.ll new file mode 100644 index 0000000000..63e03c1245 --- /dev/null +++ b/test/CodeGen/ARM/vqrshl.ll @@ -0,0 +1,141 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vqrshl\\.s8} %t | count 2 +; RUN: grep {vqrshl\\.s16} %t | count 2 +; RUN: grep {vqrshl\\.s32} %t | count 2 +; RUN: grep {vqrshl\\.s64} %t | count 2 +; RUN: grep {vqrshl\\.u8} %t | count 2 +; RUN: grep {vqrshl\\.u16} %t | count 2 +; RUN: grep {vqrshl\\.u32} %t | count 2 +; RUN: grep {vqrshl\\.u64} %t | count 2 + +define <8 x i8> @vqrshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vqrshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vqrshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vqrshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + ret <1 x i64> %tmp3 +} + +define <8 x i8> @vqrshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vqrshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vqrshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vqrshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + ret <1 x i64> %tmp3 +} + +define <16 x i8> @vqrshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vqrshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vqrshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vqrshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i64> %tmp3 +} + +define <16 x i8> @vqrshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vqrshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vqrshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vqrshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i64> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vqrshrn.ll b/test/CodeGen/ARM/vqrshrn.ll new file mode 100644 index 0000000000..38f98f2a2c --- /dev/null +++ b/test/CodeGen/ARM/vqrshrn.ll @@ -0,0 +1,76 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vqrshrn\\.s16} %t | count 1 +; RUN: grep {vqrshrn\\.s32} %t | count 1 +; RUN: grep {vqrshrn\\.s64} %t | count 1 +; RUN: grep {vqrshrn\\.u16} %t | count 1 +; RUN: grep {vqrshrn\\.u32} %t | count 1 +; RUN: grep {vqrshrn\\.u64} %t | count 1 +; RUN: grep {vqrshrun\\.s16} %t | count 1 +; RUN: grep {vqrshrun\\.s32} %t | count 1 +; RUN: grep {vqrshrun\\.s64} %t | count 1 + +define <8 x i8> @vqrshrns8(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vqrshrns16(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vqrshrns32(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >) + ret <2 x i32> %tmp2 +} + +define <8 x i8> @vqrshrnu8(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vqrshrnu16(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vqrshrnu32(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >) + ret <2 x i32> %tmp2 +} + +define <8 x i8> @vqrshruns8(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vqrshruns16(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vqrshruns32(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >) + ret <2 x i32> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64>, <2 x i64>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vqshl.ll b/test/CodeGen/ARM/vqshl.ll new file mode 100644 index 0000000000..60b04bd583 --- /dev/null +++ b/test/CodeGen/ARM/vqshl.ll @@ -0,0 +1,307 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vqshl\\.s8} %t | count 4 +; RUN: grep {vqshl\\.s16} %t | count 4 +; RUN: grep {vqshl\\.s32} %t | count 4 +; RUN: grep {vqshl\\.s64} %t | count 4 +; RUN: grep {vqshl\\.u8} %t | count 4 +; RUN: grep {vqshl\\.u16} %t | count 4 +; RUN: grep {vqshl\\.u32} %t | count 4 +; RUN: grep {vqshl\\.u64} %t | count 4 +; RUN: grep {vqshl\\.s8.*#7} %t | count 2 +; RUN: grep {vqshl\\.s16.*#15} %t | count 2 +; RUN: grep {vqshl\\.s32.*#31} %t | count 2 +; RUN: grep {vqshl\\.s64.*#63} %t | count 2 +; RUN: grep {vqshl\\.u8.*#7} %t | count 2 +; RUN: grep {vqshl\\.u16.*#15} %t | count 2 +; RUN: grep {vqshl\\.u32.*#31} %t | count 2 +; RUN: grep {vqshl\\.u64.*#63} %t | count 2 +; RUN: grep {vqshlu\\.s8} %t | count 2 +; RUN: grep {vqshlu\\.s16} %t | count 2 +; RUN: grep {vqshlu\\.s32} %t | count 2 +; RUN: grep {vqshlu\\.s64} %t | count 2 + +define <8 x i8> @vqshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vqshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vqshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vqshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + ret <1 x i64> %tmp3 +} + +define <8 x i8> @vqshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vqshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vqshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vqshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + ret <1 x i64> %tmp3 +} + +define <16 x i8> @vqshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vqshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vqshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vqshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i64> %tmp3 +} + +define <16 x i8> @vqshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vqshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vqshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vqshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i64> %tmp3 +} + +define <8 x i8> @vqshls_n8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vqshls_n16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vqshls_n32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >) + ret <2 x i32> %tmp2 +} + +define <1 x i64> @vqshls_n64(<1 x i64>* %A) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 63 >) + ret <1 x i64> %tmp2 +} + +define <8 x i8> @vqshlu_n8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vqshlu_n16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vqshlu_n32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >) + ret <2 x i32> %tmp2 +} + +define <1 x i64> @vqshlu_n64(<1 x i64>* %A) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 63 >) + ret <1 x i64> %tmp2 +} + +define <8 x i8> @vqshlsu_n8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vqshlsu_n16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vqshlsu_n32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >) + ret <2 x i32> %tmp2 +} + +define <1 x i64> @vqshlsu_n64(<1 x i64>* %A) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = call <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 63 >) + ret <1 x i64> %tmp2 +} + +define <16 x i8> @vqshlQs_n8(<16 x i8>* %A) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >) + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vqshlQs_n16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vqshlQs_n32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >) + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vqshlQs_n64(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 63, i64 63 >) + ret <2 x i64> %tmp2 +} + +define <16 x i8> @vqshlQu_n8(<16 x i8>* %A) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >) + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vqshlQu_n16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vqshlQu_n32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >) + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vqshlQu_n64(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 63, i64 63 >) + ret <2 x i64> %tmp2 +} + +define <16 x i8> @vqshlQsu_n8(<16 x i8>* %A) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = call <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >) + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vqshlQsu_n16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vqshlQsu_n32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >) + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vqshlQsu_n64(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 63, i64 63 >) + ret <2 x i64> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vqshrn.ll b/test/CodeGen/ARM/vqshrn.ll new file mode 100644 index 0000000000..6bd607abb4 --- /dev/null +++ b/test/CodeGen/ARM/vqshrn.ll @@ -0,0 +1,76 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vqshrn\\.s16} %t | count 1 +; RUN: grep {vqshrn\\.s32} %t | count 1 +; RUN: grep {vqshrn\\.s64} %t | count 1 +; RUN: grep {vqshrn\\.u16} %t | count 1 +; RUN: grep {vqshrn\\.u32} %t | count 1 +; RUN: grep {vqshrn\\.u64} %t | count 1 +; RUN: grep {vqshrun\\.s16} %t | count 1 +; RUN: grep {vqshrun\\.s32} %t | count 1 +; RUN: grep {vqshrun\\.s64} %t | count 1 + +define <8 x i8> @vqshrns8(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vqshrns16(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vqshrns32(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >) + ret <2 x i32> %tmp2 +} + +define <8 x i8> @vqshrnu8(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vqshrnu16(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vqshrnu32(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >) + ret <2 x i32> %tmp2 +} + +define <8 x i8> @vqshruns8(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vqshruns16(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vqshruns32(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >) + ret <2 x i32> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64>, <2 x i64>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vqsub.ll b/test/CodeGen/ARM/vqsub.ll new file mode 100644 index 0000000000..07052f78db --- /dev/null +++ b/test/CodeGen/ARM/vqsub.ll @@ -0,0 +1,141 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vqsub\\.s8} %t | count 2 +; RUN: grep {vqsub\\.s16} %t | count 2 +; RUN: grep {vqsub\\.s32} %t | count 2 +; RUN: grep {vqsub\\.s64} %t | count 2 +; RUN: grep {vqsub\\.u8} %t | count 2 +; RUN: grep {vqsub\\.u16} %t | count 2 +; RUN: grep {vqsub\\.u32} %t | count 2 +; RUN: grep {vqsub\\.u64} %t | count 2 + +define <8 x i8> @vqsubs8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vqsubs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vqsubs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vqsubs64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + ret <1 x i64> %tmp3 +} + +define <8 x i8> @vqsubu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vqsubu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vqsubu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vqsubu64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + ret <1 x i64> %tmp3 +} + +define <16 x i8> @vqsubQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vqsubQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vqsubQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vqsubQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i64> %tmp3 +} + +define <16 x i8> @vqsubQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vqsubQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vqsubQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vqsubQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i64> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vraddhn.ll b/test/CodeGen/ARM/vraddhn.ll new file mode 100644 index 0000000000..d69e54554c --- /dev/null +++ b/test/CodeGen/ARM/vraddhn.ll @@ -0,0 +1,29 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vraddhn\\.i16} %t | count 1 +; RUN: grep {vraddhn\\.i32} %t | count 1 +; RUN: grep {vraddhn\\.i64} %t | count 1 + +define <8 x i8> @vraddhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vraddhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vraddhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i32> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vrecpe.ll b/test/CodeGen/ARM/vrecpe.ll new file mode 100644 index 0000000000..79cb595bc8 --- /dev/null +++ b/test/CodeGen/ARM/vrecpe.ll @@ -0,0 +1,33 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vrecpe\\.u32} %t | count 2 +; RUN: grep {vrecpe\\.f32} %t | count 2 + +define <2 x i32> @vrecpei32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %tmp1) + ret <2 x i32> %tmp2 +} + +define <4 x i32> @vrecpeQi32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %tmp1) + ret <4 x i32> %tmp2 +} + +define <2 x float> @vrecpef32(<2 x float>* %A) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = call <2 x float> @llvm.arm.neon.vrecpef.v2f32(<2 x float> %tmp1) + ret <2 x float> %tmp2 +} + +define <4 x float> @vrecpeQf32(<4 x float>* %A) nounwind { + %tmp1 = load <4 x float>* %A + %tmp2 = call <4 x float> @llvm.arm.neon.vrecpef.v4f32(<4 x float> %tmp1) + ret <4 x float> %tmp2 +} + +declare <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32>) nounwind readnone + +declare <2 x float> @llvm.arm.neon.vrecpef.v2f32(<2 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrecpef.v4f32(<4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vrecps.ll b/test/CodeGen/ARM/vrecps.ll new file mode 100644 index 0000000000..7ded4157f0 --- /dev/null +++ b/test/CodeGen/ARM/vrecps.ll @@ -0,0 +1,19 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vrecps\\.f32} %t | count 2 + +define <2 x float> @vrecpsf32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) + ret <2 x float> %tmp3 +} + +define <4 x float> @vrecpsQf32(<4 x float>* %A, <4 x float>* %B) nounwind { + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) + ret <4 x float> %tmp3 +} + +declare <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float>, <2 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vrhadd.ll b/test/CodeGen/ARM/vrhadd.ll new file mode 100644 index 0000000000..4f6437caea --- /dev/null +++ b/test/CodeGen/ARM/vrhadd.ll @@ -0,0 +1,107 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vrhadd\\.s8} %t | count 2 +; RUN: grep {vrhadd\\.s16} %t | count 2 +; RUN: grep {vrhadd\\.s32} %t | count 2 +; RUN: grep {vrhadd\\.u8} %t | count 2 +; RUN: grep {vrhadd\\.u16} %t | count 2 +; RUN: grep {vrhadd\\.u32} %t | count 2 + +define <8 x i8> @vrhadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vrhadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vrhadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <8 x i8> @vrhaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vrhaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vrhaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <16 x i8> @vrhaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vrhaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vrhaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <16 x i8> @vrhaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vrhaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vrhaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vrshl.ll b/test/CodeGen/ARM/vrshl.ll new file mode 100644 index 0000000000..fbbf5489c7 --- /dev/null +++ b/test/CodeGen/ARM/vrshl.ll @@ -0,0 +1,245 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vrshl\\.s8} %t | count 2 +; RUN: grep {vrshl\\.s16} %t | count 2 +; RUN: grep {vrshl\\.s32} %t | count 2 +; RUN: grep {vrshl\\.s64} %t | count 2 +; RUN: grep {vrshl\\.u8} %t | count 2 +; RUN: grep {vrshl\\.u16} %t | count 2 +; RUN: grep {vrshl\\.u32} %t | count 2 +; RUN: grep {vrshl\\.u64} %t | count 2 +; RUN: grep {vrshr\\.s8} %t | count 2 +; RUN: grep {vrshr\\.s16} %t | count 2 +; RUN: grep {vrshr\\.s32} %t | count 2 +; RUN: grep {vrshr\\.s64} %t | count 2 +; RUN: grep {vrshr\\.u8} %t | count 2 +; RUN: grep {vrshr\\.u16} %t | count 2 +; RUN: grep {vrshr\\.u32} %t | count 2 +; RUN: grep {vrshr\\.u64} %t | count 2 + +define <8 x i8> @vrshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vrshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vrshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vrshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + ret <1 x i64> %tmp3 +} + +define <8 x i8> @vrshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vrshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vrshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vrshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + ret <1 x i64> %tmp3 +} + +define <16 x i8> @vrshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vrshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vrshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vrshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i64> %tmp3 +} + +define <16 x i8> @vrshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vrshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vrshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vrshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i64> %tmp3 +} + +define <8 x i8> @vrshrs8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vrshrs16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vrshrs32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >) + ret <2 x i32> %tmp2 +} + +define <1 x i64> @vrshrs64(<1 x i64>* %A) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >) + ret <1 x i64> %tmp2 +} + +define <8 x i8> @vrshru8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vrshru16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vrshru32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >) + ret <2 x i32> %tmp2 +} + +define <1 x i64> @vrshru64(<1 x i64>* %A) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >) + ret <1 x i64> %tmp2 +} + +define <16 x i8> @vrshrQs8(<16 x i8>* %A) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vrshrQs16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vrshrQs32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >) + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vrshrQs64(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >) + ret <2 x i64> %tmp2 +} + +define <16 x i8> @vrshrQu8(<16 x i8>* %A) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vrshrQu16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vrshrQu32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >) + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vrshrQu64(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >) + ret <2 x i64> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vrshrn.ll b/test/CodeGen/ARM/vrshrn.ll new file mode 100644 index 0000000000..8af24fd4d6 --- /dev/null +++ b/test/CodeGen/ARM/vrshrn.ll @@ -0,0 +1,26 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vrshrn\\.i16} %t | count 1 +; RUN: grep {vrshrn\\.i32} %t | count 1 +; RUN: grep {vrshrn\\.i64} %t | count 1 + +define <8 x i8> @vrshrns8(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vrshrns16(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vrshrns32(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >) + ret <2 x i32> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vrsqrte.ll b/test/CodeGen/ARM/vrsqrte.ll new file mode 100644 index 0000000000..10529f61b5 --- /dev/null +++ b/test/CodeGen/ARM/vrsqrte.ll @@ -0,0 +1,33 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vrsqrte\\.u32} %t | count 2 +; RUN: grep {vrsqrte\\.f32} %t | count 2 + +define <2 x i32> @vrsqrtei32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> %tmp1) + ret <2 x i32> %tmp2 +} + +define <4 x i32> @vrsqrteQi32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> %tmp1) + ret <4 x i32> %tmp2 +} + +define <2 x float> @vrsqrtef32(<2 x float>* %A) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = call <2 x float> @llvm.arm.neon.vrsqrtef.v2f32(<2 x float> %tmp1) + ret <2 x float> %tmp2 +} + +define <4 x float> @vrsqrteQf32(<4 x float>* %A) nounwind { + %tmp1 = load <4 x float>* %A + %tmp2 = call <4 x float> @llvm.arm.neon.vrsqrtef.v4f32(<4 x float> %tmp1) + ret <4 x float> %tmp2 +} + +declare <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32>) nounwind readnone + +declare <2 x float> @llvm.arm.neon.vrsqrtef.v2f32(<2 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrsqrtef.v4f32(<4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vrsqrts.ll b/test/CodeGen/ARM/vrsqrts.ll new file mode 100644 index 0000000000..3fd0567842 --- /dev/null +++ b/test/CodeGen/ARM/vrsqrts.ll @@ -0,0 +1,19 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vrsqrts\\.f32} %t | count 2 + +define <2 x float> @vrsqrtsf32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) + ret <2 x float> %tmp3 +} + +define <4 x float> @vrsqrtsQf32(<4 x float>* %A, <4 x float>* %B) nounwind { + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) + ret <4 x float> %tmp3 +} + +declare <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vrsubhn.ll b/test/CodeGen/ARM/vrsubhn.ll new file mode 100644 index 0000000000..587092397c --- /dev/null +++ b/test/CodeGen/ARM/vrsubhn.ll @@ -0,0 +1,29 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vrsubhn\\.i16} %t | count 1 +; RUN: grep {vrsubhn\\.i32} %t | count 1 +; RUN: grep {vrsubhn\\.i64} %t | count 1 + +define <8 x i8> @vrsubhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vrsubhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vrsubhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i32> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vset_lane.ll b/test/CodeGen/ARM/vset_lane.ll new file mode 100644 index 0000000000..279c628aed --- /dev/null +++ b/test/CodeGen/ARM/vset_lane.ll @@ -0,0 +1,40 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vmov\\.8} %t | count 2 +; RUN: grep {vmov\\.16} %t | count 2 +; RUN: grep {vmov\\.32} %t | count 2 + +define <8 x i8> @vset_lane8(<8 x i8>* %A, i8 %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = insertelement <8 x i8> %tmp1, i8 %B, i32 1 + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vset_lane16(<4 x i16>* %A, i16 %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = insertelement <4 x i16> %tmp1, i16 %B, i32 1 + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vset_lane32(<2 x i32>* %A, i32 %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = insertelement <2 x i32> %tmp1, i32 %B, i32 1 + ret <2 x i32> %tmp2 +} + +define <16 x i8> @vsetQ_lane8(<16 x i8>* %A, i8 %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = insertelement <16 x i8> %tmp1, i8 %B, i32 1 + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vsetQ_lane16(<8 x i16>* %A, i16 %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = insertelement <8 x i16> %tmp1, i16 %B, i32 1 + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vsetQ_lane32(<4 x i32>* %A, i32 %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = insertelement <4 x i32> %tmp1, i32 %B, i32 1 + ret <4 x i32> %tmp2 +} diff --git a/test/CodeGen/ARM/vshift.ll b/test/CodeGen/ARM/vshift.ll new file mode 100644 index 0000000000..8c5c4aad18 --- /dev/null +++ b/test/CodeGen/ARM/vshift.ll @@ -0,0 +1,337 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vshl\\.s8} %t | count 2 +; RUN: grep {vshl\\.s16} %t | count 2 +; RUN: grep {vshl\\.s32} %t | count 2 +; RUN: grep {vshl\\.s64} %t | count 2 +; RUN: grep {vshl\\.u8} %t | count 4 +; RUN: grep {vshl\\.u16} %t | count 4 +; RUN: grep {vshl\\.u32} %t | count 4 +; RUN: grep {vshl\\.u64} %t | count 4 +; RUN: grep {vshl\\.i8} %t | count 2 +; RUN: grep {vshl\\.i16} %t | count 2 +; RUN: grep {vshl\\.i32} %t | count 2 +; RUN: grep {vshl\\.i64} %t | count 2 +; RUN: grep {vshr\\.u8} %t | count 2 +; RUN: grep {vshr\\.u16} %t | count 2 +; RUN: grep {vshr\\.u32} %t | count 2 +; RUN: grep {vshr\\.u64} %t | count 2 +; RUN: grep {vshr\\.s8} %t | count 2 +; RUN: grep {vshr\\.s16} %t | count 2 +; RUN: grep {vshr\\.s32} %t | count 2 +; RUN: grep {vshr\\.s64} %t | count 2 +; RUN: grep {vneg\\.s8} %t | count 4 +; RUN: grep {vneg\\.s16} %t | count 4 +; RUN: grep {vneg\\.s32} %t | count 4 +; RUN: grep {vsub\\.i64} %t | count 4 + +define <8 x i8> @vshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = shl <8 x i8> %tmp1, %tmp2 + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = shl <4 x i16> %tmp1, %tmp2 + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = shl <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = shl <1 x i64> %tmp1, %tmp2 + ret <1 x i64> %tmp3 +} + +define <8 x i8> @vshli8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = shl <8 x i8> %tmp1, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 > + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vshli16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = shl <4 x i16> %tmp1, < i16 15, i16 15, i16 15, i16 15 > + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vshli32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = shl <2 x i32> %tmp1, < i32 31, i32 31 > + ret <2 x i32> %tmp2 +} + +define <1 x i64> @vshli64(<1 x i64>* %A) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = shl <1 x i64> %tmp1, < i64 63 > + ret <1 x i64> %tmp2 +} + +define <16 x i8> @vshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = shl <16 x i8> %tmp1, %tmp2 + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = shl <8 x i16> %tmp1, %tmp2 + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = shl <4 x i32> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = shl <2 x i64> %tmp1, %tmp2 + ret <2 x i64> %tmp3 +} + +define <16 x i8> @vshlQi8(<16 x i8>* %A) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = shl <16 x i8> %tmp1, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 > + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vshlQi16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = shl <8 x i16> %tmp1, < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 > + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vshlQi32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = shl <4 x i32> %tmp1, < i32 31, i32 31, i32 31, i32 31 > + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vshlQi64(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = shl <2 x i64> %tmp1, < i64 63, i64 63 > + ret <2 x i64> %tmp2 +} + +define <8 x i8> @vlshru8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = lshr <8 x i8> %tmp1, %tmp2 + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vlshru16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = lshr <4 x i16> %tmp1, %tmp2 + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vlshru32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = lshr <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vlshru64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = lshr <1 x i64> %tmp1, %tmp2 + ret <1 x i64> %tmp3 +} + +define <8 x i8> @vlshri8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = lshr <8 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vlshri16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = lshr <4 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16 > + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vlshri32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = lshr <2 x i32> %tmp1, < i32 32, i32 32 > + ret <2 x i32> %tmp2 +} + +define <1 x i64> @vlshri64(<1 x i64>* %A) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = lshr <1 x i64> %tmp1, < i64 64 > + ret <1 x i64> %tmp2 +} + +define <16 x i8> @vlshrQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = lshr <16 x i8> %tmp1, %tmp2 + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vlshrQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = lshr <8 x i16> %tmp1, %tmp2 + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vlshrQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = lshr <4 x i32> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vlshrQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = lshr <2 x i64> %tmp1, %tmp2 + ret <2 x i64> %tmp3 +} + +define <16 x i8> @vlshrQi8(<16 x i8>* %A) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = lshr <16 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vlshrQi16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = lshr <8 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 > + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vlshrQi32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = lshr <4 x i32> %tmp1, < i32 32, i32 32, i32 32, i32 32 > + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vlshrQi64(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = lshr <2 x i64> %tmp1, < i64 64, i64 64 > + ret <2 x i64> %tmp2 +} + +define <8 x i8> @vashrs8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = ashr <8 x i8> %tmp1, %tmp2 + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vashrs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = ashr <4 x i16> %tmp1, %tmp2 + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vashrs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = ashr <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vashrs64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = ashr <1 x i64> %tmp1, %tmp2 + ret <1 x i64> %tmp3 +} + +define <8 x i8> @vashri8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = ashr <8 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vashri16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = ashr <4 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16 > + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vashri32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = ashr <2 x i32> %tmp1, < i32 32, i32 32 > + ret <2 x i32> %tmp2 +} + +define <1 x i64> @vashri64(<1 x i64>* %A) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = ashr <1 x i64> %tmp1, < i64 64 > + ret <1 x i64> %tmp2 +} + +define <16 x i8> @vashrQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = ashr <16 x i8> %tmp1, %tmp2 + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vashrQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = ashr <8 x i16> %tmp1, %tmp2 + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vashrQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = ashr <4 x i32> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vashrQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = ashr <2 x i64> %tmp1, %tmp2 + ret <2 x i64> %tmp3 +} + +define <16 x i8> @vashrQi8(<16 x i8>* %A) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = ashr <16 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vashrQi16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = ashr <8 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 > + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vashrQi32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = ashr <4 x i32> %tmp1, < i32 32, i32 32, i32 32, i32 32 > + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vashrQi64(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = ashr <2 x i64> %tmp1, < i64 64, i64 64 > + ret <2 x i64> %tmp2 +} diff --git a/test/CodeGen/ARM/vshiftins.ll b/test/CodeGen/ARM/vshiftins.ll new file mode 100644 index 0000000000..cb7cbb89ec --- /dev/null +++ b/test/CodeGen/ARM/vshiftins.ll @@ -0,0 +1,131 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vsli\\.8} %t | count 2 +; RUN: grep {vsli\\.16} %t | count 2 +; RUN: grep {vsli\\.32} %t | count 2 +; RUN: grep {vsli\\.64} %t | count 2 +; RUN: grep {vsri\\.8} %t | count 2 +; RUN: grep {vsri\\.16} %t | count 2 +; RUN: grep {vsri\\.32} %t | count 2 +; RUN: grep {vsri\\.64} %t | count 2 + +define <8 x i8> @vsli8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vsli16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vsli32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> < i32 31, i32 31 >) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vsli64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, <1 x i64> < i64 63 >) + ret <1 x i64> %tmp3 +} + +define <16 x i8> @vsliQ8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vsliQ16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vsliQ32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vsliQ64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, <2 x i64> < i64 63, i64 63 >) + ret <2 x i64> %tmp3 +} + +define <8 x i8> @vsri8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vsri16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vsri32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> < i32 -32, i32 -32 >) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vsri64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, <1 x i64> < i64 -64 >) + ret <1 x i64> %tmp3 +} + +define <16 x i8> @vsriQ8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vsriQ16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vsriQ32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vsriQ64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, <2 x i64> < i64 -64, i64 -64 >) + ret <2 x i64> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64>, <1 x i64>, <1 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vshl.ll b/test/CodeGen/ARM/vshl.ll new file mode 100644 index 0000000000..993126ea57 --- /dev/null +++ b/test/CodeGen/ARM/vshl.ll @@ -0,0 +1,302 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vshl\\.s8} %t | count 2 +; RUN: grep {vshl\\.s16} %t | count 2 +; RUN: grep {vshl\\.s32} %t | count 2 +; RUN: grep {vshl\\.s64} %t | count 2 +; RUN: grep {vshl\\.u8} %t | count 2 +; RUN: grep {vshl\\.u16} %t | count 2 +; RUN: grep {vshl\\.u32} %t | count 2 +; RUN: grep {vshl\\.u64} %t | count 2 +; RUN: grep {vshl\\.i8} %t | count 2 +; RUN: grep {vshl\\.i16} %t | count 2 +; RUN: grep {vshl\\.i32} %t | count 2 +; RUN: grep {vshl\\.i64} %t | count 2 +; RUN: grep {vshr\\.s8} %t | count 2 +; RUN: grep {vshr\\.s16} %t | count 2 +; RUN: grep {vshr\\.s32} %t | count 2 +; RUN: grep {vshr\\.s64} %t | count 2 +; RUN: grep {vshr\\.u8} %t | count 2 +; RUN: grep {vshr\\.u16} %t | count 2 +; RUN: grep {vshr\\.u32} %t | count 2 +; RUN: grep {vshr\\.u64} %t | count 2 + +define <8 x i8> @vshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + ret <1 x i64> %tmp3 +} + +define <8 x i8> @vshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + ret <1 x i64> %tmp3 +} + +define <16 x i8> @vshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i64> %tmp3 +} + +define <16 x i8> @vshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i64> %tmp3 +} + +; For left shifts by immediates, the signedness is irrelevant. +; Test a mix of both signed and unsigned intrinsics. + +define <8 x i8> @vshli8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vshli16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vshli32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >) + ret <2 x i32> %tmp2 +} + +define <1 x i64> @vshli64(<1 x i64>* %A) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 63 >) + ret <1 x i64> %tmp2 +} + +define <16 x i8> @vshlQi8(<16 x i8>* %A) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >) + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vshlQi16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vshlQi32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >) + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vshlQi64(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 63, i64 63 >) + ret <2 x i64> %tmp2 +} + +; Right shift by immediate: + +define <8 x i8> @vshrs8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vshrs16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vshrs32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >) + ret <2 x i32> %tmp2 +} + +define <1 x i64> @vshrs64(<1 x i64>* %A) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >) + ret <1 x i64> %tmp2 +} + +define <8 x i8> @vshru8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vshru16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vshru32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >) + ret <2 x i32> %tmp2 +} + +define <1 x i64> @vshru64(<1 x i64>* %A) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >) + ret <1 x i64> %tmp2 +} + +define <16 x i8> @vshrQs8(<16 x i8>* %A) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vshrQs16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vshrQs32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >) + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vshrQs64(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >) + ret <2 x i64> %tmp2 +} + +define <16 x i8> @vshrQu8(<16 x i8>* %A) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vshrQu16(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vshrQu32(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >) + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vshrQu64(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >) + ret <2 x i64> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vshll.ll b/test/CodeGen/ARM/vshll.ll new file mode 100644 index 0000000000..f81c09a7b9 --- /dev/null +++ b/test/CodeGen/ARM/vshll.ll @@ -0,0 +1,74 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vshll\\.s8} %t | count 1 +; RUN: grep {vshll\\.s16} %t | count 1 +; RUN: grep {vshll\\.s32} %t | count 1 +; RUN: grep {vshll\\.u8} %t | count 1 +; RUN: grep {vshll\\.u16} %t | count 1 +; RUN: grep {vshll\\.u32} %t | count 1 +; RUN: grep {vshll\\.i8} %t | count 1 +; RUN: grep {vshll\\.i16} %t | count 1 +; RUN: grep {vshll\\.i32} %t | count 1 + +define <8 x i16> @vshlls8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vshiftls.v8i16(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vshlls16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vshiftls.v4i32(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >) + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vshlls32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i64> @llvm.arm.neon.vshiftls.v2i64(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >) + ret <2 x i64> %tmp2 +} + +define <8 x i16> @vshllu8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vshiftlu.v8i16(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vshllu16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vshiftlu.v4i32(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >) + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vshllu32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i64> @llvm.arm.neon.vshiftlu.v2i64(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >) + ret <2 x i64> %tmp2 +} + +; The following tests use the maximum shift count, so the signedness is +; irrelevant. Test both signed and unsigned versions. +define <8 x i16> @vshlli8(<8 x i8>* %A) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vshiftls.v8i16(<8 x i8> %tmp1, <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vshlli16(<4 x i16>* %A) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vshiftlu.v4i32(<4 x i16> %tmp1, <4 x i16> < i16 16, i16 16, i16 16, i16 16 >) + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vshlli32(<2 x i32>* %A) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i64> @llvm.arm.neon.vshiftls.v2i64(<2 x i32> %tmp1, <2 x i32> < i32 32, i32 32 >) + ret <2 x i64> %tmp2 +} + +declare <8 x i16> @llvm.arm.neon.vshiftls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vshiftls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vshiftls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vshiftlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vshiftlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vshiftlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vshrn.ll b/test/CodeGen/ARM/vshrn.ll new file mode 100644 index 0000000000..bc640cbbca --- /dev/null +++ b/test/CodeGen/ARM/vshrn.ll @@ -0,0 +1,26 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vshrn\\.i16} %t | count 1 +; RUN: grep {vshrn\\.i32} %t | count 1 +; RUN: grep {vshrn\\.i64} %t | count 1 + +define <8 x i8> @vshrns8(<8 x i16>* %A) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vshrns16(<4 x i32>* %A) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vshrns32(<2 x i64>* %A) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vshiftn.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >) + ret <2 x i32> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vshiftn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vsra.ll b/test/CodeGen/ARM/vsra.ll new file mode 100644 index 0000000000..e2829dcdda --- /dev/null +++ b/test/CodeGen/ARM/vsra.ll @@ -0,0 +1,293 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vsra\\.s8} %t | count 2 +; RUN: grep {vsra\\.s16} %t | count 2 +; RUN: grep {vsra\\.s32} %t | count 2 +; RUN: grep {vsra\\.s64} %t | count 2 +; RUN: grep {vsra\\.u8} %t | count 2 +; RUN: grep {vsra\\.u16} %t | count 2 +; RUN: grep {vsra\\.u32} %t | count 2 +; RUN: grep {vsra\\.u64} %t | count 2 +; RUN: grep {vrsra\\.s8} %t | count 2 +; RUN: grep {vrsra\\.s16} %t | count 2 +; RUN: grep {vrsra\\.s32} %t | count 2 +; RUN: grep {vrsra\\.s64} %t | count 2 +; RUN: grep {vrsra\\.u8} %t | count 2 +; RUN: grep {vrsra\\.u16} %t | count 2 +; RUN: grep {vrsra\\.u32} %t | count 2 +; RUN: grep {vrsra\\.u64} %t | count 2 + +define <8 x i8> @vsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = ashr <8 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > + %tmp4 = add <8 x i8> %tmp1, %tmp3 + ret <8 x i8> %tmp4 +} + +define <4 x i16> @vsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = ashr <4 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16 > + %tmp4 = add <4 x i16> %tmp1, %tmp3 + ret <4 x i16> %tmp4 +} + +define <2 x i32> @vsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = ashr <2 x i32> %tmp2, < i32 32, i32 32 > + %tmp4 = add <2 x i32> %tmp1, %tmp3 + ret <2 x i32> %tmp4 +} + +define <1 x i64> @vsras64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = ashr <1 x i64> %tmp2, < i64 64 > + %tmp4 = add <1 x i64> %tmp1, %tmp3 + ret <1 x i64> %tmp4 +} + +define <16 x i8> @vsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = ashr <16 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > + %tmp4 = add <16 x i8> %tmp1, %tmp3 + ret <16 x i8> %tmp4 +} + +define <8 x i16> @vsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = ashr <8 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 > + %tmp4 = add <8 x i16> %tmp1, %tmp3 + ret <8 x i16> %tmp4 +} + +define <4 x i32> @vsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = ashr <4 x i32> %tmp2, < i32 32, i32 32, i32 32, i32 32 > + %tmp4 = add <4 x i32> %tmp1, %tmp3 + ret <4 x i32> %tmp4 +} + +define <2 x i64> @vsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = ashr <2 x i64> %tmp2, < i64 64, i64 64 > + %tmp4 = add <2 x i64> %tmp1, %tmp3 + ret <2 x i64> %tmp4 +} + +define <8 x i8> @vsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = lshr <8 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > + %tmp4 = add <8 x i8> %tmp1, %tmp3 + ret <8 x i8> %tmp4 +} + +define <4 x i16> @vsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = lshr <4 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16 > + %tmp4 = add <4 x i16> %tmp1, %tmp3 + ret <4 x i16> %tmp4 +} + +define <2 x i32> @vsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = lshr <2 x i32> %tmp2, < i32 32, i32 32 > + %tmp4 = add <2 x i32> %tmp1, %tmp3 + ret <2 x i32> %tmp4 +} + +define <1 x i64> @vsrau64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = lshr <1 x i64> %tmp2, < i64 64 > + %tmp4 = add <1 x i64> %tmp1, %tmp3 + ret <1 x i64> %tmp4 +} + +define <16 x i8> @vsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = lshr <16 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > + %tmp4 = add <16 x i8> %tmp1, %tmp3 + ret <16 x i8> %tmp4 +} + +define <8 x i16> @vsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = lshr <8 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 > + %tmp4 = add <8 x i16> %tmp1, %tmp3 + ret <8 x i16> %tmp4 +} + +define <4 x i32> @vsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = lshr <4 x i32> %tmp2, < i32 32, i32 32, i32 32, i32 32 > + %tmp4 = add <4 x i32> %tmp1, %tmp3 + ret <4 x i32> %tmp4 +} + +define <2 x i64> @vsraQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = lshr <2 x i64> %tmp2, < i64 64, i64 64 > + %tmp4 = add <2 x i64> %tmp1, %tmp3 + ret <2 x i64> %tmp4 +} + +define <8 x i8> @vrsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp2, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) + %tmp4 = add <8 x i8> %tmp1, %tmp3 + ret <8 x i8> %tmp4 +} + +define <4 x i16> @vrsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp2, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >) + %tmp4 = add <4 x i16> %tmp1, %tmp3 + ret <4 x i16> %tmp4 +} + +define <2 x i32> @vrsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp2, <2 x i32> < i32 -32, i32 -32 >) + %tmp4 = add <2 x i32> %tmp1, %tmp3 + ret <2 x i32> %tmp4 +} + +define <1 x i64> @vrsras64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp2, <1 x i64> < i64 -64 >) + %tmp4 = add <1 x i64> %tmp1, %tmp3 + ret <1 x i64> %tmp4 +} + +define <8 x i8> @vrsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp2, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) + %tmp4 = add <8 x i8> %tmp1, %tmp3 + ret <8 x i8> %tmp4 +} + +define <4 x i16> @vrsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp2, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >) + %tmp4 = add <4 x i16> %tmp1, %tmp3 + ret <4 x i16> %tmp4 +} + +define <2 x i32> @vrsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp2, <2 x i32> < i32 -32, i32 -32 >) + %tmp4 = add <2 x i32> %tmp1, %tmp3 + ret <2 x i32> %tmp4 +} + +define <1 x i64> @vrsrau64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp2, <1 x i64> < i64 -64 >) + %tmp4 = add <1 x i64> %tmp1, %tmp3 + ret <1 x i64> %tmp4 +} + +define <16 x i8> @vrsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp2, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) + %tmp4 = add <16 x i8> %tmp1, %tmp3 + ret <16 x i8> %tmp4 +} + +define <8 x i16> @vrsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp2, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >) + %tmp4 = add <8 x i16> %tmp1, %tmp3 + ret <8 x i16> %tmp4 +} + +define <4 x i32> @vrsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp2, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >) + %tmp4 = add <4 x i32> %tmp1, %tmp3 + ret <4 x i32> %tmp4 +} + +define <2 x i64> @vrsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp2, <2 x i64> < i64 -64, i64 -64 >) + %tmp4 = add <2 x i64> %tmp1, %tmp3 + ret <2 x i64> %tmp4 +} + +define <16 x i8> @vrsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp2, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) + %tmp4 = add <16 x i8> %tmp1, %tmp3 + ret <16 x i8> %tmp4 +} + +define <8 x i16> @vrsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp2, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >) + %tmp4 = add <8 x i16> %tmp1, %tmp3 + ret <8 x i16> %tmp4 +} + +define <4 x i32> @vrsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp2, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >) + %tmp4 = add <4 x i32> %tmp1, %tmp3 + ret <4 x i32> %tmp4 +} + +define <2 x i64> @vrsraQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp2, <2 x i64> < i64 -64, i64 -64 >) + %tmp4 = add <2 x i64> %tmp1, %tmp3 + ret <2 x i64> %tmp4 +} + +declare <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vsub.ll b/test/CodeGen/ARM/vsub.ll new file mode 100644 index 0000000000..85dea41835 --- /dev/null +++ b/test/CodeGen/ARM/vsub.ll @@ -0,0 +1,76 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vsub\\.i8} %t | count 2 +; RUN: grep {vsub\\.i16} %t | count 2 +; RUN: grep {vsub\\.i32} %t | count 2 +; RUN: grep {vsub\\.i64} %t | count 2 +; RUN: grep {vsub\\.f32} %t | count 2 + +define <8 x i8> @vsubi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = sub <8 x i8> %tmp1, %tmp2 + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vsubi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = sub <4 x i16> %tmp1, %tmp2 + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vsubi32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = sub <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vsubi64(<1 x i64>* %A, <1 x i64>* %B) nounwind { + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = sub <1 x i64> %tmp1, %tmp2 + ret <1 x i64> %tmp3 +} + +define <2 x float> @vsubf32(<2 x float>* %A, <2 x float>* %B) nounwind { + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = sub <2 x float> %tmp1, %tmp2 + ret <2 x float> %tmp3 +} + +define <16 x i8> @vsubQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = sub <16 x i8> %tmp1, %tmp2 + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vsubQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = sub <8 x i16> %tmp1, %tmp2 + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vsubQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = sub <4 x i32> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vsubQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = sub <2 x i64> %tmp1, %tmp2 + ret <2 x i64> %tmp3 +} + +define <4 x float> @vsubQf32(<4 x float>* %A, <4 x float>* %B) nounwind { + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = sub <4 x float> %tmp1, %tmp2 + ret <4 x float> %tmp3 +} diff --git a/test/CodeGen/ARM/vsubhn.ll b/test/CodeGen/ARM/vsubhn.ll new file mode 100644 index 0000000000..5adc9ed97d --- /dev/null +++ b/test/CodeGen/ARM/vsubhn.ll @@ -0,0 +1,29 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vsubhn\\.i16} %t | count 1 +; RUN: grep {vsubhn\\.i32} %t | count 1 +; RUN: grep {vsubhn\\.i64} %t | count 1 + +define <8 x i8> @vsubhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vsubhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vsubhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i32> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vsubl.ll b/test/CodeGen/ARM/vsubl.ll new file mode 100644 index 0000000000..d2af455843 --- /dev/null +++ b/test/CodeGen/ARM/vsubl.ll @@ -0,0 +1,57 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vsubl\\.s8} %t | count 1 +; RUN: grep {vsubl\\.s16} %t | count 1 +; RUN: grep {vsubl\\.s32} %t | count 1 +; RUN: grep {vsubl\\.u8} %t | count 1 +; RUN: grep {vsubl\\.u16} %t | count 1 +; RUN: grep {vsubl\\.u32} %t | count 1 + +define <8 x i16> @vsubls8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vsubls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vsubls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vsubls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vsubls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vsubls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +define <8 x i16> @vsublu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vsublu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vsublu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vsublu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vsublu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vsublu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +declare <8 x i16> @llvm.arm.neon.vsubls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vsubls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vsubls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vsublu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vsublu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vsublu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vsubw.ll b/test/CodeGen/ARM/vsubw.ll new file mode 100644 index 0000000000..30bec5532a --- /dev/null +++ b/test/CodeGen/ARM/vsubw.ll @@ -0,0 +1,57 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vsubw\\.s8} %t | count 1 +; RUN: grep {vsubw\\.s16} %t | count 1 +; RUN: grep {vsubw\\.s32} %t | count 1 +; RUN: grep {vsubw\\.u8} %t | count 1 +; RUN: grep {vsubw\\.u16} %t | count 1 +; RUN: grep {vsubw\\.u32} %t | count 1 + +define <8 x i16> @vsubws8(<8 x i16>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vsubws.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vsubws16(<4 x i32>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vsubws.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vsubws32(<2 x i64>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vsubws.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +define <8 x i16> @vsubwu8(<8 x i16>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vsubwu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vsubwu16(<4 x i32>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vsubwu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vsubwu32(<2 x i64>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vsubwu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +declare <8 x i16> @llvm.arm.neon.vsubws.v8i16(<8 x i16>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vsubws.v4i32(<4 x i32>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vsubws.v2i64(<2 x i64>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vsubwu.v8i16(<8 x i16>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vsubwu.v4i32(<4 x i32>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vsubwu.v2i64(<2 x i64>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vtst.ll b/test/CodeGen/ARM/vtst.ll new file mode 100644 index 0000000000..d4f7d2aef1 --- /dev/null +++ b/test/CodeGen/ARM/vtst.ll @@ -0,0 +1,52 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t +; RUN: grep {vtst\\.i8} %t | count 2 +; RUN: grep {vtst\\.i16} %t | count 2 +; RUN: grep {vtst\\.i32} %t | count 2 + +define <8 x i8> @vtsti8(<8 x i8>* %A, <8 x i8>* %B) nounwind { + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = and <8 x i8> %tmp1, %tmp2 + %tmp4 = vicmp ne <8 x i8> %tmp3, zeroinitializer + ret <8 x i8> %tmp4 +} + +define <4 x i16> @vtsti16(<4 x i16>* %A, <4 x i16>* %B) nounwind { + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = and <4 x i16> %tmp1, %tmp2 + %tmp4 = vicmp ne <4 x i16> %tmp3, zeroinitializer + ret <4 x i16> %tmp4 +} + +define <2 x i32> @vtsti32(<2 x i32>* %A, <2 x i32>* %B) nounwind { + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = and <2 x i32> %tmp1, %tmp2 + %tmp4 = vicmp ne <2 x i32> %tmp3, zeroinitializer + ret <2 x i32> %tmp4 +} + +define <16 x i8> @vtstQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = and <16 x i8> %tmp1, %tmp2 + %tmp4 = vicmp ne <16 x i8> %tmp3, zeroinitializer + ret <16 x i8> %tmp4 +} + +define <8 x i16> @vtstQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = and <8 x i16> %tmp1, %tmp2 + %tmp4 = vicmp ne <8 x i16> %tmp3, zeroinitializer + ret <8 x i16> %tmp4 +} + +define <4 x i32> @vtstQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = and <4 x i32> %tmp1, %tmp2 + %tmp4 = vicmp ne <4 x i32> %tmp3, zeroinitializer + ret <4 x i32> %tmp4 +} |