diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/Headers/Makefile | 14 | ||||
-rw-r--r-- | lib/Headers/arm_neon.h | 537 | ||||
-rw-r--r-- | lib/Headers/arm_neon.td | 286 |
3 files changed, 299 insertions, 538 deletions
diff --git a/lib/Headers/Makefile b/lib/Headers/Makefile index cb36e84319..fea59d6f4b 100644 --- a/lib/Headers/Makefile +++ b/lib/Headers/Makefile @@ -8,6 +8,10 @@ ##===----------------------------------------------------------------------===## LEVEL = ../../../.. + +BUILT_SOURCES = arm_neon.h.inc +TABLEGEN_INC_FILES_COMMON = 1 + include $(LEVEL)/Makefile.common CLANG_VERSION := $(shell cat $(PROJ_SRC_DIR)/../../VER) @@ -19,10 +23,14 @@ HEADERS := $(notdir $(wildcard $(PROJ_SRC_DIR)/*.h)) OBJHEADERS := $(addprefix $(HeaderDir)/, $(HEADERS)) -$(OBJHEADERS): $(HeaderDir)/%.h: $(PROJ_SRC_DIR)/%.h $(HeaderDir)/.dir +$(OBJHEADERS): $(HeaderDir)/%.h: $(PROJ_SRC_DIR)/%.h $(HeaderDir)/.dir $(HeaderDir)/arm_neon.h $(Verb) cp $< $@ $(Echo) Copying $(notdir $<) to build dir +$(HeaderDir)/arm_neon.h: $(BUILT_SOURCES) + $(Verb) cp $< $@ + $(Echo) Copying $(notdir $<) to build dir + # Hook into the standard Makefile rules. all-local:: $(OBJHEADERS) @@ -38,3 +46,7 @@ $(INSTHEADERS): $(PROJ_headers)/%.h: $(HeaderDir)/%.h | $(PROJ_headers) $(Echo) Installing compiler include file: $(notdir $<) install-local:: $(INSTHEADERS) + +$(ObjDir)/arm_neon.h.inc.tmp : arm_neon.td $(TBLGEN) $(ObjDir)/.dir + $(Echo) "Building Clang arm_neon.h.inc with tblgen" + $(Verb) $(TableGen) -gen-arm-neon-header -o $(call SYSPATH, $@) $< diff --git a/lib/Headers/arm_neon.h b/lib/Headers/arm_neon.h deleted file mode 100644 index 4508a27f36..0000000000 --- a/lib/Headers/arm_neon.h +++ /dev/null @@ -1,537 +0,0 @@ -/*===---- arm_neon.h - NEON intrinsics --------------------------------------=== - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __ARM_NEON_H -#define __ARM_NEON_H - -#ifndef __ARM_NEON__ -#error "NEON support not enabled" -#endif - -// NEON document appears to be specified in terms of stdint types. -#include <stdint.h> - -// Define some NEON-specific scalar types for floats and polynomials. -typedef float float32_t; -typedef uint8_t poly8_t; - -// FIXME: probably need a 'poly' attribute or something for correct codegen to -// disambiguate from uint16_t. -typedef uint16_t poly16_t; - -typedef __attribute__(( __vector_size__(8) )) int8_t __neon_int8x8_t; -typedef __attribute__(( __vector_size__(16) )) int8_t __neon_int8x16_t; -typedef __attribute__(( __vector_size__(8) )) int16_t __neon_int16x4_t; -typedef __attribute__(( __vector_size__(16) )) int16_t __neon_int16x8_t; -typedef __attribute__(( __vector_size__(8) )) int32_t __neon_int32x2_t; -typedef __attribute__(( __vector_size__(16) )) int32_t __neon_int32x4_t; -typedef __attribute__(( __vector_size__(8) )) int64_t __neon_int64x1_t; -typedef __attribute__(( __vector_size__(16) )) int64_t __neon_int64x2_t; -typedef __attribute__(( __vector_size__(8) )) uint8_t __neon_uint8x8_t; -typedef __attribute__(( __vector_size__(16) )) uint8_t __neon_uint8x16_t; -typedef __attribute__(( __vector_size__(8) )) uint16_t __neon_uint16x4_t; -typedef __attribute__(( __vector_size__(16) )) uint16_t __neon_uint16x8_t; -typedef __attribute__(( __vector_size__(8) )) uint32_t __neon_uint32x2_t; -typedef __attribute__(( __vector_size__(16) )) uint32_t __neon_uint32x4_t; -typedef __attribute__(( __vector_size__(8) )) uint64_t __neon_uint64x1_t; -typedef __attribute__(( __vector_size__(16) )) uint64_t __neon_uint64x2_t; -typedef __attribute__(( __vector_size__(8) )) uint16_t __neon_float16x4_t; -typedef __attribute__(( __vector_size__(16) )) uint16_t __neon_float16x8_t; -typedef __attribute__(( __vector_size__(8) )) float32_t __neon_float32x2_t; -typedef __attribute__(( __vector_size__(16) )) float32_t __neon_float32x4_t; -typedef __attribute__(( __vector_size__(8) )) poly8_t __neon_poly8x8_t; -typedef __attribute__(( __vector_size__(16) )) poly8_t __neon_poly8x16_t; -typedef __attribute__(( __vector_size__(8) )) poly16_t __neon_poly16x4_t; -typedef __attribute__(( __vector_size__(16) )) poly16_t __neon_poly16x8_t; - -typedef struct __int8x8_t { - __neon_int8x8_t val; -} int8x8_t; - -typedef struct __int8x16_t { - __neon_int8x16_t val; -} int8x16_t; - -typedef struct __int16x4_t { - __neon_int16x4_t val; -} int16x4_t; - -typedef struct __int16x8_t { - __neon_int16x8_t val; -} int16x8_t; - -typedef struct __int32x2_t { - __neon_int32x2_t val; -} int32x2_t; - -typedef struct __int32x4_t { - __neon_int32x4_t val; -} int32x4_t; - -typedef struct __int64x1_t { - __neon_int64x1_t val; -} int64x1_t; - -typedef struct __int64x2_t { - __neon_int64x2_t val; -} int64x2_t; - -typedef struct __uint8x8_t { - __neon_uint8x8_t val; -} uint8x8_t; - -typedef struct __uint8x16_t { - __neon_uint8x16_t val; -} uint8x16_t; - -typedef struct __uint16x4_t { - __neon_uint16x4_t val; -} uint16x4_t; - -typedef struct __uint16x8_t { - __neon_uint16x8_t val; -} uint16x8_t; - -typedef struct __uint32x2_t { - __neon_uint32x2_t val; -} uint32x2_t; - -typedef struct __uint32x4_t { - __neon_uint32x4_t val; -} uint32x4_t; - -typedef struct __uint64x1_t { - __neon_uint64x1_t val; -} uint64x1_t; - -typedef struct __uint64x2_t { - __neon_uint64x2_t val; -} uint64x2_t; - -typedef struct __float16x4_t { - __neon_float16x4_t val; -} float16x4_t; - -typedef struct __float16x8_t { - __neon_float16x8_t val; -} float16x8_t; - -typedef struct __float32x2_t { - __neon_float32x2_t val; -} float32x2_t; - -typedef struct __float32x4_t { - __neon_float32x4_t val; -} float32x4_t; - -typedef struct __poly8x8_t { - __neon_poly8x8_t val; -} poly8x8_t; - -typedef struct __poly8x16_t { - __neon_poly8x16_t val; -} poly8x16_t; - -typedef struct __poly16x4_t { - __neon_poly16x4_t val; -} poly16x4_t; - -typedef struct __poly16x8_t { - __neon_poly16x8_t val; -} poly16x8_t; - -// FIXME: write tool to stamp out the structure-of-array types, possibly gen this whole file. - -// Intrinsics, per ARM document DUI0348B -#define __ai static __attribute__((__always_inline__)) - -#define INTTYPES_WIDE(op, builtin) \ - __ai int16x8_t op##_s8(int16x8_t a, int8x8_t b) { return (int16x8_t){ builtin(a.val, b.val) }; } \ - __ai int32x4_t op##_s16(int32x4_t a, int16x4_t b) { return (int32x4_t){ builtin(a.val, b.val) }; } \ - __ai int64x2_t op##_s32(int64x2_t a, int32x2_t b) { return (int64x2_t){ builtin(a.val, b.val) }; } \ - __ai uint16x8_t op##_u8(uint16x8_t a, uint8x8_t b) { return (uint16x8_t){ builtin(a.val, b.val) }; } \ - __ai uint32x4_t op##_u16(uint32x4_t a, uint16x4_t b) { return (uint32x4_t){ builtin(a.val, b.val) }; } \ - __ai uint64x2_t op##_u32(uint64x2_t a, uint32x2_t b) { return (uint64x2_t){ builtin(a.val, b.val) }; } - -#define INTTYPES_WIDENING(op, builtin) \ - __ai int16x8_t op##_s8(int8x8_t a, int8x8_t b) { return (int16x8_t){ builtin(a.val, b.val) }; } \ - __ai int32x4_t op##_s16(int16x4_t a, int16x4_t b) { return (int32x4_t){ builtin(a.val, b.val) }; } \ - __ai int64x2_t op##_s32(int32x2_t a, int32x2_t b) { return (int64x2_t){ builtin(a.val, b.val) }; } \ - __ai uint16x8_t op##_u8(uint8x8_t a, uint8x8_t b) { return (uint16x8_t){ builtin(a.val, b.val) }; } \ - __ai uint32x4_t op##_u16(uint16x4_t a, uint16x4_t b) { return (uint32x4_t){ builtin(a.val, b.val) }; } \ - __ai uint64x2_t op##_u32(uint32x2_t a, uint32x2_t b) { return (uint64x2_t){ builtin(a.val, b.val) }; } - -#define INTTYPES_WIDENING_MUL(op, builtin) \ - __ai int16x8_t op##_s8(int16x8_t a, int8x8_t b, int8x8_t c) { return (int16x8_t){ builtin(a.val, b.val, c.val) }; } \ - __ai int32x4_t op##_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return (int32x4_t){ builtin(a.val, b.val, c.val) }; } \ - __ai int64x2_t op##_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return (int64x2_t){ builtin(a.val, b.val, c.val) }; } \ - __ai uint16x8_t op##_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return (uint16x8_t){ builtin(a.val, b.val, c.val) }; } \ - __ai uint32x4_t op##_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return (uint32x4_t){ builtin(a.val, b.val, c.val) }; } \ - __ai uint64x2_t op##_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return (uint64x2_t){ builtin(a.val, b.val, c.val) }; } - -#define INTTYPES_NARROWING(op, builtin) \ - __ai int8x8_t op##_s16(int16x8_t a, int16x8_t b) { return (int8x8_t){ builtin(a.val, b.val) }; } \ - __ai int16x4_t op##_s32(int32x4_t a, int32x4_t b) { return (int16x4_t){ builtin(a.val, b.val) }; } \ - __ai int32x2_t op##_s64(int64x2_t a, int64x2_t b) { return (int32x2_t){ builtin(a.val, b.val) }; } \ - __ai uint8x8_t op##_u16(uint16x8_t a, uint16x8_t b) { return (uint8x8_t){ builtin(a.val, b.val) }; } \ - __ai uint16x4_t op##_u32(uint32x4_t a, uint32x4_t b) { return (uint16x4_t){ builtin(a.val, b.val) }; } \ - __ai uint32x2_t op##_u64(uint64x2_t a, uint64x2_t b) { return (uint32x2_t){ builtin(a.val, b.val) }; } - -#define INTTYPES_ADD_32(op, builtin) \ - __ai int8x8_t op##_s8(int8x8_t a, int8x8_t b) { return (int8x8_t){ builtin(a.val, b.val) }; } \ - __ai int16x4_t op##_s16(int16x4_t a, int16x4_t b) { return (int16x4_t){ builtin(a.val, b.val) }; } \ - __ai int32x2_t op##_s32(int32x2_t a, int32x2_t b) { return (int32x2_t){ builtin(a.val, b.val) }; } \ - __ai uint8x8_t op##_u8(uint8x8_t a, uint8x8_t b) { return (uint8x8_t){ builtin(a.val, b.val) }; } \ - __ai uint16x4_t op##_u16(uint16x4_t a, uint16x4_t b) { return (uint16x4_t){ builtin(a.val, b.val) }; } \ - __ai uint32x2_t op##_u32(uint32x2_t a, uint32x2_t b) { return (uint32x2_t){ builtin(a.val, b.val) }; } \ - __ai int8x16_t op##q_s8(int8x16_t a, int8x16_t b) { return (int8x16_t){ builtin(a.val, b.val) }; } \ - __ai int16x8_t op##q_s16(int16x8_t a, int16x8_t b) { return (int16x8_t){ builtin(a.val, b.val) }; } \ - __ai int32x4_t op##q_s32(int32x4_t a, int32x4_t b) { return (int32x4_t){ builtin(a.val, b.val) }; } \ - __ai uint8x16_t op##q_u8(uint8x16_t a, uint8x16_t b) { return (uint8x16_t){ builtin(a.val, b.val) }; } \ - __ai uint16x8_t op##q_u16(uint16x8_t a, uint16x8_t b) { return (uint16x8_t){ builtin(a.val, b.val) }; } \ - __ai uint32x4_t op##q_u32(uint32x4_t a, uint32x4_t b) { return (uint32x4_t){ builtin(a.val, b.val) }; } - -#define INTTYPES_ADD_64(op, builtin) \ - __ai int64x1_t op##_s64(int64x1_t a, int64x1_t b) { return (int64x1_t){ builtin(a.val, b.val) }; } \ - __ai uint64x1_t op##_u64(uint64x1_t a, uint64x1_t b) { return (uint64x1_t){ builtin(a.val, b.val) }; } \ - __ai int64x2_t op##q_s64(int64x2_t a, int64x2_t b) { return (int64x2_t){ builtin(a.val, b.val) }; } \ - __ai uint64x2_t op##q_u64(uint64x2_t a, uint64x2_t b) { return (uint64x2_t){ builtin(a.val, b.val) }; } - -#define FLOATTYPES_CMP(op, builtin) \ - __ai uint32x2_t op##_f32(float32x2_t a, float32x2_t b) { return (uint32x2_t){ builtin(a.val, b.val) }; } \ - __ai uint32x4_t op##q_f32(float32x4_t a, float32x4_t b) { return (uint32x4_t){ builtin(a.val, b.val) }; } - -#define INT_FLOAT_CMP_OP(op, cc) \ - __ai uint8x8_t op##_s8(int8x8_t a, int8x8_t b) { return (uint8x8_t){(__neon_uint8x8_t)(a.val cc b.val)}; } \ - __ai uint16x4_t op##_s16(int16x4_t a, int16x4_t b) { return (uint16x4_t){(__neon_uint16x4_t)(a.val cc b.val)}; } \ - __ai uint32x2_t op##_s32(int32x2_t a, int32x2_t b) { return (uint32x2_t){(__neon_uint32x2_t)(a.val cc b.val)}; } \ - __ai uint32x2_t op##_f32(float32x2_t a, float32x2_t b) { return (uint32x2_t){(__neon_uint32x2_t)(a.val cc b.val)}; } \ - __ai uint8x8_t op##_u8(uint8x8_t a, uint8x8_t b) { return (uint8x8_t){a.val cc b.val}; } \ - __ai uint16x4_t op##_u16(uint16x4_t a, uint16x4_t b) { return (uint16x4_t){a.val cc b.val}; } \ - __ai uint32x2_t op##_u32(uint32x2_t a, uint32x2_t b) { return (uint32x2_t){a.val cc b.val}; } \ - __ai uint8x16_t op##q_s8(int8x16_t a, int8x16_t b) { return (uint8x16_t){(__neon_uint8x16_t)(a.val cc b.val)}; } \ - __ai uint16x8_t op##q_s16(int16x8_t a, int16x8_t b) { return (uint16x8_t){(__neon_uint16x8_t)(a.val cc b.val)}; } \ - __ai uint32x4_t op##q_s32(int32x4_t a, int32x4_t b) { return (uint32x4_t){(__neon_uint32x4_t)(a.val cc b.val)}; } \ - __ai uint32x4_t op##q_f32(float32x4_t a, float32x4_t b) { return (uint32x4_t){(__neon_uint32x4_t)(a.val cc b.val)}; } \ - __ai uint8x16_t op##q_u8(uint8x16_t a, uint8x16_t b) { return (uint8x16_t){a.val cc b.val}; } \ - __ai uint16x8_t op##q_u16(uint16x8_t a, uint16x8_t b) { return (uint16x8_t){a.val cc b.val}; } \ - __ai uint32x4_t op##q_u32(uint32x4_t a, uint32x4_t b) { return (uint32x4_t){a.val cc b.val}; } - -#define INT_UNARY(op, builtin) \ - __ai int8x8_t op##_s8(int8x8_t a) { return (int8x8_t){ builtin(a.val) }; } \ - __ai int16x4_t op##_s16(int16x4_t a) { return (int16x4_t){ builtin(a.val) }; } \ - __ai int32x2_t op##_s32(int32x2_t a) { return (int32x2_t){ builtin(a.val) }; } \ - __ai int8x16_t op##q_s8(int8x16_t a) { return (int8x16_t){ builtin(a.val) }; } \ - __ai int16x8_t op##q_s16(int16x8_t a) { return (int16x8_t){ builtin(a.val) }; } \ - __ai int32x4_t op##q_s32(int32x4_t a) { return (int32x4_t){ builtin(a.val) }; } - -#define FP_UNARY(op, builtin) \ - __ai float32x2_t op##_f32(float32x2_t a) { return (float32x2_t){ builtin(a.val) }; } \ - __ai float32x4_t op##q_f32(float32x4_t a) { return (float32x4_t){ builtin(a.val) }; } - -#define FP_BINARY(op, builtin) \ - __ai float32x2_t op##_f32(float32x2_t a, float32x2_t b) { return (float32x2_t){ builtin(a.val, b.val) }; } \ - __ai float32x4_t op##q_f32(float32x4_t a, float32x4_t b) { return (float32x4_t){ builtin(a.val, b.val) }; } - -#define INT_FP_PAIRWISE_ADD(op, builtin) \ - __ai int8x8_t op##_s8(int8x8_t a, int8x8_t b) { return (int8x8_t){ builtin(a.val, b.val) }; } \ - __ai int16x4_t op##_s16(int16x4_t a, int16x4_t b) { return (int16x4_t){ builtin(a.val, b.val) }; } \ - __ai int32x2_t op##_s32(int32x2_t a, int32x2_t b) { return (int32x2_t){ builtin(a.val, b.val) }; } \ - __ai uint8x8_t op##_u8(uint8x8_t a, uint8x8_t b) { return (uint8x8_t){ builtin(a.val, b.val) }; } \ - __ai uint16x4_t op##_u16(uint16x4_t a, uint16x4_t b) { return (uint16x4_t){ builtin(a.val, b.val) }; } \ - __ai uint32x2_t op##_u32(uint32x2_t a, uint32x2_t b) { return (uint32x2_t){ builtin(a.val, b.val) }; } \ - __ai float32x2_t op##_f32(float32x2_t a, float32x2_t b) { return (float32x2_t){ builtin(a.val, b.val) }; } - -#define INT_LOGICAL_OP(op, lop) \ - __ai int8x8_t op##_s8(int8x8_t a, int8x8_t b) { return (int8x8_t){ a.val lop b.val }; } \ - __ai int16x4_t op##_s16(int16x4_t a, int16x4_t b) { return (int16x4_t){ a.val lop b.val }; } \ - __ai int32x2_t op##_s32(int32x2_t a, int32x2_t b) { return (int32x2_t){ a.val lop b.val }; } \ - __ai int64x1_t op##_s64(int64x1_t a, int64x1_t b) { return (int64x1_t){ a.val lop b.val }; } \ - __ai uint8x8_t op##_u8(uint8x8_t a, uint8x8_t b) { return (uint8x8_t){ a.val lop b.val }; } \ - __ai uint16x4_t op##_u16(uint16x4_t a, uint16x4_t b) { return (uint16x4_t){ a.val lop b.val }; } \ - __ai uint32x2_t op##_u32(uint32x2_t a, uint32x2_t b) { return (uint32x2_t){ a.val lop b.val }; } \ - __ai uint64x1_t op##_u64(uint64x1_t a, uint64x1_t b) { return (uint64x1_t){ a.val lop b.val }; } \ - __ai int8x16_t op##q_s8(int8x16_t a, int8x16_t b) { return (int8x16_t){ a.val lop b.val }; } \ - __ai int16x8_t op##q_s16(int16x8_t a, int16x8_t b) { return (int16x8_t){ a.val lop b.val }; } \ - __ai int32x4_t op##q_s32(int32x4_t a, int32x4_t b) { return (int32x4_t){ a.val lop b.val }; } \ - __ai int64x2_t op##q_s64(int64x2_t a, int64x2_t b) { return (int64x2_t){ a.val lop b.val }; } \ - __ai uint8x16_t op##q_u8(uint8x16_t a, uint8x16_t b) { return (uint8x16_t){ a.val lop b.val }; } \ - __ai uint16x8_t op##q_u16(uint16x8_t a, uint16x8_t b) { return (uint16x8_t){ a.val lop b.val }; } \ - __ai uint32x4_t op##q_u32(uint32x4_t a, uint32x4_t b) { return (uint32x4_t){ a.val lop b.val }; } \ - __ai uint64x2_t op##q_u64(uint64x2_t a, uint64x2_t b) { return (uint64x2_t){ a.val lop b.val }; } - -// vector add -__ai int8x8_t vadd_s8(int8x8_t a, int8x8_t b) { return (int8x8_t){a.val + b.val}; } -__ai int16x4_t vadd_s16(int16x4_t a, int16x4_t b) { return (int16x4_t){a.val + b.val}; } -__ai int32x2_t vadd_s32(int32x2_t a, int32x2_t b) { return (int32x2_t){a.val + b.val}; } -__ai int64x1_t vadd_s64(int64x1_t a, int64x1_t b) { return (int64x1_t){a.val + b.val}; } -__ai float32x2_t vadd_f32(float32x2_t a, float32x2_t b) { return (float32x2_t){a.val + b.val}; } -__ai uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b) { return (uint8x8_t){a.val + b.val}; } -__ai uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b) { return (uint16x4_t){a.val + b.val}; } -__ai uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b) { return (uint32x2_t){a.val + b.val}; } -__ai uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b) { return (uint64x1_t){a.val + b.val}; } -__ai int8x16_t vaddq_s8(int8x16_t a, int8x16_t b) { return (int8x16_t){a.val + b.val}; } -__ai int16x8_t vaddq_s16(int16x8_t a, int16x8_t b) { return (int16x8_t){a.val + b.val}; } -__ai int32x4_t vaddq_s32(int32x4_t a, int32x4_t b) { return (int32x4_t){a.val + b.val}; } -__ai int64x2_t vaddq_s64(int64x2_t a, int64x2_t b) { return (int64x2_t){a.val + b.val}; } -__ai float32x4_t vaddq_f32(float32x4_t a, float32x4_t b) { return (float32x4_t){a.val + b.val}; } -__ai uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b) { return (uint8x16_t){a.val + b.val}; } -__ai uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b) { return (uint16x8_t){a.val + b.val}; } -__ai uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b) { return (uint32x4_t){a.val + b.val}; } -__ai uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b) { return (uint64x2_t){a.val + b.val}; } - -// vector long add -INTTYPES_WIDENING(vaddl, __builtin_neon_vaddl) - -// vector wide add -INTTYPES_WIDE(vaddw, __builtin_neon_vaddw) - -// halving add -// rounding halving add -INTTYPES_ADD_32(vhadd, __builtin_neon_vhadd) -INTTYPES_ADD_32(vrhadd, __builtin_neon_vrhadd) - -// saturating add -INTTYPES_ADD_32(vqadd, __builtin_neon_vqadd) -INTTYPES_ADD_64(vqadd, __builtin_neon_vqadd) - -// add high half -// rounding add high half -INTTYPES_NARROWING(vaddhn, __builtin_neon_vaddhn) -INTTYPES_NARROWING(vraddhn, __builtin_neon_vraddhn) - -// multiply -// mul-poly - -// multiple accumulate -// multiple subtract - -// multiple accumulate long -// multiple subtract long -INTTYPES_WIDENING_MUL(vmlal, __builtin_neon_vmlal) -INTTYPES_WIDENING_MUL(vmlsl, __builtin_neon_vmlsl) - -// saturating doubling multiply high -// saturating rounding doubling multiply high - -// saturating doubling multiply accumulate long -// saturating doubling multiply subtract long - -// long multiply -// long multiply-poly -INTTYPES_WIDENING(vmull, __builtin_neon_vmull) -__ai poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b) { return (poly16x8_t){ __builtin_neon_vmull(a.val, b.val) }; } - -// saturating doubling long multiply - -// subtract - -// long subtract -INTTYPES_WIDENING(vsubl, __builtin_neon_vsubl) - -// wide subtract -INTTYPES_WIDE(vsubw, __builtin_neon_vsubw) - -// saturating subtract -INTTYPES_ADD_32(vqsub, __builtin_neon_vqsub) -INTTYPES_ADD_64(vqsub, __builtin_neon_vqsub) - -// halving subtract -INTTYPES_ADD_32(vhsub, __builtin_neon_vhsub) - -// subtract high half -// rounding subtract high half -INTTYPES_NARROWING(vsubhn, __builtin_neon_vsubhn) -INTTYPES_NARROWING(vrsubhn, __builtin_neon_vrsubhn) - -// compare eq -// compare ge -// compare le -// compare gt -// compare lt -INT_FLOAT_CMP_OP(vceq, ==) -INT_FLOAT_CMP_OP(vcge, >=) -INT_FLOAT_CMP_OP(vcle, <=) -INT_FLOAT_CMP_OP(vcgt, >) -INT_FLOAT_CMP_OP(vclt, <) - -// compare eq-poly - -// compare abs ge -// compare abs le -// compare abs gt -// compare abs lt -FLOATTYPES_CMP(vcage, __builtin_neon_vcage) -FLOATTYPES_CMP(vcale, __builtin_neon_vcale) -FLOATTYPES_CMP(vcagt, __builtin_neon_vcagt) -FLOATTYPES_CMP(vcalt, __builtin_neon_vcalt) - -// test bits - -// abs diff -INTTYPES_ADD_32(vabd, __builtin_neon_vabd) -FP_BINARY(vabd, __builtin_neon_vabd) - -// abs diff long -INTTYPES_WIDENING(vabdl, __builtin_neon_vabdl) - -// abs diff accumulate -// abs diff accumulate long - -// max -// min -INTTYPES_ADD_32(vmax, __builtin_neon_vmax) -FP_BINARY(vmax, __builtin_neon_vmax) -INTTYPES_ADD_32(vmin, __builtin_neon_vmin) -FP_BINARY(vmin, __builtin_neon_vmin) - -// pairwise add -// pairwise max -// pairwise min -INT_FP_PAIRWISE_ADD(vpadd, __builtin_neon_vpadd) -INT_FP_PAIRWISE_ADD(vpmax, __builtin_neon_vpmax) -INT_FP_PAIRWISE_ADD(vpmin, __builtin_neon_vpmin) - -// long pairwise add -// long pairwise add accumulate - -// recip -// recip sqrt -FP_BINARY(vrecps, __builtin_neon_vrecps) -FP_BINARY(vrsqrts, __builtin_neon_vrsqrts) - -// shl by vec -// saturating shl by vec -// rounding shl by vec -// saturating rounding shl by vec - -// shr by constant -// shl by constant -// rounding shr by constant -// shr by constant and accumulate -// rounding shr by constant and accumulate -// saturating shl by constant -// s->u saturating shl by constant -// narrowing saturating shr by constant -// s->u narrowing saturating shr by constant -// s->u rounding narrowing saturating shr by constant -// narrowing saturating shr by constant -// rounding narrowing shr by constant -// rounding narrowing saturating shr by constant -// widening shl by constant - -// shr and insert -// shl and insert - -// loads and stores, single vector -// loads and stores, lane -// loads, dupe - -// loads and stores, arrays - -// vget,vgetq lane -// vset, vsetq lane - -// vcreate -// vdup, vdupq -// vmov, vmovq -// vdup_lane, vdupq_lane -// vcombine -// vget_high, vget_low - -// vcvt {u,s} <-> f, f <-> f16 -// narrow -// long move (unpack) -// saturating narrow -// saturating narrow s->u - -// table lookup -// extended table lookup - -// mla with scalar -// widening mla with scalar -// widening saturating doubling mla with scalar -// mls with scalar -// widening mls with scalar -// widening saturating doubling mls with scalar -// mul by scalar -// long mul with scalar -// long mul by scalar -// saturating doubling long mul with scalar -// saturating doubling long mul by scalar -// saturating doubling mul high with scalar -// saturating doubling mul high by scalar -// saturating rounding doubling mul high with scalar -// saturating rounding doubling mul high by scalar -// mla with scalar -// widening mla with sclar -// widening saturating doubling mla with scalar -// mls with scalar -// widening mls with scalar -// widening saturating doubling mls with scalar - -// extract - -// endian swap (vrev) - -// negate - -// abs -// saturating abs -// saturating negate -// count leading signs -INT_UNARY(vabs, __builtin_neon_vabs) -FP_UNARY(vabs, __builtin_neon_vabs) -INT_UNARY(vqabs, __builtin_neon_vqabs) -INT_UNARY(vqneg, __builtin_neon_vqneg) -INT_UNARY(vcls, __builtin_neon_vcls) - -// count leading zeroes -// popcount - -// recip_est -// recip_sqrt_est - -// not-poly -// not - -// and -// or -// xor -// andn -// orn -INT_LOGICAL_OP(vand, &) -INT_LOGICAL_OP(vorr, |) -INT_LOGICAL_OP(veor, ^) -INT_LOGICAL_OP(vbic, &~) -INT_LOGICAL_OP(vorn, |~) - -// bitselect - -// transpose elts -// interleave elts -// deinterleave elts - -// vreinterpret - -#endif /* __ARM_NEON_H */ diff --git a/lib/Headers/arm_neon.td b/lib/Headers/arm_neon.td new file mode 100644 index 0000000000..c227acbdc9 --- /dev/null +++ b/lib/Headers/arm_neon.td @@ -0,0 +1,286 @@ +//===--- arm_neon.td - ARM NEON compiler interface ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the TableGen definitions from which the ARM NEON header +// file will be generated. See ARM document DUI0348B. +// +//===----------------------------------------------------------------------===// + +class Op; + +def OP_NONE : Op; +def OP_ADD : Op; +def OP_SUB : Op; +def OP_MUL : Op; +def OP_MLA : Op; +def OP_MLS : Op; +def OP_EQ : Op; +def OP_GE : Op; +def OP_LE : Op; +def OP_GT : Op; +def OP_LT : Op; +def OP_NEG : Op; +def OP_NOT : Op; +def OP_AND : Op; +def OP_OR : Op; +def OP_XOR : Op; +def OP_ANDN : Op; +def OP_ORN : Op; + +class Inst <string p, string t, Op o = OP_NONE> { + string Prototype = p; + string Types = t; + Op Operand = o; +} + +// prototype: return (arg, arg, ...) +// v: void +// t: best-fit integer (int/poly args) +// x: unsigned integer (int/float args) +// f: float (int args) +// d: default +// w: double width elements, same num elts +// n: double width elements, half num elts +// i: constant int +// l: constant uint64 +// s: scalar of element type +// c: default elt width, double num elts + +// sizes: +// c: char +// s: short +// i: int +// l: long +// f: float +// h: half-float + +// size modifiers: +// U: unsigned +// Q: 128b +// P: polynomial + +//////////////////////////////////////////////////////////////////////////////// +// E.3.1 Addition +def VADD : Inst<"ddd", "csilfUcUsUiUlQcQsQiQlQfQUcQUsQUiQUl", OP_ADD>; +def VADDL : Inst<"wdd", "csiUcUsUi">; +def VADDw : Inst<"wwd", "csiUcUsUi">; +def VHADD : Inst<"ddd", "csiUcUsUiQcQsQiQUcQUsQUi">; +def VRHADD : Inst<"ddd", "csiUcUsUiQcQsQiQUcQUsQUi">; +def VQADD : Inst<"ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">; +def VADDHN : Inst<"dww", "csiUcUsUi">; +def VRADDHN : Inst<"dww", "csiUcUsUi">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.2 Multiplication +def VMUL : Inst<"ddd", "csifUcUsUiPcQcQsQiQfQUcQUsQUiQPc", OP_MUL>; +def VMLA : Inst<"dddd", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_MLA>; +def VMLAL : Inst<"wwdd", "csiUcUsUi">; +def VMLS : Inst<"dddd", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_MLS>; +def VMLSL : Inst<"wwdd", "csiUcUsUi">; +def VQDMULH : Inst<"ddd", "siQsQi">; +def VQRDMULH : Inst<"ddd", "siQsQi">; +def VQDMLAL : Inst<"wwdd", "si">; +def VQDMLSL : Inst<"wwdd", "si">; +def VMULL : Inst<"wdd", "csiUcUsUiPc">; +def VQDMULL : Inst<"wdd", "si">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.3 Subtraction +def VSUB : Inst<"ddd", "csilfUcUsUiUlQcQsQiQlQfQUcQUsQUiQUl", OP_SUB>; +def VSUBL : Inst<"wdd", "csiUcUsUi">; +def VSUBw : Inst<"wwd", "csiUcUsUi">; +def VQSUB : Inst<"ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">; +def VHSUB : Inst<"ddd", "csiUcUsUiQcQsQiQUcQUsQUi">; +def VSUBHN : Inst<"dww", "csiUcUsUi">; +def VRSUBHN : Inst<"dww", "csiUcUsUi">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.4 Comparison +def VCEQ : Inst<"xdd", "csifUcUsUiPcQcQsQiQfQUcQUsQUiQPc", OP_EQ>; +def VCGE : Inst<"xdd", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_GE>; +def VCLE : Inst<"xdd", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_LE>; +def VCGT : Inst<"xdd", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_GT>; +def VCLT : Inst<"xdd", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_LT>; +def VCAGE : Inst<"xdd", "fQf">; +def VCALE : Inst<"xdd", "fQf">; +def VCAGT : Inst<"xdd", "fQf">; +def VCALT : Inst<"xdd", "fQf">; +def VTST : Inst<"xdd", "csiUcUsUiPcQcQsQiQUcQUsQUiQPc">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.5 Absolute Difference +def VABD : Inst<"ddd", "csiUcUsUifQcQsQiQUcQUsQUiQf">; +def VABDL : Inst<"wdd", "csiUcUsUi">; +def VABA : Inst<"dddd", "csiUcUsUiQcQsQiQUcQUsQUi">; +def VABAL : Inst<"wwdd", "csiUcUsUi">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.6 Max/Min +def VMAX : Inst<"ddd", "csiUcUsUifQcQsQiQUcQUsQUiQf">; +def VMIN : Inst<"ddd", "csiUcUsUifQcQsQiQUcQUsQUiQf">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.7 Pairdise Addition +def VPADD : Inst<"ddd", "csiUcUsUif">; +def VPADDL : Inst<"nd", "csiUcUsUiQcQsQiQUcQUsQUi">; +def VPADAL : Inst<"nnd", "csiUcUsUiQcQsQiQUcQUsQUi">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.8-9 Folding Max/Min +def VPMAX : Inst<"ddd", "csiUcUsUif">; +def VPMIN : Inst<"ddd", "csiUcUsUif">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.10 Reciprocal/Sqrt +def VRECPS : Inst<"ddd", "fQf">; +def VRSQRTS : Inst<"ddd", "fQf">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.11 Shifts by signed variable +def VSHL : Inst<"ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">; +def VQSHL : Inst<"ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">; +def VRSHL : Inst<"ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">; +def VQRSHL : Inst<"ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.12 Shifts by constant +def VSHR_N : Inst<"ddi", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">; +def VSHL_N : Inst<"ddi", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">; +def VRSHR_N : Inst<"ddi", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">; +def VSRA_N : Inst<"dddi", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">; +def VRSRA_N : Inst<"dddi", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">; +def VQSHL_N : Inst<"ddi", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">; +def VQSHLU_N : Inst<"xdi", "csilQcQsQiQl">; +def VSHRN_N : Inst<"dwi", "csiUcUsUi">; +def VQSHRUN_N : Inst<"xwi", "csi">; +def VQRSHRUN_N : Inst<"xwi", "csi">; +def VQSHRN_N : Inst<"dwi", "csiUcUsUi">; +def VRSHRN_N : Inst<"dwi", "csiUcUsUi">; +def VQRSHRN_N : Inst<"dwi", "csiUcUsUi">; +def VSHLL_N : Inst<"wdi", "csiUcUsUi">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.13 Shifts with insert +def VSRI_N : Inst<"dddi", "csilUcUsUiUlPcPsQcQsQiQlQUcQUsQUiQUlQPcQPs">; +def VSLI_N : Inst<"dddi", "csilUcUsUiUlPcPsQcQsQiQlQUcQUsQUiQUlQPcQPs">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.14 Loads and stores of a single vector + +//////////////////////////////////////////////////////////////////////////////// +// E.3.15 Loads and stores of an N-element structure + +//////////////////////////////////////////////////////////////////////////////// +// E.3.16 Extract lanes from a vector +def VGET_LANE : Inst<"ddi", "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.17 Set lanes within a vector +def VSET_LANE : Inst<"dddi", "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.18 Initialize a vector from bit pattern +def VCREATE: Inst<"dl", "csihfUcUsUiUlPcPsl">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.19 Set all lanes to same value +def VDUP_N : Inst<"ds", "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl">; +def VMOV_N : Inst<"ds", "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.20 Combining vectors +def VCOMBINE : Inst<"cdd", "csilhfUcUsUiUlPcPs">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.21 Splitting vectors +def VGET_HIGH : Inst<"dc", "csilhfUcUsUiUlPcPs">; +def VGET_LOW : Inst<"dc", "csilhfUcUsUiUlPcPs">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.22 Converting vectors +def VCVT_S32 : Inst<"df", "iQi">; +def VCVT_U32 : Inst<"df", "UiQUi">; +def VCVT_F16 : Inst<"df", "h">; +def VCVT_N_S32 : Inst<"dfi", "iQi">; +def VCVT_N_U32 : Inst<"dfi", "UiQUi">; +def VCVT_F32 : Inst<"fd", "iUiQiQUih">; +def VCVT_N_F32 : Inst<"fdi", "iUiQiQUi">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.23-24 Table lookup, Extended table lookup +def VTBL1 : Inst<"ddt", "UccPc">; +def VTBL2 : Inst<"d2dt", "UccPc">; +def VTBL3 : Inst<"d3dt", "UccPc">; +def VTBL4 : Inst<"d4dt", "UccPc">; +def VTBX1 : Inst<"dddt", "UccPc">; +def VTBX2 : Inst<"dd2dt", "UccPc">; +def VTBX3 : Inst<"dd3dt", "UccPc">; +def VTBX4 : Inst<"dd4dt", "UccPc">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.25 Operations with a scalar value +def VMLA_LANE : Inst<"ddddi", "siUsUifQsQiQUsQUiQf">; +def VMLAL_LANE : Inst<"wwddi", "siUsUi">; +def VQDMLAL_LANE : Inst<"wwddi", "si">; +def VMLS_LANE : Inst<"ddddi", "siUsUifQsQiQUsQUiQf">; +def VMLSL_LANE : Inst<"wwddi", "siUsUi">; +def VQDMLSL_LANE : Inst<"wwddi", "si">; +def VMUL_N : Inst<"dds", "sifUsUiQsQiQfQUsQUi">; +def VMULL_N : Inst<"wds", "siUsUi">; +def VMULL_LANE : Inst<"wddi", "siUsUi">; +def VQDMULL_N : Inst<"wds", "si">; +def VQDMULL_LANE : Inst<"wddi", "si">; +def VQDMULH_N : Inst<"dds", "siQsQi">; +def VQDMULH_LANE : Inst<"dddi", "siQsQi">; +def VQRDMULH_N : Inst<"dds", "siQsQi">; +def VQRDMULH_LANE : Inst<"dddi", "siQsQi">; +def VMLA_N : Inst<"ddds", "siUsUifQsQiQUsQUiQf">; +def VQDMLAL_N : Inst<"wwds", "si">; +def VMLS_N : Inst<"ddds", "siUsUifQsQiQUsQUiQf">; +def VMLSL_N : Inst<"wwds", "siUsUi">; +def VQDMLSL_N : Inst<"wwds", "si">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.26 Vector Extract +def VEXT : Inst<"dddi", "cUcPcsUsPsiUilUlQcQUcQPcQsQUsQPsQiQUiQlQUl">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.27 Reverse vector elements (sdap endianness) +def VREV64 : Inst<"dd", "csiUcUsUiPcPsfQcQsQiQUcQUsQUiQPcQPsQf">; +def VREV32 : Inst<"dd", "csUcUsPcQcQsQUcQUsQPc">; +def VREV16 : Inst<"dd", "cUcPcQcQUcQPc">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.28 Other single operand arithmetic +def VABS : Inst<"dd", "csifQcQsQiQf">; +def VQABS : Inst<"dd", "csiQcQsQi">; +def VNEG : Inst<"dd", "csifQcQsQiQf", OP_NEG>; +def VQNEG : Inst<"dd", "csiQcQsQi">; +def VCLS : Inst<"dd", "csiQcQsQi">; +def VCLZ : Inst<"dd", "csiUcUsUiQcQsQiQUcQUsQUi">; +def VCNT : Inst<"dd", "UccPcQUcQcQPc">; +def VRECPE : Inst<"dd", "fUiQfQUi">; +def VRSQRTE : Inst<"dd", "fUiQfQUi">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.29 Logical operations +def VMVN : Inst<"dd", "csiUcUsUiPcQcQsQiQUcQUsQUiQPc", OP_NOT>; +def VAND : Inst<"ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_AND>; +def VORR : Inst<"ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_OR>; +def VEOR : Inst<"ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_XOR>; +def VBIC : Inst<"ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_ANDN>; +def VORN : Inst<"ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_ORN>; +def VBSL : Inst<"dxdd", "csilUcUsUiUlfPcPsQcQsQiQlQUcQUsQUiQUlQfQPcQPs">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.30 Transposition operations +def VTRN: Inst<"", "">; + +//////////////////////////////////////////////////////////////////////////////// +// E.3.31 Vector reinterpret cast operations |