3 files changed, 299 insertions, 538 deletions
diff --git a/lib/Headers/Makefile b/lib/Headers/Makefile
index cb36e84319..fea59d6f4b 100644
--- a/lib/Headers/Makefile
+++ b/lib/Headers/Makefile
@@ -8,6 +8,10 @@
 ##===----------------------------------------------------------------------===##
 
 LEVEL = ../../../..
+
+BUILT_SOURCES = arm_neon.h.inc
+TABLEGEN_INC_FILES_COMMON = 1
+
 include $(LEVEL)/Makefile.common
 
 CLANG_VERSION := $(shell cat $(PROJ_SRC_DIR)/../../VER)
@@ -19,10 +23,14 @@ HEADERS := $(notdir $(wildcard $(PROJ_SRC_DIR)/*.h))
 OBJHEADERS := $(addprefix $(HeaderDir)/, $(HEADERS))
 
 
-$(OBJHEADERS): $(HeaderDir)/%.h: $(PROJ_SRC_DIR)/%.h $(HeaderDir)/.dir
+$(OBJHEADERS): $(HeaderDir)/%.h: $(PROJ_SRC_DIR)/%.h $(HeaderDir)/.dir $(HeaderDir)/arm_neon.h
 	$(Verb) cp $< $@
 	$(Echo) Copying $(notdir $<) to build dir
 
+$(HeaderDir)/arm_neon.h: $(BUILT_SOURCES)
+	$(Verb) cp $< $@
+	$(Echo) Copying $(notdir $<) to build dir
+	
 # Hook into the standard Makefile rules.
 all-local:: $(OBJHEADERS)
 
@@ -38,3 +46,7 @@ $(INSTHEADERS): $(PROJ_headers)/%.h: $(HeaderDir)/%.h | $(PROJ_headers)
 	$(Echo) Installing compiler include file: $(notdir $<)
 
 install-local:: $(INSTHEADERS)
+
+$(ObjDir)/arm_neon.h.inc.tmp : arm_neon.td $(TBLGEN) $(ObjDir)/.dir
+	$(Echo) "Building Clang arm_neon.h.inc with tblgen"
+	$(Verb) $(TableGen) -gen-arm-neon-header -o $(call SYSPATH, $@) $<
diff --git a/lib/Headers/arm_neon.h b/lib/Headers/arm_neon.h
deleted file mode 100644
index 4508a27f36..0000000000
--- a/lib/Headers/arm_neon.h
+++ /dev/null
@@ -1,537 +0,0 @@
-/*===---- arm_neon.h - NEON intrinsics --------------------------------------===
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __ARM_NEON_H
-#define __ARM_NEON_H
-
-#ifndef __ARM_NEON__
-#error "NEON support not enabled"
-#endif
-
-// NEON document appears to be specified in terms of stdint types.
-#include <stdint.h>
-
-// Define some NEON-specific scalar types for floats and polynomials.
-typedef float float32_t;
-typedef uint8_t poly8_t;
-
-// FIXME: probably need a 'poly' attribute or something for correct codegen to
-//        disambiguate from uint16_t.
-typedef uint16_t poly16_t;
-
-typedef __attribute__(( __vector_size__(8) ))  int8_t __neon_int8x8_t;
-typedef __attribute__(( __vector_size__(16) )) int8_t __neon_int8x16_t;
-typedef __attribute__(( __vector_size__(8) ))  int16_t __neon_int16x4_t;
-typedef __attribute__(( __vector_size__(16) )) int16_t __neon_int16x8_t;
-typedef __attribute__(( __vector_size__(8) ))  int32_t __neon_int32x2_t;
-typedef __attribute__(( __vector_size__(16) )) int32_t __neon_int32x4_t;
-typedef __attribute__(( __vector_size__(8) ))  int64_t __neon_int64x1_t;
-typedef __attribute__(( __vector_size__(16) )) int64_t __neon_int64x2_t;
-typedef __attribute__(( __vector_size__(8) ))  uint8_t __neon_uint8x8_t;
-typedef __attribute__(( __vector_size__(16) )) uint8_t __neon_uint8x16_t;
-typedef __attribute__(( __vector_size__(8) ))  uint16_t __neon_uint16x4_t;
-typedef __attribute__(( __vector_size__(16) )) uint16_t __neon_uint16x8_t;
-typedef __attribute__(( __vector_size__(8) ))  uint32_t __neon_uint32x2_t;
-typedef __attribute__(( __vector_size__(16) )) uint32_t __neon_uint32x4_t;
-typedef __attribute__(( __vector_size__(8) ))  uint64_t __neon_uint64x1_t;
-typedef __attribute__(( __vector_size__(16) )) uint64_t __neon_uint64x2_t;
-typedef __attribute__(( __vector_size__(8) ))  uint16_t __neon_float16x4_t;
-typedef __attribute__(( __vector_size__(16) )) uint16_t __neon_float16x8_t;
-typedef __attribute__(( __vector_size__(8) ))  float32_t __neon_float32x2_t;
-typedef __attribute__(( __vector_size__(16) )) float32_t __neon_float32x4_t;
-typedef __attribute__(( __vector_size__(8) ))  poly8_t __neon_poly8x8_t;
-typedef __attribute__(( __vector_size__(16) )) poly8_t __neon_poly8x16_t;
-typedef __attribute__(( __vector_size__(8) ))  poly16_t __neon_poly16x4_t;
-typedef __attribute__(( __vector_size__(16) )) poly16_t __neon_poly16x8_t;
-
-typedef struct __int8x8_t {
-  __neon_int8x8_t val;
-} int8x8_t;
-
-typedef struct __int8x16_t {
-  __neon_int8x16_t val;
-} int8x16_t;
-
-typedef struct __int16x4_t {
-  __neon_int16x4_t val;
-} int16x4_t;
-
-typedef struct __int16x8_t {
-  __neon_int16x8_t val;
-} int16x8_t;
-
-typedef struct __int32x2_t {
-  __neon_int32x2_t val;
-} int32x2_t;
-
-typedef struct __int32x4_t {
-  __neon_int32x4_t val;
-} int32x4_t;
-
-typedef struct __int64x1_t {
-  __neon_int64x1_t val;
-} int64x1_t;
-
-typedef struct __int64x2_t {
-  __neon_int64x2_t val;
-} int64x2_t;
-
-typedef struct __uint8x8_t {
-  __neon_uint8x8_t val;
-} uint8x8_t;
-
-typedef struct __uint8x16_t {
-  __neon_uint8x16_t val;
-} uint8x16_t;
-
-typedef struct __uint16x4_t {
-  __neon_uint16x4_t val;
-} uint16x4_t;
-
-typedef struct __uint16x8_t {
-  __neon_uint16x8_t val;
-} uint16x8_t;
-
-typedef struct __uint32x2_t {
-  __neon_uint32x2_t val;
-} uint32x2_t;
-
-typedef struct __uint32x4_t {
-  __neon_uint32x4_t val;
-} uint32x4_t;
-
-typedef struct __uint64x1_t {
-  __neon_uint64x1_t val;
-} uint64x1_t;
-
-typedef struct __uint64x2_t {
-  __neon_uint64x2_t val;
-} uint64x2_t;
-
-typedef struct __float16x4_t {
-  __neon_float16x4_t val;
-} float16x4_t;
-
-typedef struct __float16x8_t {
-  __neon_float16x8_t val;
-} float16x8_t;
-
-typedef struct __float32x2_t {
-  __neon_float32x2_t val;
-} float32x2_t;
-
-typedef struct __float32x4_t {
-  __neon_float32x4_t val;
-} float32x4_t;
-
-typedef struct __poly8x8_t {
-  __neon_poly8x8_t val;
-} poly8x8_t;
-
-typedef struct __poly8x16_t {
-  __neon_poly8x16_t val;
-} poly8x16_t;
-
-typedef struct __poly16x4_t {
-  __neon_poly16x4_t val;
-} poly16x4_t;
-
-typedef struct __poly16x8_t {
-  __neon_poly16x8_t val;
-} poly16x8_t;
-
-// FIXME: write tool to stamp out the structure-of-array types, possibly gen this whole file.
-
-// Intrinsics, per ARM document DUI0348B
-#define __ai static __attribute__((__always_inline__))
-
-#define INTTYPES_WIDE(op, builtin) \
-  __ai int16x8_t op##_s8(int16x8_t a, int8x8_t b) { return (int16x8_t){ builtin(a.val, b.val) }; } \
-  __ai int32x4_t op##_s16(int32x4_t a, int16x4_t b) { return (int32x4_t){ builtin(a.val, b.val) }; } \
-  __ai int64x2_t op##_s32(int64x2_t a, int32x2_t b) { return (int64x2_t){ builtin(a.val, b.val) }; } \
-  __ai uint16x8_t op##_u8(uint16x8_t a, uint8x8_t b) { return (uint16x8_t){ builtin(a.val, b.val) }; } \
-  __ai uint32x4_t op##_u16(uint32x4_t a, uint16x4_t b) { return (uint32x4_t){ builtin(a.val, b.val) }; } \
-  __ai uint64x2_t op##_u32(uint64x2_t a, uint32x2_t b) { return (uint64x2_t){ builtin(a.val, b.val) }; }
-
-#define INTTYPES_WIDENING(op, builtin) \
-  __ai int16x8_t op##_s8(int8x8_t a, int8x8_t b) { return (int16x8_t){ builtin(a.val, b.val) }; } \
-  __ai int32x4_t op##_s16(int16x4_t a, int16x4_t b) { return (int32x4_t){ builtin(a.val, b.val) }; } \
-  __ai int64x2_t op##_s32(int32x2_t a, int32x2_t b) { return (int64x2_t){ builtin(a.val, b.val) }; } \
-  __ai uint16x8_t op##_u8(uint8x8_t a, uint8x8_t b) { return (uint16x8_t){ builtin(a.val, b.val) }; } \
-  __ai uint32x4_t op##_u16(uint16x4_t a, uint16x4_t b) { return (uint32x4_t){ builtin(a.val, b.val) }; } \
-  __ai uint64x2_t op##_u32(uint32x2_t a, uint32x2_t b) { return (uint64x2_t){ builtin(a.val, b.val) }; }
-
-#define INTTYPES_WIDENING_MUL(op, builtin) \
-  __ai int16x8_t op##_s8(int16x8_t a, int8x8_t b, int8x8_t c) { return (int16x8_t){ builtin(a.val, b.val, c.val) }; } \
-  __ai int32x4_t op##_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return (int32x4_t){ builtin(a.val, b.val, c.val) }; } \
-  __ai int64x2_t op##_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return (int64x2_t){ builtin(a.val, b.val, c.val) }; } \
-  __ai uint16x8_t op##_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return (uint16x8_t){ builtin(a.val, b.val, c.val) }; } \
-  __ai uint32x4_t op##_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return (uint32x4_t){ builtin(a.val, b.val, c.val) }; } \
-  __ai uint64x2_t op##_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return (uint64x2_t){ builtin(a.val, b.val, c.val) }; }
-
-#define INTTYPES_NARROWING(op, builtin) \
-  __ai int8x8_t op##_s16(int16x8_t a, int16x8_t b) { return (int8x8_t){ builtin(a.val, b.val) }; } \
-  __ai int16x4_t op##_s32(int32x4_t a, int32x4_t b) { return (int16x4_t){ builtin(a.val, b.val) }; } \
-  __ai int32x2_t op##_s64(int64x2_t a, int64x2_t b) { return (int32x2_t){ builtin(a.val, b.val) }; } \
-  __ai uint8x8_t op##_u16(uint16x8_t a, uint16x8_t b) { return (uint8x8_t){ builtin(a.val, b.val) }; } \
-  __ai uint16x4_t op##_u32(uint32x4_t a, uint32x4_t b) { return (uint16x4_t){ builtin(a.val, b.val) }; } \
-  __ai uint32x2_t op##_u64(uint64x2_t a, uint64x2_t b) { return (uint32x2_t){ builtin(a.val, b.val) }; }
-
-#define INTTYPES_ADD_32(op, builtin) \
-  __ai int8x8_t op##_s8(int8x8_t a, int8x8_t b) { return (int8x8_t){ builtin(a.val, b.val) }; } \
-  __ai int16x4_t op##_s16(int16x4_t a, int16x4_t b) { return (int16x4_t){ builtin(a.val, b.val) }; } \
-  __ai int32x2_t op##_s32(int32x2_t a, int32x2_t b) { return (int32x2_t){ builtin(a.val, b.val) }; } \
-  __ai uint8x8_t op##_u8(uint8x8_t a, uint8x8_t b) { return (uint8x8_t){ builtin(a.val, b.val) }; } \
-  __ai uint16x4_t op##_u16(uint16x4_t a, uint16x4_t b) { return (uint16x4_t){ builtin(a.val, b.val) }; } \
-  __ai uint32x2_t op##_u32(uint32x2_t a, uint32x2_t b) { return (uint32x2_t){ builtin(a.val, b.val) }; } \
-  __ai int8x16_t op##q_s8(int8x16_t a, int8x16_t b) { return (int8x16_t){ builtin(a.val, b.val) }; } \
-  __ai int16x8_t op##q_s16(int16x8_t a, int16x8_t b) { return (int16x8_t){ builtin(a.val, b.val) }; } \
-  __ai int32x4_t op##q_s32(int32x4_t a, int32x4_t b) { return (int32x4_t){ builtin(a.val, b.val) }; } \
-  __ai uint8x16_t op##q_u8(uint8x16_t a, uint8x16_t b) { return (uint8x16_t){ builtin(a.val, b.val) }; } \
-  __ai uint16x8_t op##q_u16(uint16x8_t a, uint16x8_t b) { return (uint16x8_t){ builtin(a.val, b.val) }; } \
-  __ai uint32x4_t op##q_u32(uint32x4_t a, uint32x4_t b) { return (uint32x4_t){ builtin(a.val, b.val) }; }
-
-#define INTTYPES_ADD_64(op, builtin) \
-  __ai int64x1_t op##_s64(int64x1_t a, int64x1_t b) { return (int64x1_t){ builtin(a.val, b.val) }; } \
-  __ai uint64x1_t op##_u64(uint64x1_t a, uint64x1_t b) { return (uint64x1_t){ builtin(a.val, b.val) }; } \
-  __ai int64x2_t op##q_s64(int64x2_t a, int64x2_t b) { return (int64x2_t){ builtin(a.val, b.val) }; } \
-  __ai uint64x2_t op##q_u64(uint64x2_t a, uint64x2_t b) { return (uint64x2_t){ builtin(a.val, b.val) }; }
-
-#define FLOATTYPES_CMP(op, builtin) \
-  __ai uint32x2_t op##_f32(float32x2_t a, float32x2_t b) { return (uint32x2_t){ builtin(a.val, b.val) }; } \
-  __ai uint32x4_t op##q_f32(float32x4_t a, float32x4_t b) { return (uint32x4_t){ builtin(a.val, b.val) }; }
-
-#define INT_FLOAT_CMP_OP(op, cc) \
-  __ai uint8x8_t op##_s8(int8x8_t a, int8x8_t b) { return (uint8x8_t){(__neon_uint8x8_t)(a.val cc b.val)}; } \
-  __ai uint16x4_t op##_s16(int16x4_t a, int16x4_t b) { return (uint16x4_t){(__neon_uint16x4_t)(a.val cc b.val)}; } \
-  __ai uint32x2_t op##_s32(int32x2_t a, int32x2_t b) { return (uint32x2_t){(__neon_uint32x2_t)(a.val cc b.val)}; } \
-  __ai uint32x2_t op##_f32(float32x2_t a, float32x2_t b) { return (uint32x2_t){(__neon_uint32x2_t)(a.val cc b.val)}; } \
-  __ai uint8x8_t op##_u8(uint8x8_t a, uint8x8_t b) { return (uint8x8_t){a.val cc b.val}; } \
-  __ai uint16x4_t op##_u16(uint16x4_t a, uint16x4_t b) { return (uint16x4_t){a.val cc b.val}; } \
-  __ai uint32x2_t op##_u32(uint32x2_t a, uint32x2_t b) { return (uint32x2_t){a.val cc b.val}; } \
-  __ai uint8x16_t op##q_s8(int8x16_t a, int8x16_t b) { return (uint8x16_t){(__neon_uint8x16_t)(a.val cc b.val)}; } \
-  __ai uint16x8_t op##q_s16(int16x8_t a, int16x8_t b) { return (uint16x8_t){(__neon_uint16x8_t)(a.val cc b.val)}; } \
-  __ai uint32x4_t op##q_s32(int32x4_t a, int32x4_t b) { return (uint32x4_t){(__neon_uint32x4_t)(a.val cc b.val)}; } \
-  __ai uint32x4_t op##q_f32(float32x4_t a, float32x4_t b) { return (uint32x4_t){(__neon_uint32x4_t)(a.val cc b.val)}; } \
-  __ai uint8x16_t op##q_u8(uint8x16_t a, uint8x16_t b) { return (uint8x16_t){a.val cc b.val}; } \
-  __ai uint16x8_t op##q_u16(uint16x8_t a, uint16x8_t b) { return (uint16x8_t){a.val cc b.val}; } \
-  __ai uint32x4_t op##q_u32(uint32x4_t a, uint32x4_t b) { return (uint32x4_t){a.val cc b.val}; } 
-
-#define INT_UNARY(op, builtin) \
-  __ai int8x8_t op##_s8(int8x8_t a) { return (int8x8_t){ builtin(a.val) }; } \
-  __ai int16x4_t op##_s16(int16x4_t a) { return (int16x4_t){ builtin(a.val) }; } \
-  __ai int32x2_t op##_s32(int32x2_t a) { return (int32x2_t){ builtin(a.val) }; } \
-  __ai int8x16_t op##q_s8(int8x16_t a) { return (int8x16_t){ builtin(a.val) }; } \
-  __ai int16x8_t op##q_s16(int16x8_t a) { return (int16x8_t){ builtin(a.val) }; } \
-  __ai int32x4_t op##q_s32(int32x4_t a) { return (int32x4_t){ builtin(a.val) }; }
-
-#define FP_UNARY(op, builtin) \
-  __ai float32x2_t op##_f32(float32x2_t a) { return (float32x2_t){ builtin(a.val) }; } \
-  __ai float32x4_t op##q_f32(float32x4_t a) { return (float32x4_t){ builtin(a.val) }; }
-
-#define FP_BINARY(op, builtin) \
-  __ai float32x2_t op##_f32(float32x2_t a, float32x2_t b) { return (float32x2_t){ builtin(a.val, b.val) }; } \
-  __ai float32x4_t op##q_f32(float32x4_t a, float32x4_t b) { return (float32x4_t){ builtin(a.val, b.val) }; }
-
-#define INT_FP_PAIRWISE_ADD(op, builtin) \
-  __ai int8x8_t op##_s8(int8x8_t a, int8x8_t b) { return (int8x8_t){ builtin(a.val, b.val) }; } \
-  __ai int16x4_t op##_s16(int16x4_t a, int16x4_t b) { return (int16x4_t){ builtin(a.val, b.val) }; } \
-  __ai int32x2_t op##_s32(int32x2_t a, int32x2_t b) { return (int32x2_t){ builtin(a.val, b.val) }; } \
-  __ai uint8x8_t op##_u8(uint8x8_t a, uint8x8_t b) { return (uint8x8_t){ builtin(a.val, b.val) }; } \
-  __ai uint16x4_t op##_u16(uint16x4_t a, uint16x4_t b) { return (uint16x4_t){ builtin(a.val, b.val) }; } \
-  __ai uint32x2_t op##_u32(uint32x2_t a, uint32x2_t b) { return (uint32x2_t){ builtin(a.val, b.val) }; } \
-  __ai float32x2_t op##_f32(float32x2_t a, float32x2_t b) { return (float32x2_t){ builtin(a.val, b.val) }; }
-
-#define INT_LOGICAL_OP(op, lop) \
-  __ai int8x8_t op##_s8(int8x8_t a, int8x8_t b) { return (int8x8_t){ a.val lop b.val }; } \
-  __ai int16x4_t op##_s16(int16x4_t a, int16x4_t b) { return (int16x4_t){ a.val lop b.val }; } \
-  __ai int32x2_t op##_s32(int32x2_t a, int32x2_t b) { return (int32x2_t){ a.val lop b.val }; } \
-  __ai int64x1_t op##_s64(int64x1_t a, int64x1_t b) { return (int64x1_t){ a.val lop b.val }; } \
-  __ai uint8x8_t op##_u8(uint8x8_t a, uint8x8_t b) { return (uint8x8_t){ a.val lop b.val }; } \
-  __ai uint16x4_t op##_u16(uint16x4_t a, uint16x4_t b) { return (uint16x4_t){ a.val lop b.val }; } \
-  __ai uint32x2_t op##_u32(uint32x2_t a, uint32x2_t b) { return (uint32x2_t){ a.val lop b.val }; } \
-  __ai uint64x1_t op##_u64(uint64x1_t a, uint64x1_t b) { return (uint64x1_t){ a.val lop b.val }; } \
-  __ai int8x16_t op##q_s8(int8x16_t a, int8x16_t b) { return (int8x16_t){ a.val lop b.val }; } \
-  __ai int16x8_t op##q_s16(int16x8_t a, int16x8_t b) { return (int16x8_t){ a.val lop b.val }; } \
-  __ai int32x4_t op##q_s32(int32x4_t a, int32x4_t b) { return (int32x4_t){ a.val lop b.val }; } \
-  __ai int64x2_t op##q_s64(int64x2_t a, int64x2_t b) { return (int64x2_t){ a.val lop b.val }; } \
-  __ai uint8x16_t op##q_u8(uint8x16_t a, uint8x16_t b) { return (uint8x16_t){ a.val lop b.val }; } \
-  __ai uint16x8_t op##q_u16(uint16x8_t a, uint16x8_t b) { return (uint16x8_t){ a.val lop b.val }; } \
-  __ai uint32x4_t op##q_u32(uint32x4_t a, uint32x4_t b) { return (uint32x4_t){ a.val lop b.val }; } \
-  __ai uint64x2_t op##q_u64(uint64x2_t a, uint64x2_t b) { return (uint64x2_t){ a.val lop b.val }; }
-
-// vector add
-__ai int8x8_t vadd_s8(int8x8_t a, int8x8_t b) { return (int8x8_t){a.val + b.val}; }
-__ai int16x4_t vadd_s16(int16x4_t a, int16x4_t b) { return (int16x4_t){a.val + b.val}; }
-__ai int32x2_t vadd_s32(int32x2_t a, int32x2_t b) { return (int32x2_t){a.val + b.val}; }
-__ai int64x1_t vadd_s64(int64x1_t a, int64x1_t b) { return (int64x1_t){a.val + b.val}; }
-__ai float32x2_t vadd_f32(float32x2_t a, float32x2_t b) { return (float32x2_t){a.val + b.val}; }
-__ai uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b) { return (uint8x8_t){a.val + b.val}; }
-__ai uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b) { return (uint16x4_t){a.val + b.val}; }
-__ai uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b) { return (uint32x2_t){a.val + b.val}; }
-__ai uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b) { return (uint64x1_t){a.val + b.val}; }
-__ai int8x16_t vaddq_s8(int8x16_t a, int8x16_t b) { return (int8x16_t){a.val + b.val}; }
-__ai int16x8_t vaddq_s16(int16x8_t a, int16x8_t b) { return (int16x8_t){a.val + b.val}; }
-__ai int32x4_t vaddq_s32(int32x4_t a, int32x4_t b) { return (int32x4_t){a.val + b.val}; }
-__ai int64x2_t vaddq_s64(int64x2_t a, int64x2_t b) { return (int64x2_t){a.val + b.val}; }
-__ai float32x4_t vaddq_f32(float32x4_t a, float32x4_t b) { return (float32x4_t){a.val + b.val}; }
-__ai uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b) { return (uint8x16_t){a.val + b.val}; }
-__ai uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b) { return (uint16x8_t){a.val + b.val}; }
-__ai uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b) { return (uint32x4_t){a.val + b.val}; }
-__ai uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b) { return (uint64x2_t){a.val + b.val}; }
-
-// vector long add
-INTTYPES_WIDENING(vaddl, __builtin_neon_vaddl)
-
-// vector wide add
-INTTYPES_WIDE(vaddw, __builtin_neon_vaddw)
-
-// halving add
-// rounding halving add
-INTTYPES_ADD_32(vhadd, __builtin_neon_vhadd)
-INTTYPES_ADD_32(vrhadd, __builtin_neon_vrhadd)
-
-// saturating add
-INTTYPES_ADD_32(vqadd, __builtin_neon_vqadd)
-INTTYPES_ADD_64(vqadd, __builtin_neon_vqadd)
-
-// add high half
-// rounding add high half
-INTTYPES_NARROWING(vaddhn, __builtin_neon_vaddhn)
-INTTYPES_NARROWING(vraddhn, __builtin_neon_vraddhn)
-
-// multiply
-// mul-poly
-
-// multiple accumulate
-// multiple subtract
-
-// multiple accumulate long
-// multiple subtract long
-INTTYPES_WIDENING_MUL(vmlal, __builtin_neon_vmlal)
-INTTYPES_WIDENING_MUL(vmlsl, __builtin_neon_vmlsl)
-
-// saturating doubling multiply high 
-// saturating rounding doubling multiply high 
-
-// saturating doubling multiply accumulate long 
-// saturating doubling multiply subtract long 
-
-// long multiply
-// long multiply-poly
-INTTYPES_WIDENING(vmull, __builtin_neon_vmull)
-__ai poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b) { return (poly16x8_t){ __builtin_neon_vmull(a.val, b.val) }; }
-
-// saturating doubling long multiply
-
-// subtract
-
-// long subtract
-INTTYPES_WIDENING(vsubl, __builtin_neon_vsubl)
-
-// wide subtract
-INTTYPES_WIDE(vsubw, __builtin_neon_vsubw)
-
-// saturating subtract
-INTTYPES_ADD_32(vqsub, __builtin_neon_vqsub)
-INTTYPES_ADD_64(vqsub, __builtin_neon_vqsub)
-
-// halving subtract
-INTTYPES_ADD_32(vhsub, __builtin_neon_vhsub)
-
-// subtract high half
-// rounding subtract high half
-INTTYPES_NARROWING(vsubhn, __builtin_neon_vsubhn)
-INTTYPES_NARROWING(vrsubhn, __builtin_neon_vrsubhn)
-
-// compare eq
-// compare ge
-// compare le
-// compare gt
-// compare lt
-INT_FLOAT_CMP_OP(vceq, ==)
-INT_FLOAT_CMP_OP(vcge, >=)
-INT_FLOAT_CMP_OP(vcle, <=)
-INT_FLOAT_CMP_OP(vcgt, >)
-INT_FLOAT_CMP_OP(vclt, <)
-
-// compare eq-poly
-
-// compare abs ge
-// compare abs le
-// compare abs gt
-// compare abs lt
-FLOATTYPES_CMP(vcage, __builtin_neon_vcage)
-FLOATTYPES_CMP(vcale, __builtin_neon_vcale)
-FLOATTYPES_CMP(vcagt, __builtin_neon_vcagt)
-FLOATTYPES_CMP(vcalt, __builtin_neon_vcalt)
-
-// test bits
-
-// abs diff
-INTTYPES_ADD_32(vabd, __builtin_neon_vabd)
-FP_BINARY(vabd, __builtin_neon_vabd)
-
-// abs diff long
-INTTYPES_WIDENING(vabdl, __builtin_neon_vabdl)
-
-// abs diff accumulate
-// abs diff accumulate long
-
-// max
-// min
-INTTYPES_ADD_32(vmax, __builtin_neon_vmax)
-FP_BINARY(vmax, __builtin_neon_vmax)
-INTTYPES_ADD_32(vmin, __builtin_neon_vmin)
-FP_BINARY(vmin, __builtin_neon_vmin)
-
-// pairwise add
-// pairwise max
-// pairwise min
-INT_FP_PAIRWISE_ADD(vpadd, __builtin_neon_vpadd)
-INT_FP_PAIRWISE_ADD(vpmax, __builtin_neon_vpmax)
-INT_FP_PAIRWISE_ADD(vpmin, __builtin_neon_vpmin)
-
-// long pairwise add
-// long pairwise add accumulate
-
-// recip
-// recip sqrt
-FP_BINARY(vrecps, __builtin_neon_vrecps)
-FP_BINARY(vrsqrts, __builtin_neon_vrsqrts)
-
-// shl by vec
-// saturating shl by vec
-// rounding shl by vec
-// saturating rounding shl by vec
-
-// shr by constant
-// shl by constant
-// rounding shr by constant
-// shr by constant and accumulate
-// rounding shr by constant and accumulate
-// saturating shl by constant
-// s->u saturating shl by constant
-// narrowing saturating shr by constant
-// s->u narrowing saturating shr by constant
-// s->u rounding narrowing saturating shr by constant
-// narrowing saturating shr by constant
-// rounding narrowing shr by constant
-// rounding narrowing saturating shr by constant
-// widening shl by constant
-
-// shr and insert
-// shl and insert
-
-// loads and stores, single vector
-// loads and stores, lane
-// loads, dupe
-
-// loads and stores, arrays
-
-// vget,vgetq lane
-// vset, vsetq lane
-
-// vcreate
-// vdup, vdupq
-// vmov, vmovq
-// vdup_lane, vdupq_lane
-// vcombine
-// vget_high, vget_low
-
-// vcvt {u,s} <-> f, f <-> f16
-// narrow
-// long move (unpack)
-// saturating narrow
-// saturating narrow s->u
-
-// table lookup
-// extended table lookup
-
-// mla with scalar
-// widening mla with scalar
-// widening saturating doubling mla with scalar
-// mls with scalar
-// widening mls with scalar
-// widening saturating doubling mls with scalar
-// mul by scalar
-// long mul with scalar
-// long mul by scalar
-// saturating doubling long mul with scalar
-// saturating doubling long mul by scalar
-// saturating doubling mul high with scalar
-// saturating doubling mul high by scalar
-// saturating rounding doubling mul high with scalar
-// saturating rounding doubling mul high by scalar
-// mla with scalar
-// widening mla with sclar
-// widening saturating doubling mla with scalar
-// mls with scalar
-// widening mls with scalar
-// widening saturating doubling mls with scalar
-
-// extract
-
-// endian swap (vrev)
-
-// negate
-
-// abs
-// saturating abs
-// saturating negate
-// count leading signs
-INT_UNARY(vabs, __builtin_neon_vabs)
-FP_UNARY(vabs, __builtin_neon_vabs)
-INT_UNARY(vqabs, __builtin_neon_vqabs)
-INT_UNARY(vqneg, __builtin_neon_vqneg)
-INT_UNARY(vcls, __builtin_neon_vcls)
-
-// count leading zeroes
-// popcount
-
-// recip_est
-// recip_sqrt_est
-
-// not-poly
-// not
-
-// and
-// or
-// xor
-// andn
-// orn
-INT_LOGICAL_OP(vand, &)
-INT_LOGICAL_OP(vorr, |)
-INT_LOGICAL_OP(veor, ^)
-INT_LOGICAL_OP(vbic, &~)
-INT_LOGICAL_OP(vorn, |~)
-
-// bitselect
-
-// transpose elts
-// interleave elts
-// deinterleave elts
-
-// vreinterpret
-
-#endif /* __ARM_NEON_H */
diff --git a/lib/Headers/arm_neon.td b/lib/Headers/arm_neon.td
new file mode 100644
index 0000000000..c227acbdc9
--- /dev/null
+++ b/lib/Headers/arm_neon.td
@@ -0,0 +1,286 @@
+//===--- arm_neon.td - ARM NEON compiler interface ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines the TableGen definitions from which the ARM NEON header
+//  file will be generated.  See ARM document DUI0348B.
+//
+//===----------------------------------------------------------------------===//
+
+class Op;
+
+def OP_NONE : Op;
+def OP_ADD  : Op;
+def OP_SUB  : Op;
+def OP_MUL  : Op;
+def OP_MLA  : Op;
+def OP_MLS  : Op;
+def OP_EQ   : Op;
+def OP_GE   : Op;
+def OP_LE   : Op;
+def OP_GT   : Op;
+def OP_LT   : Op;
+def OP_NEG  : Op;
+def OP_NOT  : Op;
+def OP_AND  : Op;
+def OP_OR   : Op;
+def OP_XOR  : Op;
+def OP_ANDN : Op;
+def OP_ORN  : Op;
+
+class Inst <string p, string t, Op o = OP_NONE> {
+  string Prototype = p;
+  string Types = t;
+  Op Operand = o;
+}
+
+// prototype: return (arg, arg, ...)
+// v: void
+// t: best-fit integer (int/poly args)
+// x: unsigned integer (int/float args)
+// f: float (int args)
+// d: default
+// w: double width elements, same num elts
+// n: double width elements, half num elts
+// i: constant int
+// l: constant uint64
+// s: scalar of element type
+// c: default elt width, double num elts
+
+// sizes:
+// c: char
+// s: short
+// i: int
+// l: long
+// f: float
+// h: half-float
+
+// size modifiers:
+// U: unsigned
+// Q: 128b
+// P: polynomial
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.1 Addition
+def VADD    : Inst<"ddd", "csilfUcUsUiUlQcQsQiQlQfQUcQUsQUiQUl", OP_ADD>;
+def VADDL   : Inst<"wdd", "csiUcUsUi">;
+def VADDw   : Inst<"wwd", "csiUcUsUi">;
+def VHADD   : Inst<"ddd", "csiUcUsUiQcQsQiQUcQUsQUi">;
+def VRHADD  : Inst<"ddd", "csiUcUsUiQcQsQiQUcQUsQUi">;
+def VQADD   : Inst<"ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+def VADDHN  : Inst<"dww", "csiUcUsUi">;
+def VRADDHN : Inst<"dww", "csiUcUsUi">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.2 Multiplication
+def VMUL     : Inst<"ddd",  "csifUcUsUiPcQcQsQiQfQUcQUsQUiQPc", OP_MUL>;
+def VMLA     : Inst<"dddd", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_MLA>;
+def VMLAL    : Inst<"wwdd", "csiUcUsUi">;
+def VMLS     : Inst<"dddd", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_MLS>;
+def VMLSL    : Inst<"wwdd", "csiUcUsUi">;
+def VQDMULH  : Inst<"ddd",  "siQsQi">;
+def VQRDMULH : Inst<"ddd",  "siQsQi">;
+def VQDMLAL  : Inst<"wwdd", "si">;
+def VQDMLSL  : Inst<"wwdd", "si">;
+def VMULL    : Inst<"wdd",  "csiUcUsUiPc">;
+def VQDMULL  : Inst<"wdd",  "si">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.3 Subtraction
+def VSUB    : Inst<"ddd", "csilfUcUsUiUlQcQsQiQlQfQUcQUsQUiQUl", OP_SUB>;
+def VSUBL   : Inst<"wdd", "csiUcUsUi">;
+def VSUBw   : Inst<"wwd", "csiUcUsUi">;
+def VQSUB   : Inst<"ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+def VHSUB   : Inst<"ddd", "csiUcUsUiQcQsQiQUcQUsQUi">;
+def VSUBHN  : Inst<"dww", "csiUcUsUi">;
+def VRSUBHN : Inst<"dww", "csiUcUsUi">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.4 Comparison
+def VCEQ  : Inst<"xdd", "csifUcUsUiPcQcQsQiQfQUcQUsQUiQPc", OP_EQ>;
+def VCGE  : Inst<"xdd", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_GE>;
+def VCLE  : Inst<"xdd", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_LE>;
+def VCGT  : Inst<"xdd", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_GT>;
+def VCLT  : Inst<"xdd", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_LT>;
+def VCAGE : Inst<"xdd", "fQf">;
+def VCALE : Inst<"xdd", "fQf">;
+def VCAGT : Inst<"xdd", "fQf">;
+def VCALT : Inst<"xdd", "fQf">;
+def VTST  : Inst<"xdd", "csiUcUsUiPcQcQsQiQUcQUsQUiQPc">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.5 Absolute Difference
+def VABD  : Inst<"ddd",  "csiUcUsUifQcQsQiQUcQUsQUiQf">;
+def VABDL : Inst<"wdd",  "csiUcUsUi">;
+def VABA  : Inst<"dddd", "csiUcUsUiQcQsQiQUcQUsQUi">;
+def VABAL : Inst<"wwdd", "csiUcUsUi">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.6 Max/Min
+def VMAX : Inst<"ddd", "csiUcUsUifQcQsQiQUcQUsQUiQf">;
+def VMIN : Inst<"ddd", "csiUcUsUifQcQsQiQUcQUsQUiQf">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.7 Pairdise Addition
+def VPADD  : Inst<"ddd", "csiUcUsUif">;
+def VPADDL : Inst<"nd",  "csiUcUsUiQcQsQiQUcQUsQUi">;
+def VPADAL : Inst<"nnd", "csiUcUsUiQcQsQiQUcQUsQUi">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.8-9 Folding Max/Min
+def VPMAX : Inst<"ddd", "csiUcUsUif">;
+def VPMIN : Inst<"ddd", "csiUcUsUif">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.10 Reciprocal/Sqrt
+def VRECPS  : Inst<"ddd", "fQf">;
+def VRSQRTS : Inst<"ddd", "fQf">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.11 Shifts by signed variable
+def VSHL   : Inst<"ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+def VQSHL  : Inst<"ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+def VRSHL  : Inst<"ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+def VQRSHL : Inst<"ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.12 Shifts by constant
+def VSHR_N     : Inst<"ddi",  "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+def VSHL_N     : Inst<"ddi",  "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+def VRSHR_N    : Inst<"ddi",  "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+def VSRA_N     : Inst<"dddi", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+def VRSRA_N    : Inst<"dddi", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+def VQSHL_N    : Inst<"ddi",  "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+def VQSHLU_N   : Inst<"xdi",  "csilQcQsQiQl">;
+def VSHRN_N    : Inst<"dwi",  "csiUcUsUi">;
+def VQSHRUN_N  : Inst<"xwi",  "csi">;
+def VQRSHRUN_N : Inst<"xwi",  "csi">;
+def VQSHRN_N   : Inst<"dwi",  "csiUcUsUi">;
+def VRSHRN_N   : Inst<"dwi",  "csiUcUsUi">;
+def VQRSHRN_N  : Inst<"dwi",  "csiUcUsUi">;
+def VSHLL_N    : Inst<"wdi",  "csiUcUsUi">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.13 Shifts with insert
+def VSRI_N : Inst<"dddi", "csilUcUsUiUlPcPsQcQsQiQlQUcQUsQUiQUlQPcQPs">;
+def VSLI_N : Inst<"dddi", "csilUcUsUiUlPcPsQcQsQiQlQUcQUsQUiQUlQPcQPs">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.14 Loads and stores of a single vector
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.15 Loads and stores of an N-element structure
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.16 Extract lanes from a vector
+def VGET_LANE : Inst<"ddi", "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.17 Set lanes within a vector
+def VSET_LANE : Inst<"dddi", "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.18 Initialize a vector from bit pattern
+def VCREATE: Inst<"dl", "csihfUcUsUiUlPcPsl">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.19 Set all lanes to same value
+def VDUP_N : Inst<"ds", "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl">;
+def VMOV_N : Inst<"ds", "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.20 Combining vectors
+def VCOMBINE : Inst<"cdd", "csilhfUcUsUiUlPcPs">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.21 Splitting vectors
+def VGET_HIGH : Inst<"dc", "csilhfUcUsUiUlPcPs">;
+def VGET_LOW  : Inst<"dc", "csilhfUcUsUiUlPcPs">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.22 Converting vectors
+def VCVT_S32   : Inst<"df",  "iQi">;
+def VCVT_U32   : Inst<"df",  "UiQUi">;
+def VCVT_F16   : Inst<"df",  "h">;
+def VCVT_N_S32 : Inst<"dfi", "iQi">;
+def VCVT_N_U32 : Inst<"dfi", "UiQUi">;
+def VCVT_F32   : Inst<"fd",  "iUiQiQUih">;
+def VCVT_N_F32 : Inst<"fdi", "iUiQiQUi">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.23-24 Table lookup, Extended table lookup
+def VTBL1 : Inst<"ddt",   "UccPc">;
+def VTBL2 : Inst<"d2dt",  "UccPc">;
+def VTBL3 : Inst<"d3dt",  "UccPc">;
+def VTBL4 : Inst<"d4dt",  "UccPc">;
+def VTBX1 : Inst<"dddt",  "UccPc">;
+def VTBX2 : Inst<"dd2dt", "UccPc">;
+def VTBX3 : Inst<"dd3dt", "UccPc">;
+def VTBX4 : Inst<"dd4dt", "UccPc">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.25 Operations with a scalar value
+def VMLA_LANE     : Inst<"ddddi", "siUsUifQsQiQUsQUiQf">;
+def VMLAL_LANE    : Inst<"wwddi", "siUsUi">;
+def VQDMLAL_LANE  : Inst<"wwddi", "si">; 
+def VMLS_LANE     : Inst<"ddddi", "siUsUifQsQiQUsQUiQf">;
+def VMLSL_LANE    : Inst<"wwddi", "siUsUi">;
+def VQDMLSL_LANE  : Inst<"wwddi", "si">;
+def VMUL_N        : Inst<"dds",   "sifUsUiQsQiQfQUsQUi">;
+def VMULL_N       : Inst<"wds",   "siUsUi">;
+def VMULL_LANE    : Inst<"wddi",  "siUsUi">;
+def VQDMULL_N     : Inst<"wds",   "si">;
+def VQDMULL_LANE  : Inst<"wddi",  "si">;
+def VQDMULH_N     : Inst<"dds",   "siQsQi">;
+def VQDMULH_LANE  : Inst<"dddi",  "siQsQi">;
+def VQRDMULH_N    : Inst<"dds",   "siQsQi">;
+def VQRDMULH_LANE : Inst<"dddi",  "siQsQi">;
+def VMLA_N        : Inst<"ddds",  "siUsUifQsQiQUsQUiQf">;
+def VQDMLAL_N     : Inst<"wwds",  "si">;
+def VMLS_N        : Inst<"ddds",  "siUsUifQsQiQUsQUiQf">;
+def VMLSL_N       : Inst<"wwds",  "siUsUi">;
+def VQDMLSL_N     : Inst<"wwds",  "si">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.26 Vector Extract
+def VEXT : Inst<"dddi", "cUcPcsUsPsiUilUlQcQUcQPcQsQUsQPsQiQUiQlQUl">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.27 Reverse vector elements (sdap endianness)
+def VREV64 : Inst<"dd", "csiUcUsUiPcPsfQcQsQiQUcQUsQUiQPcQPsQf">;
+def VREV32 : Inst<"dd", "csUcUsPcQcQsQUcQUsQPc">;
+def VREV16 : Inst<"dd", "cUcPcQcQUcQPc">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.28 Other single operand arithmetic
+def VABS    : Inst<"dd", "csifQcQsQiQf">;
+def VQABS   : Inst<"dd", "csiQcQsQi">;
+def VNEG    : Inst<"dd", "csifQcQsQiQf", OP_NEG>;
+def VQNEG   : Inst<"dd", "csiQcQsQi">;
+def VCLS    : Inst<"dd", "csiQcQsQi">;
+def VCLZ    : Inst<"dd", "csiUcUsUiQcQsQiQUcQUsQUi">;
+def VCNT    : Inst<"dd", "UccPcQUcQcQPc">;
+def VRECPE  : Inst<"dd", "fUiQfQUi">;
+def VRSQRTE : Inst<"dd", "fUiQfQUi">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.29 Logical operations
+def VMVN : Inst<"dd",   "csiUcUsUiPcQcQsQiQUcQUsQUiQPc", OP_NOT>;
+def VAND : Inst<"ddd",  "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_AND>;
+def VORR : Inst<"ddd",  "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_OR>;
+def VEOR : Inst<"ddd",  "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_XOR>;
+def VBIC : Inst<"ddd",  "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_ANDN>;
+def VORN : Inst<"ddd",  "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_ORN>;
+def VBSL : Inst<"dxdd", "csilUcUsUiUlfPcPsQcQsQiQlQUcQUsQUiQUlQfQPcQPs">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.30 Transposition operations
+def VTRN: Inst<"", "">;
+
+////////////////////////////////////////////////////////////////////////////////
+// E.3.31 Vector reinterpret cast operations