diff options
-rw-r--r-- | lib/Headers/emmintrin.h | 47 | ||||
-rw-r--r-- | lib/Headers/mmintrin.h | 4 | ||||
-rw-r--r-- | lib/Headers/xmmintrin.h | 62 |
3 files changed, 67 insertions, 46 deletions
diff --git a/lib/Headers/emmintrin.h b/lib/Headers/emmintrin.h index c96000aa1b..1061bf3b79 100644 --- a/lib/Headers/emmintrin.h +++ b/lib/Headers/emmintrin.h @@ -40,7 +40,8 @@ typedef char __v16qi __attribute__((__vector_size__(16))); static inline __m128d __attribute__((__always_inline__, __nodebug__)) _mm_add_sd(__m128d a, __m128d b) { - return __builtin_ia32_addsd(a, b); + a[0] += b[0]; + return a; } static inline __m128d __attribute__((__always_inline__, __nodebug__)) @@ -52,7 +53,8 @@ _mm_add_pd(__m128d a, __m128d b) static inline __m128d __attribute__((__always_inline__, __nodebug__)) _mm_sub_sd(__m128d a, __m128d b) { - return __builtin_ia32_subsd(a, b); + a[0] -= b[0]; + return a; } static inline __m128d __attribute__((__always_inline__, __nodebug__)) @@ -64,7 +66,8 @@ _mm_sub_pd(__m128d a, __m128d b) static inline __m128d __attribute__((__always_inline__, __nodebug__)) _mm_mul_sd(__m128d a, __m128d b) { - return __builtin_ia32_mulsd(a, b); + a[0] *= b[0]; + return a; } static inline __m128d __attribute__((__always_inline__, __nodebug__)) @@ -76,7 +79,8 @@ _mm_mul_pd(__m128d a, __m128d b) static inline __m128d __attribute__((__always_inline__, __nodebug__)) _mm_div_sd(__m128d a, __m128d b) { - return __builtin_ia32_divsd(a, b); + a[0] /= b[0]; + return a; } static inline __m128d __attribute__((__always_inline__, __nodebug__)) @@ -125,25 +129,25 @@ _mm_max_pd(__m128d a, __m128d b) static inline __m128d __attribute__((__always_inline__, __nodebug__)) _mm_and_pd(__m128d a, __m128d b) { - return __builtin_ia32_andpd(a, b); + return (__m128)((__v4si)a & (__v4si)b); } static inline __m128d __attribute__((__always_inline__, __nodebug__)) _mm_andnot_pd(__m128d a, __m128d b) { - return __builtin_ia32_andnpd(a, b); + return (__m128)(~(__v4si)a & (__v4si)b); } static inline __m128d __attribute__((__always_inline__, __nodebug__)) _mm_or_pd(__m128d a, __m128d b) { - return __builtin_ia32_orpd(a, b); + return (__m128)((__v4si)a | (__v4si)b); } static inline __m128d __attribute__((__always_inline__, __nodebug__)) _mm_xor_pd(__m128d a, __m128d b) { - return __builtin_ia32_xorpd(a, b); + return (__m128)((__v4si)a ^ (__v4si)b); } static inline __m128d __attribute__((__always_inline__, __nodebug__)) @@ -383,7 +387,8 @@ _mm_cvtsd_si32(__m128d a) static inline __m128 __attribute__((__always_inline__, __nodebug__)) _mm_cvtsd_ss(__m128 a, __m128d b) { - return __builtin_ia32_cvtsd2ss(a, b); + a[0] = b[0]; + return a; } static inline __m128d __attribute__((__always_inline__, __nodebug__)) @@ -395,7 +400,8 @@ _mm_cvtsi32_sd(__m128d a, int b) static inline __m128d __attribute__((__always_inline__, __nodebug__)) _mm_cvtss_sd(__m128d a, __m128 b) { - return __builtin_ia32_cvtss2sd(a, b); + a[0] = b[0]; + return a; } static inline __m128i __attribute__((__always_inline__, __nodebug__)) @@ -407,7 +413,7 @@ _mm_cvttpd_epi32(__m128d a) static inline int __attribute__((__always_inline__, __nodebug__)) _mm_cvttsd_si32(__m128d a) { - return __builtin_ia32_cvttsd2si(a); + return a[0]; } static inline __m64 __attribute__((__always_inline__, __nodebug__)) @@ -747,25 +753,25 @@ _mm_subs_epu16(__m128i a, __m128i b) static inline __m128i __attribute__((__always_inline__, __nodebug__)) _mm_and_si128(__m128i a, __m128i b) { - return __builtin_ia32_pand128(a, b); + return a & b; } static inline __m128i __attribute__((__always_inline__, __nodebug__)) _mm_andnot_si128(__m128i a, __m128i b) { - return __builtin_ia32_pandn128(a, b); + return ~a & b; } static inline __m128i __attribute__((__always_inline__, __nodebug__)) _mm_or_si128(__m128i a, __m128i b) { - return __builtin_ia32_por128(a, b); + return a | b; } static inline __m128i __attribute__((__always_inline__, __nodebug__)) _mm_xor_si128(__m128i a, __m128i b) { - return __builtin_ia32_pxor128(a, b); + return a ^ b; } static inline __m128i __attribute__((__always_inline__, __nodebug__)) @@ -934,7 +940,8 @@ _mm_cmplt_epi32(__m128i a, __m128i b) static inline __m128d __attribute__((__always_inline__, __nodebug__)) _mm_cvtsi64_sd(__m128d a, long long b) { - return __builtin_ia32_cvtsi642sd(a, b); + a[0] = b; + return a; } static inline long long __attribute__((__always_inline__, __nodebug__)) @@ -946,7 +953,7 @@ _mm_cvtsd_si64(__m128d a) static inline long long __attribute__((__always_inline__, __nodebug__)) _mm_cvttsd_si64(__m128d a) { - return __builtin_ia32_cvttsd2si64(a); + return a[0]; } #endif @@ -1181,7 +1188,9 @@ _mm_extract_epi16(__m128i a, int imm) static inline __m128i __attribute__((__always_inline__, __nodebug__)) _mm_insert_epi16(__m128i a, int b, int imm) { - return (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)a, b, imm); + __v8hi c = (__v8hi)a; + c[imm & 7] = b; + return c; } static inline int __attribute__((__always_inline__, __nodebug__)) @@ -1257,7 +1266,7 @@ _mm_movpi64_pi64(__m64 a) static inline __m128i __attribute__((__always_inline__, __nodebug__)) _mm_move_epi64(__m128i a) { - return (__m128i){ a[0], 0 }; + return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2); } static inline __m128d __attribute__((__always_inline__, __nodebug__)) diff --git a/lib/Headers/mmintrin.h b/lib/Headers/mmintrin.h index 339d21288b..8ea3c470ee 100644 --- a/lib/Headers/mmintrin.h +++ b/lib/Headers/mmintrin.h @@ -415,13 +415,13 @@ _mm_set1_pi32(int __i) static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_set1_pi16(short __s) { - return (__m64)(__v4hi){ __s }; + return (__m64)(__v4hi){ __s, __s, __s, __s }; } static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_set1_pi8(char __b) { - return (__m64)(__v8qi){ __b }; + return (__m64)(__v8qi){ __b, __b, __b, __b, __b, __b, __b, __b }; } static inline __m64 __attribute__((__always_inline__, __nodebug__)) diff --git a/lib/Headers/xmmintrin.h b/lib/Headers/xmmintrin.h index c104f6301a..f896893173 100644 --- a/lib/Headers/xmmintrin.h +++ b/lib/Headers/xmmintrin.h @@ -38,7 +38,8 @@ typedef float __m128 __attribute__((__vector_size__(16))); static inline __m128 __attribute__((__always_inline__, __nodebug__)) _mm_add_ss(__m128 a, __m128 b) { - return __builtin_ia32_addss(a, b); + a[0] += b[0]; + return a; } static inline __m128 __attribute__((__always_inline__, __nodebug__)) @@ -50,7 +51,8 @@ _mm_add_ps(__m128 a, __m128 b) static inline __m128 __attribute__((__always_inline__, __nodebug__)) _mm_sub_ss(__m128 a, __m128 b) { - return __builtin_ia32_subss(a, b); + a[0] -= b[0]; + return a; } static inline __m128 __attribute__((__always_inline__, __nodebug__)) @@ -62,7 +64,8 @@ _mm_sub_ps(__m128 a, __m128 b) static inline __m128 __attribute__((__always_inline__, __nodebug__)) _mm_mul_ss(__m128 a, __m128 b) { - return __builtin_ia32_mulss(a, b); + a[0] *= b[0]; + return a; } static inline __m128 __attribute__((__always_inline__, __nodebug__)) @@ -74,7 +77,8 @@ _mm_mul_ps(__m128 a, __m128 b) static inline __m128 __attribute__((__always_inline__, __nodebug__)) _mm_div_ss(__m128 a, __m128 b) { - return __builtin_ia32_divss(a, b); + a[0] /= b[0]; + return a; } static inline __m128 __attribute__((__always_inline__, __nodebug__)) @@ -146,25 +150,29 @@ _mm_max_ps(__m128 a, __m128 b) static inline __m128 __attribute__((__always_inline__, __nodebug__)) _mm_and_ps(__m128 a, __m128 b) { - return __builtin_ia32_andps(a, b); + typedef int __v4si __attribute__((__vector_size__(16))); + return (__m128)((__v4si)a & (__v4si)b); } static inline __m128 __attribute__((__always_inline__, __nodebug__)) _mm_andnot_ps(__m128 a, __m128 b) { - return __builtin_ia32_andnps(a, b); + typedef int __v4si __attribute__((__vector_size__(16))); + return (__m128)(~(__v4si)a & (__v4si)b); } static inline __m128 __attribute__((__always_inline__, __nodebug__)) _mm_or_ps(__m128 a, __m128 b) { - return __builtin_ia32_orps(a, b); + typedef int __v4si __attribute__((__vector_size__(16))); + return (__m128)((__v4si)a | (__v4si)b); } static inline __m128 __attribute__((__always_inline__, __nodebug__)) _mm_xor_ps(__m128 a, __m128 b) { - return __builtin_ia32_xorps(a, b); + typedef int __v4si __attribute__((__vector_size__(16))); + return (__m128)((__v4si)a ^ ~(__v4si)b); } static inline __m128 __attribute__((__always_inline__, __nodebug__)) @@ -389,12 +397,16 @@ _mm_cvtss_si32(__m128 a) return __builtin_ia32_cvtss2si(a); } +#ifdef __x86_64__ + static inline long long __attribute__((__always_inline__, __nodebug__)) _mm_cvtss_si64(__m128 a) { return __builtin_ia32_cvtss2si64(a); } +#endif + static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_cvtps_pi32(__m128 a) { @@ -404,13 +416,13 @@ _mm_cvtps_pi32(__m128 a) static inline int __attribute__((__always_inline__, __nodebug__)) _mm_cvttss_si32(__m128 a) { - return __builtin_ia32_cvttss2si(a); + return a[0]; } static inline long long __attribute__((__always_inline__, __nodebug__)) _mm_cvttss_si64(__m128 a) { - return __builtin_ia32_cvttss2si64(a); + return a[0]; } static inline __m64 __attribute__((__always_inline__, __nodebug__)) @@ -422,7 +434,8 @@ _mm_cvttps_pi32(__m128 a) static inline __m128 __attribute__((__always_inline__, __nodebug__)) _mm_cvtsi32_ss(__m128 a, int b) { - return __builtin_ia32_cvtsi2ss(a, b); + a[0] = b; + return a; } #ifdef __x86_64__ @@ -430,7 +443,8 @@ _mm_cvtsi32_ss(__m128 a, int b) static inline __m128 __attribute__((__always_inline__, __nodebug__)) _mm_cvtsi64_ss(__m128 a, long long b) { - return __builtin_ia32_cvtsi642ss(a, b); + a[0] = b; + return a; } #endif @@ -456,6 +470,13 @@ _mm_loadh_pi(__m128 a, __m64 const *p) static inline __m128 __attribute__((__always_inline__, __nodebug__)) _mm_loadl_pi(__m128 a, __m64 const *p) { +#if 0 + // FIXME: This should work, but gives really crappy code at the moment + __m128 b; + b[0] = *(float*)p; + b[1] = *((float*)p+1); + return __builtin_shufflevector(a, b, 0, 1, 4, 5); +#endif return __builtin_ia32_loadlps(a, (__v2si *)p); } @@ -604,26 +625,17 @@ _mm_sfence(void) static inline int __attribute__((__always_inline__, __nodebug__)) _mm_extract_pi16(__m64 a, int n) { - /* FIXME: - * This should force n to be an immediate. - * This does not use the PEXTRW instruction. From looking at the LLVM source, the - instruction doesn't seem to be hooked up. - * The code could probably be made better :) - */ __v4hi b = (__v4hi)a; - return b[(n == 0) ? 0 : (n == 1 ? 1 : (n == 2 ? 2 : 3))]; + return (unsigned short)b[n & 3]; } -/* FIXME: Implement this. We could add a __builtin_insertelement function that's similar to - the already existing __builtin_shufflevector. -*/ -/* static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_insert_pi16(__m64 a, int d, int n) { - return (__m64){ 0LL }; + __v4hi b = (__v4hi)a; + b[n & 3] = d; + return b; } -*/ static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_max_pi16(__m64 a, __m64 b) |