aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/Headers/emmintrin.h47
-rw-r--r--lib/Headers/mmintrin.h4
-rw-r--r--lib/Headers/xmmintrin.h62
3 files changed, 67 insertions, 46 deletions
diff --git a/lib/Headers/emmintrin.h b/lib/Headers/emmintrin.h
index c96000aa1b..1061bf3b79 100644
--- a/lib/Headers/emmintrin.h
+++ b/lib/Headers/emmintrin.h
@@ -40,7 +40,8 @@ typedef char __v16qi __attribute__((__vector_size__(16)));
static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_add_sd(__m128d a, __m128d b)
{
- return __builtin_ia32_addsd(a, b);
+ a[0] += b[0];
+ return a;
}
static inline __m128d __attribute__((__always_inline__, __nodebug__))
@@ -52,7 +53,8 @@ _mm_add_pd(__m128d a, __m128d b)
static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sub_sd(__m128d a, __m128d b)
{
- return __builtin_ia32_subsd(a, b);
+ a[0] -= b[0];
+ return a;
}
static inline __m128d __attribute__((__always_inline__, __nodebug__))
@@ -64,7 +66,8 @@ _mm_sub_pd(__m128d a, __m128d b)
static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_mul_sd(__m128d a, __m128d b)
{
- return __builtin_ia32_mulsd(a, b);
+ a[0] *= b[0];
+ return a;
}
static inline __m128d __attribute__((__always_inline__, __nodebug__))
@@ -76,7 +79,8 @@ _mm_mul_pd(__m128d a, __m128d b)
static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_div_sd(__m128d a, __m128d b)
{
- return __builtin_ia32_divsd(a, b);
+ a[0] /= b[0];
+ return a;
}
static inline __m128d __attribute__((__always_inline__, __nodebug__))
@@ -125,25 +129,25 @@ _mm_max_pd(__m128d a, __m128d b)
static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_and_pd(__m128d a, __m128d b)
{
- return __builtin_ia32_andpd(a, b);
+ return (__m128)((__v4si)a & (__v4si)b);
}
static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_andnot_pd(__m128d a, __m128d b)
{
- return __builtin_ia32_andnpd(a, b);
+ return (__m128)(~(__v4si)a & (__v4si)b);
}
static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_or_pd(__m128d a, __m128d b)
{
- return __builtin_ia32_orpd(a, b);
+ return (__m128)((__v4si)a | (__v4si)b);
}
static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_xor_pd(__m128d a, __m128d b)
{
- return __builtin_ia32_xorpd(a, b);
+ return (__m128)((__v4si)a ^ (__v4si)b);
}
static inline __m128d __attribute__((__always_inline__, __nodebug__))
@@ -383,7 +387,8 @@ _mm_cvtsd_si32(__m128d a)
static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_ss(__m128 a, __m128d b)
{
- return __builtin_ia32_cvtsd2ss(a, b);
+ a[0] = b[0];
+ return a;
}
static inline __m128d __attribute__((__always_inline__, __nodebug__))
@@ -395,7 +400,8 @@ _mm_cvtsi32_sd(__m128d a, int b)
static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtss_sd(__m128d a, __m128 b)
{
- return __builtin_ia32_cvtss2sd(a, b);
+ a[0] = b[0];
+ return a;
}
static inline __m128i __attribute__((__always_inline__, __nodebug__))
@@ -407,7 +413,7 @@ _mm_cvttpd_epi32(__m128d a)
static inline int __attribute__((__always_inline__, __nodebug__))
_mm_cvttsd_si32(__m128d a)
{
- return __builtin_ia32_cvttsd2si(a);
+ return a[0];
}
static inline __m64 __attribute__((__always_inline__, __nodebug__))
@@ -747,25 +753,25 @@ _mm_subs_epu16(__m128i a, __m128i b)
static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_and_si128(__m128i a, __m128i b)
{
- return __builtin_ia32_pand128(a, b);
+ return a & b;
}
static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_andnot_si128(__m128i a, __m128i b)
{
- return __builtin_ia32_pandn128(a, b);
+ return ~a & b;
}
static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_or_si128(__m128i a, __m128i b)
{
- return __builtin_ia32_por128(a, b);
+ return a | b;
}
static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_xor_si128(__m128i a, __m128i b)
{
- return __builtin_ia32_pxor128(a, b);
+ return a ^ b;
}
static inline __m128i __attribute__((__always_inline__, __nodebug__))
@@ -934,7 +940,8 @@ _mm_cmplt_epi32(__m128i a, __m128i b)
static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_sd(__m128d a, long long b)
{
- return __builtin_ia32_cvtsi642sd(a, b);
+ a[0] = b;
+ return a;
}
static inline long long __attribute__((__always_inline__, __nodebug__))
@@ -946,7 +953,7 @@ _mm_cvtsd_si64(__m128d a)
static inline long long __attribute__((__always_inline__, __nodebug__))
_mm_cvttsd_si64(__m128d a)
{
- return __builtin_ia32_cvttsd2si64(a);
+ return a[0];
}
#endif
@@ -1181,7 +1188,9 @@ _mm_extract_epi16(__m128i a, int imm)
static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_insert_epi16(__m128i a, int b, int imm)
{
- return (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)a, b, imm);
+ __v8hi c = (__v8hi)a;
+ c[imm & 7] = b;
+ return c;
}
static inline int __attribute__((__always_inline__, __nodebug__))
@@ -1257,7 +1266,7 @@ _mm_movpi64_pi64(__m64 a)
static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_move_epi64(__m128i a)
{
- return (__m128i){ a[0], 0 };
+ return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2);
}
static inline __m128d __attribute__((__always_inline__, __nodebug__))
diff --git a/lib/Headers/mmintrin.h b/lib/Headers/mmintrin.h
index 339d21288b..8ea3c470ee 100644
--- a/lib/Headers/mmintrin.h
+++ b/lib/Headers/mmintrin.h
@@ -415,13 +415,13 @@ _mm_set1_pi32(int __i)
static inline __m64 __attribute__((__always_inline__, __nodebug__))
_mm_set1_pi16(short __s)
{
- return (__m64)(__v4hi){ __s };
+ return (__m64)(__v4hi){ __s, __s, __s, __s };
}
static inline __m64 __attribute__((__always_inline__, __nodebug__))
_mm_set1_pi8(char __b)
{
- return (__m64)(__v8qi){ __b };
+ return (__m64)(__v8qi){ __b, __b, __b, __b, __b, __b, __b, __b };
}
static inline __m64 __attribute__((__always_inline__, __nodebug__))
diff --git a/lib/Headers/xmmintrin.h b/lib/Headers/xmmintrin.h
index c104f6301a..f896893173 100644
--- a/lib/Headers/xmmintrin.h
+++ b/lib/Headers/xmmintrin.h
@@ -38,7 +38,8 @@ typedef float __m128 __attribute__((__vector_size__(16)));
static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_add_ss(__m128 a, __m128 b)
{
- return __builtin_ia32_addss(a, b);
+ a[0] += b[0];
+ return a;
}
static inline __m128 __attribute__((__always_inline__, __nodebug__))
@@ -50,7 +51,8 @@ _mm_add_ps(__m128 a, __m128 b)
static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_sub_ss(__m128 a, __m128 b)
{
- return __builtin_ia32_subss(a, b);
+ a[0] -= b[0];
+ return a;
}
static inline __m128 __attribute__((__always_inline__, __nodebug__))
@@ -62,7 +64,8 @@ _mm_sub_ps(__m128 a, __m128 b)
static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_mul_ss(__m128 a, __m128 b)
{
- return __builtin_ia32_mulss(a, b);
+ a[0] *= b[0];
+ return a;
}
static inline __m128 __attribute__((__always_inline__, __nodebug__))
@@ -74,7 +77,8 @@ _mm_mul_ps(__m128 a, __m128 b)
static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_div_ss(__m128 a, __m128 b)
{
- return __builtin_ia32_divss(a, b);
+ a[0] /= b[0];
+ return a;
}
static inline __m128 __attribute__((__always_inline__, __nodebug__))
@@ -146,25 +150,29 @@ _mm_max_ps(__m128 a, __m128 b)
static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_and_ps(__m128 a, __m128 b)
{
- return __builtin_ia32_andps(a, b);
+ typedef int __v4si __attribute__((__vector_size__(16)));
+ return (__m128)((__v4si)a & (__v4si)b);
}
static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_andnot_ps(__m128 a, __m128 b)
{
- return __builtin_ia32_andnps(a, b);
+ typedef int __v4si __attribute__((__vector_size__(16)));
+ return (__m128)(~(__v4si)a & (__v4si)b);
}
static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_or_ps(__m128 a, __m128 b)
{
- return __builtin_ia32_orps(a, b);
+ typedef int __v4si __attribute__((__vector_size__(16)));
+ return (__m128)((__v4si)a | (__v4si)b);
}
static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_xor_ps(__m128 a, __m128 b)
{
- return __builtin_ia32_xorps(a, b);
+ typedef int __v4si __attribute__((__vector_size__(16)));
+ return (__m128)((__v4si)a ^ ~(__v4si)b);
}
static inline __m128 __attribute__((__always_inline__, __nodebug__))
@@ -389,12 +397,16 @@ _mm_cvtss_si32(__m128 a)
return __builtin_ia32_cvtss2si(a);
}
+#ifdef __x86_64__
+
static inline long long __attribute__((__always_inline__, __nodebug__))
_mm_cvtss_si64(__m128 a)
{
return __builtin_ia32_cvtss2si64(a);
}
+#endif
+
static inline __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_pi32(__m128 a)
{
@@ -404,13 +416,13 @@ _mm_cvtps_pi32(__m128 a)
static inline int __attribute__((__always_inline__, __nodebug__))
_mm_cvttss_si32(__m128 a)
{
- return __builtin_ia32_cvttss2si(a);
+ return a[0];
}
static inline long long __attribute__((__always_inline__, __nodebug__))
_mm_cvttss_si64(__m128 a)
{
- return __builtin_ia32_cvttss2si64(a);
+ return a[0];
}
static inline __m64 __attribute__((__always_inline__, __nodebug__))
@@ -422,7 +434,8 @@ _mm_cvttps_pi32(__m128 a)
static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi32_ss(__m128 a, int b)
{
- return __builtin_ia32_cvtsi2ss(a, b);
+ a[0] = b;
+ return a;
}
#ifdef __x86_64__
@@ -430,7 +443,8 @@ _mm_cvtsi32_ss(__m128 a, int b)
static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_ss(__m128 a, long long b)
{
- return __builtin_ia32_cvtsi642ss(a, b);
+ a[0] = b;
+ return a;
}
#endif
@@ -456,6 +470,13 @@ _mm_loadh_pi(__m128 a, __m64 const *p)
static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_loadl_pi(__m128 a, __m64 const *p)
{
+#if 0
+ // FIXME: This should work, but gives really crappy code at the moment
+ __m128 b;
+ b[0] = *(float*)p;
+ b[1] = *((float*)p+1);
+ return __builtin_shufflevector(a, b, 0, 1, 4, 5);
+#endif
return __builtin_ia32_loadlps(a, (__v2si *)p);
}
@@ -604,26 +625,17 @@ _mm_sfence(void)
static inline int __attribute__((__always_inline__, __nodebug__))
_mm_extract_pi16(__m64 a, int n)
{
- /* FIXME:
- * This should force n to be an immediate.
- * This does not use the PEXTRW instruction. From looking at the LLVM source, the
- instruction doesn't seem to be hooked up.
- * The code could probably be made better :)
- */
__v4hi b = (__v4hi)a;
- return b[(n == 0) ? 0 : (n == 1 ? 1 : (n == 2 ? 2 : 3))];
+ return (unsigned short)b[n & 3];
}
-/* FIXME: Implement this. We could add a __builtin_insertelement function that's similar to
- the already existing __builtin_shufflevector.
-*/
-/*
static inline __m64 __attribute__((__always_inline__, __nodebug__))
_mm_insert_pi16(__m64 a, int d, int n)
{
- return (__m64){ 0LL };
+ __v4hi b = (__v4hi)a;
+ b[n & 3] = d;
+ return b;
}
-*/
static inline __m64 __attribute__((__always_inline__, __nodebug__))
_mm_max_pi16(__m64 a, __m64 b)