diff options
-rw-r--r-- | include/clang/Basic/BuiltinsX86.def | 4 | ||||
-rw-r--r-- | lib/Headers/avxintrin.h | 20 | ||||
-rw-r--r-- | test/CodeGen/avx-shuffle-builtins.c | 24 | ||||
-rw-r--r-- | test/CodeGen/builtins-x86.c | 4 |
4 files changed, 40 insertions, 12 deletions
diff --git a/include/clang/Basic/BuiltinsX86.def b/include/clang/Basic/BuiltinsX86.def index dd0f3be920..7e592bc679 100644 --- a/include/clang/Basic/BuiltinsX86.def +++ b/include/clang/Basic/BuiltinsX86.def @@ -414,10 +414,6 @@ BUILTIN(__builtin_ia32_cvttps2dq256, "V8iV8f", "") BUILTIN(__builtin_ia32_vperm2f128_pd256, "V4dV4dV4dIc", "") BUILTIN(__builtin_ia32_vperm2f128_ps256, "V8fV8fV8fIc", "") BUILTIN(__builtin_ia32_vperm2f128_si256, "V8iV8iV8iIc", "") -BUILTIN(__builtin_ia32_vpermilpd, "V2dV2dIc", "") -BUILTIN(__builtin_ia32_vpermilps, "V4fV4fIc", "") -BUILTIN(__builtin_ia32_vpermilpd256, "V4dV4dIc", "") -BUILTIN(__builtin_ia32_vpermilps256, "V8fV8fIc", "") BUILTIN(__builtin_ia32_vinsertf128_pd256, "V4dV4dV2dIc", "") BUILTIN(__builtin_ia32_vinsertf128_ps256, "V8fV8fV4fIc", "") BUILTIN(__builtin_ia32_vinsertf128_si256, "V8iV8iV4iIc", "") diff --git a/lib/Headers/avxintrin.h b/lib/Headers/avxintrin.h index ce4b2264bf..0758a653bf 100644 --- a/lib/Headers/avxintrin.h +++ b/lib/Headers/avxintrin.h @@ -260,19 +260,31 @@ _mm256_permutevar_ps(__m256 a, __m256i c) #define _mm_permute_pd(A, C) __extension__ ({ \ __m128d __A = (A); \ - (__m128d)__builtin_ia32_vpermilpd((__v2df)__A, (C)); }) + (__m128d)__builtin_shufflevector((__v2df)__A, (__v2df) _mm_setzero_pd(), \ + (C) & 0x1, ((C) & 0x2) >> 1); }) #define _mm256_permute_pd(A, C) __extension__ ({ \ __m256d __A = (A); \ - (__m256d)__builtin_ia32_vpermilpd256((__v4df)__A, (C)); }) + (__m256d)__builtin_shufflevector((__v4df)__A, (__v4df) _mm256_setzero_pd(), \ + (C) & 0x1, ((C) & 0x2) >> 1, \ + 2 + (((C) & 0x4) >> 2), \ + 2 + (((C) & 0x8) >> 3)); }) #define _mm_permute_ps(A, C) __extension__ ({ \ __m128 __A = (A); \ - (__m128)__builtin_ia32_vpermilps((__v4sf)__A, (C)); }) + (__m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \ + (C) & 0x3, ((C) & 0xc) >> 2, \ + ((C) & 0x30) >> 4, ((C) & 0xc0) >> 8); }) #define _mm256_permute_ps(A, C) __extension__ ({ \ __m256 __A = (A); \ - (__m256)__builtin_ia32_vpermilps256((__v8sf)__A, (C)); }) + (__m256)__builtin_shufflevector((__v8sf)__A, (__v8sf) _mm256_setzero_ps(), \ + (C) & 0x3, ((C) & 0xc) >> 2, \ + ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \ + 4 + (((C) & 0x03) >> 0), \ + 4 + (((C) & 0x0c) >> 2), \ + 4 + (((C) & 0x30) >> 4), \ + 4 + (((C) & 0xc0) >> 6)); }) #define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \ __m256d __V1 = (V1); \ diff --git a/test/CodeGen/avx-shuffle-builtins.c b/test/CodeGen/avx-shuffle-builtins.c index c11780a5e3..6b2b1b1b18 100644 --- a/test/CodeGen/avx-shuffle-builtins.c +++ b/test/CodeGen/avx-shuffle-builtins.c @@ -14,3 +14,27 @@ __m256 x(__m256 a, __m256 b) { // CHECK: shufflevector{{.*}}<i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15> return _mm256_shuffle_ps(a, b, 203); } + +__m128d test_mm_permute_pd(__m128d a) { + // Check if the mask is correct + // CHECK: shufflevector{{.*}}<i32 1, i32 0> + return _mm_permute_pd(a, 1); +} + +__m256d test_mm256_permute_pd(__m256d a) { + // Check if the mask is correct + // CHECK: shufflevector{{.*}}<i32 1, i32 0, i32 3, i32 2> + return _mm256_permute_pd(a, 5); +} + +__m128 test_mm_permute_ps(__m128 a) { + // Check if the mask is correct + // CHECK: shufflevector{{.*}}<i32 3, i32 2, i32 1, i32 0> + return _mm_permute_ps(a, 0x1b); +} + +__m256 test_mm256_permute_ps(__m256 a) { + // Check if the mask is correct + // CHECK: shufflevector{{.*}}<i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> + return _mm256_permute_ps(a, 0x1b); +} diff --git a/test/CodeGen/builtins-x86.c b/test/CodeGen/builtins-x86.c index a8121526b6..acb5554db4 100644 --- a/test/CodeGen/builtins-x86.c +++ b/test/CodeGen/builtins-x86.c @@ -417,10 +417,6 @@ void f0() { tmp_V4d = __builtin_ia32_vperm2f128_pd256(tmp_V4d, tmp_V4d, 0x7); tmp_V8f = __builtin_ia32_vperm2f128_ps256(tmp_V8f, tmp_V8f, 0x7); tmp_V8i = __builtin_ia32_vperm2f128_si256(tmp_V8i, tmp_V8i, 0x7); - tmp_V2d = __builtin_ia32_vpermilpd(tmp_V2d, 0x7); - tmp_V4f = __builtin_ia32_vpermilps(tmp_V4f, 0x7); - tmp_V4d = __builtin_ia32_vpermilpd256(tmp_V4d, 0x7); - tmp_V8f = __builtin_ia32_vpermilps256(tmp_V8f, 0x7); tmp_V4d = __builtin_ia32_vinsertf128_pd256(tmp_V4d, tmp_V2d, 0x7); tmp_V8f = __builtin_ia32_vinsertf128_ps256(tmp_V8f, tmp_V4f, 0x7); tmp_V8i = __builtin_ia32_vinsertf128_si256(tmp_V8i, tmp_V4i, 0x7); |