/*- * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 Siarhei Siamashka * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * This file was originally written by Colin Percival as part of the Tarsnap * online backup system. */ /* * This is the performance critical part of scrypt key derivation function [1], * implemented using gcc vector extensions [2]. The parameters are set * to N = 1024, r = 1, p = 1 as used for litecoin proof of work [3]. * * The drawback is that these extensions are only supported by gcc and a few * other compilers, which are trying to be gcc-compatible (clang, path64, ...) * * The advantage is that this code works on any SIMD capable hardware * (x86 SSE2, PowerPC Altivec, Cell SPU, ARM NEON, ARM iWMMXt, ...) without * modifications when compiled with gcc 4.7. The older compiler versions are * missing bits and pieces, but still can work for Altivec, SPU and SSE2 with * a bit of intrinsic band aid. * * 1. http://www.tarsnap.com/scrypt.html * 2. http://gcc.gnu.org/onlinedocs/gcc/Vector-Extensions.html * 3. https://github.com/coblee/litecoin/wiki/Scrypt-proof-of-work */ #ifndef __SCRYPT_SIMD_HELPERS_H__ #define __SCRYPT_SIMD_HELPERS_H__ #include #include "sha256-helpers.h" #if defined(__GNUC__) && \ ((defined(__SSE2__) || defined(__ALTIVEC__) || defined(__SPU__)) || \ (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))) #define HAVE_SCRYPT_SIMD_HELPERS #ifdef __SSE2__ #include #endif #ifdef __ALTIVEC__ #include #include #endif #ifdef __SPU__ #include #endif typedef uint32_t uint32x4 __attribute__ ((vector_size(16), aligned(16))); typedef uint8_t uint8x16 __attribute__ ((vector_size(16), aligned(16))); /* * Define two helper functions ('rol_32x4' and 'shuffle_32x4') to ensure * better support for old gcc versions and gcc-compatible compilers */ static __attribute__((always_inline)) uint32x4 rol_32x4(uint32x4 a, uint32_t b) { #ifdef __ALTIVEC__ return vec_rl(a, vec_splats(b)); #elif defined(__SPU__) return spu_rl(a, b); #elif defined(__SSE2__) return (uint32x4)_mm_slli_epi32((__m128i)a, b) ^ (uint32x4)_mm_srli_epi32((__m128i)a, 32 - b); #else return (a << b) ^ (a >> (32 - b)); #endif } #if defined(__clang__) # define shuffle_32x4(a, p1, p2, p3, p4) \ __builtin_shufflevector(a, a, p1, p2, p3, p4) #elif defined(__SSE2__) # define shuffle_32x4(a, p1, p2, p3, p4) \ (uint32x4)_mm_shuffle_epi32((__m128i)a, _MM_SHUFFLE(p4, p3, p2, p1)) #else static __attribute__((always_inline)) uint32x4 shuffle_32x4(uint32x4 a, const int p1, const int p2, const int p3, const int p4) { #if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)) /* gcc 4.7 introduces '__builtin_shuffle' */ const uint32x4 mask = { p1, p2, p3, p4 }; return __builtin_shuffle(a, mask); #elif defined(__SPU__) const uint8x16 mask = { p1 * 4, p1 * 4 + 1, p1 * 4 + 2, p1 * 4 + 3, p2 * 4, p2 * 4 + 1, p2 * 4 + 2, p2 * 4 + 3, p3 * 4, p3 * 4 + 1, p3 * 4 + 2, p3 * 4 + 3, p4 * 4, p4 * 4 + 1, p4 * 4 + 2, p4 * 4 + 3 }; return spu_shuffle(a, a, mask); #elif defined(__ALTIVEC__) const uint8x16 mask = { p1 * 4, p1 * 4 + 1, p1 * 4 + 2, p1 * 4 + 3, p2 * 4, p2 * 4 + 1, p2 * 4 + 2, p2 * 4 + 3, p3 * 4, p3 * 4 + 1, p3 * 4 + 2, p3 * 4 + 3, p4 * 4, p4 * 4 + 1, p4 * 4 + 2, p4 * 4 + 3 }; return vec_perm(a, a, mask); #else # error Have no implementation for 'shuffle_32x4' inline function #endif } #endif /*****************************************************************************/ static __attribute__((always_inline)) void blkcpy128(uint32x4 * __restrict D, const uint32x4 * __restrict S) { D[0] = S[0]; D[1] = S[1]; D[2] = S[2]; D[3] = S[3]; D[4] = S[4]; D[5] = S[5]; D[6] = S[6]; D[7] = S[7]; } static __attribute__((always_inline)) void blkxor128(uint32x4 * __restrict D, const uint32x4 * __restrict S) { D[0] ^= S[0]; D[1] ^= S[1]; D[2] ^= S[2]; D[3] ^= S[3]; D[4] ^= S[4]; D[5] ^= S[5]; D[6] ^= S[6]; D[7] ^= S[7]; } /** * salsa20_8(B): * Apply the salsa20/8 core to the provided block. */ static __attribute__((always_inline)) void salsa20_8_xor(uint32x4 * __restrict B, const uint32x4 * __restrict Bx) { uint32x4 X0, X1, X2, X3; int i; X0 = (B[0] ^= Bx[0]); X1 = (B[1] ^= Bx[1]); X2 = (B[2] ^= Bx[2]); X3 = (B[3] ^= Bx[3]); for (i = 0; i < 8; i += 2) { /* Operate on "columns". */ X1 ^= rol_32x4(X0 + X3, 7); X2 ^= rol_32x4(X1 + X0, 9); X3 ^= rol_32x4(X2 + X1, 13); X0 ^= rol_32x4(X3 + X2, 18); /* Rearrange data. */ X1 = shuffle_32x4(X1, 3, 0, 1, 2); X2 = shuffle_32x4(X2, 2, 3, 0, 1); X3 = shuffle_32x4(X3, 1, 2, 3, 0); /* Operate on "rows". */ X3 ^= rol_32x4(X0 + X1, 7); X2 ^= rol_32x4(X3 + X0, 9); X1 ^= rol_32x4(X2 + X3, 13); X0 ^= rol_32x4(X1 + X2, 18); /* Rearrange data. */ X1 = shuffle_32x4(X1, 1, 2, 3, 0); X2 = shuffle_32x4(X2, 2, 3, 0, 1); X3 = shuffle_32x4(X3, 3, 0, 1, 2); } B[0] += X0; B[1] += X1; B[2] += X2; B[3] += X3; } static __attribute__((always_inline)) void salsa20_8_xor2(uint32x4 * __restrict B, const uint32x4 * __restrict Bx, uint32x4 * __restrict C, const uint32x4 * __restrict Cx) { uint32x4 X0, X1, X2, X3; uint32x4 Y0, Y1, Y2, Y3; int i; X0 = (B[0] ^= Bx[0]); X1 = (B[1] ^= Bx[1]); X2 = (B[2] ^= Bx[2]); X3 = (B[3] ^= Bx[3]); Y0 = (C[0] ^= Cx[0]); Y1 = (C[1] ^= Cx[1]); Y2 = (C[2] ^= Cx[2]); Y3 = (C[3] ^= Cx[3]); for (i = 0; i < 8; i += 2) { /* Operate on "columns". */ X1 ^= rol_32x4(X0 + X3, 7); Y1 ^= rol_32x4(Y0 + Y3, 7); X2 ^= rol_32x4(X1 + X0, 9); Y2 ^= rol_32x4(Y1 + Y0, 9); X3 ^= rol_32x4(X2 + X1, 13); Y3 ^= rol_32x4(Y2 + Y1, 13); X0 ^= rol_32x4(X3 + X2, 18); Y0 ^= rol_32x4(Y3 + Y2, 18); /* Rearrange data. */ X1 = shuffle_32x4(X1, 3, 0, 1, 2); Y1 = shuffle_32x4(Y1, 3, 0, 1, 2); X2 = shuffle_32x4(X2, 2, 3, 0, 1); Y2 = shuffle_32x4(Y2, 2, 3, 0, 1); X3 = shuffle_32x4(X3, 1, 2, 3, 0); Y3 = shuffle_32x4(Y3, 1, 2, 3, 0); /* Operate on "rows". */ X3 ^= rol_32x4(X0 + X1, 7); Y3 ^= rol_32x4(Y0 + Y1, 7); X2 ^= rol_32x4(X3 + X0, 9); Y2 ^= rol_32x4(Y3 + Y0, 9); X1 ^= rol_32x4(X2 + X3, 13); Y1 ^= rol_32x4(Y2 + Y3, 13); X0 ^= rol_32x4(X1 + X2, 18); Y0 ^= rol_32x4(Y1 + Y2, 18); /* Rearrange data. */ X1 = shuffle_32x4(X1, 1, 2, 3, 0); Y1 = shuffle_32x4(Y1, 1, 2, 3, 0); X2 = shuffle_32x4(X2, 2, 3, 0, 1); Y2 = shuffle_32x4(Y2, 2, 3, 0, 1); X3 = shuffle_32x4(X3, 3, 0, 1, 2); Y3 = shuffle_32x4(Y3, 3, 0, 1, 2); } B[0] += X0; B[1] += X1; B[2] += X2; B[3] += X3; C[0] += Y0; C[1] += Y1; C[2] += Y2; C[3] += Y3; } /* Helps to prevent the violation of strict aliasing rules */ typedef union { uint32x4 q[8]; uint32_t w[32]; } XY; /** * The most performance critical part of scrypt (N = 1024, r = 1, p = 1). * Handles one hash at a time. Is likely the best choice when having * small L1/L2 caches and slow memory. * * databuf - 128 bytes buffer for data input and output * scratch - temporary buffer, it must have size at * least (128 + 128 * 1024) bytes * * All buffers must be aligned at 64 byte boundary. */ static inline void scrypt_simd_core1(uint32_t databuf[32], void * scratch) { uint32_t * databufA = (uint32_t *)&databuf[0]; XY * X = (XY *)((uintptr_t)scratch + 0); uint32x4 * V = (uint32x4 *)((uintptr_t)scratch + 128); int i, j; /* 1: X <-- B */ for (i = 0; i < 16; i++) { X->w[i] = databufA[i * 5 % 16]; X->w[16 + i] = databufA[16 + (i * 5 % 16)]; } /* 2: for i = 0 to N - 1 do */ for (i = 0; i < 1024; i++) { blkcpy128(&V[i * 8], &X->q[0]); salsa20_8_xor(&X->q[0], &X->q[4]); salsa20_8_xor(&X->q[4], &X->q[0]); } /* 6: for i = 0 to N - 1 do */ for (i = 0; i < 1024; i++) { j = X->w[16] & 1023; /* j <-- Integerify(X) mod N */ blkxor128(X->q, &V[j * 8]); salsa20_8_xor(&X->q[0], &X->q[4]); salsa20_8_xor(&X->q[4], &X->q[0]); } /* 10: B' <-- X */ for (i = 0; i < 16; i++) { databufA[i * 5 % 16] = X->w[i]; databufA[16 + (i * 5 % 16)] = X->w[16 + i]; } } /** * The most performance critical part of scrypt (N = 1024, r = 1, p = 1) * Handle two hashes at a time. Is likely a better choice when the * instructions have high latencies, but needs many registers and * large L2 cache. * * databuf - two 128 bytes buffer for data input and output * scratch - temporary buffer, it must have size at * least (2 * 128 + 2 * 128 * 1024) bytes * * All buffers must be aligned at 64 byte boundary. */ static inline void scrypt_simd_core2(uint32_t databuf[2 * 32], void * scratch) { uint32_t * databufA = (uint32_t *)&databuf[0]; uint32_t * databufB = (uint32_t *)&databuf[32]; XY * XA = (XY *)((uintptr_t)scratch); XY * XB = (XY *)((uintptr_t)scratch + 128 + 128 * 1024); uint32x4 * VA = (uint32x4 *)((uintptr_t)XA + 128); uint32x4 * VB = (uint32x4 *)((uintptr_t)XB + 128); int i, jA, jB; /* 1: X <-- B */ for (i = 0; i < 16; i++) { XA->w[i] = databufA[i * 5 % 16]; XA->w[16 + i] = databufA[16 + (i * 5 % 16)]; XB->w[i] = databufB[i * 5 % 16]; XB->w[16 + i] = databufB[16 + (i * 5 % 16)]; } /* 2: for i = 0 to N - 1 do */ for (i = 0; i < 1024; i++) { blkcpy128(&VA[i * 8], &XA->q[0]); blkcpy128(&VB[i * 8], &XB->q[0]); salsa20_8_xor2(&XA->q[0], &XA->q[4], &XB->q[0], &XB->q[4]); salsa20_8_xor2(&XA->q[4], &XA->q[0], &XB->q[4], &XB->q[0]); } /* 6: for i = 0 to N - 1 do */ for (i = 0; i < 1024; i++) { jA = XA->w[16] & 1023; /* j <-- Integerify(X) mod N */ jB = XB->w[16] & 1023; /* j <-- Integerify(X) mod N */ blkxor128(XA->q, &VA[jA * 8]); blkxor128(XB->q, &VB[jB * 8]); salsa20_8_xor2(&XA->q[0], &XA->q[4], &XB->q[0], &XB->q[4]); salsa20_8_xor2(&XA->q[4], &XA->q[0], &XB->q[4], &XB->q[0]); } /* 10: B' <-- X */ for (i = 0; i < 16; i++) { databufA[i * 5 % 16] = XA->w[i]; databufA[16 + (i * 5 % 16)] = XA->w[16 + i]; databufB[i * 5 % 16] = XB->w[i]; databufB[16 + (i * 5 % 16)] = XB->w[16 + i]; } } #endif #endif