diff options
author | Siarhei Siamashka <siarhei.siamashka@gmail.com> | 2011-12-29 01:38:05 +0200 |
---|---|---|
committer | Siarhei Siamashka <siarhei.siamashka@gmail.com> | 2011-12-29 06:46:17 +0200 |
commit | 7a9481d266b58bd02a7d8033f359ca344cfd65d6 (patch) | |
tree | ecc2a56b02db52a0466be9958ab8faee1f0a2b3b | |
parent | a0139edc7e2c3aa8252405ceff8a23a065965716 (diff) |
Use gcc vector extensions for SIMD scrypt key derivation function
The code can be compiled for different architectures from the same
source starting with gcc 4.7. But SSE2/Altivec/SPU targets have
compatibility wrappers, which also allow the use of older versions
of gcc.
Two hashes are processed at the same time, so twice bigger scratch
buffer is needed (~256K vs. ~128K).
Speedup on Cell PPU (32-bit), single thread, 3.2GHz:
~0.58 khash/s -> ~1.79 khash/sec
-rw-r--r-- | Makefile.am | 3 | ||||
-rw-r--r-- | cpu-miner.c | 2 | ||||
-rw-r--r-- | scrypt-simd-helpers.h | 366 | ||||
-rw-r--r-- | scrypt.c | 113 |
4 files changed, 479 insertions, 5 deletions
diff --git a/Makefile.am b/Makefile.am index 8065f2f..5623a59 100644 --- a/Makefile.am +++ b/Makefile.am @@ -14,7 +14,8 @@ INCLUDES = $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) bin_PROGRAMS = minerd minerd_SOURCES = elist.h miner.h compat.h \ - cpu-miner.c util.c scrypt.c sha256-helpers.h + cpu-miner.c util.c scrypt.c sha256-helpers.h \ + scrypt-simd-helpers.h minerd_LDFLAGS = $(PTHREAD_FLAGS) minerd_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ minerd_CPPFLAGS = @LIBCURL_CPPFLAGS@ diff --git a/cpu-miner.c b/cpu-miner.c index b3d1546..607c01d 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -531,7 +531,7 @@ static void *miner_thread(void *userdata) if (opt_algo == ALGO_SCRYPT) { - scratchbuf = malloc(131583); + scratchbuf = malloc(2 * 131583); max_nonce = 0xffff; } diff --git a/scrypt-simd-helpers.h b/scrypt-simd-helpers.h new file mode 100644 index 0000000..322d718 --- /dev/null +++ b/scrypt-simd-helpers.h @@ -0,0 +1,366 @@ +/*- + * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 Siarhei Siamashka + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + +/* + * This is the performance critical part of scrypt key derivation function [1], + * implemented using gcc vector extensions [2]. The parameters are set + * to N = 1024, r = 1, p = 1 as used for litecoin proof of work [3]. + * + * The drawback is that these extensions are only supported by gcc and a few + * other compilers, which are trying to be gcc-compatible (clang, path64, ...) + * + * The advantage is that this code works on any SIMD capable hardware + * (x86 SSE2, PowerPC Altivec, Cell SPU, ARM NEON, ARM iWMMXt, ...) without + * modifications when compiled with gcc 4.7. The older compiler versions are + * missing bits and pieces, but still can work for Altivec, SPU and SSE2 with + * a bit of intrinsic band aid. + * + * 1. http://www.tarsnap.com/scrypt.html + * 2. http://gcc.gnu.org/onlinedocs/gcc/Vector-Extensions.html + * 3. https://github.com/coblee/litecoin/wiki/Scrypt-proof-of-work + */ + +#ifndef __SCRYPT_SIMD_HELPERS_H__ +#define __SCRYPT_SIMD_HELPERS_H__ + +#include <stdint.h> +#include "sha256-helpers.h" + +#if defined(__GNUC__) && \ + ((defined(__SSE2__) || defined(__ALTIVEC__) || defined(__SPU__)) || \ + (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))) + +#define HAVE_SCRYPT_SIMD_HELPERS + +#ifdef __SSE2__ +#include <emmintrin.h> +#endif + +#ifdef __ALTIVEC__ +#include <altivec.h> +#include <vec_types.h> +#endif + +#ifdef __SPU__ +#include <spu_intrinsics.h> +#endif + +typedef uint32_t uint32x4 __attribute__ ((vector_size(16), aligned(16))); +typedef uint8_t uint8x16 __attribute__ ((vector_size(16), aligned(16))); + +/* + * Define two helper functions ('rol_32x4' and 'shuffle_32x4') to ensure + * better support for old gcc versions and gcc-compatible compilers + */ +static __attribute__((always_inline)) uint32x4 +rol_32x4(uint32x4 a, uint32_t b) +{ +#ifdef __ALTIVEC__ + return vec_rl(a, vec_splats(b)); +#elif defined(__SPU__) + return spu_rl(a, b); +#elif defined(__SSE2__) + return (uint32x4)_mm_slli_epi32((__m128i)a, b) ^ + (uint32x4)_mm_srli_epi32((__m128i)a, 32 - b); +#else + return (a << b) ^ (a >> (32 - b)); +#endif +} + +#if defined(__clang__) +# define shuffle_32x4(a, p1, p2, p3, p4) \ + __builtin_shufflevector(a, a, p1, p2, p3, p4) +#elif defined(__SSE2__) +# define shuffle_32x4(a, p1, p2, p3, p4) \ + (uint32x4)_mm_shuffle_epi32((__m128i)a, _MM_SHUFFLE(p4, p3, p2, p1)) +#else +static __attribute__((always_inline)) uint32x4 +shuffle_32x4(uint32x4 a, const int p1, const int p2, const int p3, const int p4) +{ +#if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)) + /* gcc 4.7 introduces '__builtin_shuffle' */ + const uint32x4 mask = { p1, p2, p3, p4 }; + return __builtin_shuffle(a, mask); +#elif defined(__SPU__) + const uint8x16 mask = { + p1 * 4, p1 * 4 + 1, p1 * 4 + 2, p1 * 4 + 3, + p2 * 4, p2 * 4 + 1, p2 * 4 + 2, p2 * 4 + 3, + p3 * 4, p3 * 4 + 1, p3 * 4 + 2, p3 * 4 + 3, + p4 * 4, p4 * 4 + 1, p4 * 4 + 2, p4 * 4 + 3 + }; + return spu_shuffle(a, a, mask); +#elif defined(__ALTIVEC__) + const uint8x16 mask = { + p1 * 4, p1 * 4 + 1, p1 * 4 + 2, p1 * 4 + 3, + p2 * 4, p2 * 4 + 1, p2 * 4 + 2, p2 * 4 + 3, + p3 * 4, p3 * 4 + 1, p3 * 4 + 2, p3 * 4 + 3, + p4 * 4, p4 * 4 + 1, p4 * 4 + 2, p4 * 4 + 3 + }; + return vec_perm(a, a, mask); +#else +# error Have no implementation for 'shuffle_32x4' inline function +#endif +} +#endif + +/*****************************************************************************/ + +static __attribute__((always_inline)) void +blkcpy128(uint32x4 * __restrict D, const uint32x4 * __restrict S) +{ + D[0] = S[0]; D[1] = S[1]; D[2] = S[2]; D[3] = S[3]; + D[4] = S[4]; D[5] = S[5]; D[6] = S[6]; D[7] = S[7]; +} + +static __attribute__((always_inline)) void +blkxor128(uint32x4 * __restrict D, const uint32x4 * __restrict S) +{ + D[0] ^= S[0]; D[1] ^= S[1]; D[2] ^= S[2]; D[3] ^= S[3]; + D[4] ^= S[4]; D[5] ^= S[5]; D[6] ^= S[6]; D[7] ^= S[7]; +} + +/** + * salsa20_8(B): + * Apply the salsa20/8 core to the provided block. + */ +static __attribute__((always_inline)) void +salsa20_8_xor(uint32x4 * __restrict B, const uint32x4 * __restrict Bx) +{ + uint32x4 X0, X1, X2, X3; + int i; + + X0 = (B[0] ^= Bx[0]); + X1 = (B[1] ^= Bx[1]); + X2 = (B[2] ^= Bx[2]); + X3 = (B[3] ^= Bx[3]); + + for (i = 0; i < 8; i += 2) { + /* Operate on "columns". */ + X1 ^= rol_32x4(X0 + X3, 7); + X2 ^= rol_32x4(X1 + X0, 9); + X3 ^= rol_32x4(X2 + X1, 13); + X0 ^= rol_32x4(X3 + X2, 18); + + /* Rearrange data. */ + X1 = shuffle_32x4(X1, 3, 0, 1, 2); + X2 = shuffle_32x4(X2, 2, 3, 0, 1); + X3 = shuffle_32x4(X3, 1, 2, 3, 0); + + /* Operate on "rows". */ + X3 ^= rol_32x4(X0 + X1, 7); + X2 ^= rol_32x4(X3 + X0, 9); + X1 ^= rol_32x4(X2 + X3, 13); + X0 ^= rol_32x4(X1 + X2, 18); + + /* Rearrange data. */ + X1 = shuffle_32x4(X1, 1, 2, 3, 0); + X2 = shuffle_32x4(X2, 2, 3, 0, 1); + X3 = shuffle_32x4(X3, 3, 0, 1, 2); + } + + B[0] += X0; + B[1] += X1; + B[2] += X2; + B[3] += X3; +} + +static __attribute__((always_inline)) void +salsa20_8_xor2(uint32x4 * __restrict B, const uint32x4 * __restrict Bx, + uint32x4 * __restrict C, const uint32x4 * __restrict Cx) +{ + uint32x4 X0, X1, X2, X3; + uint32x4 Y0, Y1, Y2, Y3; + int i; + + X0 = (B[0] ^= Bx[0]); + X1 = (B[1] ^= Bx[1]); + X2 = (B[2] ^= Bx[2]); + X3 = (B[3] ^= Bx[3]); + Y0 = (C[0] ^= Cx[0]); + Y1 = (C[1] ^= Cx[1]); + Y2 = (C[2] ^= Cx[2]); + Y3 = (C[3] ^= Cx[3]); + + for (i = 0; i < 8; i += 2) { + /* Operate on "columns". */ + X1 ^= rol_32x4(X0 + X3, 7); + Y1 ^= rol_32x4(Y0 + Y3, 7); + X2 ^= rol_32x4(X1 + X0, 9); + Y2 ^= rol_32x4(Y1 + Y0, 9); + X3 ^= rol_32x4(X2 + X1, 13); + Y3 ^= rol_32x4(Y2 + Y1, 13); + X0 ^= rol_32x4(X3 + X2, 18); + Y0 ^= rol_32x4(Y3 + Y2, 18); + + /* Rearrange data. */ + X1 = shuffle_32x4(X1, 3, 0, 1, 2); + Y1 = shuffle_32x4(Y1, 3, 0, 1, 2); + X2 = shuffle_32x4(X2, 2, 3, 0, 1); + Y2 = shuffle_32x4(Y2, 2, 3, 0, 1); + X3 = shuffle_32x4(X3, 1, 2, 3, 0); + Y3 = shuffle_32x4(Y3, 1, 2, 3, 0); + + /* Operate on "rows". */ + X3 ^= rol_32x4(X0 + X1, 7); + Y3 ^= rol_32x4(Y0 + Y1, 7); + X2 ^= rol_32x4(X3 + X0, 9); + Y2 ^= rol_32x4(Y3 + Y0, 9); + X1 ^= rol_32x4(X2 + X3, 13); + Y1 ^= rol_32x4(Y2 + Y3, 13); + X0 ^= rol_32x4(X1 + X2, 18); + Y0 ^= rol_32x4(Y1 + Y2, 18); + + /* Rearrange data. */ + X1 = shuffle_32x4(X1, 1, 2, 3, 0); + Y1 = shuffle_32x4(Y1, 1, 2, 3, 0); + X2 = shuffle_32x4(X2, 2, 3, 0, 1); + Y2 = shuffle_32x4(Y2, 2, 3, 0, 1); + X3 = shuffle_32x4(X3, 3, 0, 1, 2); + Y3 = shuffle_32x4(Y3, 3, 0, 1, 2); + } + + B[0] += X0; + B[1] += X1; + B[2] += X2; + B[3] += X3; + C[0] += Y0; + C[1] += Y1; + C[2] += Y2; + C[3] += Y3; +} + +/* Helps to prevent the violation of strict aliasing rules */ +typedef union { uint32x4 q[8]; uint32_t w[32]; } XY; + +/** + * The most performance critical part of scrypt (N = 1024, r = 1, p = 1). + * Handles one hash at a time. Is likely the best choice when having + * small L1/L2 caches and slow memory. + * + * databuf - 128 bytes buffer for data input and output + * scratch - temporary buffer, it must have size at + * least (128 + 128 * 1024) bytes + * + * All buffers must be aligned at 64 byte boundary. + */ +static inline +void scrypt_simd_core1(uint8_t databuf[128], void * scratch) +{ + XY * X = (XY *)((uintptr_t)scratch + 0); + uint32x4 * V = (uint32x4 *)((uintptr_t)scratch + 128); + int i, j; + + /* 1: X <-- B */ + for (i = 0; i < 16; i++) { + X->w[i] = le32dec(&databuf[(i * 5 % 16) * 4]); + X->w[16 + i] = le32dec(&databuf[(16 + (i * 5 % 16)) * 4]); + } + + /* 2: for i = 0 to N - 1 do */ + for (i = 0; i < 1024; i++) { + blkcpy128(&V[i * 8], &X->q[0]); + salsa20_8_xor(&X->q[0], &X->q[4]); + salsa20_8_xor(&X->q[4], &X->q[0]); + } + + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < 1024; i++) { + j = X->w[16] & 1023; /* j <-- Integerify(X) mod N */ + blkxor128(X->q, &V[j * 8]); + salsa20_8_xor(&X->q[0], &X->q[4]); + salsa20_8_xor(&X->q[4], &X->q[0]); + } + + /* 10: B' <-- X */ + for (i = 0; i < 16; i++) { + le32enc(&databuf[(i * 5 % 16) * 4], X->w[i]); + le32enc(&databuf[(16 + (i * 5 % 16)) * 4], X->w[16 + i]); + } +} + +/** + * The most performance critical part of scrypt (N = 1024, r = 1, p = 1) + * Handle two hashes at a time. Is likely a better choice when the + * instructions have high latencies, but needs many registers and + * large L2 cache. + * + * databuf - two 128 bytes buffer for data input and output + * scratch - temporary buffer, it must have size at + * least (2 * 128 + 2 * 128 * 1024) bytes + * + * All buffers must be aligned at 64 byte boundary. + */ +static inline +void scrypt_simd_core2(uint8_t databuf[2 * 128], void * scratch) +{ + uint8_t * databufA = &databuf[0]; + uint8_t * databufB = &databuf[128]; + XY * XA = (XY *)((uintptr_t)scratch); + XY * XB = (XY *)((uintptr_t)scratch + 128 + 128 * 1024); + uint32x4 * VA = (uint32x4 *)((uintptr_t)XA + 128); + uint32x4 * VB = (uint32x4 *)((uintptr_t)XB + 128); + int i, jA, jB; + + /* 1: X <-- B */ + for (i = 0; i < 16; i++) { + XA->w[i] = le32dec(&databufA[(i * 5 % 16) * 4]); + XA->w[16 + i] = le32dec(&databufA[(16 + (i * 5 % 16)) * 4]); + XB->w[i] = le32dec(&databufB[(i * 5 % 16) * 4]); + XB->w[16 + i] = le32dec(&databufB[(16 + (i * 5 % 16)) * 4]); + } + + /* 2: for i = 0 to N - 1 do */ + for (i = 0; i < 1024; i++) { + blkcpy128(&VA[i * 8], &XA->q[0]); + blkcpy128(&VB[i * 8], &XB->q[0]); + salsa20_8_xor2(&XA->q[0], &XA->q[4], &XB->q[0], &XB->q[4]); + salsa20_8_xor2(&XA->q[4], &XA->q[0], &XB->q[4], &XB->q[0]); + } + + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < 1024; i++) { + jA = XA->w[16] & 1023; /* j <-- Integerify(X) mod N */ + jB = XB->w[16] & 1023; /* j <-- Integerify(X) mod N */ + blkxor128(XA->q, &VA[jA * 8]); + blkxor128(XB->q, &VB[jB * 8]); + salsa20_8_xor2(&XA->q[0], &XA->q[4], &XB->q[0], &XB->q[4]); + salsa20_8_xor2(&XA->q[4], &XA->q[0], &XB->q[4], &XB->q[0]); + } + + /* 10: B' <-- X */ + for (i = 0; i < 16; i++) { + le32enc(&databufA[(i * 5 % 16) * 4], XA->w[i]); + le32enc(&databufA[(16 + (i * 5 % 16)) * 4], XA->w[16 + i]); + le32enc(&databufB[(i * 5 % 16) * 4], XB->w[i]); + le32enc(&databufB[(16 + (i * 5 % 16)) * 4], XB->w[16 + i]); + } +} + +#endif +#endif @@ -35,6 +35,7 @@ #include <string.h> #include "sha256-helpers.h" +#include "scrypt-simd-helpers.h" static void blkcpy(void *, void *, size_t); static void blkxor(void *, void *, size_t); @@ -219,7 +220,7 @@ smix(uint8_t * B, size_t r, uint64_t N, uint32_t * V, uint32_t * XY) /* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes */ -static void scrypt_1024_1_1_256_sp(const char* input, char* output, char* scratchpad) +static void scrypt_1024_1_1_256_sp1(const char* input, char* output, char* scratchpad) { uint8_t * B; uint32_t * V; @@ -237,17 +238,21 @@ static void scrypt_1024_1_1_256_sp(const char* input, char* output, char* scratc /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ PBKDF2_SHA256((const uint8_t*)input, 80, (const uint8_t*)input, 80, 1, B, p * 128 * r); +#ifdef HAVE_SCRYPT_SIMD_HELPERS + scrypt_simd_core1(B, XY); +#else /* 2: for i = 0 to p - 1 do */ for (i = 0; i < p; i++) { /* 3: B_i <-- MF(B_i, N) */ smix(&B[i * 128 * r], r, N, V, XY); } +#endif /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ PBKDF2_SHA256((const uint8_t*)input, 80, B, p * 128 * r, 1, (uint8_t*)output, 32); } -int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf, +int scanhash_scrypt1(int thr_id, unsigned char *pdata, unsigned char *scratchbuf, const unsigned char *ptarget, uint32_t max_nonce, unsigned long *hashes_done) { @@ -266,7 +271,7 @@ int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf, while(1) { n++; le32enc(nonce, n); - scrypt_1024_1_1_256_sp(data, tmp_hash, scratchbuf); + scrypt_1024_1_1_256_sp1(data, tmp_hash, scratchbuf); if (le32dec(tmp_hash+28) <= Htarg) { be32enc(pdata + 64 + 12, n); @@ -282,3 +287,105 @@ int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf, return false; } +#ifdef HAVE_SCRYPT_SIMD_HELPERS + +static void +scrypt_1024_1_1_256_sp2(const unsigned char * input1, + unsigned char * output1, + const unsigned char * input2, + unsigned char * output2, + unsigned char * scratchpad) +{ + uint8_t * B1, * B2; + uint8_t * V; + + const uint32_t N = 1024; + const uint32_t r = 1; + const uint32_t p = 1; + + B1 = (uint8_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + B2 = B1 + 128; + V = B2 + 128; + + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + PBKDF2_SHA256((const uint8_t*)input1, 80, (const uint8_t*)input1, 80, 1, B1, p * 128 * r); + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + PBKDF2_SHA256((const uint8_t*)input2, 80, (const uint8_t*)input2, 80, 1, B2, p * 128 * r); + + scrypt_simd_core2(B1, V); + + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + PBKDF2_SHA256((const uint8_t*)input1, 80, B1, p * 128 * r, 1, (uint8_t*)output1, 32); + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + PBKDF2_SHA256((const uint8_t*)input2, 80, B2, p * 128 * r, 1, (uint8_t*)output2, 32); +} + +int scanhash_scrypt2(int thr_id, unsigned char *pdata, unsigned char *scratchbuf, + const unsigned char *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + unsigned char data1[80]; + unsigned char tmp_hash1[32]; + unsigned char data2[80]; + unsigned char tmp_hash2[32]; + uint32_t *nonce1 = (uint32_t *)(data1 + 64 + 12); + uint32_t *nonce2 = (uint32_t *)(data2 + 64 + 12); + uint32_t n = 0; + uint32_t Htarg = le32dec(ptarget + 28); + int i; + + work_restart[thr_id].restart = 0; + + for (i = 0; i < 80/4; i++) { + ((uint32_t *)data1)[i] = swab32(((uint32_t *)pdata)[i]); + ((uint32_t *)data2)[i] = swab32(((uint32_t *)pdata)[i]); + } + + while(1) { + le32enc(nonce1, n + 1); + le32enc(nonce2, n + 2); + scrypt_1024_1_1_256_sp2(data1, tmp_hash1, data2, tmp_hash2, scratchbuf); + + if (le32dec(tmp_hash1+28) <= Htarg) { + be32enc(pdata + 64 + 12, n + 1); + *hashes_done = n + 1; + return true; + } + + if (le32dec(tmp_hash2+28) <= Htarg && n + 2 <= max_nonce) { + be32enc(pdata + 64 + 12, n + 2); + *hashes_done = n + 2; + return true; + } + + n += 2; + + if (n >= max_nonce) { + *hashes_done = max_nonce; + break; + } + + if (work_restart[thr_id].restart) { + *hashes_done = n; + break; + } + } + return false; +} + +#endif + +int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf, + const unsigned char *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + /* + * TODO: maybe add a command line option or run benchmarks at start + * to select the fastest implementation? + */ +#ifdef HAVE_SCRYPT_SIMD_HELPERS + return scanhash_scrypt2(thr_id, pdata, scratchbuf, ptarget, max_nonce, hashes_done); +#else + return scanhash_scrypt1(thr_id, pdata, scratchbuf, ptarget, max_nonce, hashes_done); +#endif +} |