aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile.am3
-rw-r--r--cpu-miner.c2
-rw-r--r--scrypt-simd-helpers.h366
-rw-r--r--scrypt.c113
4 files changed, 479 insertions, 5 deletions
diff --git a/Makefile.am b/Makefile.am
index 8065f2f..5623a59 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -14,7 +14,8 @@ INCLUDES = $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES)
bin_PROGRAMS = minerd
minerd_SOURCES = elist.h miner.h compat.h \
- cpu-miner.c util.c scrypt.c sha256-helpers.h
+ cpu-miner.c util.c scrypt.c sha256-helpers.h \
+ scrypt-simd-helpers.h
minerd_LDFLAGS = $(PTHREAD_FLAGS)
minerd_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@
minerd_CPPFLAGS = @LIBCURL_CPPFLAGS@
diff --git a/cpu-miner.c b/cpu-miner.c
index b3d1546..607c01d 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -531,7 +531,7 @@ static void *miner_thread(void *userdata)
if (opt_algo == ALGO_SCRYPT)
{
- scratchbuf = malloc(131583);
+ scratchbuf = malloc(2 * 131583);
max_nonce = 0xffff;
}
diff --git a/scrypt-simd-helpers.h b/scrypt-simd-helpers.h
new file mode 100644
index 0000000..322d718
--- /dev/null
+++ b/scrypt-simd-helpers.h
@@ -0,0 +1,366 @@
+/*-
+ * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 Siarhei Siamashka
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+/*
+ * This is the performance critical part of scrypt key derivation function [1],
+ * implemented using gcc vector extensions [2]. The parameters are set
+ * to N = 1024, r = 1, p = 1 as used for litecoin proof of work [3].
+ *
+ * The drawback is that these extensions are only supported by gcc and a few
+ * other compilers, which are trying to be gcc-compatible (clang, path64, ...)
+ *
+ * The advantage is that this code works on any SIMD capable hardware
+ * (x86 SSE2, PowerPC Altivec, Cell SPU, ARM NEON, ARM iWMMXt, ...) without
+ * modifications when compiled with gcc 4.7. The older compiler versions are
+ * missing bits and pieces, but still can work for Altivec, SPU and SSE2 with
+ * a bit of intrinsic band aid.
+ *
+ * 1. http://www.tarsnap.com/scrypt.html
+ * 2. http://gcc.gnu.org/onlinedocs/gcc/Vector-Extensions.html
+ * 3. https://github.com/coblee/litecoin/wiki/Scrypt-proof-of-work
+ */
+
+#ifndef __SCRYPT_SIMD_HELPERS_H__
+#define __SCRYPT_SIMD_HELPERS_H__
+
+#include <stdint.h>
+#include "sha256-helpers.h"
+
+#if defined(__GNUC__) && \
+ ((defined(__SSE2__) || defined(__ALTIVEC__) || defined(__SPU__)) || \
+ (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)))
+
+#define HAVE_SCRYPT_SIMD_HELPERS
+
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
+#ifdef __ALTIVEC__
+#include <altivec.h>
+#include <vec_types.h>
+#endif
+
+#ifdef __SPU__
+#include <spu_intrinsics.h>
+#endif
+
+typedef uint32_t uint32x4 __attribute__ ((vector_size(16), aligned(16)));
+typedef uint8_t uint8x16 __attribute__ ((vector_size(16), aligned(16)));
+
+/*
+ * Define two helper functions ('rol_32x4' and 'shuffle_32x4') to ensure
+ * better support for old gcc versions and gcc-compatible compilers
+ */
+static __attribute__((always_inline)) uint32x4
+rol_32x4(uint32x4 a, uint32_t b)
+{
+#ifdef __ALTIVEC__
+ return vec_rl(a, vec_splats(b));
+#elif defined(__SPU__)
+ return spu_rl(a, b);
+#elif defined(__SSE2__)
+ return (uint32x4)_mm_slli_epi32((__m128i)a, b) ^
+ (uint32x4)_mm_srli_epi32((__m128i)a, 32 - b);
+#else
+ return (a << b) ^ (a >> (32 - b));
+#endif
+}
+
+#if defined(__clang__)
+# define shuffle_32x4(a, p1, p2, p3, p4) \
+ __builtin_shufflevector(a, a, p1, p2, p3, p4)
+#elif defined(__SSE2__)
+# define shuffle_32x4(a, p1, p2, p3, p4) \
+ (uint32x4)_mm_shuffle_epi32((__m128i)a, _MM_SHUFFLE(p4, p3, p2, p1))
+#else
+static __attribute__((always_inline)) uint32x4
+shuffle_32x4(uint32x4 a, const int p1, const int p2, const int p3, const int p4)
+{
+#if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))
+ /* gcc 4.7 introduces '__builtin_shuffle' */
+ const uint32x4 mask = { p1, p2, p3, p4 };
+ return __builtin_shuffle(a, mask);
+#elif defined(__SPU__)
+ const uint8x16 mask = {
+ p1 * 4, p1 * 4 + 1, p1 * 4 + 2, p1 * 4 + 3,
+ p2 * 4, p2 * 4 + 1, p2 * 4 + 2, p2 * 4 + 3,
+ p3 * 4, p3 * 4 + 1, p3 * 4 + 2, p3 * 4 + 3,
+ p4 * 4, p4 * 4 + 1, p4 * 4 + 2, p4 * 4 + 3
+ };
+ return spu_shuffle(a, a, mask);
+#elif defined(__ALTIVEC__)
+ const uint8x16 mask = {
+ p1 * 4, p1 * 4 + 1, p1 * 4 + 2, p1 * 4 + 3,
+ p2 * 4, p2 * 4 + 1, p2 * 4 + 2, p2 * 4 + 3,
+ p3 * 4, p3 * 4 + 1, p3 * 4 + 2, p3 * 4 + 3,
+ p4 * 4, p4 * 4 + 1, p4 * 4 + 2, p4 * 4 + 3
+ };
+ return vec_perm(a, a, mask);
+#else
+# error Have no implementation for 'shuffle_32x4' inline function
+#endif
+}
+#endif
+
+/*****************************************************************************/
+
+static __attribute__((always_inline)) void
+blkcpy128(uint32x4 * __restrict D, const uint32x4 * __restrict S)
+{
+ D[0] = S[0]; D[1] = S[1]; D[2] = S[2]; D[3] = S[3];
+ D[4] = S[4]; D[5] = S[5]; D[6] = S[6]; D[7] = S[7];
+}
+
+static __attribute__((always_inline)) void
+blkxor128(uint32x4 * __restrict D, const uint32x4 * __restrict S)
+{
+ D[0] ^= S[0]; D[1] ^= S[1]; D[2] ^= S[2]; D[3] ^= S[3];
+ D[4] ^= S[4]; D[5] ^= S[5]; D[6] ^= S[6]; D[7] ^= S[7];
+}
+
+/**
+ * salsa20_8(B):
+ * Apply the salsa20/8 core to the provided block.
+ */
+static __attribute__((always_inline)) void
+salsa20_8_xor(uint32x4 * __restrict B, const uint32x4 * __restrict Bx)
+{
+ uint32x4 X0, X1, X2, X3;
+ int i;
+
+ X0 = (B[0] ^= Bx[0]);
+ X1 = (B[1] ^= Bx[1]);
+ X2 = (B[2] ^= Bx[2]);
+ X3 = (B[3] ^= Bx[3]);
+
+ for (i = 0; i < 8; i += 2) {
+ /* Operate on "columns". */
+ X1 ^= rol_32x4(X0 + X3, 7);
+ X2 ^= rol_32x4(X1 + X0, 9);
+ X3 ^= rol_32x4(X2 + X1, 13);
+ X0 ^= rol_32x4(X3 + X2, 18);
+
+ /* Rearrange data. */
+ X1 = shuffle_32x4(X1, 3, 0, 1, 2);
+ X2 = shuffle_32x4(X2, 2, 3, 0, 1);
+ X3 = shuffle_32x4(X3, 1, 2, 3, 0);
+
+ /* Operate on "rows". */
+ X3 ^= rol_32x4(X0 + X1, 7);
+ X2 ^= rol_32x4(X3 + X0, 9);
+ X1 ^= rol_32x4(X2 + X3, 13);
+ X0 ^= rol_32x4(X1 + X2, 18);
+
+ /* Rearrange data. */
+ X1 = shuffle_32x4(X1, 1, 2, 3, 0);
+ X2 = shuffle_32x4(X2, 2, 3, 0, 1);
+ X3 = shuffle_32x4(X3, 3, 0, 1, 2);
+ }
+
+ B[0] += X0;
+ B[1] += X1;
+ B[2] += X2;
+ B[3] += X3;
+}
+
+static __attribute__((always_inline)) void
+salsa20_8_xor2(uint32x4 * __restrict B, const uint32x4 * __restrict Bx,
+ uint32x4 * __restrict C, const uint32x4 * __restrict Cx)
+{
+ uint32x4 X0, X1, X2, X3;
+ uint32x4 Y0, Y1, Y2, Y3;
+ int i;
+
+ X0 = (B[0] ^= Bx[0]);
+ X1 = (B[1] ^= Bx[1]);
+ X2 = (B[2] ^= Bx[2]);
+ X3 = (B[3] ^= Bx[3]);
+ Y0 = (C[0] ^= Cx[0]);
+ Y1 = (C[1] ^= Cx[1]);
+ Y2 = (C[2] ^= Cx[2]);
+ Y3 = (C[3] ^= Cx[3]);
+
+ for (i = 0; i < 8; i += 2) {
+ /* Operate on "columns". */
+ X1 ^= rol_32x4(X0 + X3, 7);
+ Y1 ^= rol_32x4(Y0 + Y3, 7);
+ X2 ^= rol_32x4(X1 + X0, 9);
+ Y2 ^= rol_32x4(Y1 + Y0, 9);
+ X3 ^= rol_32x4(X2 + X1, 13);
+ Y3 ^= rol_32x4(Y2 + Y1, 13);
+ X0 ^= rol_32x4(X3 + X2, 18);
+ Y0 ^= rol_32x4(Y3 + Y2, 18);
+
+ /* Rearrange data. */
+ X1 = shuffle_32x4(X1, 3, 0, 1, 2);
+ Y1 = shuffle_32x4(Y1, 3, 0, 1, 2);
+ X2 = shuffle_32x4(X2, 2, 3, 0, 1);
+ Y2 = shuffle_32x4(Y2, 2, 3, 0, 1);
+ X3 = shuffle_32x4(X3, 1, 2, 3, 0);
+ Y3 = shuffle_32x4(Y3, 1, 2, 3, 0);
+
+ /* Operate on "rows". */
+ X3 ^= rol_32x4(X0 + X1, 7);
+ Y3 ^= rol_32x4(Y0 + Y1, 7);
+ X2 ^= rol_32x4(X3 + X0, 9);
+ Y2 ^= rol_32x4(Y3 + Y0, 9);
+ X1 ^= rol_32x4(X2 + X3, 13);
+ Y1 ^= rol_32x4(Y2 + Y3, 13);
+ X0 ^= rol_32x4(X1 + X2, 18);
+ Y0 ^= rol_32x4(Y1 + Y2, 18);
+
+ /* Rearrange data. */
+ X1 = shuffle_32x4(X1, 1, 2, 3, 0);
+ Y1 = shuffle_32x4(Y1, 1, 2, 3, 0);
+ X2 = shuffle_32x4(X2, 2, 3, 0, 1);
+ Y2 = shuffle_32x4(Y2, 2, 3, 0, 1);
+ X3 = shuffle_32x4(X3, 3, 0, 1, 2);
+ Y3 = shuffle_32x4(Y3, 3, 0, 1, 2);
+ }
+
+ B[0] += X0;
+ B[1] += X1;
+ B[2] += X2;
+ B[3] += X3;
+ C[0] += Y0;
+ C[1] += Y1;
+ C[2] += Y2;
+ C[3] += Y3;
+}
+
+/* Helps to prevent the violation of strict aliasing rules */
+typedef union { uint32x4 q[8]; uint32_t w[32]; } XY;
+
+/**
+ * The most performance critical part of scrypt (N = 1024, r = 1, p = 1).
+ * Handles one hash at a time. Is likely the best choice when having
+ * small L1/L2 caches and slow memory.
+ *
+ * databuf - 128 bytes buffer for data input and output
+ * scratch - temporary buffer, it must have size at
+ * least (128 + 128 * 1024) bytes
+ *
+ * All buffers must be aligned at 64 byte boundary.
+ */
+static inline
+void scrypt_simd_core1(uint8_t databuf[128], void * scratch)
+{
+ XY * X = (XY *)((uintptr_t)scratch + 0);
+ uint32x4 * V = (uint32x4 *)((uintptr_t)scratch + 128);
+ int i, j;
+
+ /* 1: X <-- B */
+ for (i = 0; i < 16; i++) {
+ X->w[i] = le32dec(&databuf[(i * 5 % 16) * 4]);
+ X->w[16 + i] = le32dec(&databuf[(16 + (i * 5 % 16)) * 4]);
+ }
+
+ /* 2: for i = 0 to N - 1 do */
+ for (i = 0; i < 1024; i++) {
+ blkcpy128(&V[i * 8], &X->q[0]);
+ salsa20_8_xor(&X->q[0], &X->q[4]);
+ salsa20_8_xor(&X->q[4], &X->q[0]);
+ }
+
+ /* 6: for i = 0 to N - 1 do */
+ for (i = 0; i < 1024; i++) {
+ j = X->w[16] & 1023; /* j <-- Integerify(X) mod N */
+ blkxor128(X->q, &V[j * 8]);
+ salsa20_8_xor(&X->q[0], &X->q[4]);
+ salsa20_8_xor(&X->q[4], &X->q[0]);
+ }
+
+ /* 10: B' <-- X */
+ for (i = 0; i < 16; i++) {
+ le32enc(&databuf[(i * 5 % 16) * 4], X->w[i]);
+ le32enc(&databuf[(16 + (i * 5 % 16)) * 4], X->w[16 + i]);
+ }
+}
+
+/**
+ * The most performance critical part of scrypt (N = 1024, r = 1, p = 1)
+ * Handle two hashes at a time. Is likely a better choice when the
+ * instructions have high latencies, but needs many registers and
+ * large L2 cache.
+ *
+ * databuf - two 128 bytes buffer for data input and output
+ * scratch - temporary buffer, it must have size at
+ * least (2 * 128 + 2 * 128 * 1024) bytes
+ *
+ * All buffers must be aligned at 64 byte boundary.
+ */
+static inline
+void scrypt_simd_core2(uint8_t databuf[2 * 128], void * scratch)
+{
+ uint8_t * databufA = &databuf[0];
+ uint8_t * databufB = &databuf[128];
+ XY * XA = (XY *)((uintptr_t)scratch);
+ XY * XB = (XY *)((uintptr_t)scratch + 128 + 128 * 1024);
+ uint32x4 * VA = (uint32x4 *)((uintptr_t)XA + 128);
+ uint32x4 * VB = (uint32x4 *)((uintptr_t)XB + 128);
+ int i, jA, jB;
+
+ /* 1: X <-- B */
+ for (i = 0; i < 16; i++) {
+ XA->w[i] = le32dec(&databufA[(i * 5 % 16) * 4]);
+ XA->w[16 + i] = le32dec(&databufA[(16 + (i * 5 % 16)) * 4]);
+ XB->w[i] = le32dec(&databufB[(i * 5 % 16) * 4]);
+ XB->w[16 + i] = le32dec(&databufB[(16 + (i * 5 % 16)) * 4]);
+ }
+
+ /* 2: for i = 0 to N - 1 do */
+ for (i = 0; i < 1024; i++) {
+ blkcpy128(&VA[i * 8], &XA->q[0]);
+ blkcpy128(&VB[i * 8], &XB->q[0]);
+ salsa20_8_xor2(&XA->q[0], &XA->q[4], &XB->q[0], &XB->q[4]);
+ salsa20_8_xor2(&XA->q[4], &XA->q[0], &XB->q[4], &XB->q[0]);
+ }
+
+ /* 6: for i = 0 to N - 1 do */
+ for (i = 0; i < 1024; i++) {
+ jA = XA->w[16] & 1023; /* j <-- Integerify(X) mod N */
+ jB = XB->w[16] & 1023; /* j <-- Integerify(X) mod N */
+ blkxor128(XA->q, &VA[jA * 8]);
+ blkxor128(XB->q, &VB[jB * 8]);
+ salsa20_8_xor2(&XA->q[0], &XA->q[4], &XB->q[0], &XB->q[4]);
+ salsa20_8_xor2(&XA->q[4], &XA->q[0], &XB->q[4], &XB->q[0]);
+ }
+
+ /* 10: B' <-- X */
+ for (i = 0; i < 16; i++) {
+ le32enc(&databufA[(i * 5 % 16) * 4], XA->w[i]);
+ le32enc(&databufA[(16 + (i * 5 % 16)) * 4], XA->w[16 + i]);
+ le32enc(&databufB[(i * 5 % 16) * 4], XB->w[i]);
+ le32enc(&databufB[(16 + (i * 5 % 16)) * 4], XB->w[16 + i]);
+ }
+}
+
+#endif
+#endif
diff --git a/scrypt.c b/scrypt.c
index 2780b29..6f70cc2 100644
--- a/scrypt.c
+++ b/scrypt.c
@@ -35,6 +35,7 @@
#include <string.h>
#include "sha256-helpers.h"
+#include "scrypt-simd-helpers.h"
static void blkcpy(void *, void *, size_t);
static void blkxor(void *, void *, size_t);
@@ -219,7 +220,7 @@ smix(uint8_t * B, size_t r, uint64_t N, uint32_t * V, uint32_t * XY)
/* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output
scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes
*/
-static void scrypt_1024_1_1_256_sp(const char* input, char* output, char* scratchpad)
+static void scrypt_1024_1_1_256_sp1(const char* input, char* output, char* scratchpad)
{
uint8_t * B;
uint32_t * V;
@@ -237,17 +238,21 @@ static void scrypt_1024_1_1_256_sp(const char* input, char* output, char* scratc
/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
PBKDF2_SHA256((const uint8_t*)input, 80, (const uint8_t*)input, 80, 1, B, p * 128 * r);
+#ifdef HAVE_SCRYPT_SIMD_HELPERS
+ scrypt_simd_core1(B, XY);
+#else
/* 2: for i = 0 to p - 1 do */
for (i = 0; i < p; i++) {
/* 3: B_i <-- MF(B_i, N) */
smix(&B[i * 128 * r], r, N, V, XY);
}
+#endif
/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
PBKDF2_SHA256((const uint8_t*)input, 80, B, p * 128 * r, 1, (uint8_t*)output, 32);
}
-int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
+int scanhash_scrypt1(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
const unsigned char *ptarget,
uint32_t max_nonce, unsigned long *hashes_done)
{
@@ -266,7 +271,7 @@ int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
while(1) {
n++;
le32enc(nonce, n);
- scrypt_1024_1_1_256_sp(data, tmp_hash, scratchbuf);
+ scrypt_1024_1_1_256_sp1(data, tmp_hash, scratchbuf);
if (le32dec(tmp_hash+28) <= Htarg) {
be32enc(pdata + 64 + 12, n);
@@ -282,3 +287,105 @@ int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
return false;
}
+#ifdef HAVE_SCRYPT_SIMD_HELPERS
+
+static void
+scrypt_1024_1_1_256_sp2(const unsigned char * input1,
+ unsigned char * output1,
+ const unsigned char * input2,
+ unsigned char * output2,
+ unsigned char * scratchpad)
+{
+ uint8_t * B1, * B2;
+ uint8_t * V;
+
+ const uint32_t N = 1024;
+ const uint32_t r = 1;
+ const uint32_t p = 1;
+
+ B1 = (uint8_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
+ B2 = B1 + 128;
+ V = B2 + 128;
+
+ /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+ PBKDF2_SHA256((const uint8_t*)input1, 80, (const uint8_t*)input1, 80, 1, B1, p * 128 * r);
+ /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+ PBKDF2_SHA256((const uint8_t*)input2, 80, (const uint8_t*)input2, 80, 1, B2, p * 128 * r);
+
+ scrypt_simd_core2(B1, V);
+
+ /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+ PBKDF2_SHA256((const uint8_t*)input1, 80, B1, p * 128 * r, 1, (uint8_t*)output1, 32);
+ /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+ PBKDF2_SHA256((const uint8_t*)input2, 80, B2, p * 128 * r, 1, (uint8_t*)output2, 32);
+}
+
+int scanhash_scrypt2(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
+ const unsigned char *ptarget,
+ uint32_t max_nonce, unsigned long *hashes_done)
+{
+ unsigned char data1[80];
+ unsigned char tmp_hash1[32];
+ unsigned char data2[80];
+ unsigned char tmp_hash2[32];
+ uint32_t *nonce1 = (uint32_t *)(data1 + 64 + 12);
+ uint32_t *nonce2 = (uint32_t *)(data2 + 64 + 12);
+ uint32_t n = 0;
+ uint32_t Htarg = le32dec(ptarget + 28);
+ int i;
+
+ work_restart[thr_id].restart = 0;
+
+ for (i = 0; i < 80/4; i++) {
+ ((uint32_t *)data1)[i] = swab32(((uint32_t *)pdata)[i]);
+ ((uint32_t *)data2)[i] = swab32(((uint32_t *)pdata)[i]);
+ }
+
+ while(1) {
+ le32enc(nonce1, n + 1);
+ le32enc(nonce2, n + 2);
+ scrypt_1024_1_1_256_sp2(data1, tmp_hash1, data2, tmp_hash2, scratchbuf);
+
+ if (le32dec(tmp_hash1+28) <= Htarg) {
+ be32enc(pdata + 64 + 12, n + 1);
+ *hashes_done = n + 1;
+ return true;
+ }
+
+ if (le32dec(tmp_hash2+28) <= Htarg && n + 2 <= max_nonce) {
+ be32enc(pdata + 64 + 12, n + 2);
+ *hashes_done = n + 2;
+ return true;
+ }
+
+ n += 2;
+
+ if (n >= max_nonce) {
+ *hashes_done = max_nonce;
+ break;
+ }
+
+ if (work_restart[thr_id].restart) {
+ *hashes_done = n;
+ break;
+ }
+ }
+ return false;
+}
+
+#endif
+
+int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
+ const unsigned char *ptarget,
+ uint32_t max_nonce, unsigned long *hashes_done)
+{
+ /*
+ * TODO: maybe add a command line option or run benchmarks at start
+ * to select the fastest implementation?
+ */
+#ifdef HAVE_SCRYPT_SIMD_HELPERS
+ return scanhash_scrypt2(thr_id, pdata, scratchbuf, ptarget, max_nonce, hashes_done);
+#else
+ return scanhash_scrypt1(thr_id, pdata, scratchbuf, ptarget, max_nonce, hashes_done);
+#endif
+}