4 files changed, 479 insertions, 5 deletions
diff --git a/Makefile.am b/Makefile.am
index 8065f2f..5623a59 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -14,7 +14,8 @@ INCLUDES	= $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES)
 bin_PROGRAMS	= minerd
 
 minerd_SOURCES	= elist.h miner.h compat.h			\
-		  cpu-miner.c util.c scrypt.c sha256-helpers.h
+		  cpu-miner.c util.c scrypt.c sha256-helpers.h	\
+		  scrypt-simd-helpers.h
 minerd_LDFLAGS	= $(PTHREAD_FLAGS)
 minerd_LDADD	= @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@
 minerd_CPPFLAGS = @LIBCURL_CPPFLAGS@
diff --git a/cpu-miner.c b/cpu-miner.c
index b3d1546..607c01d 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -531,7 +531,7 @@ static void *miner_thread(void *userdata)
 	
 	if (opt_algo == ALGO_SCRYPT)
 	{
-		scratchbuf = malloc(131583);
+		scratchbuf = malloc(2 * 131583);
 		max_nonce = 0xffff;
 	}
 
diff --git a/scrypt-simd-helpers.h b/scrypt-simd-helpers.h
new file mode 100644
index 0000000..322d718
--- /dev/null
+++ b/scrypt-simd-helpers.h
@@ -0,0 +1,366 @@
+/*-
+ * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 Siarhei Siamashka
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+/*
+ * This is the performance critical part of scrypt key derivation function [1],
+ * implemented using gcc vector extensions [2]. The parameters are set
+ * to N = 1024, r = 1, p = 1 as used for litecoin proof of work [3].
+ *
+ * The drawback is that these extensions are only supported by gcc and a few
+ * other compilers, which are trying to be gcc-compatible (clang, path64, ...)
+ *
+ * The advantage is that this code works on any SIMD capable hardware
+ * (x86 SSE2, PowerPC Altivec, Cell SPU, ARM NEON, ARM iWMMXt, ...) without
+ * modifications when compiled with gcc 4.7. The older compiler versions are
+ * missing bits and pieces, but still can work for Altivec, SPU and SSE2 with
+ * a bit of intrinsic band aid.
+ *
+ * 1. http://www.tarsnap.com/scrypt.html
+ * 2. http://gcc.gnu.org/onlinedocs/gcc/Vector-Extensions.html
+ * 3. https://github.com/coblee/litecoin/wiki/Scrypt-proof-of-work
+ */
+
+#ifndef __SCRYPT_SIMD_HELPERS_H__
+#define __SCRYPT_SIMD_HELPERS_H__
+
+#include <stdint.h>
+#include "sha256-helpers.h"
+
+#if defined(__GNUC__) && \
+    ((defined(__SSE2__) || defined(__ALTIVEC__) || defined(__SPU__)) || \
+    (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)))
+
+#define HAVE_SCRYPT_SIMD_HELPERS
+
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
+#ifdef __ALTIVEC__
+#include <altivec.h>
+#include <vec_types.h>
+#endif
+
+#ifdef __SPU__
+#include <spu_intrinsics.h>
+#endif
+
+typedef uint32_t uint32x4 __attribute__ ((vector_size(16), aligned(16)));
+typedef uint8_t uint8x16 __attribute__ ((vector_size(16), aligned(16)));
+
+/*
+ * Define two helper functions ('rol_32x4' and 'shuffle_32x4') to ensure
+ * better support for old gcc versions and gcc-compatible compilers
+ */
+static __attribute__((always_inline)) uint32x4
+rol_32x4(uint32x4 a, uint32_t b)
+{
+#ifdef __ALTIVEC__
+	return vec_rl(a, vec_splats(b));
+#elif defined(__SPU__)
+	return spu_rl(a, b);
+#elif defined(__SSE2__)
+	return (uint32x4)_mm_slli_epi32((__m128i)a, b) ^
+	       (uint32x4)_mm_srli_epi32((__m128i)a, 32 - b);
+#else
+	return (a << b) ^ (a >> (32 - b));
+#endif
+}
+
+#if defined(__clang__)
+# define shuffle_32x4(a, p1, p2, p3, p4) \
+	__builtin_shufflevector(a, a, p1, p2, p3, p4)
+#elif defined(__SSE2__)
+# define shuffle_32x4(a, p1, p2, p3, p4) \
+	(uint32x4)_mm_shuffle_epi32((__m128i)a, _MM_SHUFFLE(p4, p3, p2, p1))
+#else
+static __attribute__((always_inline)) uint32x4
+shuffle_32x4(uint32x4 a, const int p1, const int p2, const int p3, const int p4)
+{
+#if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))
+	/* gcc 4.7 introduces '__builtin_shuffle' */
+	const uint32x4 mask = { p1, p2, p3, p4 };
+	return __builtin_shuffle(a, mask);
+#elif defined(__SPU__)
+	const uint8x16 mask = {
+		p1 * 4, p1 * 4 + 1, p1 * 4 + 2, p1 * 4 + 3,
+		p2 * 4, p2 * 4 + 1, p2 * 4 + 2, p2 * 4 + 3,
+		p3 * 4, p3 * 4 + 1, p3 * 4 + 2, p3 * 4 + 3,
+		p4 * 4, p4 * 4 + 1, p4 * 4 + 2, p4 * 4 + 3
+	};
+	return spu_shuffle(a, a, mask);
+#elif defined(__ALTIVEC__)
+	const uint8x16 mask = {
+		p1 * 4, p1 * 4 + 1, p1 * 4 + 2, p1 * 4 + 3,
+		p2 * 4, p2 * 4 + 1, p2 * 4 + 2, p2 * 4 + 3,
+		p3 * 4, p3 * 4 + 1, p3 * 4 + 2, p3 * 4 + 3,
+		p4 * 4, p4 * 4 + 1, p4 * 4 + 2, p4 * 4 + 3
+	};
+	return vec_perm(a, a, mask);
+#else
+# error Have no implementation for 'shuffle_32x4' inline function
+#endif
+}
+#endif
+
+/*****************************************************************************/
+
+static __attribute__((always_inline)) void
+blkcpy128(uint32x4 * __restrict D, const uint32x4 * __restrict S)
+{
+	D[0] = S[0]; D[1] = S[1]; D[2] = S[2]; D[3] = S[3];
+	D[4] = S[4]; D[5] = S[5]; D[6] = S[6]; D[7] = S[7];
+}
+
+static __attribute__((always_inline)) void
+blkxor128(uint32x4 * __restrict D, const uint32x4 * __restrict S)
+{
+	D[0] ^= S[0]; D[1] ^= S[1]; D[2] ^= S[2]; D[3] ^= S[3];
+	D[4] ^= S[4]; D[5] ^= S[5]; D[6] ^= S[6]; D[7] ^= S[7];
+}
+
+/**
+ * salsa20_8(B):
+ * Apply the salsa20/8 core to the provided block.
+ */
+static __attribute__((always_inline)) void
+salsa20_8_xor(uint32x4 * __restrict B, const uint32x4 * __restrict Bx)
+{
+	uint32x4 X0, X1, X2, X3;
+	int i;
+
+	X0 = (B[0] ^= Bx[0]);
+	X1 = (B[1] ^= Bx[1]);
+	X2 = (B[2] ^= Bx[2]);
+	X3 = (B[3] ^= Bx[3]);
+
+	for (i = 0; i < 8; i += 2) {
+		/* Operate on "columns". */
+		X1 ^= rol_32x4(X0 + X3, 7);
+		X2 ^= rol_32x4(X1 + X0, 9);
+		X3 ^= rol_32x4(X2 + X1, 13);
+		X0 ^= rol_32x4(X3 + X2, 18);
+
+		/* Rearrange data. */
+		X1 = shuffle_32x4(X1, 3, 0, 1, 2);
+		X2 = shuffle_32x4(X2, 2, 3, 0, 1);
+		X3 = shuffle_32x4(X3, 1, 2, 3, 0);
+
+		/* Operate on "rows". */
+		X3 ^= rol_32x4(X0 + X1, 7);
+		X2 ^= rol_32x4(X3 + X0, 9);
+		X1 ^= rol_32x4(X2 + X3, 13);
+		X0 ^= rol_32x4(X1 + X2, 18);
+
+		/* Rearrange data. */
+		X1 = shuffle_32x4(X1, 1, 2, 3, 0);
+		X2 = shuffle_32x4(X2, 2, 3, 0, 1);
+		X3 = shuffle_32x4(X3, 3, 0, 1, 2);
+	}
+
+	B[0] += X0;
+	B[1] += X1;
+	B[2] += X2;
+	B[3] += X3;
+}
+
+static __attribute__((always_inline)) void
+salsa20_8_xor2(uint32x4 * __restrict B, const uint32x4 * __restrict Bx,
+               uint32x4 * __restrict C, const uint32x4 * __restrict Cx)
+{
+	uint32x4 X0, X1, X2, X3;
+	uint32x4 Y0, Y1, Y2, Y3;
+	int i;
+
+	X0 = (B[0] ^= Bx[0]);
+	X1 = (B[1] ^= Bx[1]);
+	X2 = (B[2] ^= Bx[2]);
+	X3 = (B[3] ^= Bx[3]);
+	Y0 = (C[0] ^= Cx[0]);
+	Y1 = (C[1] ^= Cx[1]);
+	Y2 = (C[2] ^= Cx[2]);
+	Y3 = (C[3] ^= Cx[3]);
+
+	for (i = 0; i < 8; i += 2) {
+		/* Operate on "columns". */
+		X1 ^= rol_32x4(X0 + X3, 7);
+		Y1 ^= rol_32x4(Y0 + Y3, 7);
+		X2 ^= rol_32x4(X1 + X0, 9);
+		Y2 ^= rol_32x4(Y1 + Y0, 9);
+		X3 ^= rol_32x4(X2 + X1, 13);
+		Y3 ^= rol_32x4(Y2 + Y1, 13);
+		X0 ^= rol_32x4(X3 + X2, 18);
+		Y0 ^= rol_32x4(Y3 + Y2, 18);
+
+		/* Rearrange data. */
+		X1 = shuffle_32x4(X1, 3, 0, 1, 2);
+		Y1 = shuffle_32x4(Y1, 3, 0, 1, 2);
+		X2 = shuffle_32x4(X2, 2, 3, 0, 1);
+		Y2 = shuffle_32x4(Y2, 2, 3, 0, 1);
+		X3 = shuffle_32x4(X3, 1, 2, 3, 0);
+		Y3 = shuffle_32x4(Y3, 1, 2, 3, 0);
+
+		/* Operate on "rows". */
+		X3 ^= rol_32x4(X0 + X1, 7);
+		Y3 ^= rol_32x4(Y0 + Y1, 7);
+		X2 ^= rol_32x4(X3 + X0, 9);
+		Y2 ^= rol_32x4(Y3 + Y0, 9);
+		X1 ^= rol_32x4(X2 + X3, 13);
+		Y1 ^= rol_32x4(Y2 + Y3, 13);
+		X0 ^= rol_32x4(X1 + X2, 18);
+		Y0 ^= rol_32x4(Y1 + Y2, 18);
+
+		/* Rearrange data. */
+		X1 = shuffle_32x4(X1, 1, 2, 3, 0);
+		Y1 = shuffle_32x4(Y1, 1, 2, 3, 0);
+		X2 = shuffle_32x4(X2, 2, 3, 0, 1);
+		Y2 = shuffle_32x4(Y2, 2, 3, 0, 1);
+		X3 = shuffle_32x4(X3, 3, 0, 1, 2);
+		Y3 = shuffle_32x4(Y3, 3, 0, 1, 2);
+	}
+
+	B[0] += X0;
+	B[1] += X1;
+	B[2] += X2;
+	B[3] += X3;
+	C[0] += Y0;
+	C[1] += Y1;
+	C[2] += Y2;
+	C[3] += Y3;
+}
+
+/* Helps to prevent the violation of strict aliasing rules */
+typedef union { uint32x4 q[8]; uint32_t w[32]; } XY;
+
+/**
+ * The most performance critical part of scrypt (N = 1024, r = 1, p = 1).
+ * Handles one hash at a time. Is likely the best choice when having
+ * small L1/L2 caches and slow memory.
+ *
+ * databuf - 128 bytes buffer for data input and output
+ * scratch - temporary buffer, it must have size at
+ *           least (128 + 128 * 1024) bytes
+ *
+ * All buffers must be aligned at 64 byte boundary.
+ */
+static inline
+void scrypt_simd_core1(uint8_t databuf[128], void * scratch)
+{
+	XY       * X = (XY *)((uintptr_t)scratch + 0);
+	uint32x4 * V = (uint32x4 *)((uintptr_t)scratch + 128);
+	int i, j;
+
+	/* 1: X <-- B */
+	for (i = 0; i < 16; i++) {
+		X->w[i]      = le32dec(&databuf[(i * 5 % 16) * 4]);
+		X->w[16 + i] = le32dec(&databuf[(16 + (i * 5 % 16)) * 4]);
+	}
+
+	/* 2: for i = 0 to N - 1 do */
+	for (i = 0; i < 1024; i++) {
+		blkcpy128(&V[i * 8], &X->q[0]);
+		salsa20_8_xor(&X->q[0], &X->q[4]);
+		salsa20_8_xor(&X->q[4], &X->q[0]);
+	}
+
+	/* 6: for i = 0 to N - 1 do */
+	for (i = 0; i < 1024; i++) {
+		j = X->w[16] & 1023; /* j <-- Integerify(X) mod N */
+		blkxor128(X->q, &V[j * 8]);
+		salsa20_8_xor(&X->q[0], &X->q[4]);
+		salsa20_8_xor(&X->q[4], &X->q[0]);
+	}
+
+	/* 10: B' <-- X */
+	for (i = 0; i < 16; i++) {
+		le32enc(&databuf[(i * 5 % 16) * 4], X->w[i]);
+		le32enc(&databuf[(16 + (i * 5 % 16)) * 4], X->w[16 + i]);
+	}
+}
+
+/**
+ * The most performance critical part of scrypt (N = 1024, r = 1, p = 1)
+ * Handle two hashes at a time. Is likely a better choice when the
+ * instructions have high latencies, but needs many registers and
+ * large L2 cache.
+ *
+ * databuf - two 128 bytes buffer for data input and output
+ * scratch - temporary buffer, it must have size at
+ *           least (2 * 128 + 2 * 128 * 1024) bytes
+ *
+ * All buffers must be aligned at 64 byte boundary.
+ */
+static inline
+void scrypt_simd_core2(uint8_t databuf[2 * 128], void * scratch)
+{
+	uint8_t  * databufA = &databuf[0];
+	uint8_t  * databufB = &databuf[128];
+	XY       * XA = (XY *)((uintptr_t)scratch);
+	XY       * XB = (XY *)((uintptr_t)scratch + 128 + 128 * 1024);
+	uint32x4 * VA = (uint32x4 *)((uintptr_t)XA + 128);
+	uint32x4 * VB = (uint32x4 *)((uintptr_t)XB + 128);
+	int i, jA, jB;
+
+	/* 1: X <-- B */
+	for (i = 0; i < 16; i++) {
+		XA->w[i]      = le32dec(&databufA[(i * 5 % 16) * 4]);
+		XA->w[16 + i] = le32dec(&databufA[(16 + (i * 5 % 16)) * 4]);
+		XB->w[i]      = le32dec(&databufB[(i * 5 % 16) * 4]);
+		XB->w[16 + i] = le32dec(&databufB[(16 + (i * 5 % 16)) * 4]);
+	}
+
+	/* 2: for i = 0 to N - 1 do */
+	for (i = 0; i < 1024; i++) {
+		blkcpy128(&VA[i * 8], &XA->q[0]);
+		blkcpy128(&VB[i * 8], &XB->q[0]);
+		salsa20_8_xor2(&XA->q[0], &XA->q[4], &XB->q[0], &XB->q[4]);
+		salsa20_8_xor2(&XA->q[4], &XA->q[0], &XB->q[4], &XB->q[0]);
+	}
+
+	/* 6: for i = 0 to N - 1 do */
+	for (i = 0; i < 1024; i++) {
+		jA = XA->w[16] & 1023; /* j <-- Integerify(X) mod N */
+		jB = XB->w[16] & 1023; /* j <-- Integerify(X) mod N */
+		blkxor128(XA->q, &VA[jA * 8]);
+		blkxor128(XB->q, &VB[jB * 8]);
+		salsa20_8_xor2(&XA->q[0], &XA->q[4], &XB->q[0], &XB->q[4]);
+		salsa20_8_xor2(&XA->q[4], &XA->q[0], &XB->q[4], &XB->q[0]);
+	}
+
+	/* 10: B' <-- X */
+	for (i = 0; i < 16; i++) {
+		le32enc(&databufA[(i * 5 % 16) * 4], XA->w[i]);
+		le32enc(&databufA[(16 + (i * 5 % 16)) * 4], XA->w[16 + i]);
+		le32enc(&databufB[(i * 5 % 16) * 4], XB->w[i]);
+		le32enc(&databufB[(16 + (i * 5 % 16)) * 4], XB->w[16 + i]);
+	}
+}
+
+#endif
+#endif
diff --git a/scrypt.c b/scrypt.c
index 2780b29..6f70cc2 100644
--- a/scrypt.c
+++ b/scrypt.c
@@ -35,6 +35,7 @@
 #include <string.h>
 
 #include "sha256-helpers.h"
+#include "scrypt-simd-helpers.h"
 
 static void blkcpy(void *, void *, size_t);
 static void blkxor(void *, void *, size_t);
@@ -219,7 +220,7 @@ smix(uint8_t * B, size_t r, uint64_t N, uint32_t * V, uint32_t * XY)
 /* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output
    scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes
  */
-static void scrypt_1024_1_1_256_sp(const char* input, char* output, char* scratchpad)
+static void scrypt_1024_1_1_256_sp1(const char* input, char* output, char* scratchpad)
 {
 	uint8_t * B;
 	uint32_t * V;
@@ -237,17 +238,21 @@ static void scrypt_1024_1_1_256_sp(const char* input, char* output, char* scratc
 	/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
 	PBKDF2_SHA256((const uint8_t*)input, 80, (const uint8_t*)input, 80, 1, B, p * 128 * r);
 
+#ifdef HAVE_SCRYPT_SIMD_HELPERS
+	scrypt_simd_core1(B, XY);
+#else
 	/* 2: for i = 0 to p - 1 do */
 	for (i = 0; i < p; i++) {
 		/* 3: B_i <-- MF(B_i, N) */
 		smix(&B[i * 128 * r], r, N, V, XY);
 	}
+#endif
 
 	/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
 	PBKDF2_SHA256((const uint8_t*)input, 80, B, p * 128 * r, 1, (uint8_t*)output, 32);
 }
 
-int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
+int scanhash_scrypt1(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
 	const unsigned char *ptarget,
 	uint32_t max_nonce, unsigned long *hashes_done)
 {
@@ -266,7 +271,7 @@ int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
 	while(1) {
 		n++;
 		le32enc(nonce, n);
-		scrypt_1024_1_1_256_sp(data, tmp_hash, scratchbuf);
+		scrypt_1024_1_1_256_sp1(data, tmp_hash, scratchbuf);
 
 		if (le32dec(tmp_hash+28) <= Htarg) {
 			be32enc(pdata + 64 + 12, n);
@@ -282,3 +287,105 @@ int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
 	return false;
 }
 
+#ifdef HAVE_SCRYPT_SIMD_HELPERS
+
+static void
+scrypt_1024_1_1_256_sp2(const unsigned char * input1,
+                        unsigned char       * output1,
+                        const unsigned char * input2,
+                        unsigned char       * output2,
+                        unsigned char       * scratchpad)
+{
+	uint8_t * B1, * B2;
+	uint8_t * V;
+
+	const uint32_t N = 1024;
+	const uint32_t r = 1;
+	const uint32_t p = 1;
+
+	B1 = (uint8_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
+	B2 = B1 + 128;
+	V  = B2 + 128;
+
+	/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+	PBKDF2_SHA256((const uint8_t*)input1, 80, (const uint8_t*)input1, 80, 1, B1, p * 128 * r);
+	/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+	PBKDF2_SHA256((const uint8_t*)input2, 80, (const uint8_t*)input2, 80, 1, B2, p * 128 * r);
+
+	scrypt_simd_core2(B1, V);
+
+	/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+	PBKDF2_SHA256((const uint8_t*)input1, 80, B1, p * 128 * r, 1, (uint8_t*)output1, 32);
+	/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+	PBKDF2_SHA256((const uint8_t*)input2, 80, B2, p * 128 * r, 1, (uint8_t*)output2, 32);
+}
+
+int scanhash_scrypt2(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
+	const unsigned char *ptarget,
+	uint32_t max_nonce, unsigned long *hashes_done)
+{
+	unsigned char data1[80];
+	unsigned char tmp_hash1[32];
+	unsigned char data2[80];
+	unsigned char tmp_hash2[32];
+	uint32_t *nonce1 = (uint32_t *)(data1 + 64 + 12);
+	uint32_t *nonce2 = (uint32_t *)(data2 + 64 + 12);
+	uint32_t n = 0;
+	uint32_t Htarg = le32dec(ptarget + 28);
+	int i;
+
+	work_restart[thr_id].restart = 0;
+	
+	for (i = 0; i < 80/4; i++) {
+		((uint32_t *)data1)[i] = swab32(((uint32_t *)pdata)[i]);
+		((uint32_t *)data2)[i] = swab32(((uint32_t *)pdata)[i]);
+	}
+	
+	while(1) {
+		le32enc(nonce1, n + 1);
+		le32enc(nonce2, n + 2);
+		scrypt_1024_1_1_256_sp2(data1, tmp_hash1, data2, tmp_hash2, scratchbuf);
+
+		if (le32dec(tmp_hash1+28) <= Htarg) {
+			be32enc(pdata + 64 + 12, n + 1);
+			*hashes_done = n + 1;
+			return true;
+		}
+
+		if (le32dec(tmp_hash2+28) <= Htarg && n + 2 <= max_nonce) {
+			be32enc(pdata + 64 + 12, n + 2);
+			*hashes_done = n + 2;
+			return true;
+		}
+
+		n += 2;
+
+		if (n >= max_nonce) {
+			*hashes_done = max_nonce;
+			break;
+		}
+
+		if (work_restart[thr_id].restart) {
+			*hashes_done = n;
+			break;
+		}
+	}
+	return false;
+}
+
+#endif
+
+int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
+	const unsigned char *ptarget,
+	uint32_t max_nonce, unsigned long *hashes_done)
+{
+	/*
+	 * TODO: maybe add a command line option or run benchmarks at start
+	 * to select the fastest implementation?
+	 */
+#ifdef HAVE_SCRYPT_SIMD_HELPERS
+	return scanhash_scrypt2(thr_id, pdata, scratchbuf, ptarget, max_nonce, hashes_done);
+#else
+	return scanhash_scrypt1(thr_id, pdata, scratchbuf, ptarget, max_nonce, hashes_done);
+#endif
+}