aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiarhei Siamashka <siarhei.siamashka@gmail.com>2011-12-29 01:38:05 +0200
committerSiarhei Siamashka <siarhei.siamashka@gmail.com>2011-12-29 06:46:17 +0200
commit7a9481d266b58bd02a7d8033f359ca344cfd65d6 (patch)
treeecc2a56b02db52a0466be9958ab8faee1f0a2b3b
parenta0139edc7e2c3aa8252405ceff8a23a065965716 (diff)
Use gcc vector extensions for SIMD scrypt key derivation function
The code can be compiled for different architectures from the same source starting with gcc 4.7. But SSE2/Altivec/SPU targets have compatibility wrappers, which also allow the use of older versions of gcc. Two hashes are processed at the same time, so twice bigger scratch buffer is needed (~256K vs. ~128K). Speedup on Cell PPU (32-bit), single thread, 3.2GHz: ~0.58 khash/s -> ~1.79 khash/sec
-rw-r--r--Makefile.am3
-rw-r--r--cpu-miner.c2
-rw-r--r--scrypt-simd-helpers.h366
-rw-r--r--scrypt.c113
4 files changed, 479 insertions, 5 deletions
diff --git a/Makefile.am b/Makefile.am
index 8065f2f..5623a59 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -14,7 +14,8 @@ INCLUDES = $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES)
bin_PROGRAMS = minerd
minerd_SOURCES = elist.h miner.h compat.h \
- cpu-miner.c util.c scrypt.c sha256-helpers.h
+ cpu-miner.c util.c scrypt.c sha256-helpers.h \
+ scrypt-simd-helpers.h
minerd_LDFLAGS = $(PTHREAD_FLAGS)
minerd_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@
minerd_CPPFLAGS = @LIBCURL_CPPFLAGS@
diff --git a/cpu-miner.c b/cpu-miner.c
index b3d1546..607c01d 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -531,7 +531,7 @@ static void *miner_thread(void *userdata)
if (opt_algo == ALGO_SCRYPT)
{
- scratchbuf = malloc(131583);
+ scratchbuf = malloc(2 * 131583);
max_nonce = 0xffff;
}
diff --git a/scrypt-simd-helpers.h b/scrypt-simd-helpers.h
new file mode 100644
index 0000000..322d718
--- /dev/null
+++ b/scrypt-simd-helpers.h
@@ -0,0 +1,366 @@
+/*-
+ * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 Siarhei Siamashka
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+/*
+ * This is the performance critical part of scrypt key derivation function [1],
+ * implemented using gcc vector extensions [2]. The parameters are set
+ * to N = 1024, r = 1, p = 1 as used for litecoin proof of work [3].
+ *
+ * The drawback is that these extensions are only supported by gcc and a few
+ * other compilers, which are trying to be gcc-compatible (clang, path64, ...)
+ *
+ * The advantage is that this code works on any SIMD capable hardware
+ * (x86 SSE2, PowerPC Altivec, Cell SPU, ARM NEON, ARM iWMMXt, ...) without
+ * modifications when compiled with gcc 4.7. The older compiler versions are
+ * missing bits and pieces, but still can work for Altivec, SPU and SSE2 with
+ * a bit of intrinsic band aid.
+ *
+ * 1. http://www.tarsnap.com/scrypt.html
+ * 2. http://gcc.gnu.org/onlinedocs/gcc/Vector-Extensions.html
+ * 3. https://github.com/coblee/litecoin/wiki/Scrypt-proof-of-work
+ */
+
+#ifndef __SCRYPT_SIMD_HELPERS_H__
+#define __SCRYPT_SIMD_HELPERS_H__
+
+#include <stdint.h>
+#include "sha256-helpers.h"
+
+#if defined(__GNUC__) && \
+ ((defined(__SSE2__) || defined(__ALTIVEC__) || defined(__SPU__)) || \
+ (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)))
+
+#define HAVE_SCRYPT_SIMD_HELPERS
+
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
+#ifdef __ALTIVEC__
+#include <altivec.h>
+#include <vec_types.h>
+#endif
+
+#ifdef __SPU__
+#include <spu_intrinsics.h>
+#endif
+
+typedef uint32_t uint32x4 __attribute__ ((vector_size(16), aligned(16)));
+typedef uint8_t uint8x16 __attribute__ ((vector_size(16), aligned(16)));
+
+/*
+ * Define two helper functions ('rol_32x4' and 'shuffle_32x4') to ensure
+ * better support for old gcc versions and gcc-compatible compilers
+ */
+static __attribute__((always_inline)) uint32x4
+rol_32x4(uint32x4 a, uint32_t b)
+{
+#ifdef __ALTIVEC__
+ return vec_rl(a, vec_splats(b));
+#elif defined(__SPU__)
+ return spu_rl(a, b);
+#elif defined(__SSE2__)
+ return (uint32x4)_mm_slli_epi32((__m128i)a, b) ^
+ (uint32x4)_mm_srli_epi32((__m128i)a, 32 - b);
+#else
+ return (a << b) ^ (a >> (32 - b));
+#endif
+}
+
+#if defined(__clang__)
+# define shuffle_32x4(a, p1, p2, p3, p4) \
+ __builtin_shufflevector(a, a, p1, p2, p3, p4)
+#elif defined(__SSE2__)
+# define shuffle_32x4(a, p1, p2, p3, p4) \
+ (uint32x4)_mm_shuffle_epi32((__m128i)a, _MM_SHUFFLE(p4, p3, p2, p1))
+#else
+static __attribute__((always_inline)) uint32x4
+shuffle_32x4(uint32x4 a, const int p1, const int p2, const int p3, const int p4)
+{
+#if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))
+ /* gcc 4.7 introduces '__builtin_shuffle' */
+ const uint32x4 mask = { p1, p2, p3, p4 };
+ return __builtin_shuffle(a, mask);
+#elif defined(__SPU__)
+ const uint8x16 mask = {
+ p1 * 4, p1 * 4 + 1, p1 * 4 + 2, p1 * 4 + 3,
+ p2 * 4, p2 * 4 + 1, p2 * 4 + 2, p2 * 4 + 3,
+ p3 * 4, p3 * 4 + 1, p3 * 4 + 2, p3 * 4 + 3,
+ p4 * 4, p4 * 4 + 1, p4 * 4 + 2, p4 * 4 + 3
+ };
+ return spu_shuffle(a, a, mask);
+#elif defined(__ALTIVEC__)
+ const uint8x16 mask = {
+ p1 * 4, p1 * 4 + 1, p1 * 4 + 2, p1 * 4 + 3,
+ p2 * 4, p2 * 4 + 1, p2 * 4 + 2, p2 * 4 + 3,
+ p3 * 4, p3 * 4 + 1, p3 * 4 + 2, p3 * 4 + 3,
+ p4 * 4, p4 * 4 + 1, p4 * 4 + 2, p4 * 4 + 3
+ };
+ return vec_perm(a, a, mask);
+#else
+# error Have no implementation for 'shuffle_32x4' inline function
+#endif
+}
+#endif
+
+/*****************************************************************************/
+
+static __attribute__((always_inline)) void
+blkcpy128(uint32x4 * __restrict D, const uint32x4 * __restrict S)
+{
+ D[0] = S[0]; D[1] = S[1]; D[2] = S[2]; D[3] = S[3];
+ D[4] = S[4]; D[5] = S[5]; D[6] = S[6]; D[7] = S[7];
+}
+
+static __attribute__((always_inline)) void
+blkxor128(uint32x4 * __restrict D, const uint32x4 * __restrict S)
+{
+ D[0] ^= S[0]; D[1] ^= S[1]; D[2] ^= S[2]; D[3] ^= S[3];
+ D[4] ^= S[4]; D[5] ^= S[5]; D[6] ^= S[6]; D[7] ^= S[7];
+}
+
+/**
+ * salsa20_8(B):
+ * Apply the salsa20/8 core to the provided block.
+ */
+static __attribute__((always_inline)) void
+salsa20_8_xor(uint32x4 * __restrict B, const uint32x4 * __restrict Bx)
+{
+ uint32x4 X0, X1, X2, X3;
+ int i;
+
+ X0 = (B[0] ^= Bx[0]);
+ X1 = (B[1] ^= Bx[1]);
+ X2 = (B[2] ^= Bx[2]);
+ X3 = (B[3] ^= Bx[3]);
+
+ for (i = 0; i < 8; i += 2) {
+ /* Operate on "columns". */
+ X1 ^= rol_32x4(X0 + X3, 7);
+ X2 ^= rol_32x4(X1 + X0, 9);
+ X3 ^= rol_32x4(X2 + X1, 13);
+ X0 ^= rol_32x4(X3 + X2, 18);
+
+ /* Rearrange data. */
+ X1 = shuffle_32x4(X1, 3, 0, 1, 2);
+ X2 = shuffle_32x4(X2, 2, 3, 0, 1);
+ X3 = shuffle_32x4(X3, 1, 2, 3, 0);
+
+ /* Operate on "rows". */
+ X3 ^= rol_32x4(X0 + X1, 7);
+ X2 ^= rol_32x4(X3 + X0, 9);
+ X1 ^= rol_32x4(X2 + X3, 13);
+ X0 ^= rol_32x4(X1 + X2, 18);
+
+ /* Rearrange data. */
+ X1 = shuffle_32x4(X1, 1, 2, 3, 0);
+ X2 = shuffle_32x4(X2, 2, 3, 0, 1);
+ X3 = shuffle_32x4(X3, 3, 0, 1, 2);
+ }
+
+ B[0] += X0;
+ B[1] += X1;
+ B[2] += X2;
+ B[3] += X3;
+}
+
+static __attribute__((always_inline)) void
+salsa20_8_xor2(uint32x4 * __restrict B, const uint32x4 * __restrict Bx,
+ uint32x4 * __restrict C, const uint32x4 * __restrict Cx)
+{
+ uint32x4 X0, X1, X2, X3;
+ uint32x4 Y0, Y1, Y2, Y3;
+ int i;
+
+ X0 = (B[0] ^= Bx[0]);
+ X1 = (B[1] ^= Bx[1]);
+ X2 = (B[2] ^= Bx[2]);
+ X3 = (B[3] ^= Bx[3]);
+ Y0 = (C[0] ^= Cx[0]);
+ Y1 = (C[1] ^= Cx[1]);
+ Y2 = (C[2] ^= Cx[2]);
+ Y3 = (C[3] ^= Cx[3]);
+
+ for (i = 0; i < 8; i += 2) {
+ /* Operate on "columns". */
+ X1 ^= rol_32x4(X0 + X3, 7);
+ Y1 ^= rol_32x4(Y0 + Y3, 7);
+ X2 ^= rol_32x4(X1 + X0, 9);
+ Y2 ^= rol_32x4(Y1 + Y0, 9);
+ X3 ^= rol_32x4(X2 + X1, 13);
+ Y3 ^= rol_32x4(Y2 + Y1, 13);
+ X0 ^= rol_32x4(X3 + X2, 18);
+ Y0 ^= rol_32x4(Y3 + Y2, 18);
+
+ /* Rearrange data. */
+ X1 = shuffle_32x4(X1, 3, 0, 1, 2);
+ Y1 = shuffle_32x4(Y1, 3, 0, 1, 2);
+ X2 = shuffle_32x4(X2, 2, 3, 0, 1);
+ Y2 = shuffle_32x4(Y2, 2, 3, 0, 1);
+ X3 = shuffle_32x4(X3, 1, 2, 3, 0);
+ Y3 = shuffle_32x4(Y3, 1, 2, 3, 0);
+
+ /* Operate on "rows". */
+ X3 ^= rol_32x4(X0 + X1, 7);
+ Y3 ^= rol_32x4(Y0 + Y1, 7);
+ X2 ^= rol_32x4(X3 + X0, 9);
+ Y2 ^= rol_32x4(Y3 + Y0, 9);
+ X1 ^= rol_32x4(X2 + X3, 13);
+ Y1 ^= rol_32x4(Y2 + Y3, 13);
+ X0 ^= rol_32x4(X1 + X2, 18);
+ Y0 ^= rol_32x4(Y1 + Y2, 18);
+
+ /* Rearrange data. */
+ X1 = shuffle_32x4(X1, 1, 2, 3, 0);
+ Y1 = shuffle_32x4(Y1, 1, 2, 3, 0);
+ X2 = shuffle_32x4(X2, 2, 3, 0, 1);
+ Y2 = shuffle_32x4(Y2, 2, 3, 0, 1);
+ X3 = shuffle_32x4(X3, 3, 0, 1, 2);
+ Y3 = shuffle_32x4(Y3, 3, 0, 1, 2);
+ }
+
+ B[0] += X0;
+ B[1] += X1;
+ B[2] += X2;
+ B[3] += X3;
+ C[0] += Y0;
+ C[1] += Y1;
+ C[2] += Y2;
+ C[3] += Y3;
+}
+
+/* Helps to prevent the violation of strict aliasing rules */
+typedef union { uint32x4 q[8]; uint32_t w[32]; } XY;
+
+/**
+ * The most performance critical part of scrypt (N = 1024, r = 1, p = 1).
+ * Handles one hash at a time. Is likely the best choice when having
+ * small L1/L2 caches and slow memory.
+ *
+ * databuf - 128 bytes buffer for data input and output
+ * scratch - temporary buffer, it must have size at
+ * least (128 + 128 * 1024) bytes
+ *
+ * All buffers must be aligned at 64 byte boundary.
+ */
+static inline
+void scrypt_simd_core1(uint8_t databuf[128], void * scratch)
+{
+ XY * X = (XY *)((uintptr_t)scratch + 0);
+ uint32x4 * V = (uint32x4 *)((uintptr_t)scratch + 128);
+ int i, j;
+
+ /* 1: X <-- B */
+ for (i = 0; i < 16; i++) {
+ X->w[i] = le32dec(&databuf[(i * 5 % 16) * 4]);
+ X->w[16 + i] = le32dec(&databuf[(16 + (i * 5 % 16)) * 4]);
+ }
+
+ /* 2: for i = 0 to N - 1 do */
+ for (i = 0; i < 1024; i++) {
+ blkcpy128(&V[i * 8], &X->q[0]);
+ salsa20_8_xor(&X->q[0], &X->q[4]);
+ salsa20_8_xor(&X->q[4], &X->q[0]);
+ }
+
+ /* 6: for i = 0 to N - 1 do */
+ for (i = 0; i < 1024; i++) {
+ j = X->w[16] & 1023; /* j <-- Integerify(X) mod N */
+ blkxor128(X->q, &V[j * 8]);
+ salsa20_8_xor(&X->q[0], &X->q[4]);
+ salsa20_8_xor(&X->q[4], &X->q[0]);
+ }
+
+ /* 10: B' <-- X */
+ for (i = 0; i < 16; i++) {
+ le32enc(&databuf[(i * 5 % 16) * 4], X->w[i]);
+ le32enc(&databuf[(16 + (i * 5 % 16)) * 4], X->w[16 + i]);
+ }
+}
+
+/**
+ * The most performance critical part of scrypt (N = 1024, r = 1, p = 1)
+ * Handle two hashes at a time. Is likely a better choice when the
+ * instructions have high latencies, but needs many registers and
+ * large L2 cache.
+ *
+ * databuf - two 128 bytes buffer for data input and output
+ * scratch - temporary buffer, it must have size at
+ * least (2 * 128 + 2 * 128 * 1024) bytes
+ *
+ * All buffers must be aligned at 64 byte boundary.
+ */
+static inline
+void scrypt_simd_core2(uint8_t databuf[2 * 128], void * scratch)
+{
+ uint8_t * databufA = &databuf[0];
+ uint8_t * databufB = &databuf[128];
+ XY * XA = (XY *)((uintptr_t)scratch);
+ XY * XB = (XY *)((uintptr_t)scratch + 128 + 128 * 1024);
+ uint32x4 * VA = (uint32x4 *)((uintptr_t)XA + 128);
+ uint32x4 * VB = (uint32x4 *)((uintptr_t)XB + 128);
+ int i, jA, jB;
+
+ /* 1: X <-- B */
+ for (i = 0; i < 16; i++) {
+ XA->w[i] = le32dec(&databufA[(i * 5 % 16) * 4]);
+ XA->w[16 + i] = le32dec(&databufA[(16 + (i * 5 % 16)) * 4]);
+ XB->w[i] = le32dec(&databufB[(i * 5 % 16) * 4]);
+ XB->w[16 + i] = le32dec(&databufB[(16 + (i * 5 % 16)) * 4]);
+ }
+
+ /* 2: for i = 0 to N - 1 do */
+ for (i = 0; i < 1024; i++) {
+ blkcpy128(&VA[i * 8], &XA->q[0]);
+ blkcpy128(&VB[i * 8], &XB->q[0]);
+ salsa20_8_xor2(&XA->q[0], &XA->q[4], &XB->q[0], &XB->q[4]);
+ salsa20_8_xor2(&XA->q[4], &XA->q[0], &XB->q[4], &XB->q[0]);
+ }
+
+ /* 6: for i = 0 to N - 1 do */
+ for (i = 0; i < 1024; i++) {
+ jA = XA->w[16] & 1023; /* j <-- Integerify(X) mod N */
+ jB = XB->w[16] & 1023; /* j <-- Integerify(X) mod N */
+ blkxor128(XA->q, &VA[jA * 8]);
+ blkxor128(XB->q, &VB[jB * 8]);
+ salsa20_8_xor2(&XA->q[0], &XA->q[4], &XB->q[0], &XB->q[4]);
+ salsa20_8_xor2(&XA->q[4], &XA->q[0], &XB->q[4], &XB->q[0]);
+ }
+
+ /* 10: B' <-- X */
+ for (i = 0; i < 16; i++) {
+ le32enc(&databufA[(i * 5 % 16) * 4], XA->w[i]);
+ le32enc(&databufA[(16 + (i * 5 % 16)) * 4], XA->w[16 + i]);
+ le32enc(&databufB[(i * 5 % 16) * 4], XB->w[i]);
+ le32enc(&databufB[(16 + (i * 5 % 16)) * 4], XB->w[16 + i]);
+ }
+}
+
+#endif
+#endif
diff --git a/scrypt.c b/scrypt.c
index 2780b29..6f70cc2 100644
--- a/scrypt.c
+++ b/scrypt.c
@@ -35,6 +35,7 @@
#include <string.h>
#include "sha256-helpers.h"
+#include "scrypt-simd-helpers.h"
static void blkcpy(void *, void *, size_t);
static void blkxor(void *, void *, size_t);
@@ -219,7 +220,7 @@ smix(uint8_t * B, size_t r, uint64_t N, uint32_t * V, uint32_t * XY)
/* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output
scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes
*/
-static void scrypt_1024_1_1_256_sp(const char* input, char* output, char* scratchpad)
+static void scrypt_1024_1_1_256_sp1(const char* input, char* output, char* scratchpad)
{
uint8_t * B;
uint32_t * V;
@@ -237,17 +238,21 @@ static void scrypt_1024_1_1_256_sp(const char* input, char* output, char* scratc
/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
PBKDF2_SHA256((const uint8_t*)input, 80, (const uint8_t*)input, 80, 1, B, p * 128 * r);
+#ifdef HAVE_SCRYPT_SIMD_HELPERS
+ scrypt_simd_core1(B, XY);
+#else
/* 2: for i = 0 to p - 1 do */
for (i = 0; i < p; i++) {
/* 3: B_i <-- MF(B_i, N) */
smix(&B[i * 128 * r], r, N, V, XY);
}
+#endif
/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
PBKDF2_SHA256((const uint8_t*)input, 80, B, p * 128 * r, 1, (uint8_t*)output, 32);
}
-int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
+int scanhash_scrypt1(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
const unsigned char *ptarget,
uint32_t max_nonce, unsigned long *hashes_done)
{
@@ -266,7 +271,7 @@ int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
while(1) {
n++;
le32enc(nonce, n);
- scrypt_1024_1_1_256_sp(data, tmp_hash, scratchbuf);
+ scrypt_1024_1_1_256_sp1(data, tmp_hash, scratchbuf);
if (le32dec(tmp_hash+28) <= Htarg) {
be32enc(pdata + 64 + 12, n);
@@ -282,3 +287,105 @@ int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
return false;
}
+#ifdef HAVE_SCRYPT_SIMD_HELPERS
+
+static void
+scrypt_1024_1_1_256_sp2(const unsigned char * input1,
+ unsigned char * output1,
+ const unsigned char * input2,
+ unsigned char * output2,
+ unsigned char * scratchpad)
+{
+ uint8_t * B1, * B2;
+ uint8_t * V;
+
+ const uint32_t N = 1024;
+ const uint32_t r = 1;
+ const uint32_t p = 1;
+
+ B1 = (uint8_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
+ B2 = B1 + 128;
+ V = B2 + 128;
+
+ /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+ PBKDF2_SHA256((const uint8_t*)input1, 80, (const uint8_t*)input1, 80, 1, B1, p * 128 * r);
+ /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+ PBKDF2_SHA256((const uint8_t*)input2, 80, (const uint8_t*)input2, 80, 1, B2, p * 128 * r);
+
+ scrypt_simd_core2(B1, V);
+
+ /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+ PBKDF2_SHA256((const uint8_t*)input1, 80, B1, p * 128 * r, 1, (uint8_t*)output1, 32);
+ /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+ PBKDF2_SHA256((const uint8_t*)input2, 80, B2, p * 128 * r, 1, (uint8_t*)output2, 32);
+}
+
+int scanhash_scrypt2(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
+ const unsigned char *ptarget,
+ uint32_t max_nonce, unsigned long *hashes_done)
+{
+ unsigned char data1[80];
+ unsigned char tmp_hash1[32];
+ unsigned char data2[80];
+ unsigned char tmp_hash2[32];
+ uint32_t *nonce1 = (uint32_t *)(data1 + 64 + 12);
+ uint32_t *nonce2 = (uint32_t *)(data2 + 64 + 12);
+ uint32_t n = 0;
+ uint32_t Htarg = le32dec(ptarget + 28);
+ int i;
+
+ work_restart[thr_id].restart = 0;
+
+ for (i = 0; i < 80/4; i++) {
+ ((uint32_t *)data1)[i] = swab32(((uint32_t *)pdata)[i]);
+ ((uint32_t *)data2)[i] = swab32(((uint32_t *)pdata)[i]);
+ }
+
+ while(1) {
+ le32enc(nonce1, n + 1);
+ le32enc(nonce2, n + 2);
+ scrypt_1024_1_1_256_sp2(data1, tmp_hash1, data2, tmp_hash2, scratchbuf);
+
+ if (le32dec(tmp_hash1+28) <= Htarg) {
+ be32enc(pdata + 64 + 12, n + 1);
+ *hashes_done = n + 1;
+ return true;
+ }
+
+ if (le32dec(tmp_hash2+28) <= Htarg && n + 2 <= max_nonce) {
+ be32enc(pdata + 64 + 12, n + 2);
+ *hashes_done = n + 2;
+ return true;
+ }
+
+ n += 2;
+
+ if (n >= max_nonce) {
+ *hashes_done = max_nonce;
+ break;
+ }
+
+ if (work_restart[thr_id].restart) {
+ *hashes_done = n;
+ break;
+ }
+ }
+ return false;
+}
+
+#endif
+
+int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
+ const unsigned char *ptarget,
+ uint32_t max_nonce, unsigned long *hashes_done)
+{
+ /*
+ * TODO: maybe add a command line option or run benchmarks at start
+ * to select the fastest implementation?
+ */
+#ifdef HAVE_SCRYPT_SIMD_HELPERS
+ return scanhash_scrypt2(thr_id, pdata, scratchbuf, ptarget, max_nonce, hashes_done);
+#else
+ return scanhash_scrypt1(thr_id, pdata, scratchbuf, ptarget, max_nonce, hashes_done);
+#endif
+}