aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile.am11
-rw-r--r--configure.ac27
-rw-r--r--cpu-miner.c46
-rw-r--r--miner.h11
-rw-r--r--scrypt-cell-spu.c523
-rw-r--r--scrypt-cell-spu.h44
6 files changed, 656 insertions, 6 deletions
diff --git a/Makefile.am b/Makefile.am
index 5623a59..a981f2f 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -20,3 +20,14 @@ minerd_LDFLAGS = $(PTHREAD_FLAGS)
minerd_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@
minerd_CPPFLAGS = @LIBCURL_CPPFLAGS@
+if HAVE_CELL_SPU
+
+scrypt-cell-spu.o: scrypt-cell-spu.c sha256-helpers.h \
+ scrypt-simd-helpers.h scrypt-cell-spu.h
+ $(SPU_ELF_GCC) -O3 -fstrict-aliasing -Wall -Wstrict-aliasing \
+ -o scrypt-cell-spu.elf scrypt-cell-spu.c
+ $(EMBEDSPU) scrypt_spu scrypt-cell-spu.elf scrypt-cell-spu.o
+
+minerd_LDADD += scrypt-cell-spu.o @SPE2_LIBS@
+
+endif
diff --git a/configure.ac b/configure.ac
index 3b0733f..a94d74f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -32,6 +32,26 @@ case $target in
have_win32=false
PTHREAD_FLAGS="-pthread"
;;
+ powerpc*)
+ have_x86_64=false
+ have_win32=false
+ PTHREAD_FLAGS="-pthread"
+ AC_CHECK_LIB(spe2, spe_context_create, [SPE2_LIBS=-lspe2
+ have_cell_spu=true])
+ AC_CHECK_PROGS(SPU_ELF_GCC,[spu-elf-gcc spu-gcc], "false")
+ AC_CHECK_PROGS(EMBEDSPU,[embedspu ppu-embedspu], "false")
+
+ if test x$have_cell_spu = xtrue; then
+ if test x$SPU_ELF_GCC = xfalse; then
+ echo "Can't find spu-elf-gcc or spu-gcc tool"
+ exit 1
+ fi
+ if test x$EMBEDSPU = xfalse; then
+ echo "Can't find embedspu or ppu-embedspu tool"
+ exit 1
+ fi
+ fi
+ ;;
*)
have_x86_64=false
have_win32=false
@@ -39,13 +59,17 @@ case $target in
;;
esac
-
AC_CHECK_LIB(jansson, json_loads, request_jansson=false, request_jansson=true)
AC_CHECK_LIB(pthread, pthread_create, PTHREAD_LIBS=-lpthread)
AM_CONDITIONAL([WANT_JANSSON], [test x$request_jansson = xtrue])
AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue])
AM_CONDITIONAL([HAVE_x86_64], [test x$have_x86_64 = xtrue])
+AM_CONDITIONAL([HAVE_CELL_SPU], [test x$have_cell_spu = xtrue])
+
+if test x$have_cell_spu = xtrue ; then
+ AC_DEFINE([HAVE_CELL_SPU], [1], [Can use Cell/BE acceleration])
+fi
if test x$request_jansson = xtrue
then
@@ -62,6 +86,7 @@ LIBCURL_CHECK_CONFIG(, 7.10.1, ,
AC_SUBST(JANSSON_LIBS)
AC_SUBST(PTHREAD_FLAGS)
AC_SUBST(PTHREAD_LIBS)
+AC_SUBST(SPE2_LIBS)
AC_CONFIG_FILES([
Makefile
diff --git a/cpu-miner.c b/cpu-miner.c
index 607c01d..8ab0cf9 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -102,6 +102,7 @@ static const bool opt_time = true;
static enum sha256_algos opt_algo = ALGO_SCRYPT;
static int opt_n_threads;
static int num_processors;
+static int num_cell_spu; /* the number of SPU cores for Cell/BE (normally 6) */
static char *rpc_url;
static char *rpc_userpass;
static char *rpc_user, *rpc_pass;
@@ -511,6 +512,14 @@ err_out:
return false;
}
+#ifdef HAVE_CELL_SPU
+#include "scrypt-cell-spu.h"
+/* Each SPU core is processing 8 hashes as once and needs 8x memory */
+#define SCRATCHBUF_SIZE (131583 * 8)
+#else
+#define SCRATCHBUF_SIZE (131583 * 2)
+#endif
+
static void *miner_thread(void *userdata)
{
struct thr_info *mythr = userdata;
@@ -531,7 +540,7 @@ static void *miner_thread(void *userdata)
if (opt_algo == ALGO_SCRYPT)
{
- scratchbuf = malloc(2 * 131583);
+ scratchbuf = malloc(SCRATCHBUF_SIZE);
max_nonce = 0xffff;
}
@@ -556,6 +565,24 @@ static void *miner_thread(void *userdata)
/* scan nonces for a proof-of-work hash */
switch (opt_algo) {
case ALGO_SCRYPT:
+#ifdef HAVE_CELL_SPU
+ if (mythr->spe_context) {
+ scanhash_spu_args *argp = (scanhash_spu_args *)
+ (((uintptr_t)scratchbuf + 127) & ~(uintptr_t)127);
+ spe_stop_info_t stop_info;
+ unsigned int entry = SPE_DEFAULT_ENTRY;
+ memcpy(argp->data, work.data, sizeof(work.data));
+ memcpy(argp->target, work.target, sizeof(work.target));
+ argp->max_nonce = max_nonce;
+ work_restart[thr_id].restart = 0;
+ spe_context_run(mythr->spe_context, &entry, 0, argp,
+ (void *)&work_restart[thr_id].restart, &stop_info);
+ hashes_done = argp->hashes_done;
+ memcpy(work.data, argp->data, sizeof(work.data));
+ rc = stop_info.result.spe_exit_code;
+ break;
+ }
+#endif
rc = scanhash_scrypt(thr_id, work.data, scratchbuf,
work.target, max_nonce, &hashes_done);
break;
@@ -790,13 +817,20 @@ static void parse_arg (int key, char *arg)
show_usage();
}
+#ifdef HAVE_CELL_SPU
+ num_cell_spu = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1);
+#endif
#ifdef WIN32
if (!opt_n_threads)
opt_n_threads = 1;
#else
num_processors = sysconf(_SC_NPROCESSORS_ONLN);
- if (!opt_n_threads)
+ if (!opt_n_threads) {
opt_n_threads = num_processors;
+ /* If we have SPU cores, start additional thread for each */
+ if (num_cell_spu > 0)
+ opt_n_threads += num_cell_spu;
+ }
#endif /* !WIN32 */
}
@@ -922,7 +956,13 @@ int main (int argc, char *argv[])
thr->q = tq_new();
if (!thr->q)
return 1;
-
+#ifdef HAVE_CELL_SPU
+ /* The first 'num_cell_spu' threads are allocated for SPU */
+ if (i < num_cell_spu) {
+ thr->spe_context = spe_context_create(0, NULL);
+ spe_program_load(thr->spe_context, &scrypt_spu);
+ }
+#endif
if (unlikely(pthread_create(&thr->pth, NULL, miner_thread, thr))) {
applog(LOG_ERR, "thread %d create failed", i);
return 1;
diff --git a/miner.h b/miner.h
index 5d46209..dca0da7 100644
--- a/miner.h
+++ b/miner.h
@@ -36,6 +36,10 @@ void *alloca (size_t);
# endif
#endif
+#ifdef HAVE_CELL_SPU
+#include <libspe2.h>
+extern spe_program_handle_t scrypt_spu;
+#endif
#if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
#define WANT_BUILTIN_BSWAP
@@ -92,6 +96,9 @@ enum {
struct thr_info {
int id;
pthread_t pth;
+#ifdef HAVE_CELL_SPU
+ spe_context_ptr_t spe_context;
+#endif
struct thread_q *q;
};
@@ -142,8 +149,8 @@ extern bool have_longpoll;
struct thread_q;
struct work_restart {
- volatile unsigned long restart;
- char padding[128 - sizeof(unsigned long)];
+ volatile unsigned int restart;
+ char padding[128 - sizeof(unsigned int)];
};
extern pthread_mutex_t time_lock;
diff --git a/scrypt-cell-spu.c b/scrypt-cell-spu.c
new file mode 100644
index 0000000..a5a741f
--- /dev/null
+++ b/scrypt-cell-spu.c
@@ -0,0 +1,523 @@
+/*-
+ * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 Siarhei Siamashka
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+
+#include "sha256-helpers.h"
+#include "scrypt-simd-helpers.h"
+#include "scrypt-cell-spu.h"
+
+#define true 1
+#define false 0
+
+/*****************************************************************************/
+
+static __attribute__((always_inline)) void
+salsa20_8_xor4(uint32x4 * __restrict B, const uint32x4 * __restrict Bx,
+ uint32x4 * __restrict C, const uint32x4 * __restrict Cx,
+ uint32x4 * __restrict D, const uint32x4 * __restrict Dx,
+ uint32x4 * __restrict E, const uint32x4 * __restrict Ex)
+{
+ uint32x4 X0, X1, X2, X3;
+ uint32x4 Y0, Y1, Y2, Y3;
+ uint32x4 Z0, Z1, Z2, Z3;
+ uint32x4 W0, W1, W2, W3;
+ int i;
+
+ X0 = (B[0] ^= Bx[0]);
+ X1 = (B[1] ^= Bx[1]);
+ X2 = (B[2] ^= Bx[2]);
+ X3 = (B[3] ^= Bx[3]);
+ Y0 = (C[0] ^= Cx[0]);
+ Y1 = (C[1] ^= Cx[1]);
+ Y2 = (C[2] ^= Cx[2]);
+ Y3 = (C[3] ^= Cx[3]);
+ Z0 = (D[0] ^= Dx[0]);
+ Z1 = (D[1] ^= Dx[1]);
+ Z2 = (D[2] ^= Dx[2]);
+ Z3 = (D[3] ^= Dx[3]);
+ W0 = (E[0] ^= Ex[0]);
+ W1 = (E[1] ^= Ex[1]);
+ W2 = (E[2] ^= Ex[2]);
+ W3 = (E[3] ^= Ex[3]);
+
+ for (i = 0; i < 8; i += 2) {
+ /* Operate on "columns". */
+ X1 ^= rol_32x4(X0 + X3, 7);
+ Y1 ^= rol_32x4(Y0 + Y3, 7);
+ Z1 ^= rol_32x4(Z0 + Z3, 7);
+ W1 ^= rol_32x4(W0 + W3, 7);
+ X2 ^= rol_32x4(X1 + X0, 9);
+ Y2 ^= rol_32x4(Y1 + Y0, 9);
+ Z2 ^= rol_32x4(Z1 + Z0, 9);
+ W2 ^= rol_32x4(W1 + W0, 9);
+ X3 ^= rol_32x4(X2 + X1, 13);
+ Y3 ^= rol_32x4(Y2 + Y1, 13);
+ Z3 ^= rol_32x4(Z2 + Z1, 13);
+ W3 ^= rol_32x4(W2 + W1, 13);
+ X0 ^= rol_32x4(X3 + X2, 18);
+ Y0 ^= rol_32x4(Y3 + Y2, 18);
+ Z0 ^= rol_32x4(Z3 + Z2, 18);
+ W0 ^= rol_32x4(W3 + W2, 18);
+
+ /* Rearrange data. */
+ X1 = shuffle_32x4(X1, 3, 0, 1, 2);
+ Y1 = shuffle_32x4(Y1, 3, 0, 1, 2);
+ Z1 = shuffle_32x4(Z1, 3, 0, 1, 2);
+ W1 = shuffle_32x4(W1, 3, 0, 1, 2);
+ X2 = shuffle_32x4(X2, 2, 3, 0, 1);
+ Y2 = shuffle_32x4(Y2, 2, 3, 0, 1);
+ Z2 = shuffle_32x4(Z2, 2, 3, 0, 1);
+ W2 = shuffle_32x4(W2, 2, 3, 0, 1);
+ X3 = shuffle_32x4(X3, 1, 2, 3, 0);
+ Y3 = shuffle_32x4(Y3, 1, 2, 3, 0);
+ Z3 = shuffle_32x4(Z3, 1, 2, 3, 0);
+ W3 = shuffle_32x4(W3, 1, 2, 3, 0);
+
+ /* Operate on "rows". */
+ X3 ^= rol_32x4(X0 + X1, 7);
+ Y3 ^= rol_32x4(Y0 + Y1, 7);
+ Z3 ^= rol_32x4(Z0 + Z1, 7);
+ W3 ^= rol_32x4(W0 + W1, 7);
+ X2 ^= rol_32x4(X3 + X0, 9);
+ Y2 ^= rol_32x4(Y3 + Y0, 9);
+ Z2 ^= rol_32x4(Z3 + Z0, 9);
+ W2 ^= rol_32x4(W3 + W0, 9);
+ X1 ^= rol_32x4(X2 + X3, 13);
+ Y1 ^= rol_32x4(Y2 + Y3, 13);
+ Z1 ^= rol_32x4(Z2 + Z3, 13);
+ W1 ^= rol_32x4(W2 + W3, 13);
+ X0 ^= rol_32x4(X1 + X2, 18);
+ Y0 ^= rol_32x4(Y1 + Y2, 18);
+ Z0 ^= rol_32x4(Z1 + Z2, 18);
+ W0 ^= rol_32x4(W1 + W2, 18);
+
+ /* Rearrange data. */
+ X1 = shuffle_32x4(X1, 1, 2, 3, 0);
+ Y1 = shuffle_32x4(Y1, 1, 2, 3, 0);
+ Z1 = shuffle_32x4(Z1, 1, 2, 3, 0);
+ W1 = shuffle_32x4(W1, 1, 2, 3, 0);
+ X2 = shuffle_32x4(X2, 2, 3, 0, 1);
+ Y2 = shuffle_32x4(Y2, 2, 3, 0, 1);
+ Z2 = shuffle_32x4(Z2, 2, 3, 0, 1);
+ W2 = shuffle_32x4(W2, 2, 3, 0, 1);
+ X3 = shuffle_32x4(X3, 3, 0, 1, 2);
+ Y3 = shuffle_32x4(Y3, 3, 0, 1, 2);
+ Z3 = shuffle_32x4(Z3, 3, 0, 1, 2);
+ W3 = shuffle_32x4(W3, 3, 0, 1, 2);
+ }
+
+ B[0] += X0;
+ B[1] += X1;
+ B[2] += X2;
+ B[3] += X3;
+ C[0] += Y0;
+ C[1] += Y1;
+ C[2] += Y2;
+ C[3] += Y3;
+ D[0] += Z0;
+ D[1] += Z1;
+ D[2] += Z2;
+ D[3] += Z3;
+ E[0] += W0;
+ E[1] += W1;
+ E[2] += W2;
+ E[3] += W3;
+}
+
+static void
+scrypt_spu_core8(uint8_t *databuf, uint64_t scratch)
+{
+ static mfc_list_element_t dma_list[8] __attribute__((aligned(128)));
+ static XY X[8] __attribute__((aligned(128)));
+ static uint32x4 Y[8 * 8] __attribute__((aligned(128)));
+ static uint32x4 Z[8 * 8] __attribute__((aligned(128)));
+ XY * XA = &X[0];
+ XY * XB = &X[1];
+ XY * XC = &X[2];
+ XY * XD = &X[3];
+ XY * XE = &X[4];
+ XY * XF = &X[5];
+ XY * XG = &X[6];
+ XY * XH = &X[7];
+
+ uint64_t VA = (scratch + 128 * 1024 * 0);
+ uint64_t VB = (scratch + 128 * 1024 * 1);
+ uint64_t VC = (scratch + 128 * 1024 * 2);
+ uint64_t VD = (scratch + 128 * 1024 * 3);
+ uint64_t VE = (scratch + 128 * 1024 * 4);
+ uint64_t VF = (scratch + 128 * 1024 * 5);
+ uint64_t VG = (scratch + 128 * 1024 * 6);
+ uint64_t VH = (scratch + 128 * 1024 * 7);
+ int i;
+ int tag1 = 1, tag_mask1 = 1 << tag1;
+ int tag2 = 2, tag_mask2 = 1 << tag2;
+
+ /* 1: X <-- B */
+ for (i = 0; i < 16; i++) {
+ XA->w[i] = le32dec(&databuf[0 * 128 + (i * 5 % 16) * 4]);
+ XA->w[16 + i] = le32dec(&databuf[0 * 128 + (16 + (i * 5 % 16)) * 4]);
+ XB->w[i] = le32dec(&databuf[1 * 128 + (i * 5 % 16) * 4]);
+ XB->w[16 + i] = le32dec(&databuf[1 * 128 + (16 + (i * 5 % 16)) * 4]);
+ XC->w[i] = le32dec(&databuf[2 * 128 + (i * 5 % 16) * 4]);
+ XC->w[16 + i] = le32dec(&databuf[2 * 128 + (16 + (i * 5 % 16)) * 4]);
+ XD->w[i] = le32dec(&databuf[3 * 128 + (i * 5 % 16) * 4]);
+ XD->w[16 + i] = le32dec(&databuf[3 * 128 + (16 + (i * 5 % 16)) * 4]);
+ XE->w[i] = le32dec(&databuf[4 * 128 + (i * 5 % 16) * 4]);
+ XE->w[16 + i] = le32dec(&databuf[4 * 128 + (16 + (i * 5 % 16)) * 4]);
+ XF->w[i] = le32dec(&databuf[5 * 128 + (i * 5 % 16) * 4]);
+ XF->w[16 + i] = le32dec(&databuf[5 * 128 + (16 + (i * 5 % 16)) * 4]);
+ XG->w[i] = le32dec(&databuf[6 * 128 + (i * 5 % 16) * 4]);
+ XG->w[16 + i] = le32dec(&databuf[6 * 128 + (16 + (i * 5 % 16)) * 4]);
+ XH->w[i] = le32dec(&databuf[7 * 128 + (i * 5 % 16) * 4]);
+ XH->w[16 + i] = le32dec(&databuf[7 * 128 + (16 + (i * 5 % 16)) * 4]);
+ }
+ for (i = 0; i < 8; i++)
+ dma_list[i].size = 128;
+
+ /* 2: for i = 0 to N - 1 do */
+ for (i = 0; i < 1024; i++) {
+ blkcpy128(&Z[0 * 8], &XA->q[0]);
+ blkcpy128(&Z[1 * 8], &XB->q[0]);
+ blkcpy128(&Z[2 * 8], &XC->q[0]);
+ blkcpy128(&Z[3 * 8], &XD->q[0]);
+ blkcpy128(&Z[4 * 8], &XE->q[0]);
+ blkcpy128(&Z[5 * 8], &XF->q[0]);
+ blkcpy128(&Z[6 * 8], &XG->q[0]);
+ blkcpy128(&Z[7 * 8], &XH->q[0]);
+ dma_list[0].eal = mfc_ea2l(VA + i * 128);
+ dma_list[1].eal = mfc_ea2l(VB + i * 128);
+ dma_list[2].eal = mfc_ea2l(VC + i * 128);
+ dma_list[3].eal = mfc_ea2l(VD + i * 128);
+ dma_list[4].eal = mfc_ea2l(VE + i * 128);
+ dma_list[5].eal = mfc_ea2l(VF + i * 128);
+ dma_list[6].eal = mfc_ea2l(VG + i * 128);
+ dma_list[7].eal = mfc_ea2l(VH + i * 128);
+ mfc_putl(&Z[0], scratch, &dma_list[0], 8 * sizeof(mfc_list_element_t), tag1, 0, 0);
+ salsa20_8_xor4(&XA->q[0], &XA->q[4], &XB->q[0], &XB->q[4], &XC->q[0], &XC->q[4], &XD->q[0], &XD->q[4]);
+ salsa20_8_xor4(&XA->q[4], &XA->q[0], &XB->q[4], &XB->q[0], &XC->q[4], &XC->q[0], &XD->q[4], &XD->q[0]);
+ salsa20_8_xor4(&XE->q[0], &XE->q[4], &XF->q[0], &XF->q[4], &XG->q[0], &XG->q[4], &XH->q[0], &XH->q[4]);
+ salsa20_8_xor4(&XE->q[4], &XE->q[0], &XF->q[4], &XF->q[0], &XG->q[4], &XG->q[0], &XH->q[4], &XH->q[0]);
+ mfc_write_tag_mask(tag_mask1);
+ mfc_read_tag_status_all();
+ }
+
+ dma_list[0].eal = mfc_ea2l(VA + (XA->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[1].eal = mfc_ea2l(VB + (XB->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[2].eal = mfc_ea2l(VC + (XC->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[3].eal = mfc_ea2l(VD + (XD->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ mfc_getl(&Y[0], scratch, &dma_list[0], 4 * sizeof(mfc_list_element_t), tag1, 0, 0);
+
+ dma_list[4].eal = mfc_ea2l(VE + (XE->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[5].eal = mfc_ea2l(VF + (XF->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[6].eal = mfc_ea2l(VG + (XG->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[7].eal = mfc_ea2l(VH + (XH->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ mfc_getl(&Y[4 * 8], scratch, &dma_list[4], 4 * sizeof(mfc_list_element_t), tag2, 0, 0);
+
+ /* 6: for i = 0 to N - 1 do */
+ for (i = 0; i < 1024; i++) {
+ mfc_write_tag_mask(tag_mask1);
+ mfc_read_tag_status_all();
+ blkxor128(XA->q, &Y[0 * 4]);
+ blkxor128(XB->q, &Y[1 * 8]);
+ blkxor128(XC->q, &Y[2 * 8]);
+ blkxor128(XD->q, &Y[3 * 8]);
+ salsa20_8_xor4(&XA->q[0], &XA->q[4], &XB->q[0], &XB->q[4], &XC->q[0], &XC->q[4], &XD->q[0], &XD->q[4]);
+ salsa20_8_xor4(&XA->q[4], &XA->q[0], &XB->q[4], &XB->q[0], &XC->q[4], &XC->q[0], &XD->q[4], &XD->q[0]);
+
+ dma_list[0].eal = mfc_ea2l(VA + (XA->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[1].eal = mfc_ea2l(VB + (XB->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[2].eal = mfc_ea2l(VC + (XC->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[3].eal = mfc_ea2l(VD + (XD->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ mfc_getl(&Y[0], scratch, &dma_list[0], 4 * sizeof(mfc_list_element_t), tag1, 0, 0);
+
+ mfc_write_tag_mask(tag_mask2);
+ mfc_read_tag_status_all();
+ blkxor128(XE->q, &Y[4 * 8]);
+ blkxor128(XF->q, &Y[5 * 8]);
+ blkxor128(XG->q, &Y[6 * 8]);
+ blkxor128(XH->q, &Y[7 * 8]);
+ salsa20_8_xor4(&XE->q[0], &XE->q[4], &XF->q[0], &XF->q[4], &XG->q[0], &XG->q[4], &XH->q[0], &XH->q[4]);
+ salsa20_8_xor4(&XE->q[4], &XE->q[0], &XF->q[4], &XF->q[0], &XG->q[4], &XG->q[0], &XH->q[4], &XH->q[0]);
+
+ dma_list[4].eal = mfc_ea2l(VE + (XE->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[5].eal = mfc_ea2l(VF + (XF->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[6].eal = mfc_ea2l(VG + (XG->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[7].eal = mfc_ea2l(VH + (XH->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ mfc_getl(&Y[4 * 8], scratch, &dma_list[4], 4 * sizeof(mfc_list_element_t), tag2, 0, 0);
+ }
+
+ /* 10: B' <-- X */
+ for (i = 0; i < 16; i++) {
+ le32enc(&databuf[0 * 128 + (i * 5 % 16) * 4], XA->w[i]);
+ le32enc(&databuf[0 * 128 + (16 + (i * 5 % 16)) * 4], XA->w[16 + i]);
+ le32enc(&databuf[1 * 128 + (i * 5 % 16) * 4], XB->w[i]);
+ le32enc(&databuf[1 * 128 + (16 + (i * 5 % 16)) * 4], XB->w[16 + i]);
+ le32enc(&databuf[2 * 128 + (i * 5 % 16) * 4], XC->w[i]);
+ le32enc(&databuf[2 * 128 + (16 + (i * 5 % 16)) * 4], XC->w[16 + i]);
+ le32enc(&databuf[3 * 128 + (i * 5 % 16) * 4], XD->w[i]);
+ le32enc(&databuf[3 * 128 + (16 + (i * 5 % 16)) * 4], XD->w[16 + i]);
+ le32enc(&databuf[4 * 128 + (i * 5 % 16) * 4], XE->w[i]);
+ le32enc(&databuf[4 * 128 + (16 + (i * 5 % 16)) * 4], XE->w[16 + i]);
+ le32enc(&databuf[5 * 128 + (i * 5 % 16) * 4], XF->w[i]);
+ le32enc(&databuf[5 * 128 + (16 + (i * 5 % 16)) * 4], XF->w[16 + i]);
+ le32enc(&databuf[6 * 128 + (i * 5 % 16) * 4], XG->w[i]);
+ le32enc(&databuf[6 * 128 + (16 + (i * 5 % 16)) * 4], XG->w[16 + i]);
+ le32enc(&databuf[7 * 128 + (i * 5 % 16) * 4], XH->w[i]);
+ le32enc(&databuf[7 * 128 + (16 + (i * 5 % 16)) * 4], XH->w[16 + i]);
+ }
+}
+
+static void
+scrypt_1024_1_1_256_sp8(const unsigned char * input1,
+ unsigned char * output1,
+ const unsigned char * input2,
+ unsigned char * output2,
+ const unsigned char * input3,
+ unsigned char * output3,
+ const unsigned char * input4,
+ unsigned char * output4,
+ const unsigned char * input5,
+ unsigned char * output5,
+ const unsigned char * input6,
+ unsigned char * output6,
+ const unsigned char * input7,
+ unsigned char * output7,
+ const unsigned char * input8,
+ unsigned char * output8,
+ uint64_t scratchpad)
+{
+ static uint8_t databuf[128 * 8] __attribute__((aligned(128)));
+ uint8_t * B1, * B2, * B3, * B4, * B5, * B6, * B7, * B8;
+
+ const uint32_t r = 1;
+ const uint32_t p = 1;
+
+ B1 = databuf;
+ B2 = databuf + 128 * 1;
+ B3 = databuf + 128 * 2;
+ B4 = databuf + 128 * 3;
+ B5 = databuf + 128 * 4;
+ B6 = databuf + 128 * 5;
+ B7 = databuf + 128 * 6;
+ B8 = databuf + 128 * 7;
+
+ /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+ PBKDF2_SHA256((const uint8_t*)input1, 80, (const uint8_t*)input1, 80, 1, B1, p * 128 * r);
+ /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+ PBKDF2_SHA256((const uint8_t*)input2, 80, (const uint8_t*)input2, 80, 1, B2, p * 128 * r);
+ /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+ PBKDF2_SHA256((const uint8_t*)input3, 80, (const uint8_t*)input3, 80, 1, B3, p * 128 * r);
+ /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+ PBKDF2_SHA256((const uint8_t*)input4, 80, (const uint8_t*)input4, 80, 1, B4, p * 128 * r);
+ /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+ PBKDF2_SHA256((const uint8_t*)input5, 80, (const uint8_t*)input5, 80, 1, B5, p * 128 * r);
+ /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+ PBKDF2_SHA256((const uint8_t*)input6, 80, (const uint8_t*)input6, 80, 1, B6, p * 128 * r);
+ /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+ PBKDF2_SHA256((const uint8_t*)input7, 80, (const uint8_t*)input7, 80, 1, B7, p * 128 * r);
+ /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+ PBKDF2_SHA256((const uint8_t*)input8, 80, (const uint8_t*)input8, 80, 1, B8, p * 128 * r);
+
+ scrypt_spu_core8(databuf, scratchpad);
+
+ /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+ PBKDF2_SHA256((const uint8_t*)input1, 80, B1, p * 128 * r, 1, (uint8_t*)output1, 32);
+ /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+ PBKDF2_SHA256((const uint8_t*)input2, 80, B2, p * 128 * r, 1, (uint8_t*)output2, 32);
+ /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+ PBKDF2_SHA256((const uint8_t*)input3, 80, B3, p * 128 * r, 1, (uint8_t*)output3, 32);
+ /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+ PBKDF2_SHA256((const uint8_t*)input4, 80, B4, p * 128 * r, 1, (uint8_t*)output4, 32);
+ /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+ PBKDF2_SHA256((const uint8_t*)input5, 80, B5, p * 128 * r, 1, (uint8_t*)output5, 32);
+ /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+ PBKDF2_SHA256((const uint8_t*)input6, 80, B6, p * 128 * r, 1, (uint8_t*)output6, 32);
+ /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+ PBKDF2_SHA256((const uint8_t*)input7, 80, B7, p * 128 * r, 1, (uint8_t*)output7, 32);
+ /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+ PBKDF2_SHA256((const uint8_t*)input8, 80, B8, p * 128 * r, 1, (uint8_t*)output8, 32);
+}
+
+static int
+scanhash_scrypt(uint64_t work_restart_ptr, unsigned char *pdata,
+ uint64_t scratchbuf, const unsigned char *ptarget,
+ uint32_t max_nonce, uint32_t *hashes_done)
+{
+ unsigned char data1[80];
+ unsigned char tmp_hash1[32];
+ unsigned char data2[80];
+ unsigned char tmp_hash2[32];
+ unsigned char data3[80];
+ unsigned char tmp_hash3[32];
+ unsigned char data4[80];
+ unsigned char tmp_hash4[32];
+ unsigned char data5[80];
+ unsigned char tmp_hash5[32];
+ unsigned char data6[80];
+ unsigned char tmp_hash6[32];
+ unsigned char data7[80];
+ unsigned char tmp_hash7[32];
+ unsigned char data8[80];
+ unsigned char tmp_hash8[32];
+ uint32_t *nonce1 = (uint32_t *)(data1 + 64 + 12);
+ uint32_t *nonce2 = (uint32_t *)(data2 + 64 + 12);
+ uint32_t *nonce3 = (uint32_t *)(data3 + 64 + 12);
+ uint32_t *nonce4 = (uint32_t *)(data4 + 64 + 12);
+ uint32_t *nonce5 = (uint32_t *)(data5 + 64 + 12);
+ uint32_t *nonce6 = (uint32_t *)(data6 + 64 + 12);
+ uint32_t *nonce7 = (uint32_t *)(data7 + 64 + 12);
+ uint32_t *nonce8 = (uint32_t *)(data8 + 64 + 12);
+ uint32_t n = 0;
+ uint32_t Htarg = le32dec(ptarget + 28);
+ int i;
+ int tag3 = 3, tag_mask3 = 1 << tag3;
+ int work_restart = 0;
+
+ for (i = 0; i < 80/4; i++) {
+ ((uint32_t *)data1)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]);
+ ((uint32_t *)data2)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]);
+ ((uint32_t *)data3)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]);
+ ((uint32_t *)data4)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]);
+ ((uint32_t *)data5)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]);
+ ((uint32_t *)data6)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]);
+ ((uint32_t *)data7)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]);
+ ((uint32_t *)data8)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]);
+ }
+
+ while(1) {
+ /* request 'work_restart[thr_id].restart' from external memory */
+ mfc_get(&work_restart, work_restart_ptr, 4, tag3, 0, 0);
+
+ le32enc(nonce1, n + 1);
+ le32enc(nonce2, n + 2);
+ le32enc(nonce3, n + 3);
+ le32enc(nonce4, n + 4);
+ le32enc(nonce5, n + 5);
+ le32enc(nonce6, n + 6);
+ le32enc(nonce7, n + 7);
+ le32enc(nonce8, n + 8);
+ scrypt_1024_1_1_256_sp8(data1, tmp_hash1, data2, tmp_hash2,
+ data3, tmp_hash3, data4, tmp_hash4,
+ data5, tmp_hash5, data6, tmp_hash6,
+ data7, tmp_hash7, data8, tmp_hash8,
+ scratchbuf);
+
+ if (le32dec(tmp_hash1+28) <= Htarg) {
+ be32enc(pdata + 64 + 12, n + 1);
+ *hashes_done = n;
+ return true;
+ }
+
+ if (le32dec(tmp_hash2+28) <= Htarg && n + 2 <= max_nonce) {
+ be32enc(pdata + 64 + 12, n + 2);
+ *hashes_done = n + 2;
+ return true;
+ }
+
+ if (le32dec(tmp_hash3+28) <= Htarg && n + 3 <= max_nonce) {
+ be32enc(pdata + 64 + 12, n + 3);
+ *hashes_done = n + 3;
+ return true;
+ }
+
+ if (le32dec(tmp_hash4+28) <= Htarg && n + 4 <= max_nonce) {
+ be32enc(pdata + 64 + 12, n + 4);
+ *hashes_done = n + 4;
+ return true;
+ }
+
+ if (le32dec(tmp_hash5+28) <= Htarg && n + 5 <= max_nonce) {
+ be32enc(pdata + 64 + 12, n + 5);
+ *hashes_done = n + 5;
+ return true;
+ }
+
+ if (le32dec(tmp_hash6+28) <= Htarg && n + 6 <= max_nonce) {
+ be32enc(pdata + 64 + 12, n + 6);
+ *hashes_done = n + 6;
+ return true;
+ }
+
+ if (le32dec(tmp_hash7+28) <= Htarg && n + 7 <= max_nonce) {
+ be32enc(pdata + 64 + 12, n + 7);
+ *hashes_done = n + 7;
+ return true;
+ }
+
+ if (le32dec(tmp_hash8+28) <= Htarg && n + 8 <= max_nonce) {
+ be32enc(pdata + 64 + 12, n + 8);
+ *hashes_done = n + 8;
+ return true;
+ }
+
+ n += 8;
+
+ if (n >= max_nonce) {
+ *hashes_done = max_nonce;
+ break;
+ }
+
+ /* ensure that 'work_restart[thr_id].restart' has been read */
+ mfc_write_tag_mask(tag_mask3);
+ mfc_read_tag_status_all();
+
+ if (work_restart) {
+ *hashes_done = n;
+ break;
+ }
+ }
+ return false;
+}
+
+int main(uint64_t spe_id, uint64_t argp, uint64_t envp)
+{
+ static scanhash_spu_args args __attribute__((aligned(16)));
+ int tag = 1, tag_mask = 1 << tag;
+ int rc;
+
+ mfc_get(&args, argp, sizeof(args), tag, 0, 0);
+ mfc_write_tag_mask(tag_mask);
+ mfc_read_tag_status_all();
+
+ rc = scanhash_scrypt(envp, args.data, argp + 1024,
+ args.target, args.max_nonce,
+ &args.hashes_done);
+
+ mfc_put(&args, argp, sizeof(args), tag, 0, 0);
+ mfc_write_tag_mask(tag_mask);
+ mfc_read_tag_status_all();
+
+ return rc;
+}
diff --git a/scrypt-cell-spu.h b/scrypt-cell-spu.h
new file mode 100644
index 0000000..4af26c4
--- /dev/null
+++ b/scrypt-cell-spu.h
@@ -0,0 +1,44 @@
+/*-
+ * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 Siarhei Siamashka
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#ifndef __SCRYPT_CELL_SPU_H__
+#define __SCRYPT_CELL_SPU_H__
+
+/*
+ * This is the data structure which is passed between main CPU and SPU cores.
+ */
+typedef struct {
+ uint8_t data[128];
+ uint8_t target[32];
+ uint32_t max_nonce;
+ uint32_t hashes_done;
+ uint32_t padding[2];
+} scanhash_spu_args;
+
+#endif