From ea4520474ba5a966260fc5652c5248e6eab50d58 Mon Sep 17 00:00:00 2001 From: Siarhei Siamashka Date: Tue, 27 Dec 2011 03:51:24 +0200 Subject: Added Cell/BE optimizations Linux on PS3 gets a huge boost in litecoin mining performance. Cell/BE support should be detected and enabled automatically by autotools. The miner threads are first allocated to the available SPU cores (typically 6). The remaining threads are allocated on PPU. There will be 8 threads total on PS3: 6 SPU threads and 2 PPU threads. Each SPU core provides ~5.4 khash/s if compiled with spu-elf-gcc 4.6 The performance may vary for different gcc versions, older ones are typically slower. --- Makefile.am | 11 ++ configure.ac | 27 ++- cpu-miner.c | 46 ++++- miner.h | 11 +- scrypt-cell-spu.c | 523 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ scrypt-cell-spu.h | 44 +++++ 6 files changed, 656 insertions(+), 6 deletions(-) create mode 100644 scrypt-cell-spu.c create mode 100644 scrypt-cell-spu.h diff --git a/Makefile.am b/Makefile.am index 5623a59..a981f2f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -20,3 +20,14 @@ minerd_LDFLAGS = $(PTHREAD_FLAGS) minerd_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ minerd_CPPFLAGS = @LIBCURL_CPPFLAGS@ +if HAVE_CELL_SPU + +scrypt-cell-spu.o: scrypt-cell-spu.c sha256-helpers.h \ + scrypt-simd-helpers.h scrypt-cell-spu.h + $(SPU_ELF_GCC) -O3 -fstrict-aliasing -Wall -Wstrict-aliasing \ + -o scrypt-cell-spu.elf scrypt-cell-spu.c + $(EMBEDSPU) scrypt_spu scrypt-cell-spu.elf scrypt-cell-spu.o + +minerd_LDADD += scrypt-cell-spu.o @SPE2_LIBS@ + +endif diff --git a/configure.ac b/configure.ac index 3b0733f..a94d74f 100644 --- a/configure.ac +++ b/configure.ac @@ -32,6 +32,26 @@ case $target in have_win32=false PTHREAD_FLAGS="-pthread" ;; + powerpc*) + have_x86_64=false + have_win32=false + PTHREAD_FLAGS="-pthread" + AC_CHECK_LIB(spe2, spe_context_create, [SPE2_LIBS=-lspe2 + have_cell_spu=true]) + AC_CHECK_PROGS(SPU_ELF_GCC,[spu-elf-gcc spu-gcc], "false") + AC_CHECK_PROGS(EMBEDSPU,[embedspu ppu-embedspu], "false") + + if test x$have_cell_spu = xtrue; then + if test x$SPU_ELF_GCC = xfalse; then + echo "Can't find spu-elf-gcc or spu-gcc tool" + exit 1 + fi + if test x$EMBEDSPU = xfalse; then + echo "Can't find embedspu or ppu-embedspu tool" + exit 1 + fi + fi + ;; *) have_x86_64=false have_win32=false @@ -39,13 +59,17 @@ case $target in ;; esac - AC_CHECK_LIB(jansson, json_loads, request_jansson=false, request_jansson=true) AC_CHECK_LIB(pthread, pthread_create, PTHREAD_LIBS=-lpthread) AM_CONDITIONAL([WANT_JANSSON], [test x$request_jansson = xtrue]) AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue]) AM_CONDITIONAL([HAVE_x86_64], [test x$have_x86_64 = xtrue]) +AM_CONDITIONAL([HAVE_CELL_SPU], [test x$have_cell_spu = xtrue]) + +if test x$have_cell_spu = xtrue ; then + AC_DEFINE([HAVE_CELL_SPU], [1], [Can use Cell/BE acceleration]) +fi if test x$request_jansson = xtrue then @@ -62,6 +86,7 @@ LIBCURL_CHECK_CONFIG(, 7.10.1, , AC_SUBST(JANSSON_LIBS) AC_SUBST(PTHREAD_FLAGS) AC_SUBST(PTHREAD_LIBS) +AC_SUBST(SPE2_LIBS) AC_CONFIG_FILES([ Makefile diff --git a/cpu-miner.c b/cpu-miner.c index 607c01d..8ab0cf9 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -102,6 +102,7 @@ static const bool opt_time = true; static enum sha256_algos opt_algo = ALGO_SCRYPT; static int opt_n_threads; static int num_processors; +static int num_cell_spu; /* the number of SPU cores for Cell/BE (normally 6) */ static char *rpc_url; static char *rpc_userpass; static char *rpc_user, *rpc_pass; @@ -511,6 +512,14 @@ err_out: return false; } +#ifdef HAVE_CELL_SPU +#include "scrypt-cell-spu.h" +/* Each SPU core is processing 8 hashes as once and needs 8x memory */ +#define SCRATCHBUF_SIZE (131583 * 8) +#else +#define SCRATCHBUF_SIZE (131583 * 2) +#endif + static void *miner_thread(void *userdata) { struct thr_info *mythr = userdata; @@ -531,7 +540,7 @@ static void *miner_thread(void *userdata) if (opt_algo == ALGO_SCRYPT) { - scratchbuf = malloc(2 * 131583); + scratchbuf = malloc(SCRATCHBUF_SIZE); max_nonce = 0xffff; } @@ -556,6 +565,24 @@ static void *miner_thread(void *userdata) /* scan nonces for a proof-of-work hash */ switch (opt_algo) { case ALGO_SCRYPT: +#ifdef HAVE_CELL_SPU + if (mythr->spe_context) { + scanhash_spu_args *argp = (scanhash_spu_args *) + (((uintptr_t)scratchbuf + 127) & ~(uintptr_t)127); + spe_stop_info_t stop_info; + unsigned int entry = SPE_DEFAULT_ENTRY; + memcpy(argp->data, work.data, sizeof(work.data)); + memcpy(argp->target, work.target, sizeof(work.target)); + argp->max_nonce = max_nonce; + work_restart[thr_id].restart = 0; + spe_context_run(mythr->spe_context, &entry, 0, argp, + (void *)&work_restart[thr_id].restart, &stop_info); + hashes_done = argp->hashes_done; + memcpy(work.data, argp->data, sizeof(work.data)); + rc = stop_info.result.spe_exit_code; + break; + } +#endif rc = scanhash_scrypt(thr_id, work.data, scratchbuf, work.target, max_nonce, &hashes_done); break; @@ -790,13 +817,20 @@ static void parse_arg (int key, char *arg) show_usage(); } +#ifdef HAVE_CELL_SPU + num_cell_spu = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1); +#endif #ifdef WIN32 if (!opt_n_threads) opt_n_threads = 1; #else num_processors = sysconf(_SC_NPROCESSORS_ONLN); - if (!opt_n_threads) + if (!opt_n_threads) { opt_n_threads = num_processors; + /* If we have SPU cores, start additional thread for each */ + if (num_cell_spu > 0) + opt_n_threads += num_cell_spu; + } #endif /* !WIN32 */ } @@ -922,7 +956,13 @@ int main (int argc, char *argv[]) thr->q = tq_new(); if (!thr->q) return 1; - +#ifdef HAVE_CELL_SPU + /* The first 'num_cell_spu' threads are allocated for SPU */ + if (i < num_cell_spu) { + thr->spe_context = spe_context_create(0, NULL); + spe_program_load(thr->spe_context, &scrypt_spu); + } +#endif if (unlikely(pthread_create(&thr->pth, NULL, miner_thread, thr))) { applog(LOG_ERR, "thread %d create failed", i); return 1; diff --git a/miner.h b/miner.h index 5d46209..dca0da7 100644 --- a/miner.h +++ b/miner.h @@ -36,6 +36,10 @@ void *alloca (size_t); # endif #endif +#ifdef HAVE_CELL_SPU +#include +extern spe_program_handle_t scrypt_spu; +#endif #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) #define WANT_BUILTIN_BSWAP @@ -92,6 +96,9 @@ enum { struct thr_info { int id; pthread_t pth; +#ifdef HAVE_CELL_SPU + spe_context_ptr_t spe_context; +#endif struct thread_q *q; }; @@ -142,8 +149,8 @@ extern bool have_longpoll; struct thread_q; struct work_restart { - volatile unsigned long restart; - char padding[128 - sizeof(unsigned long)]; + volatile unsigned int restart; + char padding[128 - sizeof(unsigned int)]; }; extern pthread_mutex_t time_lock; diff --git a/scrypt-cell-spu.c b/scrypt-cell-spu.c new file mode 100644 index 0000000..a5a741f --- /dev/null +++ b/scrypt-cell-spu.c @@ -0,0 +1,523 @@ +/*- + * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 Siarhei Siamashka + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + +#include +#include +#include + +#include +#include + +#include "sha256-helpers.h" +#include "scrypt-simd-helpers.h" +#include "scrypt-cell-spu.h" + +#define true 1 +#define false 0 + +/*****************************************************************************/ + +static __attribute__((always_inline)) void +salsa20_8_xor4(uint32x4 * __restrict B, const uint32x4 * __restrict Bx, + uint32x4 * __restrict C, const uint32x4 * __restrict Cx, + uint32x4 * __restrict D, const uint32x4 * __restrict Dx, + uint32x4 * __restrict E, const uint32x4 * __restrict Ex) +{ + uint32x4 X0, X1, X2, X3; + uint32x4 Y0, Y1, Y2, Y3; + uint32x4 Z0, Z1, Z2, Z3; + uint32x4 W0, W1, W2, W3; + int i; + + X0 = (B[0] ^= Bx[0]); + X1 = (B[1] ^= Bx[1]); + X2 = (B[2] ^= Bx[2]); + X3 = (B[3] ^= Bx[3]); + Y0 = (C[0] ^= Cx[0]); + Y1 = (C[1] ^= Cx[1]); + Y2 = (C[2] ^= Cx[2]); + Y3 = (C[3] ^= Cx[3]); + Z0 = (D[0] ^= Dx[0]); + Z1 = (D[1] ^= Dx[1]); + Z2 = (D[2] ^= Dx[2]); + Z3 = (D[3] ^= Dx[3]); + W0 = (E[0] ^= Ex[0]); + W1 = (E[1] ^= Ex[1]); + W2 = (E[2] ^= Ex[2]); + W3 = (E[3] ^= Ex[3]); + + for (i = 0; i < 8; i += 2) { + /* Operate on "columns". */ + X1 ^= rol_32x4(X0 + X3, 7); + Y1 ^= rol_32x4(Y0 + Y3, 7); + Z1 ^= rol_32x4(Z0 + Z3, 7); + W1 ^= rol_32x4(W0 + W3, 7); + X2 ^= rol_32x4(X1 + X0, 9); + Y2 ^= rol_32x4(Y1 + Y0, 9); + Z2 ^= rol_32x4(Z1 + Z0, 9); + W2 ^= rol_32x4(W1 + W0, 9); + X3 ^= rol_32x4(X2 + X1, 13); + Y3 ^= rol_32x4(Y2 + Y1, 13); + Z3 ^= rol_32x4(Z2 + Z1, 13); + W3 ^= rol_32x4(W2 + W1, 13); + X0 ^= rol_32x4(X3 + X2, 18); + Y0 ^= rol_32x4(Y3 + Y2, 18); + Z0 ^= rol_32x4(Z3 + Z2, 18); + W0 ^= rol_32x4(W3 + W2, 18); + + /* Rearrange data. */ + X1 = shuffle_32x4(X1, 3, 0, 1, 2); + Y1 = shuffle_32x4(Y1, 3, 0, 1, 2); + Z1 = shuffle_32x4(Z1, 3, 0, 1, 2); + W1 = shuffle_32x4(W1, 3, 0, 1, 2); + X2 = shuffle_32x4(X2, 2, 3, 0, 1); + Y2 = shuffle_32x4(Y2, 2, 3, 0, 1); + Z2 = shuffle_32x4(Z2, 2, 3, 0, 1); + W2 = shuffle_32x4(W2, 2, 3, 0, 1); + X3 = shuffle_32x4(X3, 1, 2, 3, 0); + Y3 = shuffle_32x4(Y3, 1, 2, 3, 0); + Z3 = shuffle_32x4(Z3, 1, 2, 3, 0); + W3 = shuffle_32x4(W3, 1, 2, 3, 0); + + /* Operate on "rows". */ + X3 ^= rol_32x4(X0 + X1, 7); + Y3 ^= rol_32x4(Y0 + Y1, 7); + Z3 ^= rol_32x4(Z0 + Z1, 7); + W3 ^= rol_32x4(W0 + W1, 7); + X2 ^= rol_32x4(X3 + X0, 9); + Y2 ^= rol_32x4(Y3 + Y0, 9); + Z2 ^= rol_32x4(Z3 + Z0, 9); + W2 ^= rol_32x4(W3 + W0, 9); + X1 ^= rol_32x4(X2 + X3, 13); + Y1 ^= rol_32x4(Y2 + Y3, 13); + Z1 ^= rol_32x4(Z2 + Z3, 13); + W1 ^= rol_32x4(W2 + W3, 13); + X0 ^= rol_32x4(X1 + X2, 18); + Y0 ^= rol_32x4(Y1 + Y2, 18); + Z0 ^= rol_32x4(Z1 + Z2, 18); + W0 ^= rol_32x4(W1 + W2, 18); + + /* Rearrange data. */ + X1 = shuffle_32x4(X1, 1, 2, 3, 0); + Y1 = shuffle_32x4(Y1, 1, 2, 3, 0); + Z1 = shuffle_32x4(Z1, 1, 2, 3, 0); + W1 = shuffle_32x4(W1, 1, 2, 3, 0); + X2 = shuffle_32x4(X2, 2, 3, 0, 1); + Y2 = shuffle_32x4(Y2, 2, 3, 0, 1); + Z2 = shuffle_32x4(Z2, 2, 3, 0, 1); + W2 = shuffle_32x4(W2, 2, 3, 0, 1); + X3 = shuffle_32x4(X3, 3, 0, 1, 2); + Y3 = shuffle_32x4(Y3, 3, 0, 1, 2); + Z3 = shuffle_32x4(Z3, 3, 0, 1, 2); + W3 = shuffle_32x4(W3, 3, 0, 1, 2); + } + + B[0] += X0; + B[1] += X1; + B[2] += X2; + B[3] += X3; + C[0] += Y0; + C[1] += Y1; + C[2] += Y2; + C[3] += Y3; + D[0] += Z0; + D[1] += Z1; + D[2] += Z2; + D[3] += Z3; + E[0] += W0; + E[1] += W1; + E[2] += W2; + E[3] += W3; +} + +static void +scrypt_spu_core8(uint8_t *databuf, uint64_t scratch) +{ + static mfc_list_element_t dma_list[8] __attribute__((aligned(128))); + static XY X[8] __attribute__((aligned(128))); + static uint32x4 Y[8 * 8] __attribute__((aligned(128))); + static uint32x4 Z[8 * 8] __attribute__((aligned(128))); + XY * XA = &X[0]; + XY * XB = &X[1]; + XY * XC = &X[2]; + XY * XD = &X[3]; + XY * XE = &X[4]; + XY * XF = &X[5]; + XY * XG = &X[6]; + XY * XH = &X[7]; + + uint64_t VA = (scratch + 128 * 1024 * 0); + uint64_t VB = (scratch + 128 * 1024 * 1); + uint64_t VC = (scratch + 128 * 1024 * 2); + uint64_t VD = (scratch + 128 * 1024 * 3); + uint64_t VE = (scratch + 128 * 1024 * 4); + uint64_t VF = (scratch + 128 * 1024 * 5); + uint64_t VG = (scratch + 128 * 1024 * 6); + uint64_t VH = (scratch + 128 * 1024 * 7); + int i; + int tag1 = 1, tag_mask1 = 1 << tag1; + int tag2 = 2, tag_mask2 = 1 << tag2; + + /* 1: X <-- B */ + for (i = 0; i < 16; i++) { + XA->w[i] = le32dec(&databuf[0 * 128 + (i * 5 % 16) * 4]); + XA->w[16 + i] = le32dec(&databuf[0 * 128 + (16 + (i * 5 % 16)) * 4]); + XB->w[i] = le32dec(&databuf[1 * 128 + (i * 5 % 16) * 4]); + XB->w[16 + i] = le32dec(&databuf[1 * 128 + (16 + (i * 5 % 16)) * 4]); + XC->w[i] = le32dec(&databuf[2 * 128 + (i * 5 % 16) * 4]); + XC->w[16 + i] = le32dec(&databuf[2 * 128 + (16 + (i * 5 % 16)) * 4]); + XD->w[i] = le32dec(&databuf[3 * 128 + (i * 5 % 16) * 4]); + XD->w[16 + i] = le32dec(&databuf[3 * 128 + (16 + (i * 5 % 16)) * 4]); + XE->w[i] = le32dec(&databuf[4 * 128 + (i * 5 % 16) * 4]); + XE->w[16 + i] = le32dec(&databuf[4 * 128 + (16 + (i * 5 % 16)) * 4]); + XF->w[i] = le32dec(&databuf[5 * 128 + (i * 5 % 16) * 4]); + XF->w[16 + i] = le32dec(&databuf[5 * 128 + (16 + (i * 5 % 16)) * 4]); + XG->w[i] = le32dec(&databuf[6 * 128 + (i * 5 % 16) * 4]); + XG->w[16 + i] = le32dec(&databuf[6 * 128 + (16 + (i * 5 % 16)) * 4]); + XH->w[i] = le32dec(&databuf[7 * 128 + (i * 5 % 16) * 4]); + XH->w[16 + i] = le32dec(&databuf[7 * 128 + (16 + (i * 5 % 16)) * 4]); + } + for (i = 0; i < 8; i++) + dma_list[i].size = 128; + + /* 2: for i = 0 to N - 1 do */ + for (i = 0; i < 1024; i++) { + blkcpy128(&Z[0 * 8], &XA->q[0]); + blkcpy128(&Z[1 * 8], &XB->q[0]); + blkcpy128(&Z[2 * 8], &XC->q[0]); + blkcpy128(&Z[3 * 8], &XD->q[0]); + blkcpy128(&Z[4 * 8], &XE->q[0]); + blkcpy128(&Z[5 * 8], &XF->q[0]); + blkcpy128(&Z[6 * 8], &XG->q[0]); + blkcpy128(&Z[7 * 8], &XH->q[0]); + dma_list[0].eal = mfc_ea2l(VA + i * 128); + dma_list[1].eal = mfc_ea2l(VB + i * 128); + dma_list[2].eal = mfc_ea2l(VC + i * 128); + dma_list[3].eal = mfc_ea2l(VD + i * 128); + dma_list[4].eal = mfc_ea2l(VE + i * 128); + dma_list[5].eal = mfc_ea2l(VF + i * 128); + dma_list[6].eal = mfc_ea2l(VG + i * 128); + dma_list[7].eal = mfc_ea2l(VH + i * 128); + mfc_putl(&Z[0], scratch, &dma_list[0], 8 * sizeof(mfc_list_element_t), tag1, 0, 0); + salsa20_8_xor4(&XA->q[0], &XA->q[4], &XB->q[0], &XB->q[4], &XC->q[0], &XC->q[4], &XD->q[0], &XD->q[4]); + salsa20_8_xor4(&XA->q[4], &XA->q[0], &XB->q[4], &XB->q[0], &XC->q[4], &XC->q[0], &XD->q[4], &XD->q[0]); + salsa20_8_xor4(&XE->q[0], &XE->q[4], &XF->q[0], &XF->q[4], &XG->q[0], &XG->q[4], &XH->q[0], &XH->q[4]); + salsa20_8_xor4(&XE->q[4], &XE->q[0], &XF->q[4], &XF->q[0], &XG->q[4], &XG->q[0], &XH->q[4], &XH->q[0]); + mfc_write_tag_mask(tag_mask1); + mfc_read_tag_status_all(); + } + + dma_list[0].eal = mfc_ea2l(VA + (XA->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */ + dma_list[1].eal = mfc_ea2l(VB + (XB->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */ + dma_list[2].eal = mfc_ea2l(VC + (XC->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */ + dma_list[3].eal = mfc_ea2l(VD + (XD->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */ + mfc_getl(&Y[0], scratch, &dma_list[0], 4 * sizeof(mfc_list_element_t), tag1, 0, 0); + + dma_list[4].eal = mfc_ea2l(VE + (XE->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */ + dma_list[5].eal = mfc_ea2l(VF + (XF->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */ + dma_list[6].eal = mfc_ea2l(VG + (XG->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */ + dma_list[7].eal = mfc_ea2l(VH + (XH->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */ + mfc_getl(&Y[4 * 8], scratch, &dma_list[4], 4 * sizeof(mfc_list_element_t), tag2, 0, 0); + + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < 1024; i++) { + mfc_write_tag_mask(tag_mask1); + mfc_read_tag_status_all(); + blkxor128(XA->q, &Y[0 * 4]); + blkxor128(XB->q, &Y[1 * 8]); + blkxor128(XC->q, &Y[2 * 8]); + blkxor128(XD->q, &Y[3 * 8]); + salsa20_8_xor4(&XA->q[0], &XA->q[4], &XB->q[0], &XB->q[4], &XC->q[0], &XC->q[4], &XD->q[0], &XD->q[4]); + salsa20_8_xor4(&XA->q[4], &XA->q[0], &XB->q[4], &XB->q[0], &XC->q[4], &XC->q[0], &XD->q[4], &XD->q[0]); + + dma_list[0].eal = mfc_ea2l(VA + (XA->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */ + dma_list[1].eal = mfc_ea2l(VB + (XB->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */ + dma_list[2].eal = mfc_ea2l(VC + (XC->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */ + dma_list[3].eal = mfc_ea2l(VD + (XD->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */ + mfc_getl(&Y[0], scratch, &dma_list[0], 4 * sizeof(mfc_list_element_t), tag1, 0, 0); + + mfc_write_tag_mask(tag_mask2); + mfc_read_tag_status_all(); + blkxor128(XE->q, &Y[4 * 8]); + blkxor128(XF->q, &Y[5 * 8]); + blkxor128(XG->q, &Y[6 * 8]); + blkxor128(XH->q, &Y[7 * 8]); + salsa20_8_xor4(&XE->q[0], &XE->q[4], &XF->q[0], &XF->q[4], &XG->q[0], &XG->q[4], &XH->q[0], &XH->q[4]); + salsa20_8_xor4(&XE->q[4], &XE->q[0], &XF->q[4], &XF->q[0], &XG->q[4], &XG->q[0], &XH->q[4], &XH->q[0]); + + dma_list[4].eal = mfc_ea2l(VE + (XE->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */ + dma_list[5].eal = mfc_ea2l(VF + (XF->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */ + dma_list[6].eal = mfc_ea2l(VG + (XG->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */ + dma_list[7].eal = mfc_ea2l(VH + (XH->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */ + mfc_getl(&Y[4 * 8], scratch, &dma_list[4], 4 * sizeof(mfc_list_element_t), tag2, 0, 0); + } + + /* 10: B' <-- X */ + for (i = 0; i < 16; i++) { + le32enc(&databuf[0 * 128 + (i * 5 % 16) * 4], XA->w[i]); + le32enc(&databuf[0 * 128 + (16 + (i * 5 % 16)) * 4], XA->w[16 + i]); + le32enc(&databuf[1 * 128 + (i * 5 % 16) * 4], XB->w[i]); + le32enc(&databuf[1 * 128 + (16 + (i * 5 % 16)) * 4], XB->w[16 + i]); + le32enc(&databuf[2 * 128 + (i * 5 % 16) * 4], XC->w[i]); + le32enc(&databuf[2 * 128 + (16 + (i * 5 % 16)) * 4], XC->w[16 + i]); + le32enc(&databuf[3 * 128 + (i * 5 % 16) * 4], XD->w[i]); + le32enc(&databuf[3 * 128 + (16 + (i * 5 % 16)) * 4], XD->w[16 + i]); + le32enc(&databuf[4 * 128 + (i * 5 % 16) * 4], XE->w[i]); + le32enc(&databuf[4 * 128 + (16 + (i * 5 % 16)) * 4], XE->w[16 + i]); + le32enc(&databuf[5 * 128 + (i * 5 % 16) * 4], XF->w[i]); + le32enc(&databuf[5 * 128 + (16 + (i * 5 % 16)) * 4], XF->w[16 + i]); + le32enc(&databuf[6 * 128 + (i * 5 % 16) * 4], XG->w[i]); + le32enc(&databuf[6 * 128 + (16 + (i * 5 % 16)) * 4], XG->w[16 + i]); + le32enc(&databuf[7 * 128 + (i * 5 % 16) * 4], XH->w[i]); + le32enc(&databuf[7 * 128 + (16 + (i * 5 % 16)) * 4], XH->w[16 + i]); + } +} + +static void +scrypt_1024_1_1_256_sp8(const unsigned char * input1, + unsigned char * output1, + const unsigned char * input2, + unsigned char * output2, + const unsigned char * input3, + unsigned char * output3, + const unsigned char * input4, + unsigned char * output4, + const unsigned char * input5, + unsigned char * output5, + const unsigned char * input6, + unsigned char * output6, + const unsigned char * input7, + unsigned char * output7, + const unsigned char * input8, + unsigned char * output8, + uint64_t scratchpad) +{ + static uint8_t databuf[128 * 8] __attribute__((aligned(128))); + uint8_t * B1, * B2, * B3, * B4, * B5, * B6, * B7, * B8; + + const uint32_t r = 1; + const uint32_t p = 1; + + B1 = databuf; + B2 = databuf + 128 * 1; + B3 = databuf + 128 * 2; + B4 = databuf + 128 * 3; + B5 = databuf + 128 * 4; + B6 = databuf + 128 * 5; + B7 = databuf + 128 * 6; + B8 = databuf + 128 * 7; + + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + PBKDF2_SHA256((const uint8_t*)input1, 80, (const uint8_t*)input1, 80, 1, B1, p * 128 * r); + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + PBKDF2_SHA256((const uint8_t*)input2, 80, (const uint8_t*)input2, 80, 1, B2, p * 128 * r); + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + PBKDF2_SHA256((const uint8_t*)input3, 80, (const uint8_t*)input3, 80, 1, B3, p * 128 * r); + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + PBKDF2_SHA256((const uint8_t*)input4, 80, (const uint8_t*)input4, 80, 1, B4, p * 128 * r); + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + PBKDF2_SHA256((const uint8_t*)input5, 80, (const uint8_t*)input5, 80, 1, B5, p * 128 * r); + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + PBKDF2_SHA256((const uint8_t*)input6, 80, (const uint8_t*)input6, 80, 1, B6, p * 128 * r); + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + PBKDF2_SHA256((const uint8_t*)input7, 80, (const uint8_t*)input7, 80, 1, B7, p * 128 * r); + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + PBKDF2_SHA256((const uint8_t*)input8, 80, (const uint8_t*)input8, 80, 1, B8, p * 128 * r); + + scrypt_spu_core8(databuf, scratchpad); + + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + PBKDF2_SHA256((const uint8_t*)input1, 80, B1, p * 128 * r, 1, (uint8_t*)output1, 32); + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + PBKDF2_SHA256((const uint8_t*)input2, 80, B2, p * 128 * r, 1, (uint8_t*)output2, 32); + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + PBKDF2_SHA256((const uint8_t*)input3, 80, B3, p * 128 * r, 1, (uint8_t*)output3, 32); + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + PBKDF2_SHA256((const uint8_t*)input4, 80, B4, p * 128 * r, 1, (uint8_t*)output4, 32); + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + PBKDF2_SHA256((const uint8_t*)input5, 80, B5, p * 128 * r, 1, (uint8_t*)output5, 32); + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + PBKDF2_SHA256((const uint8_t*)input6, 80, B6, p * 128 * r, 1, (uint8_t*)output6, 32); + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + PBKDF2_SHA256((const uint8_t*)input7, 80, B7, p * 128 * r, 1, (uint8_t*)output7, 32); + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + PBKDF2_SHA256((const uint8_t*)input8, 80, B8, p * 128 * r, 1, (uint8_t*)output8, 32); +} + +static int +scanhash_scrypt(uint64_t work_restart_ptr, unsigned char *pdata, + uint64_t scratchbuf, const unsigned char *ptarget, + uint32_t max_nonce, uint32_t *hashes_done) +{ + unsigned char data1[80]; + unsigned char tmp_hash1[32]; + unsigned char data2[80]; + unsigned char tmp_hash2[32]; + unsigned char data3[80]; + unsigned char tmp_hash3[32]; + unsigned char data4[80]; + unsigned char tmp_hash4[32]; + unsigned char data5[80]; + unsigned char tmp_hash5[32]; + unsigned char data6[80]; + unsigned char tmp_hash6[32]; + unsigned char data7[80]; + unsigned char tmp_hash7[32]; + unsigned char data8[80]; + unsigned char tmp_hash8[32]; + uint32_t *nonce1 = (uint32_t *)(data1 + 64 + 12); + uint32_t *nonce2 = (uint32_t *)(data2 + 64 + 12); + uint32_t *nonce3 = (uint32_t *)(data3 + 64 + 12); + uint32_t *nonce4 = (uint32_t *)(data4 + 64 + 12); + uint32_t *nonce5 = (uint32_t *)(data5 + 64 + 12); + uint32_t *nonce6 = (uint32_t *)(data6 + 64 + 12); + uint32_t *nonce7 = (uint32_t *)(data7 + 64 + 12); + uint32_t *nonce8 = (uint32_t *)(data8 + 64 + 12); + uint32_t n = 0; + uint32_t Htarg = le32dec(ptarget + 28); + int i; + int tag3 = 3, tag_mask3 = 1 << tag3; + int work_restart = 0; + + for (i = 0; i < 80/4; i++) { + ((uint32_t *)data1)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]); + ((uint32_t *)data2)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]); + ((uint32_t *)data3)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]); + ((uint32_t *)data4)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]); + ((uint32_t *)data5)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]); + ((uint32_t *)data6)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]); + ((uint32_t *)data7)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]); + ((uint32_t *)data8)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]); + } + + while(1) { + /* request 'work_restart[thr_id].restart' from external memory */ + mfc_get(&work_restart, work_restart_ptr, 4, tag3, 0, 0); + + le32enc(nonce1, n + 1); + le32enc(nonce2, n + 2); + le32enc(nonce3, n + 3); + le32enc(nonce4, n + 4); + le32enc(nonce5, n + 5); + le32enc(nonce6, n + 6); + le32enc(nonce7, n + 7); + le32enc(nonce8, n + 8); + scrypt_1024_1_1_256_sp8(data1, tmp_hash1, data2, tmp_hash2, + data3, tmp_hash3, data4, tmp_hash4, + data5, tmp_hash5, data6, tmp_hash6, + data7, tmp_hash7, data8, tmp_hash8, + scratchbuf); + + if (le32dec(tmp_hash1+28) <= Htarg) { + be32enc(pdata + 64 + 12, n + 1); + *hashes_done = n; + return true; + } + + if (le32dec(tmp_hash2+28) <= Htarg && n + 2 <= max_nonce) { + be32enc(pdata + 64 + 12, n + 2); + *hashes_done = n + 2; + return true; + } + + if (le32dec(tmp_hash3+28) <= Htarg && n + 3 <= max_nonce) { + be32enc(pdata + 64 + 12, n + 3); + *hashes_done = n + 3; + return true; + } + + if (le32dec(tmp_hash4+28) <= Htarg && n + 4 <= max_nonce) { + be32enc(pdata + 64 + 12, n + 4); + *hashes_done = n + 4; + return true; + } + + if (le32dec(tmp_hash5+28) <= Htarg && n + 5 <= max_nonce) { + be32enc(pdata + 64 + 12, n + 5); + *hashes_done = n + 5; + return true; + } + + if (le32dec(tmp_hash6+28) <= Htarg && n + 6 <= max_nonce) { + be32enc(pdata + 64 + 12, n + 6); + *hashes_done = n + 6; + return true; + } + + if (le32dec(tmp_hash7+28) <= Htarg && n + 7 <= max_nonce) { + be32enc(pdata + 64 + 12, n + 7); + *hashes_done = n + 7; + return true; + } + + if (le32dec(tmp_hash8+28) <= Htarg && n + 8 <= max_nonce) { + be32enc(pdata + 64 + 12, n + 8); + *hashes_done = n + 8; + return true; + } + + n += 8; + + if (n >= max_nonce) { + *hashes_done = max_nonce; + break; + } + + /* ensure that 'work_restart[thr_id].restart' has been read */ + mfc_write_tag_mask(tag_mask3); + mfc_read_tag_status_all(); + + if (work_restart) { + *hashes_done = n; + break; + } + } + return false; +} + +int main(uint64_t spe_id, uint64_t argp, uint64_t envp) +{ + static scanhash_spu_args args __attribute__((aligned(16))); + int tag = 1, tag_mask = 1 << tag; + int rc; + + mfc_get(&args, argp, sizeof(args), tag, 0, 0); + mfc_write_tag_mask(tag_mask); + mfc_read_tag_status_all(); + + rc = scanhash_scrypt(envp, args.data, argp + 1024, + args.target, args.max_nonce, + &args.hashes_done); + + mfc_put(&args, argp, sizeof(args), tag, 0, 0); + mfc_write_tag_mask(tag_mask); + mfc_read_tag_status_all(); + + return rc; +} diff --git a/scrypt-cell-spu.h b/scrypt-cell-spu.h new file mode 100644 index 0000000..4af26c4 --- /dev/null +++ b/scrypt-cell-spu.h @@ -0,0 +1,44 @@ +/*- + * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 Siarhei Siamashka + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + +#ifndef __SCRYPT_CELL_SPU_H__ +#define __SCRYPT_CELL_SPU_H__ + +/* + * This is the data structure which is passed between main CPU and SPU cores. + */ +typedef struct { + uint8_t data[128]; + uint8_t target[32]; + uint32_t max_nonce; + uint32_t hashes_done; + uint32_t padding[2]; +} scanhash_spu_args; + +#endif -- cgit v1.2.3-18-g5258