aboutsummaryrefslogtreecommitdiff
path: root/scrypt-cell-spu.c
diff options
context:
space:
mode:
authorSiarhei Siamashka <siarhei.siamashka@gmail.com>2011-12-27 03:51:24 +0200
committerSiarhei Siamashka <siarhei.siamashka@gmail.com>2011-12-29 06:47:40 +0200
commitea4520474ba5a966260fc5652c5248e6eab50d58 (patch)
tree18f32becc26cf074fc9b1c9e00a778efa0038ef0 /scrypt-cell-spu.c
parent7a9481d266b58bd02a7d8033f359ca344cfd65d6 (diff)
Added Cell/BE optimizations
Linux on PS3 gets a huge boost in litecoin mining performance. Cell/BE support should be detected and enabled automatically by autotools. The miner threads are first allocated to the available SPU cores (typically 6). The remaining threads are allocated on PPU. There will be 8 threads total on PS3: 6 SPU threads and 2 PPU threads. Each SPU core provides ~5.4 khash/s if compiled with spu-elf-gcc 4.6 The performance may vary for different gcc versions, older ones are typically slower.
Diffstat (limited to 'scrypt-cell-spu.c')
-rw-r--r--scrypt-cell-spu.c523
1 files changed, 523 insertions, 0 deletions
diff --git a/scrypt-cell-spu.c b/scrypt-cell-spu.c
new file mode 100644
index 0000000..a5a741f
--- /dev/null
+++ b/scrypt-cell-spu.c
@@ -0,0 +1,523 @@
+/*-
+ * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 Siarhei Siamashka
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+
+#include "sha256-helpers.h"
+#include "scrypt-simd-helpers.h"
+#include "scrypt-cell-spu.h"
+
+#define true 1
+#define false 0
+
+/*****************************************************************************/
+
+static __attribute__((always_inline)) void
+salsa20_8_xor4(uint32x4 * __restrict B, const uint32x4 * __restrict Bx,
+ uint32x4 * __restrict C, const uint32x4 * __restrict Cx,
+ uint32x4 * __restrict D, const uint32x4 * __restrict Dx,
+ uint32x4 * __restrict E, const uint32x4 * __restrict Ex)
+{
+ uint32x4 X0, X1, X2, X3;
+ uint32x4 Y0, Y1, Y2, Y3;
+ uint32x4 Z0, Z1, Z2, Z3;
+ uint32x4 W0, W1, W2, W3;
+ int i;
+
+ X0 = (B[0] ^= Bx[0]);
+ X1 = (B[1] ^= Bx[1]);
+ X2 = (B[2] ^= Bx[2]);
+ X3 = (B[3] ^= Bx[3]);
+ Y0 = (C[0] ^= Cx[0]);
+ Y1 = (C[1] ^= Cx[1]);
+ Y2 = (C[2] ^= Cx[2]);
+ Y3 = (C[3] ^= Cx[3]);
+ Z0 = (D[0] ^= Dx[0]);
+ Z1 = (D[1] ^= Dx[1]);
+ Z2 = (D[2] ^= Dx[2]);
+ Z3 = (D[3] ^= Dx[3]);
+ W0 = (E[0] ^= Ex[0]);
+ W1 = (E[1] ^= Ex[1]);
+ W2 = (E[2] ^= Ex[2]);
+ W3 = (E[3] ^= Ex[3]);
+
+ for (i = 0; i < 8; i += 2) {
+ /* Operate on "columns". */
+ X1 ^= rol_32x4(X0 + X3, 7);
+ Y1 ^= rol_32x4(Y0 + Y3, 7);
+ Z1 ^= rol_32x4(Z0 + Z3, 7);
+ W1 ^= rol_32x4(W0 + W3, 7);
+ X2 ^= rol_32x4(X1 + X0, 9);
+ Y2 ^= rol_32x4(Y1 + Y0, 9);
+ Z2 ^= rol_32x4(Z1 + Z0, 9);
+ W2 ^= rol_32x4(W1 + W0, 9);
+ X3 ^= rol_32x4(X2 + X1, 13);
+ Y3 ^= rol_32x4(Y2 + Y1, 13);
+ Z3 ^= rol_32x4(Z2 + Z1, 13);
+ W3 ^= rol_32x4(W2 + W1, 13);
+ X0 ^= rol_32x4(X3 + X2, 18);
+ Y0 ^= rol_32x4(Y3 + Y2, 18);
+ Z0 ^= rol_32x4(Z3 + Z2, 18);
+ W0 ^= rol_32x4(W3 + W2, 18);
+
+ /* Rearrange data. */
+ X1 = shuffle_32x4(X1, 3, 0, 1, 2);
+ Y1 = shuffle_32x4(Y1, 3, 0, 1, 2);
+ Z1 = shuffle_32x4(Z1, 3, 0, 1, 2);
+ W1 = shuffle_32x4(W1, 3, 0, 1, 2);
+ X2 = shuffle_32x4(X2, 2, 3, 0, 1);
+ Y2 = shuffle_32x4(Y2, 2, 3, 0, 1);
+ Z2 = shuffle_32x4(Z2, 2, 3, 0, 1);
+ W2 = shuffle_32x4(W2, 2, 3, 0, 1);
+ X3 = shuffle_32x4(X3, 1, 2, 3, 0);
+ Y3 = shuffle_32x4(Y3, 1, 2, 3, 0);
+ Z3 = shuffle_32x4(Z3, 1, 2, 3, 0);
+ W3 = shuffle_32x4(W3, 1, 2, 3, 0);
+
+ /* Operate on "rows". */
+ X3 ^= rol_32x4(X0 + X1, 7);
+ Y3 ^= rol_32x4(Y0 + Y1, 7);
+ Z3 ^= rol_32x4(Z0 + Z1, 7);
+ W3 ^= rol_32x4(W0 + W1, 7);
+ X2 ^= rol_32x4(X3 + X0, 9);
+ Y2 ^= rol_32x4(Y3 + Y0, 9);
+ Z2 ^= rol_32x4(Z3 + Z0, 9);
+ W2 ^= rol_32x4(W3 + W0, 9);
+ X1 ^= rol_32x4(X2 + X3, 13);
+ Y1 ^= rol_32x4(Y2 + Y3, 13);
+ Z1 ^= rol_32x4(Z2 + Z3, 13);
+ W1 ^= rol_32x4(W2 + W3, 13);
+ X0 ^= rol_32x4(X1 + X2, 18);
+ Y0 ^= rol_32x4(Y1 + Y2, 18);
+ Z0 ^= rol_32x4(Z1 + Z2, 18);
+ W0 ^= rol_32x4(W1 + W2, 18);
+
+ /* Rearrange data. */
+ X1 = shuffle_32x4(X1, 1, 2, 3, 0);
+ Y1 = shuffle_32x4(Y1, 1, 2, 3, 0);
+ Z1 = shuffle_32x4(Z1, 1, 2, 3, 0);
+ W1 = shuffle_32x4(W1, 1, 2, 3, 0);
+ X2 = shuffle_32x4(X2, 2, 3, 0, 1);
+ Y2 = shuffle_32x4(Y2, 2, 3, 0, 1);
+ Z2 = shuffle_32x4(Z2, 2, 3, 0, 1);
+ W2 = shuffle_32x4(W2, 2, 3, 0, 1);
+ X3 = shuffle_32x4(X3, 3, 0, 1, 2);
+ Y3 = shuffle_32x4(Y3, 3, 0, 1, 2);
+ Z3 = shuffle_32x4(Z3, 3, 0, 1, 2);
+ W3 = shuffle_32x4(W3, 3, 0, 1, 2);
+ }
+
+ B[0] += X0;
+ B[1] += X1;
+ B[2] += X2;
+ B[3] += X3;
+ C[0] += Y0;
+ C[1] += Y1;
+ C[2] += Y2;
+ C[3] += Y3;
+ D[0] += Z0;
+ D[1] += Z1;
+ D[2] += Z2;
+ D[3] += Z3;
+ E[0] += W0;
+ E[1] += W1;
+ E[2] += W2;
+ E[3] += W3;
+}
+
+static void
+scrypt_spu_core8(uint8_t *databuf, uint64_t scratch)
+{
+ static mfc_list_element_t dma_list[8] __attribute__((aligned(128)));
+ static XY X[8] __attribute__((aligned(128)));
+ static uint32x4 Y[8 * 8] __attribute__((aligned(128)));
+ static uint32x4 Z[8 * 8] __attribute__((aligned(128)));
+ XY * XA = &X[0];
+ XY * XB = &X[1];
+ XY * XC = &X[2];
+ XY * XD = &X[3];
+ XY * XE = &X[4];
+ XY * XF = &X[5];
+ XY * XG = &X[6];
+ XY * XH = &X[7];
+
+ uint64_t VA = (scratch + 128 * 1024 * 0);
+ uint64_t VB = (scratch + 128 * 1024 * 1);
+ uint64_t VC = (scratch + 128 * 1024 * 2);
+ uint64_t VD = (scratch + 128 * 1024 * 3);
+ uint64_t VE = (scratch + 128 * 1024 * 4);
+ uint64_t VF = (scratch + 128 * 1024 * 5);
+ uint64_t VG = (scratch + 128 * 1024 * 6);
+ uint64_t VH = (scratch + 128 * 1024 * 7);
+ int i;
+ int tag1 = 1, tag_mask1 = 1 << tag1;
+ int tag2 = 2, tag_mask2 = 1 << tag2;
+
+ /* 1: X <-- B */
+ for (i = 0; i < 16; i++) {
+ XA->w[i] = le32dec(&databuf[0 * 128 + (i * 5 % 16) * 4]);
+ XA->w[16 + i] = le32dec(&databuf[0 * 128 + (16 + (i * 5 % 16)) * 4]);
+ XB->w[i] = le32dec(&databuf[1 * 128 + (i * 5 % 16) * 4]);
+ XB->w[16 + i] = le32dec(&databuf[1 * 128 + (16 + (i * 5 % 16)) * 4]);
+ XC->w[i] = le32dec(&databuf[2 * 128 + (i * 5 % 16) * 4]);
+ XC->w[16 + i] = le32dec(&databuf[2 * 128 + (16 + (i * 5 % 16)) * 4]);
+ XD->w[i] = le32dec(&databuf[3 * 128 + (i * 5 % 16) * 4]);
+ XD->w[16 + i] = le32dec(&databuf[3 * 128 + (16 + (i * 5 % 16)) * 4]);
+ XE->w[i] = le32dec(&databuf[4 * 128 + (i * 5 % 16) * 4]);
+ XE->w[16 + i] = le32dec(&databuf[4 * 128 + (16 + (i * 5 % 16)) * 4]);
+ XF->w[i] = le32dec(&databuf[5 * 128 + (i * 5 % 16) * 4]);
+ XF->w[16 + i] = le32dec(&databuf[5 * 128 + (16 + (i * 5 % 16)) * 4]);
+ XG->w[i] = le32dec(&databuf[6 * 128 + (i * 5 % 16) * 4]);
+ XG->w[16 + i] = le32dec(&databuf[6 * 128 + (16 + (i * 5 % 16)) * 4]);
+ XH->w[i] = le32dec(&databuf[7 * 128 + (i * 5 % 16) * 4]);
+ XH->w[16 + i] = le32dec(&databuf[7 * 128 + (16 + (i * 5 % 16)) * 4]);
+ }
+ for (i = 0; i < 8; i++)
+ dma_list[i].size = 128;
+
+ /* 2: for i = 0 to N - 1 do */
+ for (i = 0; i < 1024; i++) {
+ blkcpy128(&Z[0 * 8], &XA->q[0]);
+ blkcpy128(&Z[1 * 8], &XB->q[0]);
+ blkcpy128(&Z[2 * 8], &XC->q[0]);
+ blkcpy128(&Z[3 * 8], &XD->q[0]);
+ blkcpy128(&Z[4 * 8], &XE->q[0]);
+ blkcpy128(&Z[5 * 8], &XF->q[0]);
+ blkcpy128(&Z[6 * 8], &XG->q[0]);
+ blkcpy128(&Z[7 * 8], &XH->q[0]);
+ dma_list[0].eal = mfc_ea2l(VA + i * 128);
+ dma_list[1].eal = mfc_ea2l(VB + i * 128);
+ dma_list[2].eal = mfc_ea2l(VC + i * 128);
+ dma_list[3].eal = mfc_ea2l(VD + i * 128);
+ dma_list[4].eal = mfc_ea2l(VE + i * 128);
+ dma_list[5].eal = mfc_ea2l(VF + i * 128);
+ dma_list[6].eal = mfc_ea2l(VG + i * 128);
+ dma_list[7].eal = mfc_ea2l(VH + i * 128);
+ mfc_putl(&Z[0], scratch, &dma_list[0], 8 * sizeof(mfc_list_element_t), tag1, 0, 0);
+ salsa20_8_xor4(&XA->q[0], &XA->q[4], &XB->q[0], &XB->q[4], &XC->q[0], &XC->q[4], &XD->q[0], &XD->q[4]);
+ salsa20_8_xor4(&XA->q[4], &XA->q[0], &XB->q[4], &XB->q[0], &XC->q[4], &XC->q[0], &XD->q[4], &XD->q[0]);
+ salsa20_8_xor4(&XE->q[0], &XE->q[4], &XF->q[0], &XF->q[4], &XG->q[0], &XG->q[4], &XH->q[0], &XH->q[4]);
+ salsa20_8_xor4(&XE->q[4], &XE->q[0], &XF->q[4], &XF->q[0], &XG->q[4], &XG->q[0], &XH->q[4], &XH->q[0]);
+ mfc_write_tag_mask(tag_mask1);
+ mfc_read_tag_status_all();
+ }
+
+ dma_list[0].eal = mfc_ea2l(VA + (XA->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[1].eal = mfc_ea2l(VB + (XB->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[2].eal = mfc_ea2l(VC + (XC->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[3].eal = mfc_ea2l(VD + (XD->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ mfc_getl(&Y[0], scratch, &dma_list[0], 4 * sizeof(mfc_list_element_t), tag1, 0, 0);
+
+ dma_list[4].eal = mfc_ea2l(VE + (XE->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[5].eal = mfc_ea2l(VF + (XF->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[6].eal = mfc_ea2l(VG + (XG->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[7].eal = mfc_ea2l(VH + (XH->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ mfc_getl(&Y[4 * 8], scratch, &dma_list[4], 4 * sizeof(mfc_list_element_t), tag2, 0, 0);
+
+ /* 6: for i = 0 to N - 1 do */
+ for (i = 0; i < 1024; i++) {
+ mfc_write_tag_mask(tag_mask1);
+ mfc_read_tag_status_all();
+ blkxor128(XA->q, &Y[0 * 4]);
+ blkxor128(XB->q, &Y[1 * 8]);
+ blkxor128(XC->q, &Y[2 * 8]);
+ blkxor128(XD->q, &Y[3 * 8]);
+ salsa20_8_xor4(&XA->q[0], &XA->q[4], &XB->q[0], &XB->q[4], &XC->q[0], &XC->q[4], &XD->q[0], &XD->q[4]);
+ salsa20_8_xor4(&XA->q[4], &XA->q[0], &XB->q[4], &XB->q[0], &XC->q[4], &XC->q[0], &XD->q[4], &XD->q[0]);
+
+ dma_list[0].eal = mfc_ea2l(VA + (XA->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[1].eal = mfc_ea2l(VB + (XB->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[2].eal = mfc_ea2l(VC + (XC->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[3].eal = mfc_ea2l(VD + (XD->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ mfc_getl(&Y[0], scratch, &dma_list[0], 4 * sizeof(mfc_list_element_t), tag1, 0, 0);
+
+ mfc_write_tag_mask(tag_mask2);
+ mfc_read_tag_status_all();
+ blkxor128(XE->q, &Y[4 * 8]);
+ blkxor128(XF->q, &Y[5 * 8]);
+ blkxor128(XG->q, &Y[6 * 8]);
+ blkxor128(XH->q, &Y[7 * 8]);
+ salsa20_8_xor4(&XE->q[0], &XE->q[4], &XF->q[0], &XF->q[4], &XG->q[0], &XG->q[4], &XH->q[0], &XH->q[4]);
+ salsa20_8_xor4(&XE->q[4], &XE->q[0], &XF->q[4], &XF->q[0], &XG->q[4], &XG->q[0], &XH->q[4], &XH->q[0]);
+
+ dma_list[4].eal = mfc_ea2l(VE + (XE->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[5].eal = mfc_ea2l(VF + (XF->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[6].eal = mfc_ea2l(VG + (XG->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ dma_list[7].eal = mfc_ea2l(VH + (XH->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
+ mfc_getl(&Y[4 * 8], scratch, &dma_list[4], 4 * sizeof(mfc_list_element_t), tag2, 0, 0);
+ }
+
+ /* 10: B' <-- X */
+ for (i = 0; i < 16; i++) {
+ le32enc(&databuf[0 * 128 + (i * 5 % 16) * 4], XA->w[i]);
+ le32enc(&databuf[0 * 128 + (16 + (i * 5 % 16)) * 4], XA->w[16 + i]);
+ le32enc(&databuf[1 * 128 + (i * 5 % 16) * 4], XB->w[i]);
+ le32enc(&databuf[1 * 128 + (16 + (i * 5 % 16)) * 4], XB->w[16 + i]);
+ le32enc(&databuf[2 * 128 + (i * 5 % 16) * 4], XC->w[i]);
+ le32enc(&databuf[2 * 128 + (16 + (i * 5 % 16)) * 4], XC->w[16 + i]);
+ le32enc(&databuf[3 * 128 + (i * 5 % 16) * 4], XD->w[i]);
+ le32enc(&databuf[3 * 128 + (16 + (i * 5 % 16)) * 4], XD->w[16 + i]);
+ le32enc(&databuf[4 * 128 + (i * 5 % 16) * 4], XE->w[i]);
+ le32enc(&databuf[4 * 128 + (16 + (i * 5 % 16)) * 4], XE->w[16 + i]);
+ le32enc(&databuf[5 * 128 + (i * 5 % 16) * 4], XF->w[i]);
+ le32enc(&databuf[5 * 128 + (16 + (i * 5 % 16)) * 4], XF->w[16 + i]);
+ le32enc(&databuf[6 * 128 + (i * 5 % 16) * 4], XG->w[i]);
+ le32enc(&databuf[6 * 128 + (16 + (i * 5 % 16)) * 4], XG->w[16 + i]);
+ le32enc(&databuf[7 * 128 + (i * 5 % 16) * 4], XH->w[i]);
+ le32enc(&databuf[7 * 128 + (16 + (i * 5 % 16)) * 4], XH->w[16 + i]);
+ }
+}
+
+static void
+scrypt_1024_1_1_256_sp8(const unsigned char * input1,
+ unsigned char * output1,
+ const unsigned char * input2,
+ unsigned char * output2,
+ const unsigned char * input3,
+ unsigned char * output3,
+ const unsigned char * input4,
+ unsigned char * output4,
+ const unsigned char * input5,
+ unsigned char * output5,
+ const unsigned char * input6,
+ unsigned char * output6,
+ const unsigned char * input7,
+ unsigned char * output7,
+ const unsigned char * input8,
+ unsigned char * output8,
+ uint64_t scratchpad)
+{
+ static uint8_t databuf[128 * 8] __attribute__((aligned(128)));
+ uint8_t * B1, * B2, * B3, * B4, * B5, * B6, * B7, * B8;
+
+ const uint32_t r = 1;
+ const uint32_t p = 1;
+
+ B1 = databuf;
+ B2 = databuf + 128 * 1;
+ B3 = databuf + 128 * 2;
+ B4 = databuf + 128 * 3;
+ B5 = databuf + 128 * 4;
+ B6 = databuf + 128 * 5;
+ B7 = databuf + 128 * 6;
+ B8 = databuf + 128 * 7;
+
+ /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+ PBKDF2_SHA256((const uint8_t*)input1, 80, (const uint8_t*)input1, 80, 1, B1, p * 128 * r);
+ /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+ PBKDF2_SHA256((const uint8_t*)input2, 80, (const uint8_t*)input2, 80, 1, B2, p * 128 * r);
+ /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+ PBKDF2_SHA256((const uint8_t*)input3, 80, (const uint8_t*)input3, 80, 1, B3, p * 128 * r);
+ /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+ PBKDF2_SHA256((const uint8_t*)input4, 80, (const uint8_t*)input4, 80, 1, B4, p * 128 * r);
+ /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+ PBKDF2_SHA256((const uint8_t*)input5, 80, (const uint8_t*)input5, 80, 1, B5, p * 128 * r);
+ /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+ PBKDF2_SHA256((const uint8_t*)input6, 80, (const uint8_t*)input6, 80, 1, B6, p * 128 * r);
+ /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+ PBKDF2_SHA256((const uint8_t*)input7, 80, (const uint8_t*)input7, 80, 1, B7, p * 128 * r);
+ /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+ PBKDF2_SHA256((const uint8_t*)input8, 80, (const uint8_t*)input8, 80, 1, B8, p * 128 * r);
+
+ scrypt_spu_core8(databuf, scratchpad);
+
+ /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+ PBKDF2_SHA256((const uint8_t*)input1, 80, B1, p * 128 * r, 1, (uint8_t*)output1, 32);
+ /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+ PBKDF2_SHA256((const uint8_t*)input2, 80, B2, p * 128 * r, 1, (uint8_t*)output2, 32);
+ /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+ PBKDF2_SHA256((const uint8_t*)input3, 80, B3, p * 128 * r, 1, (uint8_t*)output3, 32);
+ /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+ PBKDF2_SHA256((const uint8_t*)input4, 80, B4, p * 128 * r, 1, (uint8_t*)output4, 32);
+ /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+ PBKDF2_SHA256((const uint8_t*)input5, 80, B5, p * 128 * r, 1, (uint8_t*)output5, 32);
+ /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+ PBKDF2_SHA256((const uint8_t*)input6, 80, B6, p * 128 * r, 1, (uint8_t*)output6, 32);
+ /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+ PBKDF2_SHA256((const uint8_t*)input7, 80, B7, p * 128 * r, 1, (uint8_t*)output7, 32);
+ /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+ PBKDF2_SHA256((const uint8_t*)input8, 80, B8, p * 128 * r, 1, (uint8_t*)output8, 32);
+}
+
+static int
+scanhash_scrypt(uint64_t work_restart_ptr, unsigned char *pdata,
+ uint64_t scratchbuf, const unsigned char *ptarget,
+ uint32_t max_nonce, uint32_t *hashes_done)
+{
+ unsigned char data1[80];
+ unsigned char tmp_hash1[32];
+ unsigned char data2[80];
+ unsigned char tmp_hash2[32];
+ unsigned char data3[80];
+ unsigned char tmp_hash3[32];
+ unsigned char data4[80];
+ unsigned char tmp_hash4[32];
+ unsigned char data5[80];
+ unsigned char tmp_hash5[32];
+ unsigned char data6[80];
+ unsigned char tmp_hash6[32];
+ unsigned char data7[80];
+ unsigned char tmp_hash7[32];
+ unsigned char data8[80];
+ unsigned char tmp_hash8[32];
+ uint32_t *nonce1 = (uint32_t *)(data1 + 64 + 12);
+ uint32_t *nonce2 = (uint32_t *)(data2 + 64 + 12);
+ uint32_t *nonce3 = (uint32_t *)(data3 + 64 + 12);
+ uint32_t *nonce4 = (uint32_t *)(data4 + 64 + 12);
+ uint32_t *nonce5 = (uint32_t *)(data5 + 64 + 12);
+ uint32_t *nonce6 = (uint32_t *)(data6 + 64 + 12);
+ uint32_t *nonce7 = (uint32_t *)(data7 + 64 + 12);
+ uint32_t *nonce8 = (uint32_t *)(data8 + 64 + 12);
+ uint32_t n = 0;
+ uint32_t Htarg = le32dec(ptarget + 28);
+ int i;
+ int tag3 = 3, tag_mask3 = 1 << tag3;
+ int work_restart = 0;
+
+ for (i = 0; i < 80/4; i++) {
+ ((uint32_t *)data1)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]);
+ ((uint32_t *)data2)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]);
+ ((uint32_t *)data3)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]);
+ ((uint32_t *)data4)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]);
+ ((uint32_t *)data5)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]);
+ ((uint32_t *)data6)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]);
+ ((uint32_t *)data7)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]);
+ ((uint32_t *)data8)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]);
+ }
+
+ while(1) {
+ /* request 'work_restart[thr_id].restart' from external memory */
+ mfc_get(&work_restart, work_restart_ptr, 4, tag3, 0, 0);
+
+ le32enc(nonce1, n + 1);
+ le32enc(nonce2, n + 2);
+ le32enc(nonce3, n + 3);
+ le32enc(nonce4, n + 4);
+ le32enc(nonce5, n + 5);
+ le32enc(nonce6, n + 6);
+ le32enc(nonce7, n + 7);
+ le32enc(nonce8, n + 8);
+ scrypt_1024_1_1_256_sp8(data1, tmp_hash1, data2, tmp_hash2,
+ data3, tmp_hash3, data4, tmp_hash4,
+ data5, tmp_hash5, data6, tmp_hash6,
+ data7, tmp_hash7, data8, tmp_hash8,
+ scratchbuf);
+
+ if (le32dec(tmp_hash1+28) <= Htarg) {
+ be32enc(pdata + 64 + 12, n + 1);
+ *hashes_done = n;
+ return true;
+ }
+
+ if (le32dec(tmp_hash2+28) <= Htarg && n + 2 <= max_nonce) {
+ be32enc(pdata + 64 + 12, n + 2);
+ *hashes_done = n + 2;
+ return true;
+ }
+
+ if (le32dec(tmp_hash3+28) <= Htarg && n + 3 <= max_nonce) {
+ be32enc(pdata + 64 + 12, n + 3);
+ *hashes_done = n + 3;
+ return true;
+ }
+
+ if (le32dec(tmp_hash4+28) <= Htarg && n + 4 <= max_nonce) {
+ be32enc(pdata + 64 + 12, n + 4);
+ *hashes_done = n + 4;
+ return true;
+ }
+
+ if (le32dec(tmp_hash5+28) <= Htarg && n + 5 <= max_nonce) {
+ be32enc(pdata + 64 + 12, n + 5);
+ *hashes_done = n + 5;
+ return true;
+ }
+
+ if (le32dec(tmp_hash6+28) <= Htarg && n + 6 <= max_nonce) {
+ be32enc(pdata + 64 + 12, n + 6);
+ *hashes_done = n + 6;
+ return true;
+ }
+
+ if (le32dec(tmp_hash7+28) <= Htarg && n + 7 <= max_nonce) {
+ be32enc(pdata + 64 + 12, n + 7);
+ *hashes_done = n + 7;
+ return true;
+ }
+
+ if (le32dec(tmp_hash8+28) <= Htarg && n + 8 <= max_nonce) {
+ be32enc(pdata + 64 + 12, n + 8);
+ *hashes_done = n + 8;
+ return true;
+ }
+
+ n += 8;
+
+ if (n >= max_nonce) {
+ *hashes_done = max_nonce;
+ break;
+ }
+
+ /* ensure that 'work_restart[thr_id].restart' has been read */
+ mfc_write_tag_mask(tag_mask3);
+ mfc_read_tag_status_all();
+
+ if (work_restart) {
+ *hashes_done = n;
+ break;
+ }
+ }
+ return false;
+}
+
+int main(uint64_t spe_id, uint64_t argp, uint64_t envp)
+{
+ static scanhash_spu_args args __attribute__((aligned(16)));
+ int tag = 1, tag_mask = 1 << tag;
+ int rc;
+
+ mfc_get(&args, argp, sizeof(args), tag, 0, 0);
+ mfc_write_tag_mask(tag_mask);
+ mfc_read_tag_status_all();
+
+ rc = scanhash_scrypt(envp, args.data, argp + 1024,
+ args.target, args.max_nonce,
+ &args.hashes_done);
+
+ mfc_put(&args, argp, sizeof(args), tag, 0, 0);
+ mfc_write_tag_mask(tag_mask);
+ mfc_read_tag_status_all();
+
+ return rc;
+}