Use the SHA256 code tweaked by pooler

Because the data endiannes has changed (native instead of little endian) and also SHA256 functions arguments are now different, this required lots of changes all over the place. Improves Altivec performance on Cell PPU from ~3.4 khash/s to ~3.6 khash/s (two threads). Seems to have no effect on SPU performance though.
author: Siarhei Siamashka <siarhei.siamashka@gmail.com> 2012-01-01 01:47:25 +0200
committer: Siarhei Siamashka <siarhei.siamashka@gmail.com> 2012-01-01 03:27:09 +0200
commit: 6d1e5bf3c2300623153479fd43c08a5874dacbbc (patch)
tree: 04dfa8c2bd12c2bb56419c2fd54fc529d04896ed /scrypt.c
parent: 4280aca375f60981d70826091f1ece82e7cf56c5 (diff)
1 files changed, 120 insertions, 219 deletions
diff --git a/scrypt.c b/scrypt.c
index 6f70cc2..3288f60 100644
--- a/scrypt.c
+++ b/scrypt.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright 2009 Colin Percival, 2011 ArtForz
+ * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -37,228 +37,134 @@
 #include "sha256-helpers.h"
 #include "scrypt-simd-helpers.h"
 
-static void blkcpy(void *, void *, size_t);
-static void blkxor(void *, void *, size_t);
-static void salsa20_8(uint32_t[16]);
-static void blockmix_salsa8(uint32_t *, uint32_t *, uint32_t *, size_t);
-static uint64_t integerify(void *, size_t);
-static void smix(uint8_t *, size_t, uint64_t, uint32_t *, uint32_t *);
-
-static void
-blkcpy(void * dest, void * src, size_t len)
-{
-	size_t * D = dest;
-	size_t * S = src;
-	size_t L = len / sizeof(size_t);
-	size_t i;
-
-	for (i = 0; i < L; i++)
-		D[i] = S[i];
-}
-
-static void
-blkxor(void * dest, void * src, size_t len)
-{
-	size_t * D = dest;
-	size_t * S = src;
-	size_t L = len / sizeof(size_t);
-	size_t i;
-
-	for (i = 0; i < L; i++)
-		D[i] ^= S[i];
-}
-
 /**
  * salsa20_8(B):
  * Apply the salsa20/8 core to the provided block.
  */
-static void
-salsa20_8(uint32_t B[16])
+static inline void
+salsa20_8(uint32_t B[16], const uint32_t Bx[16])
 {
-	uint32_t x[16];
+	uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
 	size_t i;
 
-	blkcpy(x, B, 64);
+	x00 = (B[ 0] ^= Bx[ 0]);
+	x01 = (B[ 1] ^= Bx[ 1]);
+	x02 = (B[ 2] ^= Bx[ 2]);
+	x03 = (B[ 3] ^= Bx[ 3]);
+	x04 = (B[ 4] ^= Bx[ 4]);
+	x05 = (B[ 5] ^= Bx[ 5]);
+	x06 = (B[ 6] ^= Bx[ 6]);
+	x07 = (B[ 7] ^= Bx[ 7]);
+	x08 = (B[ 8] ^= Bx[ 8]);
+	x09 = (B[ 9] ^= Bx[ 9]);
+	x10 = (B[10] ^= Bx[10]);
+	x11 = (B[11] ^= Bx[11]);
+	x12 = (B[12] ^= Bx[12]);
+	x13 = (B[13] ^= Bx[13]);
+	x14 = (B[14] ^= Bx[14]);
+	x15 = (B[15] ^= Bx[15]);
 	for (i = 0; i < 8; i += 2) {
 #define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
 		/* Operate on columns. */
-		x[ 4] ^= R(x[ 0]+x[12], 7);  x[ 8] ^= R(x[ 4]+x[ 0], 9);
-		x[12] ^= R(x[ 8]+x[ 4],13);  x[ 0] ^= R(x[12]+x[ 8],18);
-
-		x[ 9] ^= R(x[ 5]+x[ 1], 7);  x[13] ^= R(x[ 9]+x[ 5], 9);
-		x[ 1] ^= R(x[13]+x[ 9],13);  x[ 5] ^= R(x[ 1]+x[13],18);
-
-		x[14] ^= R(x[10]+x[ 6], 7);  x[ 2] ^= R(x[14]+x[10], 9);
-		x[ 6] ^= R(x[ 2]+x[14],13);  x[10] ^= R(x[ 6]+x[ 2],18);
-
-		x[ 3] ^= R(x[15]+x[11], 7);  x[ 7] ^= R(x[ 3]+x[15], 9);
-		x[11] ^= R(x[ 7]+x[ 3],13);  x[15] ^= R(x[11]+x[ 7],18);
+		x04 ^= R(x00+x12, 7);	x09 ^= R(x05+x01, 7);	x14 ^= R(x10+x06, 7);	x03 ^= R(x15+x11, 7);
+		x08 ^= R(x04+x00, 9);	x13 ^= R(x09+x05, 9);	x02 ^= R(x14+x10, 9);	x07 ^= R(x03+x15, 9);
+		x12 ^= R(x08+x04,13);	x01 ^= R(x13+x09,13);	x06 ^= R(x02+x14,13);	x11 ^= R(x07+x03,13);
+		x00 ^= R(x12+x08,18);	x05 ^= R(x01+x13,18);	x10 ^= R(x06+x02,18);	x15 ^= R(x11+x07,18);
 
 		/* Operate on rows. */
-		x[ 1] ^= R(x[ 0]+x[ 3], 7);  x[ 2] ^= R(x[ 1]+x[ 0], 9);
-		x[ 3] ^= R(x[ 2]+x[ 1],13);  x[ 0] ^= R(x[ 3]+x[ 2],18);
-
-		x[ 6] ^= R(x[ 5]+x[ 4], 7);  x[ 7] ^= R(x[ 6]+x[ 5], 9);
-		x[ 4] ^= R(x[ 7]+x[ 6],13);  x[ 5] ^= R(x[ 4]+x[ 7],18);
-
-		x[11] ^= R(x[10]+x[ 9], 7);  x[ 8] ^= R(x[11]+x[10], 9);
-		x[ 9] ^= R(x[ 8]+x[11],13);  x[10] ^= R(x[ 9]+x[ 8],18);
-
-		x[12] ^= R(x[15]+x[14], 7);  x[13] ^= R(x[12]+x[15], 9);
-		x[14] ^= R(x[13]+x[12],13);  x[15] ^= R(x[14]+x[13],18);
+		x01 ^= R(x00+x03, 7);	x06 ^= R(x05+x04, 7);	x11 ^= R(x10+x09, 7);	x12 ^= R(x15+x14, 7);
+		x02 ^= R(x01+x00, 9);	x07 ^= R(x06+x05, 9);	x08 ^= R(x11+x10, 9);	x13 ^= R(x12+x15, 9);
+		x03 ^= R(x02+x01,13);	x04 ^= R(x07+x06,13);	x09 ^= R(x08+x11,13);	x14 ^= R(x13+x12,13);
+		x00 ^= R(x03+x02,18);	x05 ^= R(x04+x07,18);	x10 ^= R(x09+x08,18);	x15 ^= R(x14+x13,18);
 #undef R
 	}
-	for (i = 0; i < 16; i++)
-		B[i] += x[i];
+	B[ 0] += x00;
+	B[ 1] += x01;
+	B[ 2] += x02;
+	B[ 3] += x03;
+	B[ 4] += x04;
+	B[ 5] += x05;
+	B[ 6] += x06;
+	B[ 7] += x07;
+	B[ 8] += x08;
+	B[ 9] += x09;
+	B[10] += x10;
+	B[11] += x11;
+	B[12] += x12;
+	B[13] += x13;
+	B[14] += x14;
+	B[15] += x15;
 }
 
-/**
- * blockmix_salsa8(Bin, Bout, X, r):
- * Compute Bout = BlockMix_{salsa20/8, r}(Bin).  The input Bin must be 128r
- * bytes in length; the output Bout must also be the same size.  The
- * temporary space X must be 64 bytes.
- */
-static void
-blockmix_salsa8(uint32_t * Bin, uint32_t * Bout, uint32_t * X, size_t r)
+static inline void scrypt_core1(uint32_t *X, uint32_t *V)
 {
-	size_t i;
-
-	/* 1: X <-- B_{2r - 1} */
-	blkcpy(X, &Bin[(2 * r - 1) * 16], 64);
-
-	/* 2: for i = 0 to 2r - 1 do */
-	for (i = 0; i < 2 * r; i += 2) {
-		/* 3: X <-- H(X \xor B_i) */
-		blkxor(X, &Bin[i * 16], 64);
-		salsa20_8(X);
+	uint32_t i;
+	uint32_t j;
+	uint32_t k;
+	uint64_t *p1, *p2;
+	p1 = (uint64_t *)X;
+	for (i = 0; i < 1024; i += 2) {
+		memcpy(&V[i * 32], X, 128);
 
-		/* 4: Y_i <-- X */
-		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-		blkcpy(&Bout[i * 8], X, 64);
+		salsa20_8(&X[0], &X[16]);
+		salsa20_8(&X[16], &X[0]);
 
-		/* 3: X <-- H(X \xor B_i) */
-		blkxor(X, &Bin[i * 16 + 16], 64);
-		salsa20_8(X);
+		memcpy(&V[(i + 1) * 32], X, 128);
 
-		/* 4: Y_i <-- X */
-		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-		blkcpy(&Bout[i * 8 + r * 16], X, 64);
+		salsa20_8(&X[0], &X[16]);
+		salsa20_8(&X[16], &X[0]);
 	}
-}
-
-/**
- * integerify(B, r):
- * Return the result of parsing B_{2r-1} as a little-endian integer.
- */
-static uint64_t
-integerify(void * B, size_t r)
-{
-	uint32_t * X = (void *)((uintptr_t)(B) + (2 * r - 1) * 64);
-
-	return (((uint64_t)(X[1]) << 32) + X[0]);
-}
-
-/**
- * smix(B, r, N, V, XY):
- * Compute B = SMix_r(B, N).  The input B must be 128r bytes in length;
- * the temporary storage V must be 128rN bytes in length; the temporary
- * storage XY must be 256r + 64 bytes in length.  The value N must be a
- * power of 2 greater than 1.  The arrays B, V, and XY must be aligned to a
- * multiple of 64 bytes.
- */
-static void
-smix(uint8_t * B, size_t r, uint64_t N, uint32_t * V, uint32_t * XY)
-{
-	uint32_t * X = XY;
-	uint32_t * Y = &XY[32 * r];
-	uint32_t * Z = &XY[64 * r];
-	uint64_t i;
-	uint64_t j;
-	size_t k;
-
-	/* 1: X <-- B */
-	for (k = 0; k < 32 * r; k++)
-		X[k] = le32dec(&B[4 * k]);
-
-	/* 2: for i = 0 to N - 1 do */
-	for (i = 0; i < N; i += 2) {
-		/* 3: V_i <-- X */
-		blkcpy(&V[i * (32 * r)], X, 128 * r);
-
-		/* 4: X <-- H(X) */
-		blockmix_salsa8(X, Y, Z, r);
-
-		/* 3: V_i <-- X */
-		blkcpy(&V[(i + 1) * (32 * r)], Y, 128 * r);
-
-		/* 4: X <-- H(X) */
-		blockmix_salsa8(Y, X, Z, r);
+	for (i = 0; i < 1024; i += 2) {
+		j = X[16] & 1023;
+		p2 = (uint64_t *)(&V[j * 32]);
+		for(k = 0; k < 16; k++)
+			p1[k] ^= p2[k];
+
+		salsa20_8(&X[0], &X[16]);
+		salsa20_8(&X[16], &X[0]);
+
+		j = X[16] & 1023;
+		p2 = (uint64_t *)(&V[j * 32]);
+		for(k = 0; k < 16; k++)
+			p1[k] ^= p2[k];
+
+		salsa20_8(&X[0], &X[16]);
+		salsa20_8(&X[16], &X[0]);
 	}
-
-	/* 6: for i = 0 to N - 1 do */
-	for (i = 0; i < N; i += 2) {
-		/* 7: j <-- Integerify(X) mod N */
-		j = integerify(X, r) & (N - 1);
-
-		/* 8: X <-- H(X \xor V_j) */
-		blkxor(X, &V[j * (32 * r)], 128 * r);
-		blockmix_salsa8(X, Y, Z, r);
-
-		/* 7: j <-- Integerify(X) mod N */
-		j = integerify(Y, r) & (N - 1);
-
-		/* 8: X <-- H(X \xor V_j) */
-		blkxor(Y, &V[j * (32 * r)], 128 * r);
-		blockmix_salsa8(Y, X, Z, r);
-	}
-
-	/* 10: B' <-- X */
-	for (k = 0; k < 32 * r; k++)
-		le32enc(&B[4 * k], X[k]);
 }
 
+
 /* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output
    scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes
  */
-static void scrypt_1024_1_1_256_sp1(const char* input, char* output, char* scratchpad)
+static void scrypt_1024_1_1_256_sp1(const uint32_t* input, uint32_t* output, uint8_t* scratchpad)
 {
-	uint8_t * B;
+	uint32_t tstate[8], ostate[8];
+	uint32_t * B;
 	uint32_t * V;
-	uint32_t * XY;
-	uint32_t i;
-
-	const uint32_t N = 1024;
-	const uint32_t r = 1;
-	const uint32_t p = 1;
 
-	B = (uint8_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-	XY = (uint32_t *)(B + (128 * r * p));
-	V = (uint32_t *)(B + (128 * r * p) + (256 * r + 64));
+	B = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
+	V = (uint32_t *)(B + 32);
 
-	/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
-	PBKDF2_SHA256((const uint8_t*)input, 80, (const uint8_t*)input, 80, 1, B, p * 128 * r);
+	PBKDF2_SHA256_80_128_init(input, tstate, ostate);
+	PBKDF2_SHA256_80_128(tstate, ostate, input, B);
 
 #ifdef HAVE_SCRYPT_SIMD_HELPERS
-	scrypt_simd_core1(B, XY);
+	scrypt_simd_core1(B, V);
 #else
-	/* 2: for i = 0 to p - 1 do */
-	for (i = 0; i < p; i++) {
-		/* 3: B_i <-- MF(B_i, N) */
-		smix(&B[i * 128 * r], r, N, V, XY);
-	}
+	scrypt_core1(B, V);
 #endif
 
-	/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
-	PBKDF2_SHA256((const uint8_t*)input, 80, B, p * 128 * r, 1, (uint8_t*)output, 32);
+	PBKDF2_SHA256_80_128_32(tstate, ostate, input, B, output);
 }
 
-int scanhash_scrypt1(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
+int scanhash_scrypt1(int thr_id, unsigned char *pdata, uint8_t *scratchbuf,
 	const unsigned char *ptarget,
 	uint32_t max_nonce, unsigned long *hashes_done)
 {
-	unsigned char data[80];
-	unsigned char tmp_hash[32];
-	uint32_t *nonce = (uint32_t *)(data + 64 + 12);
+	uint32_t data[20];
+	uint32_t tmp_hash[32];
+	uint32_t *nonce = (uint32_t *)(data + 19);
 	uint32_t n = 0;
 	uint32_t Htarg = le32dec(ptarget + 28);
 	int i;
@@ -266,14 +172,14 @@ int scanhash_scrypt1(int thr_id, unsigned char *pdata, unsigned char *scratchbuf
 	work_restart[thr_id].restart = 0;
 	
 	for (i = 0; i < 80/4; i++)
-		((uint32_t *)data)[i] = swab32(((uint32_t *)pdata)[i]);
+		data[i] = be32dec(pdata + i * 4);
 	
 	while(1) {
 		n++;
-		le32enc(nonce, n);
+		*nonce = n;
 		scrypt_1024_1_1_256_sp1(data, tmp_hash, scratchbuf);
 
-		if (le32dec(tmp_hash+28) <= Htarg) {
+		if (tmp_hash[7] <= Htarg) {
 			be32enc(pdata + 64 + 12, n);
 			*hashes_done = n;
 			return true;
@@ -290,46 +196,41 @@ int scanhash_scrypt1(int thr_id, unsigned char *pdata, unsigned char *scratchbuf
 #ifdef HAVE_SCRYPT_SIMD_HELPERS
 
 static void
-scrypt_1024_1_1_256_sp2(const unsigned char * input1,
-                        unsigned char       * output1,
-                        const unsigned char * input2,
-                        unsigned char       * output2,
-                        unsigned char       * scratchpad)
+scrypt_1024_1_1_256_sp2(const uint32_t * input1,
+                        uint32_t       * output1,
+                        const uint32_t * input2,
+                        uint32_t       * output2,
+                        uint8_t        * scratchpad)
 {
-	uint8_t * B1, * B2;
-	uint8_t * V;
-
-	const uint32_t N = 1024;
-	const uint32_t r = 1;
-	const uint32_t p = 1;
+	uint32_t tstate1[8], tstate2[8], ostate1[8], ostate2[8];
+	uint32_t * B1, * B2;
+	uint32_t * V;
 
-	B1 = (uint8_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-	B2 = B1 + 128;
-	V  = B2 + 128;
+	B1 = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
+	B2 = B1 + 32;
+	V  = B2 + 32;
 
-	/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
-	PBKDF2_SHA256((const uint8_t*)input1, 80, (const uint8_t*)input1, 80, 1, B1, p * 128 * r);
-	/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
-	PBKDF2_SHA256((const uint8_t*)input2, 80, (const uint8_t*)input2, 80, 1, B2, p * 128 * r);
+	PBKDF2_SHA256_80_128_init(input1, tstate1, ostate1);
+	PBKDF2_SHA256_80_128_init(input2, tstate2, ostate2);
+	PBKDF2_SHA256_80_128(tstate1, ostate1, input1, B1);
+	PBKDF2_SHA256_80_128(tstate2, ostate2, input2, B2);
 
 	scrypt_simd_core2(B1, V);
 
-	/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
-	PBKDF2_SHA256((const uint8_t*)input1, 80, B1, p * 128 * r, 1, (uint8_t*)output1, 32);
-	/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
-	PBKDF2_SHA256((const uint8_t*)input2, 80, B2, p * 128 * r, 1, (uint8_t*)output2, 32);
+	PBKDF2_SHA256_80_128_32(tstate1, ostate1, input1, B1, output1);
+	PBKDF2_SHA256_80_128_32(tstate2, ostate2, input2, B2, output2);
 }
 
 int scanhash_scrypt2(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
 	const unsigned char *ptarget,
 	uint32_t max_nonce, unsigned long *hashes_done)
 {
-	unsigned char data1[80];
-	unsigned char tmp_hash1[32];
-	unsigned char data2[80];
-	unsigned char tmp_hash2[32];
-	uint32_t *nonce1 = (uint32_t *)(data1 + 64 + 12);
-	uint32_t *nonce2 = (uint32_t *)(data2 + 64 + 12);
+	uint32_t data1[20];
+	uint32_t tmp_hash1[8];
+	uint32_t data2[20];
+	uint32_t tmp_hash2[8];
+	uint32_t *nonce1 = (uint32_t *)(data1 + 19);
+	uint32_t *nonce2 = (uint32_t *)(data2 + 19);
 	uint32_t n = 0;
 	uint32_t Htarg = le32dec(ptarget + 28);
 	int i;
@@ -337,22 +238,22 @@ int scanhash_scrypt2(int thr_id, unsigned char *pdata, unsigned char *scratchbuf
 	work_restart[thr_id].restart = 0;
 	
 	for (i = 0; i < 80/4; i++) {
-		((uint32_t *)data1)[i] = swab32(((uint32_t *)pdata)[i]);
-		((uint32_t *)data2)[i] = swab32(((uint32_t *)pdata)[i]);
+		((uint32_t *)data1)[i] = be32dec(pdata + i * 4);
+		((uint32_t *)data2)[i] = be32dec(pdata + i * 4);
 	}
 	
 	while(1) {
-		le32enc(nonce1, n + 1);
-		le32enc(nonce2, n + 2);
+		*nonce1 = n + 1;
+		*nonce2 = n + 2;
 		scrypt_1024_1_1_256_sp2(data1, tmp_hash1, data2, tmp_hash2, scratchbuf);
 
-		if (le32dec(tmp_hash1+28) <= Htarg) {
+		if (tmp_hash1[7] <= Htarg) {
 			be32enc(pdata + 64 + 12, n + 1);
 			*hashes_done = n + 1;
 			return true;
 		}
 
-		if (le32dec(tmp_hash2+28) <= Htarg && n + 2 <= max_nonce) {
+		if (tmp_hash2[7] <= Htarg && n + 2 <= max_nonce) {
 			be32enc(pdata + 64 + 12, n + 2);
 			*hashes_done = n + 2;
 			return true;
author	Siarhei Siamashka <siarhei.siamashka@gmail.com>	2012-01-01 01:47:25 +0200
committer	Siarhei Siamashka <siarhei.siamashka@gmail.com>	2012-01-01 03:27:09 +0200
commit	6d1e5bf3c2300623153479fd43c08a5874dacbbc (patch)
tree	04dfa8c2bd12c2bb56419c2fd54fc529d04896ed /scrypt.c
parent	4280aca375f60981d70826091f1ece82e7cf56c5 (diff)