aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiarhei Siamashka <siarhei.siamashka@gmail.com>2011-12-24 02:49:41 +0200
committerSiarhei Siamashka <siarhei.siamashka@gmail.com>2011-12-24 02:49:41 +0200
commit516344d822e3391cc28ce308372966f170c8c8e1 (patch)
tree0e8ccf227421f24e6bd1b708e30954433734899e
parentc4d35d2672fe7900e07a15d0ddd46bc6cb43fe44 (diff)
Initial Altivec intrinsics (simple conversion of Tarsnap SSE code)scrypt-bigendian
Speedup on Cell PPU (32-bit), single thread, 3.2GHz: 0.65 khash/sec -> 1.01 khash/sec
-rw-r--r--scrypt.c94
1 files changed, 94 insertions, 0 deletions
diff --git a/scrypt.c b/scrypt.c
index 91bc19f..a5133fa 100644
--- a/scrypt.c
+++ b/scrypt.c
@@ -466,6 +466,98 @@ PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
/*****************************************************************************/
+#ifdef __ALTIVEC__
+
+#include <altivec.h>
+#include <vec_types.h>
+
+static void
+blkcpy(void * dest, void * src, size_t len)
+{
+ vec_uint4 * D = dest;
+ vec_uint4 * S = src;
+ size_t L = len / sizeof(vec_uint4);
+ size_t i;
+
+ for (i = 0; i < L; i++)
+ D[i] = S[i];
+}
+
+static void
+blkxor(void * dest, void * src, size_t len)
+{
+ vec_uint4 * D = dest;
+ vec_uint4 * S = src;
+ size_t L = len / sizeof(vec_uint4);
+ size_t i;
+
+ for (i = 0; i < L; i++)
+ D[i] = vec_xor(D[i], S[i]);
+}
+
+/**
+ * salsa20_8(B):
+ * Apply the salsa20/8 core to the provided block.
+ */
+static void
+salsa20_8(uint32_t B32[16])
+{
+ vec_uint4 * B = (vec_uint4 *)&B32[0];
+ vec_uint4 X0, X1, X2, X3;
+ vec_uint4 T;
+ size_t i;
+
+ const vec_uchar16 perm_0x93 = { 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
+ const vec_uchar16 perm_0x4E = { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 };
+ const vec_uchar16 perm_0x39 = { 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3 };
+
+ X0 = B[0];
+ X1 = B[1];
+ X2 = B[2];
+ X3 = B[3];
+
+ for (i = 0; i < 8; i += 2) {
+#define R(a,b) vec_rl(a, vec_splats((uint32_t)b))
+ /* Operate on "columns". */
+ T = vec_add(X0, X3);
+ X1 = vec_xor(X1, R(T, 7));
+ T = vec_add(X1, X0);
+ X2 = vec_xor(X2, R(T, 9));
+ T = vec_add(X2, X1);
+ X3 = vec_xor(X3, R(T, 13));
+ T = vec_add(X3, X2);
+ X0 = vec_xor(X0, R(T, 18));
+
+ /* Rearrange data. */
+ X1 = vec_perm(X1, X1, perm_0x93);
+ X2 = vec_perm(X2, X2, perm_0x4E);
+ X3 = vec_perm(X3, X3, perm_0x39);
+
+ /* Operate on "rows". */
+ T = vec_add(X0, X1);
+ X3 = vec_xor(X3, R(T, 7));
+ T = vec_add(X3, X0);
+ X2 = vec_xor(X2, R(T, 9));
+ T = vec_add(X2, X3);
+ X1 = vec_xor(X1, R(T, 13));
+ T = vec_add(X1, X2);
+ X0 = vec_xor(X0, R(T, 18));
+
+ /* Rearrange data. */
+ X1 = vec_perm(X1, X1, perm_0x39);
+ X2 = vec_perm(X2, X2, perm_0x4E);
+ X3 = vec_perm(X3, X3, perm_0x93);
+#undef R
+ }
+
+ B[0] = vec_add(B[0], X0);
+ B[1] = vec_add(B[1], X1);
+ B[2] = vec_add(B[2], X2);
+ B[3] = vec_add(B[3], X3);
+}
+
+#else
+
static void
blkcpy(void * dest, void * src, size_t len)
{
@@ -534,6 +626,8 @@ salsa20_8(uint32_t B[16])
B[i] += x[i];
}
+#endif
+
/**
* blockmix_salsa8(Bin, Bout, X, r):
* Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r