diff options
author | Siarhei Siamashka <siarhei.siamashka@gmail.com> | 2012-01-01 01:47:25 +0200 |
---|---|---|
committer | Siarhei Siamashka <siarhei.siamashka@gmail.com> | 2012-01-01 03:27:09 +0200 |
commit | 6d1e5bf3c2300623153479fd43c08a5874dacbbc (patch) | |
tree | 04dfa8c2bd12c2bb56419c2fd54fc529d04896ed | |
parent | 4280aca375f60981d70826091f1ece82e7cf56c5 (diff) |
Use the SHA256 code tweaked by pooler
Because the data endiannes has changed (native instead of
little endian) and also SHA256 functions arguments are now
different, this required lots of changes all over the place.
Improves Altivec performance on Cell PPU from ~3.4 khash/s
to ~3.6 khash/s (two threads). Seems to have no effect on
SPU performance though.
-rw-r--r-- | scrypt-cell-spu.c | 269 | ||||
-rw-r--r-- | scrypt-simd-helpers.h | 33 | ||||
-rw-r--r-- | scrypt.c | 339 | ||||
-rw-r--r-- | sha256-helpers.h | 369 |
4 files changed, 373 insertions, 637 deletions
diff --git a/scrypt-cell-spu.c b/scrypt-cell-spu.c index a5a741f..50002f5 100644 --- a/scrypt-cell-spu.c +++ b/scrypt-cell-spu.c @@ -157,7 +157,7 @@ salsa20_8_xor4(uint32x4 * __restrict B, const uint32x4 * __restrict Bx, } static void -scrypt_spu_core8(uint8_t *databuf, uint64_t scratch) +scrypt_spu_core8(uint32_t *databuf32, uint64_t scratch) { static mfc_list_element_t dma_list[8] __attribute__((aligned(128))); static XY X[8] __attribute__((aligned(128))); @@ -186,22 +186,22 @@ scrypt_spu_core8(uint8_t *databuf, uint64_t scratch) /* 1: X <-- B */ for (i = 0; i < 16; i++) { - XA->w[i] = le32dec(&databuf[0 * 128 + (i * 5 % 16) * 4]); - XA->w[16 + i] = le32dec(&databuf[0 * 128 + (16 + (i * 5 % 16)) * 4]); - XB->w[i] = le32dec(&databuf[1 * 128 + (i * 5 % 16) * 4]); - XB->w[16 + i] = le32dec(&databuf[1 * 128 + (16 + (i * 5 % 16)) * 4]); - XC->w[i] = le32dec(&databuf[2 * 128 + (i * 5 % 16) * 4]); - XC->w[16 + i] = le32dec(&databuf[2 * 128 + (16 + (i * 5 % 16)) * 4]); - XD->w[i] = le32dec(&databuf[3 * 128 + (i * 5 % 16) * 4]); - XD->w[16 + i] = le32dec(&databuf[3 * 128 + (16 + (i * 5 % 16)) * 4]); - XE->w[i] = le32dec(&databuf[4 * 128 + (i * 5 % 16) * 4]); - XE->w[16 + i] = le32dec(&databuf[4 * 128 + (16 + (i * 5 % 16)) * 4]); - XF->w[i] = le32dec(&databuf[5 * 128 + (i * 5 % 16) * 4]); - XF->w[16 + i] = le32dec(&databuf[5 * 128 + (16 + (i * 5 % 16)) * 4]); - XG->w[i] = le32dec(&databuf[6 * 128 + (i * 5 % 16) * 4]); - XG->w[16 + i] = le32dec(&databuf[6 * 128 + (16 + (i * 5 % 16)) * 4]); - XH->w[i] = le32dec(&databuf[7 * 128 + (i * 5 % 16) * 4]); - XH->w[16 + i] = le32dec(&databuf[7 * 128 + (16 + (i * 5 % 16)) * 4]); + XA->w[i] = databuf32[0 * 32 + (i * 5 % 16)]; + XA->w[16 + i] = databuf32[0 * 32 + (16 + (i * 5 % 16))]; + XB->w[i] = databuf32[1 * 32 + (i * 5 % 16)]; + XB->w[16 + i] = databuf32[1 * 32 + (16 + (i * 5 % 16))]; + XC->w[i] = databuf32[2 * 32 + (i * 5 % 16)]; + XC->w[16 + i] = databuf32[2 * 32 + (16 + (i * 5 % 16))]; + XD->w[i] = databuf32[3 * 32 + (i * 5 % 16)]; + XD->w[16 + i] = databuf32[3 * 32 + (16 + (i * 5 % 16))]; + XE->w[i] = databuf32[4 * 32 + (i * 5 % 16)]; + XE->w[16 + i] = databuf32[4 * 32 + (16 + (i * 5 % 16))]; + XF->w[i] = databuf32[5 * 32 + (i * 5 % 16)]; + XF->w[16 + i] = databuf32[5 * 32 + (16 + (i * 5 % 16))]; + XG->w[i] = databuf32[6 * 32 + (i * 5 % 16)]; + XG->w[16 + i] = databuf32[6 * 32 + (16 + (i * 5 % 16))]; + XH->w[i] = databuf32[7 * 32 + (i * 5 % 16)]; + XH->w[16 + i] = databuf32[7 * 32 + (16 + (i * 5 % 16))]; } for (i = 0; i < 8; i++) dma_list[i].size = 128; @@ -280,94 +280,89 @@ scrypt_spu_core8(uint8_t *databuf, uint64_t scratch) /* 10: B' <-- X */ for (i = 0; i < 16; i++) { - le32enc(&databuf[0 * 128 + (i * 5 % 16) * 4], XA->w[i]); - le32enc(&databuf[0 * 128 + (16 + (i * 5 % 16)) * 4], XA->w[16 + i]); - le32enc(&databuf[1 * 128 + (i * 5 % 16) * 4], XB->w[i]); - le32enc(&databuf[1 * 128 + (16 + (i * 5 % 16)) * 4], XB->w[16 + i]); - le32enc(&databuf[2 * 128 + (i * 5 % 16) * 4], XC->w[i]); - le32enc(&databuf[2 * 128 + (16 + (i * 5 % 16)) * 4], XC->w[16 + i]); - le32enc(&databuf[3 * 128 + (i * 5 % 16) * 4], XD->w[i]); - le32enc(&databuf[3 * 128 + (16 + (i * 5 % 16)) * 4], XD->w[16 + i]); - le32enc(&databuf[4 * 128 + (i * 5 % 16) * 4], XE->w[i]); - le32enc(&databuf[4 * 128 + (16 + (i * 5 % 16)) * 4], XE->w[16 + i]); - le32enc(&databuf[5 * 128 + (i * 5 % 16) * 4], XF->w[i]); - le32enc(&databuf[5 * 128 + (16 + (i * 5 % 16)) * 4], XF->w[16 + i]); - le32enc(&databuf[6 * 128 + (i * 5 % 16) * 4], XG->w[i]); - le32enc(&databuf[6 * 128 + (16 + (i * 5 % 16)) * 4], XG->w[16 + i]); - le32enc(&databuf[7 * 128 + (i * 5 % 16) * 4], XH->w[i]); - le32enc(&databuf[7 * 128 + (16 + (i * 5 % 16)) * 4], XH->w[16 + i]); + databuf32[0 * 32 + (i * 5 % 16)] = XA->w[i]; + databuf32[0 * 32 + (16 + (i * 5 % 16))] = XA->w[16 + i]; + databuf32[1 * 32 + (i * 5 % 16)] = XB->w[i]; + databuf32[1 * 32 + (16 + (i * 5 % 16))] = XB->w[16 + i]; + databuf32[2 * 32 + (i * 5 % 16)] = XC->w[i]; + databuf32[2 * 32 + (16 + (i * 5 % 16))] = XC->w[16 + i]; + databuf32[3 * 32 + (i * 5 % 16)] = XD->w[i]; + databuf32[3 * 32 + (16 + (i * 5 % 16))] = XD->w[16 + i]; + databuf32[4 * 32 + (i * 5 % 16)] = XE->w[i]; + databuf32[4 * 32 + (16 + (i * 5 % 16))] = XE->w[16 + i]; + databuf32[5 * 32 + (i * 5 % 16)] = XF->w[i]; + databuf32[5 * 32 + (16 + (i * 5 % 16))] = XF->w[16 + i]; + databuf32[6 * 32 + (i * 5 % 16)] = XG->w[i]; + databuf32[6 * 32 + (16 + (i * 5 % 16))] = XG->w[16 + i]; + databuf32[7 * 32 + (i * 5 % 16)] = XH->w[i]; + databuf32[7 * 32 + (16 + (i * 5 % 16))] = XH->w[16 + i]; } } static void -scrypt_1024_1_1_256_sp8(const unsigned char * input1, - unsigned char * output1, - const unsigned char * input2, - unsigned char * output2, - const unsigned char * input3, - unsigned char * output3, - const unsigned char * input4, - unsigned char * output4, - const unsigned char * input5, - unsigned char * output5, - const unsigned char * input6, - unsigned char * output6, - const unsigned char * input7, - unsigned char * output7, - const unsigned char * input8, - unsigned char * output8, +scrypt_1024_1_1_256_sp8(const uint32_t * input1, + uint32_t * output1, + const uint32_t * input2, + uint32_t * output2, + const uint32_t * input3, + uint32_t * output3, + const uint32_t * input4, + uint32_t * output4, + const uint32_t * input5, + uint32_t * output5, + const uint32_t * input6, + uint32_t * output6, + const uint32_t * input7, + uint32_t * output7, + const uint32_t * input8, + uint32_t * output8, uint64_t scratchpad) { - static uint8_t databuf[128 * 8] __attribute__((aligned(128))); - uint8_t * B1, * B2, * B3, * B4, * B5, * B6, * B7, * B8; + uint32_t tstate1[8], tstate2[8], tstate3[8], tstate4[8]; + uint32_t tstate5[8], tstate6[8], tstate7[8], tstate8[8]; - const uint32_t r = 1; - const uint32_t p = 1; + uint32_t ostate1[8], ostate2[8], ostate3[8], ostate4[8]; + uint32_t ostate5[8], ostate6[8], ostate7[8], ostate8[8]; + + static uint32_t databuf[32 * 8] __attribute__((aligned(128))); + uint32_t * B1, * B2, * B3, * B4, * B5, * B6, * B7, * B8; B1 = databuf; - B2 = databuf + 128 * 1; - B3 = databuf + 128 * 2; - B4 = databuf + 128 * 3; - B5 = databuf + 128 * 4; - B6 = databuf + 128 * 5; - B7 = databuf + 128 * 6; - B8 = databuf + 128 * 7; - - /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ - PBKDF2_SHA256((const uint8_t*)input1, 80, (const uint8_t*)input1, 80, 1, B1, p * 128 * r); - /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ - PBKDF2_SHA256((const uint8_t*)input2, 80, (const uint8_t*)input2, 80, 1, B2, p * 128 * r); - /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ - PBKDF2_SHA256((const uint8_t*)input3, 80, (const uint8_t*)input3, 80, 1, B3, p * 128 * r); - /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ - PBKDF2_SHA256((const uint8_t*)input4, 80, (const uint8_t*)input4, 80, 1, B4, p * 128 * r); - /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ - PBKDF2_SHA256((const uint8_t*)input5, 80, (const uint8_t*)input5, 80, 1, B5, p * 128 * r); - /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ - PBKDF2_SHA256((const uint8_t*)input6, 80, (const uint8_t*)input6, 80, 1, B6, p * 128 * r); - /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ - PBKDF2_SHA256((const uint8_t*)input7, 80, (const uint8_t*)input7, 80, 1, B7, p * 128 * r); - /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ - PBKDF2_SHA256((const uint8_t*)input8, 80, (const uint8_t*)input8, 80, 1, B8, p * 128 * r); + B2 = databuf + 32 * 1; + B3 = databuf + 32 * 2; + B4 = databuf + 32 * 3; + B5 = databuf + 32 * 4; + B6 = databuf + 32 * 5; + B7 = databuf + 32 * 6; + B8 = databuf + 32 * 7; + + PBKDF2_SHA256_80_128_init(input1, tstate1, ostate1); + PBKDF2_SHA256_80_128_init(input2, tstate2, ostate2); + PBKDF2_SHA256_80_128_init(input3, tstate3, ostate3); + PBKDF2_SHA256_80_128_init(input4, tstate4, ostate4); + PBKDF2_SHA256_80_128_init(input5, tstate5, ostate5); + PBKDF2_SHA256_80_128_init(input6, tstate6, ostate6); + PBKDF2_SHA256_80_128_init(input7, tstate7, ostate7); + PBKDF2_SHA256_80_128_init(input8, tstate8, ostate8); + PBKDF2_SHA256_80_128(tstate1, ostate1, input1, B1); + PBKDF2_SHA256_80_128(tstate2, ostate2, input2, B2); + PBKDF2_SHA256_80_128(tstate3, ostate3, input3, B3); + PBKDF2_SHA256_80_128(tstate4, ostate4, input4, B4); + PBKDF2_SHA256_80_128(tstate5, ostate5, input5, B5); + PBKDF2_SHA256_80_128(tstate6, ostate6, input6, B6); + PBKDF2_SHA256_80_128(tstate7, ostate7, input7, B7); + PBKDF2_SHA256_80_128(tstate8, ostate8, input8, B8); scrypt_spu_core8(databuf, scratchpad); - /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ - PBKDF2_SHA256((const uint8_t*)input1, 80, B1, p * 128 * r, 1, (uint8_t*)output1, 32); - /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ - PBKDF2_SHA256((const uint8_t*)input2, 80, B2, p * 128 * r, 1, (uint8_t*)output2, 32); - /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ - PBKDF2_SHA256((const uint8_t*)input3, 80, B3, p * 128 * r, 1, (uint8_t*)output3, 32); - /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ - PBKDF2_SHA256((const uint8_t*)input4, 80, B4, p * 128 * r, 1, (uint8_t*)output4, 32); - /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ - PBKDF2_SHA256((const uint8_t*)input5, 80, B5, p * 128 * r, 1, (uint8_t*)output5, 32); - /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ - PBKDF2_SHA256((const uint8_t*)input6, 80, B6, p * 128 * r, 1, (uint8_t*)output6, 32); - /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ - PBKDF2_SHA256((const uint8_t*)input7, 80, B7, p * 128 * r, 1, (uint8_t*)output7, 32); - /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ - PBKDF2_SHA256((const uint8_t*)input8, 80, B8, p * 128 * r, 1, (uint8_t*)output8, 32); + PBKDF2_SHA256_80_128_32(tstate1, ostate1, input1, B1, output1); + PBKDF2_SHA256_80_128_32(tstate2, ostate2, input2, B2, output2); + PBKDF2_SHA256_80_128_32(tstate3, ostate3, input3, B3, output3); + PBKDF2_SHA256_80_128_32(tstate4, ostate4, input4, B4, output4); + PBKDF2_SHA256_80_128_32(tstate5, ostate5, input5, B5, output5); + PBKDF2_SHA256_80_128_32(tstate6, ostate6, input6, B6, output6); + PBKDF2_SHA256_80_128_32(tstate7, ostate7, input7, B7, output7); + PBKDF2_SHA256_80_128_32(tstate8, ostate8, input8, B8, output8); } static int @@ -375,30 +370,22 @@ scanhash_scrypt(uint64_t work_restart_ptr, unsigned char *pdata, uint64_t scratchbuf, const unsigned char *ptarget, uint32_t max_nonce, uint32_t *hashes_done) { - unsigned char data1[80]; - unsigned char tmp_hash1[32]; - unsigned char data2[80]; - unsigned char tmp_hash2[32]; - unsigned char data3[80]; - unsigned char tmp_hash3[32]; - unsigned char data4[80]; - unsigned char tmp_hash4[32]; - unsigned char data5[80]; - unsigned char tmp_hash5[32]; - unsigned char data6[80]; - unsigned char tmp_hash6[32]; - unsigned char data7[80]; - unsigned char tmp_hash7[32]; - unsigned char data8[80]; - unsigned char tmp_hash8[32]; - uint32_t *nonce1 = (uint32_t *)(data1 + 64 + 12); - uint32_t *nonce2 = (uint32_t *)(data2 + 64 + 12); - uint32_t *nonce3 = (uint32_t *)(data3 + 64 + 12); - uint32_t *nonce4 = (uint32_t *)(data4 + 64 + 12); - uint32_t *nonce5 = (uint32_t *)(data5 + 64 + 12); - uint32_t *nonce6 = (uint32_t *)(data6 + 64 + 12); - uint32_t *nonce7 = (uint32_t *)(data7 + 64 + 12); - uint32_t *nonce8 = (uint32_t *)(data8 + 64 + 12); + uint32_t data1[20], tmp_hash1[8]; + uint32_t data2[20], tmp_hash2[8]; + uint32_t data3[20], tmp_hash3[8]; + uint32_t data4[20], tmp_hash4[8]; + uint32_t data5[20], tmp_hash5[8]; + uint32_t data6[20], tmp_hash6[8]; + uint32_t data7[20], tmp_hash7[8]; + uint32_t data8[20], tmp_hash8[8]; + uint32_t *nonce1 = &data1[19]; + uint32_t *nonce2 = &data2[19]; + uint32_t *nonce3 = &data3[19]; + uint32_t *nonce4 = &data4[19]; + uint32_t *nonce5 = &data5[19]; + uint32_t *nonce6 = &data6[19]; + uint32_t *nonce7 = &data7[19]; + uint32_t *nonce8 = &data8[19]; uint32_t n = 0; uint32_t Htarg = le32dec(ptarget + 28); int i; @@ -406,77 +393,77 @@ scanhash_scrypt(uint64_t work_restart_ptr, unsigned char *pdata, int work_restart = 0; for (i = 0; i < 80/4; i++) { - ((uint32_t *)data1)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]); - ((uint32_t *)data2)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]); - ((uint32_t *)data3)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]); - ((uint32_t *)data4)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]); - ((uint32_t *)data5)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]); - ((uint32_t *)data6)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]); - ((uint32_t *)data7)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]); - ((uint32_t *)data8)[i] = __builtin_bswap32(((uint32_t *)pdata)[i]); + data1[i] = be32dec(&((uint32_t *)pdata)[i]); + data2[i] = be32dec(&((uint32_t *)pdata)[i]); + data3[i] = be32dec(&((uint32_t *)pdata)[i]); + data4[i] = be32dec(&((uint32_t *)pdata)[i]); + data5[i] = be32dec(&((uint32_t *)pdata)[i]); + data6[i] = be32dec(&((uint32_t *)pdata)[i]); + data7[i] = be32dec(&((uint32_t *)pdata)[i]); + data8[i] = be32dec(&((uint32_t *)pdata)[i]); } while(1) { /* request 'work_restart[thr_id].restart' from external memory */ mfc_get(&work_restart, work_restart_ptr, 4, tag3, 0, 0); - le32enc(nonce1, n + 1); - le32enc(nonce2, n + 2); - le32enc(nonce3, n + 3); - le32enc(nonce4, n + 4); - le32enc(nonce5, n + 5); - le32enc(nonce6, n + 6); - le32enc(nonce7, n + 7); - le32enc(nonce8, n + 8); + *nonce1 = n + 1; + *nonce2 = n + 2; + *nonce3 = n + 3; + *nonce4 = n + 4; + *nonce5 = n + 5; + *nonce6 = n + 6; + *nonce7 = n + 7; + *nonce8 = n + 8; scrypt_1024_1_1_256_sp8(data1, tmp_hash1, data2, tmp_hash2, data3, tmp_hash3, data4, tmp_hash4, data5, tmp_hash5, data6, tmp_hash6, data7, tmp_hash7, data8, tmp_hash8, scratchbuf); - if (le32dec(tmp_hash1+28) <= Htarg) { + if (tmp_hash1[7] <= Htarg) { be32enc(pdata + 64 + 12, n + 1); *hashes_done = n; return true; } - if (le32dec(tmp_hash2+28) <= Htarg && n + 2 <= max_nonce) { + if (tmp_hash2[7] <= Htarg && n + 2 <= max_nonce) { be32enc(pdata + 64 + 12, n + 2); *hashes_done = n + 2; return true; } - if (le32dec(tmp_hash3+28) <= Htarg && n + 3 <= max_nonce) { + if (tmp_hash3[7] <= Htarg && n + 3 <= max_nonce) { be32enc(pdata + 64 + 12, n + 3); *hashes_done = n + 3; return true; } - if (le32dec(tmp_hash4+28) <= Htarg && n + 4 <= max_nonce) { + if (tmp_hash4[7] <= Htarg && n + 4 <= max_nonce) { be32enc(pdata + 64 + 12, n + 4); *hashes_done = n + 4; return true; } - if (le32dec(tmp_hash5+28) <= Htarg && n + 5 <= max_nonce) { + if (tmp_hash5[7] <= Htarg && n + 5 <= max_nonce) { be32enc(pdata + 64 + 12, n + 5); *hashes_done = n + 5; return true; } - if (le32dec(tmp_hash6+28) <= Htarg && n + 6 <= max_nonce) { + if (tmp_hash6[7] <= Htarg && n + 6 <= max_nonce) { be32enc(pdata + 64 + 12, n + 6); *hashes_done = n + 6; return true; } - if (le32dec(tmp_hash7+28) <= Htarg && n + 7 <= max_nonce) { + if (tmp_hash7[7] <= Htarg && n + 7 <= max_nonce) { be32enc(pdata + 64 + 12, n + 7); *hashes_done = n + 7; return true; } - if (le32dec(tmp_hash8+28) <= Htarg && n + 8 <= max_nonce) { + if (tmp_hash8[7] <= Htarg && n + 8 <= max_nonce) { be32enc(pdata + 64 + 12, n + 8); *hashes_done = n + 8; return true; diff --git a/scrypt-simd-helpers.h b/scrypt-simd-helpers.h index 322d718..8dd5681 100644 --- a/scrypt-simd-helpers.h +++ b/scrypt-simd-helpers.h @@ -270,16 +270,17 @@ typedef union { uint32x4 q[8]; uint32_t w[32]; } XY; * All buffers must be aligned at 64 byte boundary. */ static inline -void scrypt_simd_core1(uint8_t databuf[128], void * scratch) +void scrypt_simd_core1(uint32_t databuf[32], void * scratch) { + uint32_t * databufA = (uint32_t *)&databuf[0]; XY * X = (XY *)((uintptr_t)scratch + 0); uint32x4 * V = (uint32x4 *)((uintptr_t)scratch + 128); int i, j; /* 1: X <-- B */ for (i = 0; i < 16; i++) { - X->w[i] = le32dec(&databuf[(i * 5 % 16) * 4]); - X->w[16 + i] = le32dec(&databuf[(16 + (i * 5 % 16)) * 4]); + X->w[i] = databufA[i * 5 % 16]; + X->w[16 + i] = databufA[16 + (i * 5 % 16)]; } /* 2: for i = 0 to N - 1 do */ @@ -299,8 +300,8 @@ void scrypt_simd_core1(uint8_t databuf[128], void * scratch) /* 10: B' <-- X */ for (i = 0; i < 16; i++) { - le32enc(&databuf[(i * 5 % 16) * 4], X->w[i]); - le32enc(&databuf[(16 + (i * 5 % 16)) * 4], X->w[16 + i]); + databufA[i * 5 % 16] = X->w[i]; + databufA[16 + (i * 5 % 16)] = X->w[16 + i]; } } @@ -317,10 +318,10 @@ void scrypt_simd_core1(uint8_t databuf[128], void * scratch) * All buffers must be aligned at 64 byte boundary. */ static inline -void scrypt_simd_core2(uint8_t databuf[2 * 128], void * scratch) +void scrypt_simd_core2(uint32_t databuf[2 * 32], void * scratch) { - uint8_t * databufA = &databuf[0]; - uint8_t * databufB = &databuf[128]; + uint32_t * databufA = (uint32_t *)&databuf[0]; + uint32_t * databufB = (uint32_t *)&databuf[32]; XY * XA = (XY *)((uintptr_t)scratch); XY * XB = (XY *)((uintptr_t)scratch + 128 + 128 * 1024); uint32x4 * VA = (uint32x4 *)((uintptr_t)XA + 128); @@ -329,10 +330,10 @@ void scrypt_simd_core2(uint8_t databuf[2 * 128], void * scratch) /* 1: X <-- B */ for (i = 0; i < 16; i++) { - XA->w[i] = le32dec(&databufA[(i * 5 % 16) * 4]); - XA->w[16 + i] = le32dec(&databufA[(16 + (i * 5 % 16)) * 4]); - XB->w[i] = le32dec(&databufB[(i * 5 % 16) * 4]); - XB->w[16 + i] = le32dec(&databufB[(16 + (i * 5 % 16)) * 4]); + XA->w[i] = databufA[i * 5 % 16]; + XA->w[16 + i] = databufA[16 + (i * 5 % 16)]; + XB->w[i] = databufB[i * 5 % 16]; + XB->w[16 + i] = databufB[16 + (i * 5 % 16)]; } /* 2: for i = 0 to N - 1 do */ @@ -355,10 +356,10 @@ void scrypt_simd_core2(uint8_t databuf[2 * 128], void * scratch) /* 10: B' <-- X */ for (i = 0; i < 16; i++) { - le32enc(&databufA[(i * 5 % 16) * 4], XA->w[i]); - le32enc(&databufA[(16 + (i * 5 % 16)) * 4], XA->w[16 + i]); - le32enc(&databufB[(i * 5 % 16) * 4], XB->w[i]); - le32enc(&databufB[(16 + (i * 5 % 16)) * 4], XB->w[16 + i]); + databufA[i * 5 % 16] = XA->w[i]; + databufA[16 + (i * 5 % 16)] = XA->w[16 + i]; + databufB[i * 5 % 16] = XB->w[i]; + databufB[16 + (i * 5 % 16)] = XB->w[16 + i]; } } @@ -1,5 +1,5 @@ /*- - * Copyright 2009 Colin Percival, 2011 ArtForz + * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -37,228 +37,134 @@ #include "sha256-helpers.h" #include "scrypt-simd-helpers.h" -static void blkcpy(void *, void *, size_t); -static void blkxor(void *, void *, size_t); -static void salsa20_8(uint32_t[16]); -static void blockmix_salsa8(uint32_t *, uint32_t *, uint32_t *, size_t); -static uint64_t integerify(void *, size_t); -static void smix(uint8_t *, size_t, uint64_t, uint32_t *, uint32_t *); - -static void -blkcpy(void * dest, void * src, size_t len) -{ - size_t * D = dest; - size_t * S = src; - size_t L = len / sizeof(size_t); - size_t i; - - for (i = 0; i < L; i++) - D[i] = S[i]; -} - -static void -blkxor(void * dest, void * src, size_t len) -{ - size_t * D = dest; - size_t * S = src; - size_t L = len / sizeof(size_t); - size_t i; - - for (i = 0; i < L; i++) - D[i] ^= S[i]; -} - /** * salsa20_8(B): * Apply the salsa20/8 core to the provided block. */ -static void -salsa20_8(uint32_t B[16]) +static inline void +salsa20_8(uint32_t B[16], const uint32_t Bx[16]) { - uint32_t x[16]; + uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15; size_t i; - blkcpy(x, B, 64); + x00 = (B[ 0] ^= Bx[ 0]); + x01 = (B[ 1] ^= Bx[ 1]); + x02 = (B[ 2] ^= Bx[ 2]); + x03 = (B[ 3] ^= Bx[ 3]); + x04 = (B[ 4] ^= Bx[ 4]); + x05 = (B[ 5] ^= Bx[ 5]); + x06 = (B[ 6] ^= Bx[ 6]); + x07 = (B[ 7] ^= Bx[ 7]); + x08 = (B[ 8] ^= Bx[ 8]); + x09 = (B[ 9] ^= Bx[ 9]); + x10 = (B[10] ^= Bx[10]); + x11 = (B[11] ^= Bx[11]); + x12 = (B[12] ^= Bx[12]); + x13 = (B[13] ^= Bx[13]); + x14 = (B[14] ^= Bx[14]); + x15 = (B[15] ^= Bx[15]); for (i = 0; i < 8; i += 2) { #define R(a,b) (((a) << (b)) | ((a) >> (32 - (b)))) /* Operate on columns. */ - x[ 4] ^= R(x[ 0]+x[12], 7); x[ 8] ^= R(x[ 4]+x[ 0], 9); - x[12] ^= R(x[ 8]+x[ 4],13); x[ 0] ^= R(x[12]+x[ 8],18); - - x[ 9] ^= R(x[ 5]+x[ 1], 7); x[13] ^= R(x[ 9]+x[ 5], 9); - x[ 1] ^= R(x[13]+x[ 9],13); x[ 5] ^= R(x[ 1]+x[13],18); - - x[14] ^= R(x[10]+x[ 6], 7); x[ 2] ^= R(x[14]+x[10], 9); - x[ 6] ^= R(x[ 2]+x[14],13); x[10] ^= R(x[ 6]+x[ 2],18); - - x[ 3] ^= R(x[15]+x[11], 7); x[ 7] ^= R(x[ 3]+x[15], 9); - x[11] ^= R(x[ 7]+x[ 3],13); x[15] ^= R(x[11]+x[ 7],18); + x04 ^= R(x00+x12, 7); x09 ^= R(x05+x01, 7); x14 ^= R(x10+x06, 7); x03 ^= R(x15+x11, 7); + x08 ^= R(x04+x00, 9); x13 ^= R(x09+x05, 9); x02 ^= R(x14+x10, 9); x07 ^= R(x03+x15, 9); + x12 ^= R(x08+x04,13); x01 ^= R(x13+x09,13); x06 ^= R(x02+x14,13); x11 ^= R(x07+x03,13); + x00 ^= R(x12+x08,18); x05 ^= R(x01+x13,18); x10 ^= R(x06+x02,18); x15 ^= R(x11+x07,18); /* Operate on rows. */ - x[ 1] ^= R(x[ 0]+x[ 3], 7); x[ 2] ^= R(x[ 1]+x[ 0], 9); - x[ 3] ^= R(x[ 2]+x[ 1],13); x[ 0] ^= R(x[ 3]+x[ 2],18); - - x[ 6] ^= R(x[ 5]+x[ 4], 7); x[ 7] ^= R(x[ 6]+x[ 5], 9); - x[ 4] ^= R(x[ 7]+x[ 6],13); x[ 5] ^= R(x[ 4]+x[ 7],18); - - x[11] ^= R(x[10]+x[ 9], 7); x[ 8] ^= R(x[11]+x[10], 9); - x[ 9] ^= R(x[ 8]+x[11],13); x[10] ^= R(x[ 9]+x[ 8],18); - - x[12] ^= R(x[15]+x[14], 7); x[13] ^= R(x[12]+x[15], 9); - x[14] ^= R(x[13]+x[12],13); x[15] ^= R(x[14]+x[13],18); + x01 ^= R(x00+x03, 7); x06 ^= R(x05+x04, 7); x11 ^= R(x10+x09, 7); x12 ^= R(x15+x14, 7); + x02 ^= R(x01+x00, 9); x07 ^= R(x06+x05, 9); x08 ^= R(x11+x10, 9); x13 ^= R(x12+x15, 9); + x03 ^= R(x02+x01,13); x04 ^= R(x07+x06,13); x09 ^= R(x08+x11,13); x14 ^= R(x13+x12,13); + x00 ^= R(x03+x02,18); x05 ^= R(x04+x07,18); x10 ^= R(x09+x08,18); x15 ^= R(x14+x13,18); #undef R } - for (i = 0; i < 16; i++) - B[i] += x[i]; + B[ 0] += x00; + B[ 1] += x01; + B[ 2] += x02; + B[ 3] += x03; + B[ 4] += x04; + B[ 5] += x05; + B[ 6] += x06; + B[ 7] += x07; + B[ 8] += x08; + B[ 9] += x09; + B[10] += x10; + B[11] += x11; + B[12] += x12; + B[13] += x13; + B[14] += x14; + B[15] += x15; } -/** - * blockmix_salsa8(Bin, Bout, X, r): - * Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r - * bytes in length; the output Bout must also be the same size. The - * temporary space X must be 64 bytes. - */ -static void -blockmix_salsa8(uint32_t * Bin, uint32_t * Bout, uint32_t * X, size_t r) +static inline void scrypt_core1(uint32_t *X, uint32_t *V) { - size_t i; - - /* 1: X <-- B_{2r - 1} */ - blkcpy(X, &Bin[(2 * r - 1) * 16], 64); - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < 2 * r; i += 2) { - /* 3: X <-- H(X \xor B_i) */ - blkxor(X, &Bin[i * 16], 64); - salsa20_8(X); + uint32_t i; + uint32_t j; + uint32_t k; + uint64_t *p1, *p2; + p1 = (uint64_t *)X; + for (i = 0; i < 1024; i += 2) { + memcpy(&V[i * 32], X, 128); - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - blkcpy(&Bout[i * 8], X, 64); + salsa20_8(&X[0], &X[16]); + salsa20_8(&X[16], &X[0]); - /* 3: X <-- H(X \xor B_i) */ - blkxor(X, &Bin[i * 16 + 16], 64); - salsa20_8(X); + memcpy(&V[(i + 1) * 32], X, 128); - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - blkcpy(&Bout[i * 8 + r * 16], X, 64); + salsa20_8(&X[0], &X[16]); + salsa20_8(&X[16], &X[0]); } -} - -/** - * integerify(B, r): - * Return the result of parsing B_{2r-1} as a little-endian integer. - */ -static uint64_t -integerify(void * B, size_t r) -{ - uint32_t * X = (void *)((uintptr_t)(B) + (2 * r - 1) * 64); - - return (((uint64_t)(X[1]) << 32) + X[0]); -} - -/** - * smix(B, r, N, V, XY): - * Compute B = SMix_r(B, N). The input B must be 128r bytes in length; - * the temporary storage V must be 128rN bytes in length; the temporary - * storage XY must be 256r + 64 bytes in length. The value N must be a - * power of 2 greater than 1. The arrays B, V, and XY must be aligned to a - * multiple of 64 bytes. - */ -static void -smix(uint8_t * B, size_t r, uint64_t N, uint32_t * V, uint32_t * XY) -{ - uint32_t * X = XY; - uint32_t * Y = &XY[32 * r]; - uint32_t * Z = &XY[64 * r]; - uint64_t i; - uint64_t j; - size_t k; - - /* 1: X <-- B */ - for (k = 0; k < 32 * r; k++) - X[k] = le32dec(&B[4 * k]); - - /* 2: for i = 0 to N - 1 do */ - for (i = 0; i < N; i += 2) { - /* 3: V_i <-- X */ - blkcpy(&V[i * (32 * r)], X, 128 * r); - - /* 4: X <-- H(X) */ - blockmix_salsa8(X, Y, Z, r); - - /* 3: V_i <-- X */ - blkcpy(&V[(i + 1) * (32 * r)], Y, 128 * r); - - /* 4: X <-- H(X) */ - blockmix_salsa8(Y, X, Z, r); + for (i = 0; i < 1024; i += 2) { + j = X[16] & 1023; + p2 = (uint64_t *)(&V[j * 32]); + for(k = 0; k < 16; k++) + p1[k] ^= p2[k]; + + salsa20_8(&X[0], &X[16]); + salsa20_8(&X[16], &X[0]); + + j = X[16] & 1023; + p2 = (uint64_t *)(&V[j * 32]); + for(k = 0; k < 16; k++) + p1[k] ^= p2[k]; + + salsa20_8(&X[0], &X[16]); + salsa20_8(&X[16], &X[0]); } - - /* 6: for i = 0 to N - 1 do */ - for (i = 0; i < N; i += 2) { - /* 7: j <-- Integerify(X) mod N */ - j = integerify(X, r) & (N - 1); - - /* 8: X <-- H(X \xor V_j) */ - blkxor(X, &V[j * (32 * r)], 128 * r); - blockmix_salsa8(X, Y, Z, r); - - /* 7: j <-- Integerify(X) mod N */ - j = integerify(Y, r) & (N - 1); - - /* 8: X <-- H(X \xor V_j) */ - blkxor(Y, &V[j * (32 * r)], 128 * r); - blockmix_salsa8(Y, X, Z, r); - } - - /* 10: B' <-- X */ - for (k = 0; k < 32 * r; k++) - le32enc(&B[4 * k], X[k]); } + /* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes */ -static void scrypt_1024_1_1_256_sp1(const char* input, char* output, char* scratchpad) +static void scrypt_1024_1_1_256_sp1(const uint32_t* input, uint32_t* output, uint8_t* scratchpad) { - uint8_t * B; + uint32_t tstate[8], ostate[8]; + uint32_t * B; uint32_t * V; - uint32_t * XY; - uint32_t i; - - const uint32_t N = 1024; - const uint32_t r = 1; - const uint32_t p = 1; - B = (uint8_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - XY = (uint32_t *)(B + (128 * r * p)); - V = (uint32_t *)(B + (128 * r * p) + (256 * r + 64)); + B = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + V = (uint32_t *)(B + 32); - /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ - PBKDF2_SHA256((const uint8_t*)input, 80, (const uint8_t*)input, 80, 1, B, p * 128 * r); + PBKDF2_SHA256_80_128_init(input, tstate, ostate); + PBKDF2_SHA256_80_128(tstate, ostate, input, B); #ifdef HAVE_SCRYPT_SIMD_HELPERS - scrypt_simd_core1(B, XY); + scrypt_simd_core1(B, V); #else - /* 2: for i = 0 to p - 1 do */ - for (i = 0; i < p; i++) { - /* 3: B_i <-- MF(B_i, N) */ - smix(&B[i * 128 * r], r, N, V, XY); - } + scrypt_core1(B, V); #endif - /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ - PBKDF2_SHA256((const uint8_t*)input, 80, B, p * 128 * r, 1, (uint8_t*)output, 32); + PBKDF2_SHA256_80_128_32(tstate, ostate, input, B, output); } -int scanhash_scrypt1(int thr_id, unsigned char *pdata, unsigned char *scratchbuf, +int scanhash_scrypt1(int thr_id, unsigned char *pdata, uint8_t *scratchbuf, const unsigned char *ptarget, uint32_t max_nonce, unsigned long *hashes_done) { - unsigned char data[80]; - unsigned char tmp_hash[32]; - uint32_t *nonce = (uint32_t *)(data + 64 + 12); + uint32_t data[20]; + uint32_t tmp_hash[32]; + uint32_t *nonce = (uint32_t *)(data + 19); uint32_t n = 0; uint32_t Htarg = le32dec(ptarget + 28); int i; @@ -266,14 +172,14 @@ int scanhash_scrypt1(int thr_id, unsigned char *pdata, unsigned char *scratchbuf work_restart[thr_id].restart = 0; for (i = 0; i < 80/4; i++) - ((uint32_t *)data)[i] = swab32(((uint32_t *)pdata)[i]); + data[i] = be32dec(pdata + i * 4); while(1) { n++; - le32enc(nonce, n); + *nonce = n; scrypt_1024_1_1_256_sp1(data, tmp_hash, scratchbuf); - if (le32dec(tmp_hash+28) <= Htarg) { + if (tmp_hash[7] <= Htarg) { be32enc(pdata + 64 + 12, n); *hashes_done = n; return true; @@ -290,46 +196,41 @@ int scanhash_scrypt1(int thr_id, unsigned char *pdata, unsigned char *scratchbuf #ifdef HAVE_SCRYPT_SIMD_HELPERS static void -scrypt_1024_1_1_256_sp2(const unsigned char * input1, - unsigned char * output1, - const unsigned char * input2, - unsigned char * output2, - unsigned char * scratchpad) +scrypt_1024_1_1_256_sp2(const uint32_t * input1, + uint32_t * output1, + const uint32_t * input2, + uint32_t * output2, + uint8_t * scratchpad) { - uint8_t * B1, * B2; - uint8_t * V; - - const uint32_t N = 1024; - const uint32_t r = 1; - const uint32_t p = 1; + uint32_t tstate1[8], tstate2[8], ostate1[8], ostate2[8]; + uint32_t * B1, * B2; + uint32_t * V; - B1 = (uint8_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - B2 = B1 + 128; - V = B2 + 128; + B1 = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + B2 = B1 + 32; + V = B2 + 32; - /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ - PBKDF2_SHA256((const uint8_t*)input1, 80, (const uint8_t*)input1, 80, 1, B1, p * 128 * r); - /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ - PBKDF2_SHA256((const uint8_t*)input2, 80, (const uint8_t*)input2, 80, 1, B2, p * 128 * r); + PBKDF2_SHA256_80_128_init(input1, tstate1, ostate1); + PBKDF2_SHA256_80_128_init(input2, tstate2, ostate2); + PBKDF2_SHA256_80_128(tstate1, ostate1, input1, B1); + PBKDF2_SHA256_80_128(tstate2, ostate2, input2, B2); scrypt_simd_core2(B1, V); - /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ - PBKDF2_SHA256((const uint8_t*)input1, 80, B1, p * 128 * r, 1, (uint8_t*)output1, 32); - /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ - PBKDF2_SHA256((const uint8_t*)input2, 80, B2, p * 128 * r, 1, (uint8_t*)output2, 32); + PBKDF2_SHA256_80_128_32(tstate1, ostate1, input1, B1, output1); + PBKDF2_SHA256_80_128_32(tstate2, ostate2, input2, B2, output2); } int scanhash_scrypt2(int thr_id, unsigned char *pdata, unsigned char *scratchbuf, const unsigned char *ptarget, uint32_t max_nonce, unsigned long *hashes_done) { - unsigned char data1[80]; - unsigned char tmp_hash1[32]; - unsigned char data2[80]; - unsigned char tmp_hash2[32]; - uint32_t *nonce1 = (uint32_t *)(data1 + 64 + 12); - uint32_t *nonce2 = (uint32_t *)(data2 + 64 + 12); + uint32_t data1[20]; + uint32_t tmp_hash1[8]; + uint32_t data2[20]; + uint32_t tmp_hash2[8]; + uint32_t *nonce1 = (uint32_t *)(data1 + 19); + uint32_t *nonce2 = (uint32_t *)(data2 + 19); uint32_t n = 0; uint32_t Htarg = le32dec(ptarget + 28); int i; @@ -337,22 +238,22 @@ int scanhash_scrypt2(int thr_id, unsigned char *pdata, unsigned char *scratchbuf work_restart[thr_id].restart = 0; for (i = 0; i < 80/4; i++) { - ((uint32_t *)data1)[i] = swab32(((uint32_t *)pdata)[i]); - ((uint32_t *)data2)[i] = swab32(((uint32_t *)pdata)[i]); + ((uint32_t *)data1)[i] = be32dec(pdata + i * 4); + ((uint32_t *)data2)[i] = be32dec(pdata + i * 4); } while(1) { - le32enc(nonce1, n + 1); - le32enc(nonce2, n + 2); + *nonce1 = n + 1; + *nonce2 = n + 2; scrypt_1024_1_1_256_sp2(data1, tmp_hash1, data2, tmp_hash2, scratchbuf); - if (le32dec(tmp_hash1+28) <= Htarg) { + if (tmp_hash1[7] <= Htarg) { be32enc(pdata + 64 + 12, n + 1); *hashes_done = n + 1; return true; } - if (le32dec(tmp_hash2+28) <= Htarg && n + 2 <= max_nonce) { + if (tmp_hash2[7] <= Htarg && n + 2 <= max_nonce) { be32enc(pdata + 64 + 12, n + 2); *hashes_done = n + 2; return true; diff --git a/sha256-helpers.h b/sha256-helpers.h index 9d17729..c54fd01 100644 --- a/sha256-helpers.h +++ b/sha256-helpers.h @@ -1,5 +1,5 @@ /*- - * Copyright 2009 Colin Percival, 2011 ArtForz + * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -34,8 +34,18 @@ #include <stdint.h> #include <string.h> -static inline uint32_t -be32dec(const void *pp) +#define byteswap(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) + +static inline void +byteswap_vec(uint32_t *dest, const uint32_t *src, uint32_t len) +{ + uint32_t i; + + for (i = 0; i < len; i++) + dest[i] = byteswap(src[i]); +} + +static inline uint32_t be32dec(const void *pp) { const uint8_t *p = (uint8_t const *)pp; @@ -43,8 +53,7 @@ be32dec(const void *pp) ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); } -static inline void -be32enc(void *pp, uint32_t x) +static inline void be32enc(void *pp, uint32_t x) { uint8_t * p = (uint8_t *)pp; @@ -54,8 +63,7 @@ be32enc(void *pp, uint32_t x) p[0] = (x >> 24) & 0xff; } -static inline uint32_t -le32dec(const void *pp) +static inline uint32_t le32dec(const void *pp) { const uint8_t *p = (uint8_t const *)pp; @@ -63,8 +71,7 @@ le32dec(const void *pp) ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); } -static inline void -le32enc(void *pp, uint32_t x) +static inline void le32enc(void *pp, uint32_t x) { uint8_t * p = (uint8_t *)pp; @@ -74,44 +81,11 @@ le32enc(void *pp, uint32_t x) p[3] = (x >> 24) & 0xff; } - typedef struct SHA256Context { uint32_t state[8]; - uint32_t count[2]; - unsigned char buf[64]; + uint32_t buf[16]; } SHA256_CTX; -typedef struct HMAC_SHA256Context { - SHA256_CTX ictx; - SHA256_CTX octx; -} HMAC_SHA256_CTX; - -/* - * Encode a length len/4 vector of (uint32_t) into a length len vector of - * (unsigned char) in big-endian form. Assumes len is a multiple of 4. - */ -static void -be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len) -{ |