aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiarhei Siamashka <siarhei.siamashka@gmail.com>2012-02-21 00:03:44 +0200
committerSiarhei Siamashka <siarhei.siamashka@gmail.com>2012-02-21 00:03:44 +0200
commit3c51c7107674e3fb62734d6fe80405db5484ee5e (patch)
tree7d4484437ba4ea614fcf3cd06847dd12770d3dbb
parenta62cd79b85c69aea5b980f91d2c563b082b06af0 (diff)
Initial partial conversion to SPU assembly
Only the first loop is converted. The performance is increased up to ~5.97 khash/sec per SPE core. TODO: try to update it to use better data layout and more unrolling from shakti's variant.
-rw-r--r--Makefile.am7
-rw-r--r--scrypt-cell-spu-asm.S1143
-rw-r--r--scrypt-cell-spu.c142
3 files changed, 1258 insertions, 34 deletions
diff --git a/Makefile.am b/Makefile.am
index 47affe0..b900312 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -22,10 +22,11 @@ minerd_CPPFLAGS = @LIBCURL_CPPFLAGS@
if HAVE_CELL_SPU
-scrypt-cell-spu.o: scrypt-cell-spu.c sha256-helpers.h \
- scrypt-simd-helpers.h scrypt-cell-spu.h
+scrypt-cell-spu.o: scrypt-cell-spu.c scrypt-cell-spu-asm.S \
+ sha256-helpers.h scrypt-simd-helpers.h \
+ scrypt-cell-spu.h
$(SPU_ELF_GCC) -O3 -fstrict-aliasing -Wall -Wstrict-aliasing \
- -o scrypt-cell-spu.elf scrypt-cell-spu.c
+ -o scrypt-cell-spu.elf scrypt-cell-spu.c scrypt-cell-spu-asm.S
$(EMBEDSPU) scrypt_spu scrypt-cell-spu.elf scrypt-cell-spu.o
minerd_LDADD += scrypt-cell-spu.o @SPE2_LIBS@
diff --git a/scrypt-cell-spu-asm.S b/scrypt-cell-spu-asm.S
new file mode 100644
index 0000000..b095a56
--- /dev/null
+++ b/scrypt-cell-spu-asm.S
@@ -0,0 +1,1143 @@
+/*-
+ * Copyright 2012 Siarhei Siamashka
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+.text
+.global scrypt_spu_loop1_asm
+
+/*****************************************************************************/
+
+.balign 16
+shufmask3012:
+ .byte 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+shufmask2301:
+ .byte 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+shufmask1230:
+ .byte 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3
+
+/*****************************************************************************/
+
+.balign 16
+registers_save_area:
+.rept (128 - 80) * 16
+ .byte 0
+.endr
+
+.macro save_regs start_reg=80, end_reg=127
+ stqr $\start_reg, registers_save_area + (\start_reg - 80) * 16
+.if \start_reg < \end_reg
+ save_regs "(\start_reg + 1)", \end_reg
+.endif
+.endm
+
+.macro restore_regs start_reg=80, end_reg=127
+ lqr $\start_reg, registers_save_area + (\start_reg - 80) * 16
+.if \start_reg < \end_reg
+ restore_regs "(\start_reg + 1)", \end_reg
+.endif
+.endm
+
+/*****************************************************************************/
+
+#define MFC_PUTL_CMD 36
+#define MFC_TAG_UPDATE_ALL 2
+
+/*
+ * dual-issue for all shufb instructions
+ */
+.macro salsa20_8_step X0s, X1s, X2s, X3s, /* 16 regs with data */ \
+ Y0s, Y1s, Y2s, Y3s, \
+ Z0s, Z1s, Z2s, Z3s, \
+ W0s, W1s, W2s, W3s, \
+ X0, X1, X2, X3, /* 16 regs with data */ \
+ Y0, Y1, Y2, Y3, \
+ Z0, Z1, Z2, Z3, \
+ W0, W1, W2, W3, \
+ TX, TY, TZ, TW, /* 16 temporary regs */ \
+ X1t, X2t, X3t, \
+ Y1t, Y2t, Y3t, \
+ Z1t, Z2t, Z3t, \
+ W1t, W2t, W3t, \
+ MASK3012, /* shuffle constants */ \
+ MASK2301, \
+ MASK1230
+
+ a \TX, \X0s, \X3s
+ a \TY, \Y0s, \Y3s
+ a \TZ, \Z0s, \Z3s
+ a \TW, \W0s, \W3s
+ roti \TX, \TX, 7
+ roti \TY, \TY, 7
+ roti \TZ, \TZ, 7
+ roti \TW, \TW, 7
+ xor \X1, \X1s, \TX
+ xor \Y1, \Y1s, \TY
+ xor \Z1, \Z1s, \TZ
+ xor \W1, \W1s, \TW
+ a \TX, \X1, \X0s
+ shufb \X1t, \X1, \X1, \MASK3012
+ a \TY, \Y1, \Y0s
+ shufb \Y1t, \Y1, \Y1, \MASK3012
+ a \TZ, \Z1, \Z0s
+ shufb \Z1t, \Z1, \Z1, \MASK3012
+ a \TW, \W1, \W0s
+ shufb \W1t, \W1, \W1, \MASK3012
+ roti \TX, \TX, 9
+ roti \TY, \TY, 9
+ roti \TZ, \TZ, 9
+ roti \TW, \TW, 9
+ xor \X2, \X2s, \TX
+ xor \Y2, \Y2s, \TY
+ xor \Z2, \Z2s, \TZ
+ xor \W2, \W2s, \TW
+ a \TX, \X2, \X1
+ shufb \X2t, \X2, \X2, \MASK2301
+ a \TY, \Y2, \Y1
+ shufb \Y2t, \Y2, \Y2, \MASK2301
+ a \TZ, \Z2, \Z1
+ shufb \Z2t, \Z2, \Z2, \MASK2301
+ a \TW, \W2, \W1
+ shufb \W2t, \W2, \W2, \MASK2301
+ roti \TX, \TX, 13
+ roti \TY, \TY, 13
+ roti \TZ, \TZ, 13
+ roti \TW, \TW, 13
+ xor \X3, \X3s, \TX
+ xor \Y3, \Y3s, \TY
+ xor \Z3, \Z3s, \TZ
+ xor \W3, \W3s, \TW
+ a \TX, \X3, \X2
+ shufb \X3t, \X3, \X3, \MASK1230
+ a \TY, \Y3, \Y2
+ shufb \Y3t, \Y3, \Y3, \MASK1230
+ a \TZ, \Z3, \Z2
+ shufb \Z3t, \Z3, \Z3, \MASK1230
+ a \TW, \W3, \W2
+ shufb \W3t, \W3, \W3, \MASK1230
+ roti \TX, \TX, 18
+ roti \TY, \TY, 18
+ roti \TZ, \TZ, 18
+ roti \TW, \TW, 18
+ xor \X0, \X0s, \TX
+ xor \Y0, \Y0s, \TY
+ xor \Z0, \Z0s, \TZ
+ xor \W0, \W0s, \TW
+
+ a \TX, \X0, \X1t
+ a \TY, \Y0, \Y1t
+ a \TZ, \Z0, \Z1t
+ a \TW, \W0, \W1t
+ roti \TX, \TX, 7
+ roti \TY, \TY, 7
+ roti \TZ, \TZ, 7
+ roti \TW, \TW, 7
+ xor \X3t, \X3t, \TX
+ xor \Y3t, \Y3t, \TY
+ xor \Z3t, \Z3t, \TZ
+ xor \W3t, \W3t, \TW
+ a \TX, \X3t, \X0
+ shufb \X3, \X3t, \X3t, \MASK3012
+ a \TY, \Y3t, \Y0
+ shufb \Y3, \Y3t, \Y3t, \MASK3012
+ a \TZ, \Z3t, \Z0
+ shufb \Z3, \Z3t, \Z3t, \MASK3012
+ a \TW, \W3t, \W0
+ shufb \W3, \W3t, \W3t, \MASK3012
+ roti \TX, \TX, 9
+ roti \TY, \TY, 9
+ roti \TZ, \TZ, 9
+ roti \TW, \TW, 9
+ xor \X2t, \X2t, \TX
+ xor \Y2t, \Y2t, \TY
+ xor \Z2t, \Z2t, \TZ
+ xor \W2t, \W2t, \TW
+ a \TX, \X2t, \X3t
+ shufb \X2, \X2t, \X2t, \MASK2301
+ a \TY, \Y2t, \Y3t
+ shufb \Y2, \Y2t, \Y2t, \MASK2301
+ a \TZ, \Z2t, \Z3t
+ shufb \Z2, \Z2t, \Z2t, \MASK2301
+ a \TW, \W2t, \W3t
+ shufb \W2, \W2t, \W2t, \MASK2301
+ roti \TX, \TX, 13
+ roti \TY, \TY, 13
+ roti \TZ, \TZ, 13
+ roti \TW, \TW, 13
+ xor \X1t, \X1t, \TX
+ xor \Y1t, \Y1t, \TY
+ xor \Z1t, \Z1t, \TZ
+ xor \W1t, \W1t, \TW
+ a \TX, \X1t, \X2t
+ shufb \X1, \X1t, \X1t, \MASK1230
+ a \TY, \Y1t, \Y2t
+ shufb \Y1, \Y1t, \Y1t, \MASK1230
+ a \TZ, \Z1t, \Z2t
+ shufb \Z1, \Z1t, \Z1t, \MASK1230
+ a \TW, \W1t, \W2t
+ shufb \W1, \W1t, \W1t, \MASK1230
+ roti \TX, \TX, 18
+ roti \TY, \TY, 18
+ roti \TZ, \TZ, 18
+ roti \TW, \TW, 18
+ xor \X0, \X0, \TX
+ xor \Y0, \Y0, \TY
+ xor \Z0, \Z0, \TZ
+ xor \W0, \W0, \TW
+.endm
+
+.macro salsa20_8_xor4d X0l, X1l, X2l, X3l, \
+ Y0l, Y1l, Y2l, Y3l, \
+ Z0l, Z1l, Z2l, Z3l, \
+ W0l, W1l, W2l, W3l, \
+ X0r, X1r, X2r, X3r, \
+ Y0r, Y1r, Y2r, Y3r, \
+ Z0r, Z1r, Z2r, Z3r, \
+ W0r, W1r, W2r, W3r, \
+ X0c, X1c, X2c, X3c, \
+ Y0c, Y1c, Y2c, Y3c, \
+ Z0c, Z1c, Z2c, Z3c, \
+ W0c, W1c, W2c, W3c, \
+ TX, TY, TZ, TW, \
+ X1t, X2t, X3t, \
+ Y1t, Y2t, Y3t, \
+ Z1t, Z2t, Z3t, \
+ W1t, W2t, W3t, \
+ MASK3012, \
+ MASK2301, \
+ MASK1230
+
+ xor \X0c, \X0l, \X0r
+ xor \X1c, \X1l, \X1r
+ xor \X2c, \X2l, \X2r
+ xor \X3c, \X3l, \X3r
+ xor \Y0c, \Y0l, \Y0r
+ xor \Y1c, \Y1l, \Y1r
+ xor \Y2c, \Y2l, \Y2r
+ xor \Y3c, \Y3l, \Y3r
+ xor \Z0c, \Z0l, \Z0r
+ xor \Z1c, \Z1l, \Z1r
+ xor \Z2c, \Z2l, \Z2r
+ xor \Z3c, \Z3l, \Z3r
+ xor \W0c, \W0l, \W0r
+ xor \W1c, \W1l, \W1r
+ xor \W2c, \W2l, \W2r
+ xor \W3c, \W3l, \W3r
+
+.balign 8
+ salsa20_8_step \
+ \X0c, \X1c, \X2c, \X3c, \
+ \Y0c, \Y1c, \Y2c, \Y3c, \
+ \Z0c, \Z1c, \Z2c, \Z3c, \
+ \W0c, \W1c, \W2c, \W3c, \
+ \X0l, \X1l, \X2l, \X3l, \
+ \Y0l, \Y1l, \Y2l, \Y3l, \
+ \Z0l, \Z1l, \Z2l, \Z3l, \
+ \W0l, \W1l, \W2l, \W3l, \
+ \TX, \TY, \TZ, \TW, \
+ \X1t, \X2t, \X3t, \
+ \Y1t, \Y2t, \Y3t, \
+ \Z1t, \Z2t, \Z3t, \
+ \W1t, \W2t, \W3t, \
+ \MASK3012, \
+ \MASK2301, \
+ \MASK1230
+
+.balign 8
+.rept 3
+ salsa20_8_step \
+ \X0l, \X1l, \X2l, \X3l, \
+ \Y0l, \Y1l, \Y2l, \Y3l, \
+ \Z0l, \Z1l, \Z2l, \Z3l, \
+ \W0l, \W1l, \W2l, \W3l, \
+ \X0l, \X1l, \X2l, \X3l, \
+ \Y0l, \Y1l, \Y2l, \Y3l, \
+ \Z0l, \Z1l, \Z2l, \Z3l, \
+ \W0l, \W1l, \W2l, \W3l, \
+ \TX, \TY, \TZ, \TW, \
+ \X1t, \X2t, \X3t, \
+ \Y1t, \Y2t, \Y3t, \
+ \Z1t, \Z2t, \Z3t, \
+ \W1t, \W2t, \W3t, \
+ \MASK3012, \
+ \MASK2301, \
+ \MASK1230
+.endr
+
+ a \X0l, \X0l, \X0c
+ a \X1l, \X1l, \X1c
+ a \X2l, \X2l, \X2c
+ a \X3l, \X3l, \X3c
+ a \Y0l, \Y0l, \Y0c
+ a \Y1l, \Y1l, \Y1c
+ a \Y2l, \Y2l, \Y2c
+ a \Y3l, \Y3l, \Y3c
+ a \Z0l, \Z0l, \Z0c
+ a \Z1l, \Z1l, \Z1c
+ a \Z2l, \Z2l, \Z2c
+ a \Z3l, \Z3l, \Z3c
+ a \W0l, \W0l, \W0c
+ a \W1l, \W1l, \W1c
+ a \W2l, \W2l, \W2c
+ a \W3l, \W3l, \W3c
+
+ xor \X0c, \X0l, \X0r
+ xor \X1c, \X1l, \X1r
+ xor \X2c, \X2l, \X2r
+ xor \X3c, \X3l, \X3r
+ xor \Y0c, \Y0l, \Y0r
+ xor \Y1c, \Y1l, \Y1r
+ xor \Y2c, \Y2l, \Y2r
+ xor \Y3c, \Y3l, \Y3r
+ xor \Z0c, \Z0l, \Z0r
+ xor \Z1c, \Z1l, \Z1r
+ xor \Z2c, \Z2l, \Z2r
+ xor \Z3c, \Z3l, \Z3r
+ xor \W0c, \W0l, \W0r
+ xor \W1c, \W1l, \W1r
+ xor \W2c, \W2l, \W2r
+ xor \W3c, \W3l, \W3r
+
+.balign 8
+ salsa20_8_step \
+ \X0c, \X1c, \X2c, \X3c, \
+ \Y0c, \Y1c, \Y2c, \Y3c, \
+ \Z0c, \Z1c, \Z2c, \Z3c, \
+ \W0c, \W1c, \W2c, \W3c, \
+ \X0r, \X1r, \X2r, \X3r, \
+ \Y0r, \Y1r, \Y2r, \Y3r, \
+ \Z0r, \Z1r, \Z2r, \Z3r, \
+ \W0r, \W1r, \W2r, \W3r, \
+ \TX, \TY, \TZ, \TW, \
+ \X1t, \X2t, \X3t, \
+ \Y1t, \Y2t, \Y3t, \
+ \Z1t, \Z2t, \Z3t, \
+ \W1t, \W2t, \W3t, \
+ \MASK3012, \
+ \MASK2301, \
+ \MASK1230
+
+.balign 8
+.rept 3
+ salsa20_8_step \
+ \X0r, \X1r, \X2r, \X3r, \
+ \Y0r, \Y1r, \Y2r, \Y3r, \
+ \Z0r, \Z1r, \Z2r, \Z3r, \
+ \W0r, \W1r, \W2r, \W3r, \
+ \X0r, \X1r, \X2r, \X3r, \
+ \Y0r, \Y1r, \Y2r, \Y3r, \
+ \Z0r, \Z1r, \Z2r, \Z3r, \
+ \W0r, \W1r, \W2r, \W3r, \
+ \TX, \TY, \TZ, \TW, \
+ \X1t, \X2t, \X3t, \
+ \Y1t, \Y2t, \Y3t, \
+ \Z1t, \Z2t, \Z3t, \
+ \W1t, \W2t, \W3t, \
+ \MASK3012, \
+ \MASK2301, \
+ \MASK1230
+.endr
+
+ a \X0r, \X0r, \X0c
+ a \X1r, \X1r, \X1c
+ a \X2r, \X2r, \X2c
+ a \X3r, \X3r, \X3c
+ a \Y0r, \Y0r, \Y0c
+ a \Y1r, \Y1r, \Y1c
+ a \Y2r, \Y2r, \Y2c
+ a \Y3r, \Y3r, \Y3c
+ a \Z0r, \Z0r, \Z0c
+ a \Z1r, \Z1r, \Z1c
+ a \Z2r, \Z2r, \Z2c
+ a \Z3r, \Z3r, \Z3c
+ a \W0r, \W0r, \W0c
+ a \W1r, \W1r, \W1c
+ a \W2r, \W2r, \W2c
+ a \W3r, \W3r, \W3c
+.endm
+
+/*****************************************************************************/
+
+.macro scrypt_spu_loop1 data, dma_vect_list, dma_vect_step, scratch_eahi \
+ tag1, tag_mask1, tag2, tag_mask2, \
+ idx, dma_vect_list_size, mfc_putl_cmd, dma_vect0, dma_vect1, dma_vect2, dma_vect3, \
+ dummy, mfc_tag_update_all, data_b, dma_vect_list_b, \
+ aX0l, aX1l, aX2l, aX3l, \
+ aY0l, aY1l, aY2l, aY3l, \
+ aZ0l, aZ1l, aZ2l, aZ3l, \
+ aW0l, aW1l, aW2l, aW3l, \
+ aX0r, aX1r, aX2r, aX3r, \
+ aY0r, aY1r, aY2r, aY3r, \
+ aZ0r, aZ1r, aZ2r, aZ3r, \
+ aW0r, aW1r, aW2r, aW3r, \
+ bX0l, bX1l, bX2l, bX3l, \
+ bY0l, bY1l, bY2l, bY3l, \
+ bZ0l, bZ1l, bZ2l, bZ3l, \
+ bW0l, bW1l, bW2l, bW3l, \
+ bX0r, bX1r, bX2r, bX3r, \
+ bY0r, bY1r, bY2r, bY3r, \
+ bZ0r, bZ1r, bZ2r, bZ3r, \
+ bW0r, bW1r, bW2r, bW3r, \
+ X0c, X1c, X2c, X3c, \
+ Y0c, Y1c, Y2c, Y3c, \
+ Z0c, Z1c, Z2c, Z3c, \
+ W0c, W1c, W2c, W3c, \
+ TX, TY, TZ, TW, \
+ X1t, X2t, X3t, \
+ Y1t, Y2t, Y3t, \
+ Z1t, Z2t, Z3t, \
+ W1t, W2t, W3t, \
+ MASK3012, \
+ MASK2301, \
+ MASK1230
+ /* load shuffle masks */
+ il \idx, 1023
+ il \dma_vect_list_size, 32
+ il \mfc_putl_cmd, MFC_PUTL_CMD
+ il \mfc_tag_update_all, MFC_TAG_UPDATE_ALL
+ ai \data_b, \data, 2 * 8 * 16
+ ai \data_b, \data_b, 2 * 8 * 16
+ ai \dma_vect_list_b, \dma_vect_list, 32
+ lqr \MASK3012, shufmask3012
+ lqr \MASK2301, shufmask2301
+ lqr \MASK1230, shufmask1230
+
+ lqd \dma_vect0, 0 * 16(\dma_vect_list)
+ lqd \dma_vect1, 1 * 16(\dma_vect_list)
+ lqd \dma_vect2, 2 * 16(\dma_vect_list)
+ lqd \dma_vect3, 3 * 16(\dma_vect_list)
+
+ lqd \aX0l, 0 * 128 + 0 * 16(\data)
+ lqd \aX1l, 0 * 128 + 1 * 16(\data)
+ lqd \aX2l, 0 * 128 + 2 * 16(\data)
+ lqd \aX3l, 0 * 128 + 3 * 16(\data)
+ lqd \aX0r, 0 * 128 + 4 * 16(\data)
+ lqd \aX1r, 0 * 128 + 5 * 16(\data)
+ lqd \aX2r, 0 * 128 + 6 * 16(\data)
+ lqd \aX3r, 0 * 128 + 7 * 16(\data)
+
+ lqd \aY0l, 1 * 128 + 0 * 16(\data)
+ lqd \aY1l, 1 * 128 + 1 * 16(\data)
+ lqd \aY2l, 1 * 128 + 2 * 16(\data)
+ lqd \aY3l, 1 * 128 + 3 * 16(\data)
+ lqd \aY0r, 1 * 128 + 4 * 16(\data)
+ lqd \aY1r, 1 * 128 + 5 * 16(\data)
+ lqd \aY2r, 1 * 128 + 6 * 16(\data)
+ lqd \aY3r, 1 * 128 + 7 * 16(\data)
+
+ lqd \aZ0l, 2 * 128 + 0 * 16(\data)
+ lqd \aZ1l, 2 * 128 + 1 * 16(\data)
+ lqd \aZ2l, 2 * 128 + 2 * 16(\data)
+ lqd \aZ3l, 2 * 128 + 3 * 16(\data)
+ lqd \aZ0r, 2 * 128 + 4 * 16(\data)
+ lqd \aZ1r, 2 * 128 + 5 * 16(\data)
+ lqd \aZ2r, 2 * 128 + 6 * 16(\data)
+ lqd \aZ3r, 2 * 128 + 7 * 16(\data)
+
+ lqd \aW0l, 3 * 128 + 0 * 16(\data)
+ lqd \aW1l, 3 * 128 + 1 * 16(\data)
+ lqd \aW2l, 3 * 128 + 2 * 16(\data)
+ lqd \aW3l, 3 * 128 + 3 * 16(\data)
+ lqd \aW0r, 3 * 128 + 4 * 16(\data)
+ lqd \aW1r, 3 * 128 + 5 * 16(\data)
+ lqd \aW2r, 3 * 128 + 6 * 16(\data)
+ lqd \aW3r, 3 * 128 + 7 * 16(\data)
+
+ lqd \bX0l, 4 * 128 + 0 * 16(\data)
+ lqd \bX1l, 4 * 128 + 1 * 16(\data)
+ lqd \bX2l, 4 * 128 + 2 * 16(\data)
+ lqd \bX3l, 4 * 128 + 3 * 16(\data)
+ lqd \bX0r, 4 * 128 + 4 * 16(\data)
+ lqd \bX1r, 4 * 128 + 5 * 16(\data)
+ lqd \bX2r, 4 * 128 + 6 * 16(\data)
+ lqd \bX3r, 4 * 128 + 7 * 16(\data)
+
+ lqd \bY0l, 5 * 128 + 0 * 16(\data)
+ lqd \bY1l, 5 * 128 + 1 * 16(\data)
+ lqd \bY2l, 5 * 128 + 2 * 16(\data)
+ lqd \bY3l, 5 * 128 + 3 * 16(\data)
+ lqd \bY0r, 5 * 128 + 4 * 16(\data)
+ lqd \bY1r, 5 * 128 + 5 * 16(\data)
+ lqd \bY2r, 5 * 128 + 6 * 16(\data)
+ lqd \bY3r, 5 * 128 + 7 * 16(\data)
+
+ lqd \bZ0l, 6 * 128 + 0 * 16(\data)
+ lqd \bZ1l, 6 * 128 + 1 * 16(\data)
+ lqd \bZ2l, 6 * 128 + 2 * 16(\data)
+ lqd \bZ3l, 6 * 128 + 3 * 16(\data)
+ lqd \bZ0r, 6 * 128 + 4 * 16(\data)
+ lqd \bZ1r, 6 * 128 + 5 * 16(\data)
+ lqd \bZ2r, 6 * 128 + 6 * 16(\data)
+ lqd \bZ3r, 6 * 128 + 7 * 16(\data)
+
+ lqd \bW0l, 7 * 128 + 0 * 16(\data)
+ lqd \bW1l, 7 * 128 + 1 * 16(\data)
+ lqd \bW2l, 7 * 128 + 2 * 16(\data)
+ lqd \bW3l, 7 * 128 + 3 * 16(\data)
+ lqd \bW0r, 7 * 128 + 4 * 16(\data)
+ lqd \bW1r, 7 * 128 + 5 * 16(\data)
+ lqd \bW2r, 7 * 128 + 6 * 16(\data)
+ lqd \bW3r, 7 * 128 + 7 * 16(\data)
+
+ dsync
+ wrch $ch16, \data // local storage address
+ wrch $ch17, \scratch_eahi // EAH
+ wrch $ch18, \dma_vect_list // list address
+ wrch $ch19, \dma_vect_list_size // list size
+ wrch $ch20, \tag1 // tag id
+ wrch $ch21, \mfc_putl_cmd // MFC_PUTL_CMD
+0:
+/*
+ salsa20_8_xor4d \aX0l, \aX1l, \aX2l, \aX3l, \
+ \aY0l, \aY1l, \aY2l, \aY3l, \
+ \aZ0l, \aZ1l, \aZ2l, \aZ3l, \
+ \aW0l, \aW1l, \aW2l, \aW3l, \
+ \aX0r, \aX1r, \aX2r, \aX3r, \
+ \aY0r, \aY1r, \aY2r, \aY3r, \
+ \aZ0r, \aZ1r, \aZ2r, \aZ3r, \
+ \aW0r, \aW1r, \aW2r, \aW3r, \
+ \X0c, \X1c, \X2c, \X3c, \
+ \Y0c, \Y1c, \Y2c, \Y3c, \
+ \Z0c, \Z1c, \Z2c, \Z3c, \
+ \W0c, \W1c, \W2c, \W3c, \
+ \TX, \TY, \TZ, \TW, \
+ \X1t, \X2t, \X3t, \
+ \Y1t, \Y2t, \Y3t, \
+ \Z1t, \Z2t, \Z3t, \
+ \W1t, \W2t, \W3t, \
+ \MASK3012, \
+ \MASK2301, \
+ \MASK1230
+*/
+.balign 8
+ a \dma_vect2, \dma_vect2, \dma_vect_step
+ wrch $ch22, \tag_mask2 // tag mask
+ a \dma_vect3, \dma_vect3, \dma_vect_step
+ wrch $ch23, \mfc_tag_update_all // MFC_TAG_UPDATE_ALL
+ xor \X0c, \aX0l, \aX0r
+ xor \X1c, \aX1l, \aX1r
+ xor \X2c, \aX2l, \aX2r
+ xor \X3c, \aX3l, \aX3r
+ xor \Y0c, \aY0l, \aY0r
+ rdch \dummy, $ch24 // read to dummy
+ xor \Y1c, \aY1l, \aY1r
+ stqd \dma_vect2, 2 * 16(\dma_vect_list)
+ xor \Y2c, \aY2l, \aY2r
+ stqd \dma_vect3, 3 * 16(\dma_vect_list)
+ xor \Y3c, \aY3l, \aY3r
+ stqd \bX0l, 4 * 128 + 0 * 16(\data)
+ xor \Z0c, \aZ0l, \aZ0r
+ stqd \bX1l, 4 * 128 + 1 * 16(\data)
+ xor \Z1c, \aZ1l, \aZ1r
+ stqd \bX2l, 4 * 128 + 2 * 16(\data)
+ xor \Z2c, \aZ2l, \aZ2r
+ stqd \bX3l, 4 * 128 + 3 * 16(\data)
+ xor \Z3c, \aZ3l, \aZ3r
+ stqd \bX0r, 4 * 128 + 4 * 16(\data)
+ xor \W0c, \aW0l, \aW0r
+ stqd \bX1r, 4 * 128 + 5 * 16(\data)
+ xor \W1c, \aW1l, \aW1r
+ stqd \bX2r, 4 * 128 + 6 * 16(\data)
+ xor \W2c, \aW2l, \aW2r
+ stqd \bX3r, 4 * 128 + 7 * 16(\data)
+ xor \W3c, \aW3l, \aW3r
+
+.balign 8
+ salsa20_8_step \
+ \X0c, \X1c, \X2c, \X3c, \
+ \Y0c, \Y1c, \Y2c, \Y3c, \
+ \Z0c, \Z1c, \Z2c, \Z3c, \
+ \W0c, \W1c, \W2c, \W3c, \
+ \aX0l, \aX1l, \aX2l, \aX3l, \
+ \aY0l, \aY1l, \aY2l, \aY3l, \
+ \aZ0l, \aZ1l, \aZ2l, \aZ3l, \
+ \aW0l, \aW1l, \aW2l, \aW3l, \
+ \TX, \TY, \TZ, \TW, \
+ \X1t, \X2t, \X3t, \
+ \Y1t, \Y2t, \Y3t, \
+ \Z1t, \Z2t, \Z3t, \
+ \W1t, \W2t, \W3t, \
+ \MASK3012, \
+ \MASK2301, \
+ \MASK1230
+
+.balign 8
+.rept 3
+ salsa20_8_step \
+ \aX0l, \aX1l, \aX2l, \aX3l, \
+ \aY0l, \aY1l, \aY2l, \aY3l, \
+ \aZ0l, \aZ1l, \aZ2l, \aZ3l, \
+ \aW0l, \aW1l, \aW2l, \aW3l, \
+ \aX0l, \aX1l, \aX2l, \aX3l, \
+ \aY0l, \aY1l, \aY2l, \aY3l, \
+ \aZ0l, \aZ1l, \aZ2l, \aZ3l, \
+ \aW0l, \aW1l, \aW2l, \aW3l, \
+ \TX, \TY, \TZ, \TW, \
+ \X1t, \X2t, \X3t, \
+ \Y1t, \Y2t, \Y3t, \
+ \Z1t, \Z2t, \Z3t, \
+ \W1t, \W2t, \W3t, \
+ \MASK3012, \
+ \MASK2301, \
+ \MASK1230
+.endr
+
+.balign 8
+ a \aX0l, \aX0l, \X0c
+ stqd \bY0l, 5 * 128 + 0 * 16(\data)
+ a \aX1l, \aX1l, \X1c
+ stqd \bY1l, 5 * 128 + 1 * 16(\data)
+ a \aX2l, \aX2l, \X2c
+ stqd \bY2l, 5 * 128 + 2 * 16(\data)
+ a \aX3l, \aX3l, \X3c
+ stqd \bY3l, 5 * 128 + 3 * 16(\data)
+ a \aY0l, \aY0l, \Y0c
+ stqd \bY0r, 5 * 128 + 4 * 16(\data)
+ a \aY1l, \aY1l, \Y1c
+ stqd \bY1r, 5 * 128 + 5 * 16(\data)
+ a \aY2l, \aY2l, \Y2c
+ stqd \bY2r, 5 * 128 + 6 * 16(\data)
+ a \aY3l, \aY3l, \Y3c
+ stqd \bY3r, 5 * 128 + 7 * 16(\data)
+ a \aZ0l, \aZ0l, \Z0c
+ a \aZ1l, \aZ1l, \Z1c
+ a \aZ2l, \aZ2l, \Z2c
+ a \aZ3l, \aZ3l, \Z3c
+ a \aW0l, \aW0l, \W0c
+ stqd \bZ0l, 6 * 128 + 0 * 16(\data)
+ a \aW1l, \aW1l, \W1c
+ stqd \bZ1l, 6 * 128 + 1 * 16(\data)
+ a \aW2l, \aW2l, \W2c
+ stqd \bZ2l, 6 * 128 + 2 * 16(\data)
+ a \aW3l, \aW3l, \W3c
+ stqd \bZ3l, 6 * 128 + 3 * 16(\data)
+ xor \X0c, \aX0l, \aX0r
+ stqd \bZ0r, 6 * 128 + 4 * 16(\data)
+ xor \X1c, \aX1l, \aX1r
+ stqd \bZ1r, 6 * 128 + 5 * 16(\data)
+ xor \X2c, \aX2l, \aX2r
+ stqd \bZ2r, 6 * 128 + 6 * 16(\data)
+ xor \X3c, \aX3l, \aX3r
+ stqd \bZ3r, 6 * 128 + 7 * 16(\data)
+ xor \Y0c, \aY0l, \aY0r
+ xor \Y1c, \aY1l, \aY1r
+ xor \Y2c, \aY2l, \aY2r
+ xor \Y3c, \aY3l, \aY3r
+ xor \Z0c, \aZ0l, \aZ0r
+ stqd \bW0l, 7 * 128 + 0 * 16(\data)
+ xor \Z1c, \aZ1l, \aZ1r
+ stqd \bW1l, 7 * 128 + 1 * 16(\data)
+ xor \Z2c, \aZ2l, \aZ2r
+ stqd \bW2l, 7 * 128 + 2 * 16(\data)
+ xor \Z3c, \aZ3l, \aZ3r
+ stqd \bW3l, 7 * 128 + 3 * 16(\data)
+ xor \W0c, \aW0l, \aW0r
+ stqd \bW0r, 7 * 128 + 4 * 16(\data)
+ xor \W1c, \aW1l, \aW1r
+ stqd \bW1r, 7 * 128 + 5 * 16(\data)
+ xor \W2c, \aW2l, \aW2r
+ stqd \bW2r, 7 * 128 + 6 * 16(\data)
+ xor \W3c, \aW3l, \aW3r
+ stqd \bW3r, 7 * 128 + 7 * 16(\data)
+
+.balign 8
+ salsa20_8_step \
+ \X0c, \X1c, \X2c, \X3c, \
+ \Y0c, \Y1c, \Y2c, \Y3c, \
+ \Z0c, \Z1c, \Z2c, \Z3c, \
+ \W0c, \W1c, \W2c, \W3c, \
+ \aX0r, \aX1r, \aX2r, \aX3r, \
+ \aY0r, \aY1r, \aY2r, \aY3r, \
+ \aZ0r, \aZ1r, \aZ2r, \aZ3r, \
+ \aW0r, \aW1r, \aW2r, \aW3r, \
+ \TX, \TY, \TZ, \TW, \
+ \X1t, \X2t, \X3t, \
+ \Y1t, \Y2t, \Y3t, \
+ \Z1t, \Z2t, \Z3t, \
+ \W1t, \W2t, \W3t, \
+ \MASK3012, \
+ \MASK2301, \
+ \MASK1230
+
+.balign 8
+.rept 3
+ salsa20_8_step \
+ \aX0r, \aX1r, \aX2r, \aX3r, \
+ \aY0r, \aY1r, \aY2r, \aY3r, \
+ \aZ0r, \aZ1r, \aZ2r, \aZ3r, \
+ \aW0r, \aW1r, \aW2r, \aW3r, \
+ \aX0r, \aX1r, \aX2r, \aX3r, \
+ \aY0r, \aY1r, \aY2r, \aY3r, \
+ \aZ0r, \aZ1r, \aZ2r, \aZ3r, \
+ \aW0r, \aW1r, \aW2r, \aW3r, \
+ \TX, \TY, \TZ, \TW, \
+ \X1t, \X2t, \X3t, \
+ \Y1t, \Y2t, \Y3t, \
+ \Z1t, \Z2t, \Z3t, \
+ \W1t, \W2t, \W3t, \
+ \MASK3012, \
+ \MASK2301, \
+ \MASK1230
+.endr
+
+.balign 8
+ a \aX0r, \aX0r, \X0c
+ dsync
+ a \aX1r, \aX1r, \X1c
+ wrch $ch16, \data_b // local storage address
+ a \aX2r, \aX2r, \X2c
+ wrch $ch17, \scratch_eahi // EAH
+ a \aX3r, \aX3r, \X3c
+ wrch $ch18, \dma_vect_list_b // list address
+ a \aY0r, \aY0r, \Y0c
+ wrch $ch19, \dma_vect_list_size // list size
+ a \aY1r, \aY1r, \Y1c
+ wrch $ch20, \tag2 // tag id
+ a \aY2r, \aY2r, \Y2c
+ wrch $ch21, \mfc_putl_cmd // MFC_PUTL_CMD
+ a \aY3r, \aY3r, \Y3c
+ a \aZ0r, \aZ0r, \Z0c
+ a \aZ1r, \aZ1r, \Z1c
+ a \aZ2r, \aZ2r, \Z2c
+ a \aZ3r, \aZ3r, \Z3c
+ a \aW0r, \aW0r, \W0c
+ a \aW1r, \aW1r, \W1c
+ a \aW2r, \aW2r, \W2c
+ a \aW3r, \aW3r, \W3c
+/*
+ salsa20_8_xor4d \bX0l, \bX1l, \bX2l, \bX3l, \
+ \bY0l, \bY1l, \bY2l, \bY3l, \
+ \bZ0l, \bZ1l, \bZ2l, \bZ3l, \
+ \bW0l, \bW1l, \bW2l, \bW3l, \
+ \bX0r, \bX1r, \bX2r, \bX3r, \
+ \bY0r, \bY1r, \bY2r, \bY3r, \
+ \bZ0r, \bZ1r, \bZ2r, \bZ3r, \
+ \bW0r, \bW1r, \bW2r, \bW3r, \
+ \X0c, \X1c, \X2c, \X3c, \
+ \Y0c, \Y1c, \Y2c, \Y3c, \
+ \Z0c, \Z1c, \Z2c, \Z3c, \
+ \W0c, \W1c, \W2c, \W3c, \
+ \TX, \TY, \TZ, \TW, \
+ \X1t, \X2t, \X3t, \
+ \Y1t, \Y2t, \Y3t, \
+ \Z1t, \Z2t, \Z3t, \
+ \W1t, \W2t, \W3t, \
+ \MASK3012, \
+ \MASK2301, \
+ \MASK1230
+*/
+ a \dma_vect0, \dma_vect0, \dma_vect_step
+ wrch $ch22, \tag_mask1 // tag mask
+ a \dma_vect1, \dma_vect1, \dma_vect_step
+ wrch $ch23, \mfc_tag_update_all // MFC_TAG_UPDATE_ALL
+ xor \X0c, \bX0l, \bX0r
+ xor \X1c, \bX1l, \bX1r
+ xor \X2c, \bX2l, \bX2r
+ xor \X3c, \bX3l, \bX3r
+ xor \Y0c, \bY0l, \bY0r
+ rdch \dummy, $ch24 // read to dummy
+ xor \Y1c, \bY1l, \bY1r
+ stqd \dma_vect0, 0 * 16(\dma_vect_list)
+ xor \Y2c, \bY2l, \bY2r
+ stqd \dma_vect1, 1 * 16(\dma_vect_list)
+ xor \Y3c, \bY3l, \bY3r
+ stqd \aX0l, 0 * 128 + 0 * 16(\data)
+ xor \Z0c, \bZ0l, \bZ0r
+ stqd \aX1l, 0 * 128 + 1 * 16(\data)
+ xor \Z1c, \bZ1l, \bZ1r
+ stqd \aX2l, 0 * 128 + 2 * 16(\data)
+ xor \Z2c, \bZ2l, \bZ2r
+ stqd \aX3l, 0 * 128 + 3 * 16(\data)
+ xor \Z3c, \bZ3l, \bZ3r
+ stqd \aX0r, 0 * 128 + 4 * 16(\data)
+ xor \W0c, \bW0l, \bW0r
+ stqd \aX1r, 0 * 128 + 5 * 16(\data)
+ xor \W1c, \bW1l, \bW1r
+ stqd \aX2r, 0 * 128 + 6 * 16(\data)
+ xor \W2c, \bW2l, \bW2r
+ stqd \aX3r, 0 * 128 + 7 * 16(\data)
+ xor \W3c, \bW3l, \bW3r
+
+.balign 8
+ salsa20_8_step \
+ \X0c, \X1c, \X2c, \X3c, \
+ \Y0c, \Y1c, \Y2c, \Y3c, \
+ \Z0c, \Z1c, \Z2c, \Z3c, \
+ \W0c, \W1c, \W2c, \W3c, \
+ \bX0l, \bX1l, \bX2l, \bX3l, \
+ \bY0l, \bY1l, \bY2l, \bY3l, \
+ \bZ0l, \bZ1l, \bZ2l, \bZ3l, \
+ \bW0l, \bW1l, \bW2l, \bW3l, \
+ \TX, \TY, \TZ, \TW, \
+ \X1t, \X2t, \X3t, \
+ \Y1t, \Y2t, \Y3t, \
+ \Z1t, \Z2t, \Z3t, \
+ \W1t, \W2t, \W3t, \
+ \MASK3012, \
+ \MASK2301, \
+ \MASK1230
+
+.balign 8
+.rept 3
+ salsa20_8_step \
+ \bX0l, \bX1l, \bX2l, \bX3l, \
+ \bY0l, \bY1l, \bY2l, \bY3l, \
+ \bZ0l, \bZ1l, \bZ2l, \bZ3l, \
+ \bW0l, \bW1l, \bW2l, \bW3l, \
+ \bX0l, \bX1l, \bX2l, \bX3l, \
+ \bY0l, \bY1l, \bY2l, \bY3l, \
+ \bZ0l, \bZ1l, \bZ2l, \bZ3l, \
+ \bW0l, \bW1l, \bW2l, \bW3l, \
+ \TX, \TY, \TZ, \TW, \
+ \X1t, \X2t, \X3t, \
+ \Y1t, \Y2t, \Y3t, \
+ \Z1t, \Z2t, \Z3t, \
+ \W1t, \W2t, \W3t, \
+ \MASK3012, \
+ \MASK2301, \
+ \MASK1230
+.endr
+
+.balign 8
+ a \bX0l, \bX0l, \X0c
+ stqd \aY0l, 1 * 128 + 0 * 16(\data)
+ a \bX1l, \bX1l, \X1c
+ stqd \aY1l, 1 * 128 + 1 * 16(\data)
+ a \bX2l, \bX2l, \X2c
+ stqd \aY2l, 1 * 128 + 2 * 16(\data)
+ a \bX3l, \bX3l, \X3c
+ stqd \aY3l, 1 * 128 + 3 * 16(\data)
+ a \bY0l, \bY0l, \Y0c
+ stqd \aY0r, 1 * 128 + 4 * 16(\data)
+ a \bY1l, \bY1l, \Y1c
+ stqd \aY1r, 1 * 128 + 5 * 16(\data)
+ a \bY2l, \bY2l, \Y2c
+ stqd \aY2r, 1 * 128 + 6 * 16(\data)
+ a \bY3l, \bY3l, \Y3c
+ stqd \aY3r, 1 * 128 + 7 * 16(\data)
+ a \bZ0l, \bZ0l, \Z0c
+ a \bZ1l, \bZ1l, \Z1c
+ a \bZ2l, \bZ2l, \Z2c
+ a \bZ3l, \bZ3l, \Z3c
+ a \bW0l, \bW0l, \W0c
+ stqd \aZ0l, 2 * 128 + 0 * 16(\data)
+ a \bW1l, \bW1l, \W1c
+ stqd \aZ1l, 2 * 128 + 1 * 16(\data)
+ a \bW2l, \bW2l, \W2c
+ stqd \aZ2l, 2 * 128 + 2 * 16(\data)
+ a \bW3l, \bW3l, \W3c
+ stqd \aZ3l, 2 * 128 + 3 * 16(\data)
+ xor \X0c, \bX0l, \bX0r
+ stqd \aZ0r, 2 * 128 + 4 * 16(\data)
+ xor \X1c, \bX1l, \bX1r
+ stqd \aZ1r, 2 * 128 + 5 * 16(\data)
+ xor \X2c, \bX2l, \bX2r
+ stqd \aZ2r, 2 * 128 + 6 * 16(\data)
+ xor \X3c, \bX3l, \bX3r
+ stqd \aZ3r, 2 * 128 + 7 * 16(\data)
+ xor \Y0c, \bY0l, \bY0r
+ xor \Y1c, \bY1l, \bY1r
+ xor \Y2c, \bY2l, \bY2r
+ xor \Y3c, \bY3l, \bY3r
+ xor \Z0c, \bZ0l, \bZ0r
+ stqd \aW0l, 3 * 128 + 0 * 16(\data)
+ xor \Z1c, \bZ1l, \bZ1r
+ stqd \aW1l, 3 * 128 + 1 * 16(\data)
+ xor \Z2c, \bZ2l, \bZ2r
+ stqd \aW2l, 3 * 128 + 2 * 16(\data)
+ xor \Z3c, \bZ3l, \bZ3r
+ stqd \aW3l, 3 * 128 + 3 * 16(\data)
+ xor \W0c, \bW0l, \bW0r
+ stqd \aW0r, 3 * 128 + 4 * 16(\data)
+ xor \W1c, \bW1l, \bW1r
+ stqd \aW1r, 3 * 128 + 5 * 16(\data)
+ xor \W2c, \bW2l, \bW2r
+ stqd \aW2r, 3 * 128 + 6 * 16(\data)
+ xor \W3c, \bW3l, \bW3r
+ stqd \aW3r, 3 * 128 + 7 * 16(\data)
+
+.balign 8
+ salsa20_8_step \
+ \X0c, \X1c, \X2c, \X3c, \
+ \Y0c, \Y1c, \Y2c, \Y3c, \
+ \Z0c, \Z1c, \Z2c, \Z3c, \
+ \W0c, \W1c, \W2c, \W3c, \
+ \bX0r, \bX1r, \bX2r, \bX3r, \
+ \bY0r, \bY1r, \bY2r, \bY3r, \
+ \bZ0r, \bZ1r, \bZ2r, \bZ3r, \
+ \bW0r, \bW1r, \bW2r, \bW3r, \
+ \TX, \TY, \TZ, \TW, \
+ \X1t, \X2t, \X3t, \
+ \Y1t, \Y2t, \Y3t, \
+ \Z1t, \Z2t, \Z3t, \
+ \W1t, \W2t, \W3t, \
+ \MASK3012, \
+ \MASK2301, \
+ \MASK1230
+
+.balign 8
+.rept 3
+ salsa20_8_step \
+ \bX0r, \bX1r, \bX2r, \bX3r, \
+ \bY0r, \bY1r, \bY2r, \bY3r, \
+ \bZ0r, \bZ1r, \bZ2r, \bZ3r, \
+ \bW0r, \bW1r, \bW2r, \bW3r, \
+ \bX0r, \bX1r, \bX2r, \bX3r, \
+ \bY0r, \bY1r, \bY2r, \bY3r, \
+ \bZ0r, \bZ1r, \bZ2r, \bZ3r, \
+ \bW0r, \bW1r, \bW2r, \bW3r, \
+ \TX, \TY, \TZ, \TW, \
+ \X1t, \X2t, \X3t, \
+ \Y1t, \Y2t, \Y3t, \
+ \Z1t, \Z2t, \Z3t, \
+ \W1t, \W2t, \W3t, \
+ \MASK3012, \
+ \MASK2301, \
+ \MASK1230
+.endr
+
+.balign 8
+ a \bX0r, \bX0r, \X0c
+ hbrr 1f, 0b
+ a \bX1r, \bX1r, \X1c
+ dsync
+ a \bX2r, \bX2r, \X2c
+ wrch $ch16, \data // local storage address
+ a \bX3r, \bX3r, \X3c
+ wrch $ch17, \scratch_eahi // EAH
+ a \bY0r, \bY0r, \Y0c
+ wrch $ch18, \dma_vect_list // list address
+ a \bY1r, \bY1r, \Y1c
+ wrch $ch19, \dma_vect_list_size // list size
+ a \bY2r, \bY2r, \Y2c
+ wrch $ch20, \tag1 // tag id
+ a \bY3r, \bY3r, \Y3c
+ wrch $ch21, \mfc_putl_cmd // MFC_PUTL_CMD
+ ai \idx, \idx, -1
+ a \bZ0r, \bZ0r, \Z0c
+ a \bZ1r, \bZ1r, \Z1c
+ a \bZ2r, \bZ2r, \Z2c
+ a \bZ3r, \bZ3r, \Z3c
+ a \bW0r, \bW0r, \W0c
+ a \bW1r, \bW1r, \W1c
+ a \bW2r, \bW2r, \W2c
+ a \bW3r, \bW3r, \W3c
+1:
+ brnz \idx, 0b
+
+ /* end of loop */
+
+ salsa20_8_xor4d \aX0l, \aX1l, \aX2l, \aX3l, \
+ \aY0l, \aY1l, \aY2l, \aY3l, \
+ \aZ0l, \aZ1l, \aZ2l, \aZ3l, \
+ \aW0l, \aW1l, \aW2l, \aW3l, \
+ \aX0r, \aX1r, \aX2r, \aX3r, \
+ \aY0r, \aY1r, \aY2r, \aY3r, \
+ \aZ0r, \aZ1r, \aZ2r, \aZ3r, \
+ \aW0r, \aW1r, \aW2r, \aW3r, \
+ \X0c, \X1c, \X2c, \X3c, \
+ \Y0c, \Y1c, \Y2c, \Y3c, \
+ \Z0c, \Z1c, \Z2c, \Z3c, \
+ \W0c, \W1c, \W2c, \W3c, \
+ \TX, \TY, \TZ, \TW, \
+ \X1t, \X2t, \X3t, \
+ \Y1t, \Y2t, \Y3t, \
+ \Z1t, \Z2t, \Z3t, \
+ \W1t, \W2t, \W3t, \
+ \MASK3012, \
+ \MASK2301, \
+ \MASK1230
+
+ wrch $ch22, \tag_mask2 // tag mask
+ wrch $ch23, \mfc_tag_update_all // MFC_TAG_UPDATE_ALL
+ rdch \dummy, $ch24 // read to dummy
+
+ a \dma_vect2, \dma_vect2, \dma_vect_step
+ a \dma_vect3, \dma_vect3, \dma_vect_step
+
+ stqd \dma_vect2, 2 * 16(\dma_vect_list)
+ stqd \dma_vect3, 3 * 16(\dma_vect_list)
+
+ stqd \bX0l, 4 * 128 + 0 * 16(\data)
+ stqd \bX1l, 4 * 128 + 1 * 16(\data)
+ stqd \bX2l, 4 * 128 + 2 * 16(\data)
+ stqd \bX3l, 4 * 128 + 3 * 16(\data)
+ stqd \bX0r, 4 * 128 + 4 * 16(\data)
+ stqd \bX1r, 4 * 128 + 5 * 16(\data)
+ stqd \bX2r, 4 * 128 + 6 * 16(\data)
+ stqd \bX3r, 4 * 128 + 7 * 16(\data)
+
+ stqd \bY0l, 5 * 128 + 0 * 16(\data)
+ stqd \bY1l, 5 * 128 + 1 * 16(\data)
+ stqd \bY2l, 5 * 128 + 2 * 16(\data)
+ stqd \bY3l, 5 * 128 + 3 * 16(\data)
+ stqd \bY0r, 5 * 128 + 4 * 16(\data)
+ stqd \bY1r, 5 * 128 + 5 * 16(\data)
+ stqd \bY2r, 5 * 128 + 6 * 16(\data)
+ stqd \bY3r, 5 * 128 + 7 * 16(\data)
+
+ stqd \bZ0l, 6 * 128 + 0 * 16(\data)
+ stqd \bZ1l, 6 * 128 + 1 * 16(\data)
+ stqd \bZ2l, 6 * 128 + 2 * 16(\data)
+ stqd \bZ3l, 6 * 128 + 3 * 16(\data)
+ stqd \bZ0r, 6 * 128 + 4 * 16(\data)
+ stqd \bZ1r, 6 * 128 + 5 * 16(\data)
+ stqd \bZ2r, 6 * 128 + 6 * 16(\data)
+ stqd \bZ3r, 6 * 128 + 7 * 16(\data)
+
+ stqd \bW0l, 7 * 128 + 0 * 16(\data)
+ stqd \bW1l, 7 * 128 + 1 * 16(\data)
+ stqd \bW2l, 7 * 128 + 2 * 16(\data)
+ stqd \bW3l, 7 * 128 + 3 * 16(\data)
+ stqd \bW0r, 7 * 128 + 4 * 16(\data)
+ stqd \bW1r, 7 * 128 + 5 * 16(\data)
+ stqd \bW2r, 7 * 128 + 6 * 16(\data)
+ stqd \bW3r, 7 * 128 + 7 * 16(\data)
+
+ dsync
+ wrch $ch16, \data_b // local storage address
+ wrch $ch17, \scratch_eahi // EAH
+ wrch $ch18, \dma_vect_list_b // list address
+ wrch $ch19, \dma_vect_list_size // list size
+ wrch $ch20, \tag2 // tag id
+ wrch $ch21, \mfc_putl_cmd // MFC_PUTL_CMD
+
+ salsa20_8_xor4d \bX0l, \bX1l, \bX2l, \bX3l, \
+ \bY0l, \bY1l, \bY2l, \bY3l, \
+ \bZ0l, \bZ1l, \bZ2l, \bZ3l, \
+ \bW0l, \bW1l, \bW2l, \bW3l, \
+ \bX0r, \bX1r, \bX2r, \bX3r, \
+ \bY0r, \bY1r, \bY2r, \bY3r, \
+ \bZ0r, \bZ1r, \bZ2r, \bZ3r, \
+ \bW0r, \bW1r, \bW2r, \bW3r, \
+ \X0c, \X1c, \X2c, \X3c, \
+ \Y0c, \Y1c, \Y2c, \Y3c, \
+ \Z0c, \Z1c, \Z2c, \Z3c, \
+ \W0c, \W1c, \W2c, \W3c, \
+ \TX, \TY, \TZ, \TW, \
+ \X1t, \X2t, \X3t, \
+ \Y1t, \Y2t, \Y3t, \
+ \Z1t, \Z2t, \Z3t, \
+ \W1t, \W2t, \W3t, \
+ \MASK3012, \
+ \MASK2301, \
+ \MASK1230
+
+ wrch $ch22, \tag_mask1 // tag mask
+ wrch $ch23, \mfc_tag_update_all // MFC_TAG_UPDATE_ALL
+ rdch \dummy, $ch24 // read to dummy
+ wrch $ch22, \tag_mask2 // tag mask
+ wrch $ch23, \mfc_tag_update_all // MFC_TAG_UPDATE_ALL
+ rdch \dummy, $ch24 // read to dummy
+
+ stqd \aX0l, 0 * 128 + 0 * 16(\data)
+ stqd \aX1l, 0 * 128 + 1 * 16(\data)
+ stqd \aX2l, 0 * 128 + 2 * 16(\data)
+ stqd \aX3l, 0 * 128 + 3 * 16(\data)
+ stqd \aX0r, 0 * 128 + 4 * 16(\data)
+ stqd \aX1r, 0 * 128 + 5 * 16(\data)
+ stqd \aX2r, 0 * 128 + 6 * 16(\data)
+ stqd \aX3r, 0 * 128 + 7 * 16(\data)
+
+ stqd \aY0l, 1 * 128 + 0 * 16(\data)
+ stqd \aY1l, 1 * 128 + 1 * 16(\data)
+ stqd \aY2l, 1 * 128 + 2 * 16(\data)
+ stqd \aY3l, 1 * 128 + 3 * 16(\data)
+ stqd \aY0r, 1 * 128 + 4 * 16(\data)
+ stqd \aY1r, 1 * 128 + 5 * 16(\data)
+ stqd \aY2r, 1 * 128 + 6 * 16(\data)
+ stqd \aY3r, 1 * 128 + 7 * 16(\data)
+
+ stqd \aZ0l, 2 * 128 + 0 * 16(\data)
+ stqd \aZ1l, 2 * 128 + 1 * 16(\data)
+ stqd \aZ2l, 2 * 128 + 2 * 16(\data)
+ stqd \aZ3l, 2 * 128 + 3 * 16(\data)
+ stqd \aZ0r, 2 * 128 + 4 * 16(\data)
+ stqd \aZ1r, 2 * 128 + 5 * 16(\data)
+ stqd \aZ2r, 2 * 128 + 6 * 16(\data)
+ stqd \aZ3r, 2 * 128 + 7 * 16(\data)
+
+ stqd \aW0l, 3 * 128 + 0 * 16(\data)
+ stqd \aW1l, 3 * 128 + 1 * 16(\data)
+ stqd \aW2l, 3 * 128 + 2 * 16(\data)
+ stqd \aW3l, 3 * 128 + 3 * 16(\data)
+ stqd \aW0r, 3 * 128 + 4 * 16(\data)
+ stqd \aW1r, 3 * 128 + 5 * 16(\data)
+ stqd \aW2r, 3 * 128 + 6 * 16(\data)
+ stqd \aW3r, 3 * 128 + 7 * 16(\data)
+
+ stqd \bX0l, 4 * 128 + 0 * 16(\data)
+ stqd \bX1l, 4 * 128 + 1 * 16(\data)
+ stqd \bX2l, 4 * 128 + 2 * 16(\data)
+ stqd \bX3l, 4 * 128 + 3 * 16(\data)
+ stqd \bX0r, 4 * 128 + 4 * 16(\data)
+ stqd \bX1r, 4 * 128 + 5 * 16(\data)
+ stqd \bX2r, 4 * 128 + 6 * 16(\data)
+ stqd \bX3r, 4 * 128 + 7 * 16(\data)
+
+ stqd \bY0l, 5 * 128 + 0 * 16(\data)
+ stqd \bY1l, 5 * 128 + 1 * 16(\data)
+ stqd \bY2l, 5 * 128 + 2 * 16(\data)
+ stqd \bY3l, 5 * 128 + 3 * 16(\data)
+ stqd \bY0r, 5 * 128 + 4 * 16(\data)
+ stqd \bY1r, 5 * 128 + 5 * 16(\data)
+ stqd \bY2r, 5 * 128 + 6 * 16(\data)
+ stqd \bY3r, 5 * 128 + 7 * 16(\data)
+
+ stqd \bZ0l, 6 * 128 + 0 * 16(\data)
+ stqd \bZ1l, 6 * 128 + 1 * 16(\data)
+ stqd \bZ2l, 6 * 128 + 2 * 16(\data)
+ stqd \bZ3l, 6 * 128 + 3 * 16(\data)
+ stqd \bZ0r, 6 * 128 + 4 * 16(\data)
+ stqd \bZ1r, 6 * 128 + 5 * 16(\data)
+ stqd \bZ2r, 6 * 128 + 6 * 16(\data)
+ stqd \bZ3r, 6 * 128 + 7 * 16(\data)
+
+ stqd \bW0l, 7 * 128 + 0 * 16(\data)
+ stqd \bW1l, 7 * 128 + 1 * 16(\data)
+ stqd \bW2l, 7 * 128 + 2 * 16(\data)
+ stqd \bW3l, 7 * 128 + 3 * 16(\data)
+ stqd \bW0r, 7 * 128 + 4 * 16(\data)
+ stqd \bW1r, 7 * 128 + 5 * 16(\data)
+ stqd \bW2r, 7 * 128 + 6 * 16(\data)
+ stqd \bW3r, 7 * 128 + 7 * 16(\data)
+.endm
+
+ .align 3
+ .type scrypt_spu_loop1_asm, @function
+scrypt_spu_loop1_asm:
+ save_regs 80, 120
+ scrypt_spu_loop1 $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, \
+ $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, \
+ $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, \
+ $37, $38, $39, $40, $41, $42, $43, $44, $45, $46, $47, \
+ $48, $49, $50, $51, $52, $53, $54, $55, $56, $57, $58, \
+ $59, $60, $61, $62, $63, $64, $65, $66, $67, $68, $69, \
+ $70, $71, $72, $73, $74, $75, $76, $77, $78, $79, $80, \
+ $81, $82, $83, $84, $85, $86, $87, $88, $89, $90, $91, \
+ $92, $93, $94, $95, $96, $97, $98, $99, $100, $101, \
+ $102, $103, $104, $105, $106, $107, $108, $109, $110, \
+ $111, $112, $113, $114, $115, $116, $117, $118, $119, $120
+ restore_regs 80, 120
+ bi $lr
diff --git a/scrypt-cell-spu.c b/scrypt-cell-spu.c
index 50002f5..062522b 100644
--- a/scrypt-cell-spu.c
+++ b/scrypt-cell-spu.c
@@ -156,13 +156,96 @@ salsa20_8_xor4(uint32x4 * __restrict B, const uint32x4 * __restrict Bx,
E[3] += W3;
}
+void salsa20_8_xor4x_asm(uint32x4 * data);
+
+void scrypt_spu_loop1_asm(uint32x4 * data,
+ uint32x4 * dma_vect_list,
+ uint32x4 dma_vect_step,
+ uint64_t scratch_eahi,
+ int tag1,
+ int tag_mask1,
+ int tag2,
+ int tag_mask2);
+
+static __attribute__((always_inline)) void
+salsa20_8_xor4d(uint32x4 * data)
+{
+ salsa20_8_xor4(&data[0], &data[4], &data[8], &data[12],
+ &data[16], &data[20], &data[24], &data[28]);
+ salsa20_8_xor4(&data[4], &data[0], &data[12], &data[8],
+ &data[20], &data[16], &data[28], &data[24]);
+}
+
+static mfc_list_element_t dma_list[8] __attribute__((aligned(128)));
+
+static void scrypt_spu_loop1(uint32x4 * data,
+ uint32x4 * dma_vect_list,
+ uint32x4 dma_vect_step,
+ uint64_t scratch_eahi,
+ int tag1,
+ int tag_mask1,
+ int tag2,
+ int tag_mask2)
+{
+ static uint32x4 Z[8 * 8] __attribute__((aligned(128)));
+ int i;
+
+ blkcpy128(&Z[0 * 8], &data[0 * 8]);
+ blkcpy128(&Z[1 * 8], &data[1 * 8]);
+ blkcpy128(&Z[2 * 8], &data[2 * 8]);
+ blkcpy128(&Z[3 * 8], &data[3 * 8]);
+ spu_dsync();
+ spu_mfcdma64(&Z[0], scratch_eahi, (uint32_t)&dma_vect_list[0], 4 * 8, tag1, MFC_PUTL_CMD);
+ for (i = 0; i < 1023; i++) {
+ salsa20_8_xor4d(data);
+ spu_writech(MFC_WrTagMask, tag_mask2);
+ spu_mfcstat(MFC_TAG_UPDATE_ALL);
+ dma_vect_list[2] += dma_vect_step;
+ dma_vect_list[3] += dma_vect_step;
+ blkcpy128(&Z[4 * 8], &data[4 * 8]);
+ blkcpy128(&Z[5 * 8], &data[5 * 8]);
+ blkcpy128(&Z[6 * 8], &data[6 * 8]);
+ blkcpy128(&Z[7 * 8], &data[7 * 8]);
+ spu_dsync();
+ spu_mfcdma64(&Z[4 * 8], scratch_eahi, (uint32_t)&dma_vect_list[2], 4 * 8, tag2, MFC_PUTL_CMD);
+
+ salsa20_8_xor4d(data + 32);
+ spu_writech(MFC_WrTagMask, tag_mask1);
+ spu_mfcstat(MFC_TAG_UPDATE_ALL);
+ dma_vect_list[0] += dma_vect_step;
+ dma_vect_list[1] += dma_vect_step;
+ blkcpy128(&Z[0 * 8], &data[0 * 8]);
+ blkcpy128(&Z[1 * 8], &data[1 * 8]);
+ blkcpy128(&Z[2 * 8], &data[2 * 8]);
+ blkcpy128(&Z[3 * 8], &data[3 * 8]);
+ spu_dsync();
+ spu_mfcdma64(&Z[0], scratch_eahi, (uint32_t)&dma_vect_list[0], 4 * 8, tag1, MFC_PUTL_CMD);
+ }
+ salsa20_8_xor4d(data);
+ spu_writech(MFC_WrTagMask, tag_mask2);
+ spu_mfcstat(MFC_TAG_UPDATE_ALL);
+ dma_vect_list[2] += dma_vect_step;
+ dma_vect_list[3] += dma_vect_step;
+ blkcpy128(&Z[4 * 8], &data[4 * 8]);
+ blkcpy128(&Z[5 * 8], &data[5 * 8]);
+ blkcpy128(&Z[6 * 8], &data[6 * 8]);
+ blkcpy128(&Z[7 * 8], &data[7 * 8]);
+ spu_dsync();
+ spu_mfcdma64(&Z[4 * 8], scratch_eahi, (uint32_t)&dma_vect_list[2], 4 * 8, tag2, MFC_PUTL_CMD);
+ salsa20_8_xor4d(data + 32);
+
+ spu_writech(MFC_WrTagMask, tag_mask1 | tag_mask2);
+ spu_mfcstat(MFC_TAG_UPDATE_ALL);
+}
+
+/* Use assembly implementation */
+#define scrypt_spu_loop1 scrypt_spu_loop1_asm
+
static void
scrypt_spu_core8(uint32_t *databuf32, uint64_t scratch)
{
- static mfc_list_element_t dma_list[8] __attribute__((aligned(128)));
static XY X[8] __attribute__((aligned(128)));
static uint32x4 Y[8 * 8] __attribute__((aligned(128)));
- static uint32x4 Z[8 * 8] __attribute__((aligned(128)));
XY * XA = &X[0];
XY * XB = &X[1];
XY * XC = &X[2];
@@ -207,31 +290,30 @@ scrypt_spu_core8(uint32_t *databuf32, uint64_t scratch)
dma_list[i].size = 128;
/* 2: for i = 0 to N - 1 do */
- for (i = 0; i < 1024; i++) {
- blkcpy128(&Z[0 * 8], &XA->q[0]);
- blkcpy128(&Z[1 * 8], &XB->q[0]);
- blkcpy128(&Z[2 * 8], &XC->q[0]);
- blkcpy128(&Z[3 * 8], &XD->q[0]);
- blkcpy128(&Z[4 * 8], &XE->q[0]);
- blkcpy128(&Z[5 * 8], &XF->q[0]);
- blkcpy128(&Z[6 * 8], &XG->q[0]);
- blkcpy128(&Z[7 * 8], &XH->q[0]);
- dma_list[0].eal = mfc_ea2l(VA + i * 128);
- dma_list[1].eal = mfc_ea2l(VB + i * 128);
- dma_list[2].eal = mfc_ea2l(VC + i * 128);
- dma_list[3].eal = mfc_ea2l(VD + i * 128);
- dma_list[4].eal = mfc_ea2l(VE + i * 128);
- dma_list[5].eal = mfc_ea2l(VF + i * 128);
- dma_list[6].eal = mfc_ea2l(VG + i * 128);
- dma_list[7].eal = mfc_ea2l(VH + i * 128);
- mfc_putl(&Z[0], scratch, &dma_list[0], 8 * sizeof(mfc_list_element_t), tag1, 0, 0);
- salsa20_8_xor4(&XA->q[0], &XA->q[4], &XB->q[0], &XB->q[4], &XC->q[0], &XC->q[4], &XD->q[0], &XD->q[4]);
- salsa20_8_xor4(&XA->q[4], &XA->q[0], &XB->q[4], &XB->q[0], &XC->q[4], &XC->q[0], &XD->q[4], &XD->q[0]);
- salsa20_8_xor4(&XE->q[0], &XE->q[4], &XF->q[0], &XF->q[4], &XG->q[0], &XG->q[4], &XH->q[0], &XH->q[4]);
- salsa20_8_xor4(&XE->q[4], &XE->q[0], &XF->q[4], &XF->q[0], &XG->q[4], &XG->q[0], &XH->q[4], &XH->q[0]);
- mfc_write_tag_mask(tag_mask1);
- mfc_read_tag_status_all();
- }
+ do {
+ uint32x4 dma_vect_list[4] = {
+ {
+ 128, mfc_ea2l(scratch + 128 * 1024 * 0),
+ 128, mfc_ea2l(scratch + 128 * 1024 * 1)
+ },
+ {
+ 128, mfc_ea2l(scratch + 128 * 1024 * 2),
+ 128, mfc_ea2l(scratch + 128 * 1024 * 3)
+ },
+ {
+ 128, mfc_ea2l(scratch + 128 * 1024 * 4) - 128,
+ 128, mfc_ea2l(scratch + 128 * 1024 * 5) - 128
+ },
+ {
+ 128, mfc_ea2l(scratch + 128 * 1024 * 6) - 128,
+ 128, mfc_ea2l(scratch + 128 * 1024 * 7) - 128
+ }
+ };
+ uint32x4 dma_vect_step = { 0, 128, 0, 128 };
+ uint32_t scratch_eahi = mfc_ea2h(scratch);
+ scrypt_spu_loop1((uint32x4 *)XA, dma_vect_list, dma_vect_step, scratch_eahi,
+ tag1, tag_mask1, tag2, tag_mask2);
+ } while (0);
dma_list[0].eal = mfc_ea2l(VA + (XA->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
dma_list[1].eal = mfc_ea2l(VB + (XB->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
@@ -253,8 +335,7 @@ scrypt_spu_core8(uint32_t *databuf32, uint64_t scratch)
blkxor128(XB->q, &Y[1 * 8]);
blkxor128(XC->q, &Y[2 * 8]);
blkxor128(XD->q, &Y[3 * 8]);
- salsa20_8_xor4(&XA->q[0], &XA->q[4], &XB->q[0], &XB->q[4], &XC->q[0], &XC->q[4], &XD->q[0], &XD->q[4]);
- salsa20_8_xor4(&XA->q[4], &XA->q[0], &XB->q[4], &XB->q[0], &XC->q[4], &XC->q[0], &XD->q[4], &XD->q[0]);
+ salsa20_8_xor4d(&XA->q[0]);
dma_list[0].eal = mfc_ea2l(VA + (XA->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
dma_list[1].eal = mfc_ea2l(VB + (XB->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
@@ -268,8 +349,7 @@ scrypt_spu_core8(uint32_t *databuf32, uint64_t scratch)
blkxor128(XF->q, &Y[5 * 8]);
blkxor128(XG->q, &Y[6 * 8]);
blkxor128(XH->q, &Y[7 * 8]);
- salsa20_8_xor4(&XE->q[0], &XE->q[4], &XF->q[0], &XF->q[4], &XG->q[0], &XG->q[4], &XH->q[0], &XH->q[4]);
- salsa20_8_xor4(&XE->q[4], &XE->q[0], &XF->q[4], &XF->q[0], &XG->q[4], &XG->q[0], &XH->q[4], &XH->q[0]);
+ salsa20_8_xor4d(&XE->q[0]);
dma_list[4].eal = mfc_ea2l(VE + (XE->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */
dma_list[5].eal = mfc_ea2l(VF + (XF->w[16] & 1023) * 128); /* j <-- Integerify(X) mod N */