Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto update from Herbert Xu: - XTS mode optimisation for twofish/cast6/camellia/aes on x86 - AVX2/x86_64 implementation for blowfish/twofish/serpent/camellia - SSSE3/AVX/AVX2 optimisations for sha256/sha512 - Added driver for SAHARA2 crypto accelerator - Fix for GMAC when used in non-IPsec secnarios - Added generic CMAC implementation (including IPsec glue) - IP update for crypto/atmel - Support for more than one device in hwrng/timeriomem - Added Broadcom BCM2835 RNG driver - Misc fixes * git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (59 commits) crypto: caam - fix job ring cleanup code crypto: camellia - add AVX2/AES-NI/x86_64 assembler implementation of camellia cipher crypto: serpent - add AVX2/x86_64 assembler implementation of serpent cipher crypto: twofish - add AVX2/x86_64 assembler implementation of twofish cipher crypto: blowfish - add AVX2/x86_64 implementation of blowfish cipher crypto: tcrypt - add async cipher speed tests for blowfish crypto: testmgr - extend camellia test-vectors for camellia-aesni/avx2 crypto: aesni_intel - fix Kconfig problem with CRYPTO_GLUE_HELPER_X86 crypto: aesni_intel - add more optimized XTS mode for x86-64 crypto: x86/camellia-aesni-avx - add more optimized XTS code crypto: cast6-avx: use new optimized XTS code crypto: x86/twofish-avx - use optimized XTS code crypto: x86 - add more optimized XTS-mode for serpent-avx xfrm: add rfc4494 AES-CMAC-96 support crypto: add CMAC support to CryptoAPI crypto: testmgr - add empty test vectors for null ciphers crypto: testmgr - add AES GMAC test vectors crypto: gcm - fix rfc4543 to handle async crypto correctly crypto: gcm - make GMAC work when dst and src are different hwrng: timeriomem - added devicetree hooks ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2013-05-02 14:53:12 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-05-02 14:53:12 -0700
commit: 797994f81a8b2bdca2eecffa415c1e7a89a4f961 (patch)
tree: 1383dc469c26ad37fdf960f682d9a48c782935c5 /arch
parent: c8d8566952fda026966784a62f324c8352f77430 (diff)
parent: 3862de1f6c442d53bd828d39f86d07d933a70605 (diff)
40 files changed, 10762 insertions, 240 deletions
diff --git a/arch/arm/mach-at91/at91sam9g45_devices.c b/arch/arm/mach-at91/at91sam9g45_devices.c
index 827c9f2a70f..f0bf68268ca 100644
--- a/arch/arm/mach-at91/at91sam9g45_devices.c
+++ b/arch/arm/mach-at91/at91sam9g45_devices.c
@@ -18,7 +18,7 @@
 #include <linux/platform_device.h>
 #include <linux/i2c-gpio.h>
 #include <linux/atmel-mci.h>
-#include <linux/platform_data/atmel-aes.h>
+#include <linux/platform_data/crypto-atmel.h>
 
 #include <linux/platform_data/at91_adc.h>
 
@@ -1900,7 +1900,8 @@ static void __init at91_add_device_tdes(void) {}
  * -------------------------------------------------------------------- */
 
 #if defined(CONFIG_CRYPTO_DEV_ATMEL_AES) || defined(CONFIG_CRYPTO_DEV_ATMEL_AES_MODULE)
-static struct aes_platform_data aes_data;
+static struct crypto_platform_data aes_data;
+static struct crypto_dma_data alt_atslave;
 static u64 aes_dmamask = DMA_BIT_MASK(32);
 
 static struct resource aes_resources[] = {
@@ -1931,23 +1932,20 @@ static struct platform_device at91sam9g45_aes_device = {
 static void __init at91_add_device_aes(void)
 {
 	struct at_dma_slave	*atslave;
-	struct aes_dma_data	*alt_atslave;
-
-	alt_atslave = kzalloc(sizeof(struct aes_dma_data), GFP_KERNEL);
 
 	/* DMA TX slave channel configuration */
-	atslave = &alt_atslave->txdata;
+	atslave = &alt_atslave.txdata;
 	atslave->dma_dev = &at_hdmac_device.dev;
 	atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE	| ATC_SRC_H2SEL_HW |
 						ATC_SRC_PER(AT_DMA_ID_AES_RX);
 
 	/* DMA RX slave channel configuration */
-	atslave = &alt_atslave->rxdata;
+	atslave = &alt_atslave.rxdata;
 	atslave->dma_dev = &at_hdmac_device.dev;
 	atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE	| ATC_DST_H2SEL_HW |
 						ATC_DST_PER(AT_DMA_ID_AES_TX);
 
-	aes_data.dma_slave = alt_atslave;
+	aes_data.dma_slave = &alt_atslave;
 	platform_device_register(&at91sam9g45_aes_device);
 }
 #else
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 63947a8f9f0..a3a0ed80f17 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,6 +2,10 @@
 # Arch-specific CryptoAPI modules.
 #
 
+avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
+avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
+					$(comma)4)$(comma)%ymm2,yes,no)
+
 obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o
 obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
 
@@ -12,22 +16,37 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
 
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
 obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
-obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o
-obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
-obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
 obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
-obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
 obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
-obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
 
 obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
 obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
 obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
+obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
+obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
+
+# These modules require assembler to support AVX.
+ifeq ($(avx_supported),yes)
+	obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
+						camellia-aesni-avx-x86_64.o
+	obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
+	obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
+	obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
+	obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
+endif
+
+# These modules require assembler to support AVX2.
+ifeq ($(avx2_supported),yes)
+	obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o
+	obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
+	obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
+	obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o
+endif
 
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
 twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
@@ -36,21 +55,35 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
 
 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
 camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
-camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
-			       camellia_aesni_avx_glue.o
-cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
-cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
 blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
-twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o
 salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
 serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
-serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o
+
+ifeq ($(avx_supported),yes)
+	camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
+					camellia_aesni_avx_glue.o
+	cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
+	cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
+	twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o \
+				twofish_avx_glue.o
+	serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o \
+				serpent_avx_glue.o
+endif
+
+ifeq ($(avx2_supported),yes)
+	blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o
+	camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
+	serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
+	twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o
+endif
 
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
 crc32c-intel-y := crc32c-intel_glue.o
-crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o
+crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
 crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
+sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
+sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 04b797767b9..62fe22cd4cb 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -34,6 +34,10 @@
 
 #ifdef __x86_64__
 .data
+.align 16
+.Lgf128mul_x_ble_mask:
+	.octa 0x00000000000000010000000000000087
+
 POLY:   .octa 0xC2000000000000000000000000000001
 TWOONE: .octa 0x00000001000000000000000000000001
 
@@ -105,6 +109,8 @@ enc:        .octa 0x2
 #define CTR	%xmm11
 #define INC	%xmm12
 
+#define GF128MUL_MASK %xmm10
+
 #ifdef __x86_64__
 #define AREG	%rax
 #define KEYP	%rdi
@@ -2636,4 +2642,115 @@ ENTRY(aesni_ctr_enc)
 .Lctr_enc_just_ret:
 	ret
 ENDPROC(aesni_ctr_enc)
+
+/*
+ * _aesni_gf128mul_x_ble:		internal ABI
+ *	Multiply in GF(2^128) for XTS IVs
+ * input:
+ *	IV:	current IV
+ *	GF128MUL_MASK == mask with 0x87 and 0x01
+ * output:
+ *	IV:	next IV
+ * changed:
+ *	CTR:	== temporary value
+ */
+#define _aesni_gf128mul_x_ble() \
+	pshufd $0x13, IV, CTR; \
+	paddq IV, IV; \
+	psrad $31, CTR; \
+	pand GF128MUL_MASK, CTR; \
+	pxor CTR, IV;
+
+/*
+ * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ *			 bool enc, u8 *iv)
+ */
+ENTRY(aesni_xts_crypt8)
+	cmpb $0, %cl
+	movl $0, %ecx
+	movl $240, %r10d
+	leaq _aesni_enc4, %r11
+	leaq _aesni_dec4, %rax
+	cmovel %r10d, %ecx
+	cmoveq %rax, %r11
+
+	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
+	movups (IVP), IV
+
+	mov 480(KEYP), KLEN
+	addq %rcx, KEYP
+
+	movdqa IV, STATE1
+	pxor 0x00(INP), STATE1
+	movdqu IV, 0x00(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE2
+	pxor 0x10(INP), STATE2
+	movdqu IV, 0x10(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE3
+	pxor 0x20(INP), STATE3
+	movdqu IV, 0x20(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE4
+	pxor 0x30(INP), STATE4
+	movdqu IV, 0x30(OUTP)
+
+	call *%r11
+
+	pxor 0x00(OUTP), STATE1
+	movdqu STATE1, 0x00(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE1
+	pxor 0x40(INP), STATE1
+	movdqu IV, 0x40(OUTP)
+
+	pxor 0x10(OUTP), STATE2
+	movdqu STATE2, 0x10(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE2
+	pxor 0x50(INP), STATE2
+	movdqu IV, 0x50(OUTP)
+
+	pxor 0x20(OUTP), STATE3
+	movdqu STATE3, 0x20(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE3
+	pxor 0x60(INP), STATE3
+	movdqu IV, 0x60(OUTP)
+
+	pxor 0x30(OUTP), STATE4
+	movdqu STATE4, 0x30(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE4
+	pxor 0x70(INP), STATE4
+	movdqu IV, 0x70(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movups IV, (IVP)
+
+	call *%r11
+
+	pxor 0x40(OUTP), STATE1
+	movdqu STATE1, 0x40(OUTP)
+
+	pxor 0x50(OUTP), STATE2
+	movdqu STATE2, 0x50(OUTP)
+
+	pxor 0x60(OUTP), STATE3
+	movdqu STATE3, 0x60(OUTP)
+
+	pxor 0x70(OUTP), STATE4
+	movdqu STATE4, 0x70(OUTP)
+
+	ret
+ENDPROC(aesni_xts_crypt8)
+
 #endif
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index a0795da22c0..f80e668785c 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -39,6 +39,9 @@
 #include <crypto/internal/aead.h>
 #include <linux/workqueue.h>
 #include <linux/spinlock.h>
+#ifdef CONFIG_X86_64
+#include <asm/crypto/glue_helper.h>
+#endif
 
 #if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE)
 #define HAS_PCBC
@@ -102,6 +105,9 @@ void crypto_fpu_exit(void);
 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
 
+asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out,
+				 const u8 *in, bool enc, u8 *iv);
+
 /* asmlinkage void aesni_gcm_enc()
  * void *ctx,  AES Key schedule. Starts on a 16 byte boundary.
  * u8 *out, Ciphertext output. Encrypt in-place is allowed.
@@ -510,6 +516,78 @@ static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in)
 	aesni_enc(ctx, out, in);
 }
 
+#ifdef CONFIG_X86_64
+
+static void aesni_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+	glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_enc));
+}
+
+static void aesni_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+	glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_dec));
+}
+
+static void aesni_xts_enc8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+	aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, true, (u8 *)iv);
+}
+
+static void aesni_xts_dec8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+	aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, false, (u8 *)iv);
+}
+
+static const struct common_glue_ctx aesni_enc_xts = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = 1,
+
+	.funcs = { {
+		.num_blocks = 8,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc8) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc) }
+	} }
+};
+
+static const struct common_glue_ctx aesni_dec_xts = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = 1,
+
+	.funcs = { {
+		.num_blocks = 8,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec8) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec) }
+	} }
+};
+
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+
+	return glue_xts_crypt_128bit(&aesni_enc_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(aesni_xts_tweak),
+				     aes_ctx(ctx->raw_tweak_ctx),
+				     aes_ctx(ctx->raw_crypt_ctx));
+}
+
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+
+	return glue_xts_crypt_128bit(&aesni_dec_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(aesni_xts_tweak),
+				     aes_ctx(ctx->raw_tweak_ctx),
+				     aes_ctx(ctx->raw_crypt_ctx));
+}
+
+#else
+
 static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 {
@@ -560,6 +638,8 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	return ret;
 }
 
+#endif
+
 #ifdef CONFIG_X86_64
 static int rfc4106_init(struct crypto_tfm *tfm)
 {
diff --git a/arch/x86/crypto/blowfish-avx2-asm_64.S b/arch/x86/crypto/blowfish-avx2-asm_64.S
new file mode 100644
index 00000000000..784452e0d05
--- /dev/null
+++ b/arch/x86/crypto/blowfish-avx2-asm_64.S
@@ -0,0 +1,449 @@
+/*
+ * x86_64/AVX2 assembler optimized version of Blowfish
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/linkage.h>
+
+.file "blowfish-avx2-asm_64.S"
+
+.data
+.align 32
+
+.Lprefetch_mask:
+.long 0*64
+.long 1*64
+.long 2*64
+.long 3*64
+.long 4*64
+.long 5*64
+.long 6*64
+.long 7*64
+
+.Lbswap32_mask:
+.long 0x00010203
+.long 0x04050607
+.long 0x08090a0b
+.long 0x0c0d0e0f
+
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lbswap_iv_mask:
+	.byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
+
+.text
+/* structure of crypto context */
+#define p	0
+#define s0	((16 + 2) * 4)
+#define s1	((16 + 2 + (1 * 256)) * 4)
+#define s2	((16 + 2 + (2 * 256)) * 4)
+#define s3	((16 + 2 + (3 * 256)) * 4)
+
+/* register macros */
+#define CTX	%rdi
+#define RIO	 %rdx
+
+#define RS0	%rax
+#define RS1	%r8
+#define RS2	%r9
+#define RS3	%r10
+
+#define RLOOP	%r11
+#define RLOOPd	%r11d
+
+#define RXr0	%ymm8
+#define RXr1	%ymm9
+#define RXr2	%ymm10
+#define RXr3	%ymm11
+#define RXl0	%ymm12
+#define RXl1	%ymm13
+#define RXl2	%ymm14
+#define RXl3	%ymm15
+
+/* temp regs */
+#define RT0	%ymm0
+#define RT0x	%xmm0
+#define RT1	%ymm1
+#define RT1x	%xmm1
+#define RIDX0	%ymm2
+#define RIDX1	%ymm3
+#define RIDX1x	%xmm3
+#define RIDX2	%ymm4
+#define RIDX3	%ymm5
+
+/* vpgatherdd mask and '-1' */
+#define RNOT	%ymm6
+
+/* byte mask, (-1 >> 24) */
+#define RBYTE	%ymm7
+
+/***********************************************************************
+ * 32-way AVX2 blowfish
+ ***********************************************************************/
+#define F(xl, xr) \
+	vpsrld $24, xl, RIDX0; \
+	vpsrld $16, xl, RIDX1; \
+	vpsrld $8, xl, RIDX2; \
+	vpand RBYTE, RIDX1, RIDX1; \
+	vpand RBYTE, RIDX2, RIDX2; \
+	vpand RBYTE, xl, RIDX3; \
+	\
+	vpgatherdd RNOT, (RS0, RIDX0, 4), RT0; \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	vpcmpeqd RIDX0, RIDX0, RIDX0; \
+	\
+	vpgatherdd RNOT, (RS1, RIDX1, 4), RT1; \
+	vpcmpeqd RIDX1, RIDX1, RIDX1; \
+	vpaddd RT0, RT1, RT0; \
+	\
+	vpgatherdd RIDX0, (RS2, RIDX2, 4), RT1; \
+	vpxor RT0, RT1, RT0; \
+	\
+	vpgatherdd RIDX1, (RS3, RIDX3, 4), RT1; \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	vpaddd RT0, RT1, RT0; \
+	\
+	vpxor RT0, xr, xr;
+
+#define add_roundkey(xl, nmem) \
+	vpbroadcastd nmem, RT0; \
+	vpxor RT0, xl ## 0, xl ## 0; \
+	vpxor RT0, xl ## 1, xl ## 1; \
+	vpxor RT0, xl ## 2, xl ## 2; \
+	vpxor RT0, xl ## 3, xl ## 3;
+
+#define round_enc() \
+	add_roundkey(RXr, p(CTX,RLOOP,4)); \
+	F(RXl0, RXr0); \
+	F(RXl1, RXr1); \
+	F(RXl2, RXr2); \
+	F(RXl3, RXr3); \
+	\
+	add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
+	F(RXr0, RXl0); \
+	F(RXr1, RXl1); \
+	F(RXr2, RXl2); \
+	F(RXr3, RXl3);
+
+#define round_dec() \
+	add_roundkey(RXr, p+4*2(CTX,RLOOP,4)); \
+	F(RXl0, RXr0); \
+	F(RXl1, RXr1); \
+	F(RXl2, RXr2); \
+	F(RXl3, RXr3); \
+	\
+	add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
+	F(RXr0, RXl0); \
+	F(RXr1, RXl1); \
+	F(RXr2, RXl2); \
+	F(RXr3, RXl3);
+
+#define init_round_constants() \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	leaq s0(CTX), RS0; \
+	leaq s1(CTX), RS1; \
+	leaq s2(CTX), RS2; \
+	leaq s3(CTX), RS3; \
+	vpsrld $24, RNOT, RBYTE;
+
+#define transpose_2x2(x0, x1, t0) \
+	vpunpckldq x0, x1, t0; \
+	vpunpckhdq x0, x1, x1; \
+	\
+	vpunpcklqdq t0, x1, x0; \
+	vpunpckhqdq t0, x1, x1;
+
+#define read_block(xl, xr) \
+	vbroadcasti128 .Lbswap32_mask, RT1; \
+	\
+	vpshufb RT1, xl ## 0, xl ## 0; \
+	vpshufb RT1, xr ## 0, xr ## 0; \
+	vpshufb RT1, xl ## 1, xl ## 1; \
+	vpshufb RT1, xr ## 1, xr ## 1; \
+	vpshufb RT1, xl ## 2, xl ## 2; \
+	vpshufb RT1, xr ## 2, xr ## 2; \
+	vpshufb RT1, xl ## 3, xl ## 3; \
+	vpshufb RT1, xr ## 3, xr ## 3; \
+	\
+	transpose_2x2(xl ## 0, xr ## 0, RT0); \
+	transpose_2x2(xl ## 1, xr ## 1, RT0); \
+	transpose_2x2(xl ## 2, xr ## 2, RT0); \
+	transpose_2x2(xl ## 3, xr ## 3, RT0);
+
+#define write_block(xl, xr) \
+	vbroadcasti128 .Lbswap32_mask, RT1; \
+	\
+	transpose_2x2(xl ## 0, xr ## 0, RT0); \
+	transpose_2x2(xl ## 1, xr ## 1, RT0); \
+	transpose_2x2(xl ## 2, xr ## 2, RT0); \
+	transpose_2x2(xl ## 3, xr ## 3, RT0); \
+	\
+	vpshufb RT1, xl ## 0, xl ## 0; \
+	vpshufb RT1, xr ## 0, xr ## 0; \
+	vpshufb RT1, xl ## 1, xl ## 1; \
+	vpshufb RT1, xr ## 1, xr ## 1; \
+	vpshufb RT1, xl ## 2, xl ## 2; \
+	vpshufb RT1, xr ## 2, xr ## 2; \
+	vpshufb RT1, xl ## 3, xl ## 3; \
+	vpshufb RT1, xr ## 3, xr ## 3;
+
+.align 8
+__blowfish_enc_blk32:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RXl0..4, RXr0..4: plaintext
+	 * output:
+	 *	RXl0..4, RXr0..4: ciphertext (RXl <=> RXr swapped)
+	 */
+	init_round_constants();
+
+	read_block(RXl, RXr);
+
+	movl $1, RLOOPd;
+	add_roundkey(RXl, p+4*(0)(CTX));
+
+.align 4
+.L__enc_loop:
+	round_enc();
+
+	leal 2(RLOOPd), RLOOPd;
+	cmpl $17, RLOOPd;
+	jne .L__enc_loop;
+
+	add_roundkey(RXr, p+4*(17)(CTX));
+
+	write_block(RXl, RXr);
+
+	ret;
+ENDPROC(__blowfish_enc_blk32)
+
+.align 8
+__blowfish_dec_blk32:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RXl0..4, RXr0..4: ciphertext
+	 * output:
+	 *	RXl0..4, RXr0..4: plaintext (RXl <=> RXr swapped)
+	 */
+	init_round_constants();
+
+	read_block(RXl, RXr);
+
+	movl $14, RLOOPd;
+	add_roundkey(RXl, p+4*(17)(CTX));
+
+.align 4
+.L__dec_loop:
+	round_dec();
+
+	addl $-2, RLOOPd;
+	jns .L__dec_loop;
+
+	add_roundkey(RXr, p+4*(0)(CTX));
+
+	write_block(RXl, RXr);
+
+	ret;
+ENDPROC(__blowfish_dec_blk32)
+
+ENTRY(blowfish_ecb_enc_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	vzeroupper;
+
+	vmovdqu 0*32(%rdx), RXl0;
+	vmovdqu 1*32(%rdx), RXr0;
+	vmovdqu 2*32(%rdx), RXl1;
+	vmovdqu 3*32(%rdx), RXr1;
+	vmovdqu 4*32(%rdx), RXl2;
+	vmovdqu 5*32(%rdx), RXr2;
+	vmovdqu 6*32(%rdx), RXl3;
+	vmovdqu 7*32(%rdx), RXr3;
+
+	call __blowfish_enc_blk32;
+
+	vmovdqu RXr0, 0*32(%rsi);
+	vmovdqu RXl0, 1*32(%rsi);
+	vmovdqu RXr1, 2*32(%rsi);
+	vmovdqu RXl1, 3*32(%rsi);
+	vmovdqu RXr2, 4*32(%rsi);
+	vmovdqu RXl2, 5*32(%rsi);
+	vmovdqu RXr3, 6*32(%rsi);
+	vmovdqu RXl3, 7*32(%rsi);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(blowfish_ecb_enc_32way)
+
+ENTRY(blowfish_ecb_dec_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	vzeroupper;
+
+	vmovdqu 0*32(%rdx), RXl0;
+	vmovdqu 1*32(%rdx), RXr0;
+	vmovdqu 2*32(%rdx), RXl1;
+	vmovdqu 3*32(%rdx), RXr1;
+	vmovdqu 4*32(%rdx), RXl2;
+	vmovdqu 5*32(%rdx), RXr2;
+	vmovdqu 6*32(%rdx), RXl3;
+	vmovdqu 7*32(%rdx), RXr3;
+
+	call __blowfish_dec_blk32;
+
+	vmovdqu RXr0, 0*32(%rsi);
+	vmovdqu RXl0, 1*32(%rsi);
+	vmovdqu RXr1, 2*32(%rsi);
+	vmovdqu RXl1, 3*32(%rsi);
+	vmovdqu RXr2, 4*32(%rsi);
+	vmovdqu RXl2, 5*32(%rsi);
+	vmovdqu RXr3, 6*32(%rsi);
+	vmovdqu RXl3, 7*32(%rsi);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(blowfish_ecb_dec_32way)
+
+ENTRY(blowfish_cbc_dec_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	vzeroupper;
+
+	vmovdqu 0*32(%rdx), RXl0;
+	vmovdqu 1*32(%rdx), RXr0;
+	vmovdqu 2*32(%rdx), RXl1;
+	vmovdqu 3*32(%rdx), RXr1;
+	vmovdqu 4*32(%rdx), RXl2;
+	vmovdqu 5*32(%rdx), RXr2;
+	vmovdqu 6*32(%rdx), RXl3;
+	vmovdqu 7*32(%rdx), RXr3;
+
+	call __blowfish_dec_blk32;
+
+	/* xor with src */
+	vmovq (%rdx), RT0x;
+	vpshufd $0x4f, RT0x, RT0x;
+	vinserti128 $1, 8(%rdx), RT0, RT0;
+	vpxor RT0, RXr0, RXr0;
+	vpxor 0*32+24(%rdx), RXl0, RXl0;
+	vpxor 1*32+24(%rdx), RXr1, RXr1;
+	vpxor 2*32+24(%rdx), RXl1, RXl1;
+	vpxor 3*32+24(%rdx), RXr2, RXr2;
+	vpxor 4*32+24(%rdx), RXl2, RXl2;
+	vpxor 5*32+24(%rdx), RXr3, RXr3;
+	vpxor 6*32+24(%rdx), RXl3, RXl3;
+
+	vmovdqu RXr0, (0*32)(%rsi);
+	vmovdqu RXl0, (1*32)(%rsi);
+	vmovdqu RXr1, (2*32)(%rsi);
+	vmovdqu RXl1, (3*32)(%rsi);
+	vmovdqu RXr2, (4*32)(%rsi);
+	vmovdqu RXl2, (5*32)(%rsi);
+	vmovdqu RXr3, (6*32)(%rsi);
+	vmovdqu RXl3, (7*32)(%rsi);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(blowfish_cbc_dec_32way)
+
+ENTRY(blowfish_ctr_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: iv (big endian, 64bit)
+	 */
+
+	vzeroupper;
+
+	vpcmpeqd RT0, RT0, RT0;
+	vpsrldq $8, RT0, RT0; /* a: -1, b: 0, c: -1, d: 0 */
+
author	Linus Torvalds <torvalds@linux-foundation.org>	2013-05-02 14:53:12 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-05-02 14:53:12 -0700
commit	797994f81a8b2bdca2eecffa415c1e7a89a4f961 (patch)
tree	1383dc469c26ad37fdf960f682d9a48c782935c5 /arch
parent	c8d8566952fda026966784a62f324c8352f77430 (diff)
parent	3862de1f6c442d53bd828d39f86d07d933a70605 (diff)