diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-05-02 14:53:12 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-05-02 14:53:12 -0700 |
commit | 797994f81a8b2bdca2eecffa415c1e7a89a4f961 (patch) | |
tree | 1383dc469c26ad37fdf960f682d9a48c782935c5 /arch | |
parent | c8d8566952fda026966784a62f324c8352f77430 (diff) | |
parent | 3862de1f6c442d53bd828d39f86d07d933a70605 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto update from Herbert Xu:
- XTS mode optimisation for twofish/cast6/camellia/aes on x86
- AVX2/x86_64 implementation for blowfish/twofish/serpent/camellia
- SSSE3/AVX/AVX2 optimisations for sha256/sha512
- Added driver for SAHARA2 crypto accelerator
- Fix for GMAC when used in non-IPsec secnarios
- Added generic CMAC implementation (including IPsec glue)
- IP update for crypto/atmel
- Support for more than one device in hwrng/timeriomem
- Added Broadcom BCM2835 RNG driver
- Misc fixes
* git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (59 commits)
crypto: caam - fix job ring cleanup code
crypto: camellia - add AVX2/AES-NI/x86_64 assembler implementation of camellia cipher
crypto: serpent - add AVX2/x86_64 assembler implementation of serpent cipher
crypto: twofish - add AVX2/x86_64 assembler implementation of twofish cipher
crypto: blowfish - add AVX2/x86_64 implementation of blowfish cipher
crypto: tcrypt - add async cipher speed tests for blowfish
crypto: testmgr - extend camellia test-vectors for camellia-aesni/avx2
crypto: aesni_intel - fix Kconfig problem with CRYPTO_GLUE_HELPER_X86
crypto: aesni_intel - add more optimized XTS mode for x86-64
crypto: x86/camellia-aesni-avx - add more optimized XTS code
crypto: cast6-avx: use new optimized XTS code
crypto: x86/twofish-avx - use optimized XTS code
crypto: x86 - add more optimized XTS-mode for serpent-avx
xfrm: add rfc4494 AES-CMAC-96 support
crypto: add CMAC support to CryptoAPI
crypto: testmgr - add empty test vectors for null ciphers
crypto: testmgr - add AES GMAC test vectors
crypto: gcm - fix rfc4543 to handle async crypto correctly
crypto: gcm - make GMAC work when dst and src are different
hwrng: timeriomem - added devicetree hooks
...
Diffstat (limited to 'arch')
40 files changed, 10762 insertions, 240 deletions
diff --git a/arch/arm/mach-at91/at91sam9g45_devices.c b/arch/arm/mach-at91/at91sam9g45_devices.c index 827c9f2a70f..f0bf68268ca 100644 --- a/arch/arm/mach-at91/at91sam9g45_devices.c +++ b/arch/arm/mach-at91/at91sam9g45_devices.c @@ -18,7 +18,7 @@ #include <linux/platform_device.h> #include <linux/i2c-gpio.h> #include <linux/atmel-mci.h> -#include <linux/platform_data/atmel-aes.h> +#include <linux/platform_data/crypto-atmel.h> #include <linux/platform_data/at91_adc.h> @@ -1900,7 +1900,8 @@ static void __init at91_add_device_tdes(void) {} * -------------------------------------------------------------------- */ #if defined(CONFIG_CRYPTO_DEV_ATMEL_AES) || defined(CONFIG_CRYPTO_DEV_ATMEL_AES_MODULE) -static struct aes_platform_data aes_data; +static struct crypto_platform_data aes_data; +static struct crypto_dma_data alt_atslave; static u64 aes_dmamask = DMA_BIT_MASK(32); static struct resource aes_resources[] = { @@ -1931,23 +1932,20 @@ static struct platform_device at91sam9g45_aes_device = { static void __init at91_add_device_aes(void) { struct at_dma_slave *atslave; - struct aes_dma_data *alt_atslave; - - alt_atslave = kzalloc(sizeof(struct aes_dma_data), GFP_KERNEL); /* DMA TX slave channel configuration */ - atslave = &alt_atslave->txdata; + atslave = &alt_atslave.txdata; atslave->dma_dev = &at_hdmac_device.dev; atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE | ATC_SRC_H2SEL_HW | ATC_SRC_PER(AT_DMA_ID_AES_RX); /* DMA RX slave channel configuration */ - atslave = &alt_atslave->rxdata; + atslave = &alt_atslave.rxdata; atslave->dma_dev = &at_hdmac_device.dev; atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE | ATC_DST_H2SEL_HW | ATC_DST_PER(AT_DMA_ID_AES_TX); - aes_data.dma_slave = alt_atslave; + aes_data.dma_slave = &alt_atslave; platform_device_register(&at91sam9g45_aes_device); } #else diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 63947a8f9f0..a3a0ed80f17 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -2,6 +2,10 @@ # Arch-specific CryptoAPI modules. # +avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) +avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ + $(comma)4)$(comma)%ymm2,yes,no) + obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o @@ -12,22 +16,37 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o -obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o -obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o -obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o -obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o -obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o +obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o +obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o + +# These modules require assembler to support AVX. +ifeq ($(avx_supported),yes) + obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \ + camellia-aesni-avx-x86_64.o + obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o + obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o + obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o + obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o +endif + +# These modules require assembler to support AVX2. +ifeq ($(avx2_supported),yes) + obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o + obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o + obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o + obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o +endif aes-i586-y := aes-i586-asm_32.o aes_glue.o twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o @@ -36,21 +55,35 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o -camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ - camellia_aesni_avx_glue.o -cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o -cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o -twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o -serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o + +ifeq ($(avx_supported),yes) + camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ + camellia_aesni_avx_glue.o + cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o + cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o + twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o \ + twofish_avx_glue.o + serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o \ + serpent_avx_glue.o +endif + +ifeq ($(avx2_supported),yes) + blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o + camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o + serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o + twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o +endif aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o crc32c-intel-y := crc32c-intel_glue.o -crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o +crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o +sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o +sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 04b797767b9..62fe22cd4cb 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -34,6 +34,10 @@ #ifdef __x86_64__ .data +.align 16 +.Lgf128mul_x_ble_mask: + .octa 0x00000000000000010000000000000087 + POLY: .octa 0xC2000000000000000000000000000001 TWOONE: .octa 0x00000001000000000000000000000001 @@ -105,6 +109,8 @@ enc: .octa 0x2 #define CTR %xmm11 #define INC %xmm12 +#define GF128MUL_MASK %xmm10 + #ifdef __x86_64__ #define AREG %rax #define KEYP %rdi @@ -2636,4 +2642,115 @@ ENTRY(aesni_ctr_enc) .Lctr_enc_just_ret: ret ENDPROC(aesni_ctr_enc) + +/* + * _aesni_gf128mul_x_ble: internal ABI + * Multiply in GF(2^128) for XTS IVs + * input: + * IV: current IV + * GF128MUL_MASK == mask with 0x87 and 0x01 + * output: + * IV: next IV + * changed: + * CTR: == temporary value + */ +#define _aesni_gf128mul_x_ble() \ + pshufd $0x13, IV, CTR; \ + paddq IV, IV; \ + psrad $31, CTR; \ + pand GF128MUL_MASK, CTR; \ + pxor CTR, IV; + +/* + * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, + * bool enc, u8 *iv) + */ +ENTRY(aesni_xts_crypt8) + cmpb $0, %cl + movl $0, %ecx + movl $240, %r10d + leaq _aesni_enc4, %r11 + leaq _aesni_dec4, %rax + cmovel %r10d, %ecx + cmoveq %rax, %r11 + + movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK + movups (IVP), IV + + mov 480(KEYP), KLEN + addq %rcx, KEYP + + movdqa IV, STATE1 + pxor 0x00(INP), STATE1 + movdqu IV, 0x00(OUTP) + + _aesni_gf128mul_x_ble() + movdqa IV, STATE2 + pxor 0x10(INP), STATE2 + movdqu IV, 0x10(OUTP) + + _aesni_gf128mul_x_ble() + movdqa IV, STATE3 + pxor 0x20(INP), STATE3 + movdqu IV, 0x20(OUTP) + + _aesni_gf128mul_x_ble() + movdqa IV, STATE4 + pxor 0x30(INP), STATE4 + movdqu IV, 0x30(OUTP) + + call *%r11 + + pxor 0x00(OUTP), STATE1 + movdqu STATE1, 0x00(OUTP) + + _aesni_gf128mul_x_ble() + movdqa IV, STATE1 + pxor 0x40(INP), STATE1 + movdqu IV, 0x40(OUTP) + + pxor 0x10(OUTP), STATE2 + movdqu STATE2, 0x10(OUTP) + + _aesni_gf128mul_x_ble() + movdqa IV, STATE2 + pxor 0x50(INP), STATE2 + movdqu IV, 0x50(OUTP) + + pxor 0x20(OUTP), STATE3 + movdqu STATE3, 0x20(OUTP) + + _aesni_gf128mul_x_ble() + movdqa IV, STATE3 + pxor 0x60(INP), STATE3 + movdqu IV, 0x60(OUTP) + + pxor 0x30(OUTP), STATE4 + movdqu STATE4, 0x30(OUTP) + + _aesni_gf128mul_x_ble() + movdqa IV, STATE4 + pxor 0x70(INP), STATE4 + movdqu IV, 0x70(OUTP) + + _aesni_gf128mul_x_ble() + movups IV, (IVP) + + call *%r11 + + pxor 0x40(OUTP), STATE1 + movdqu STATE1, 0x40(OUTP) + + pxor 0x50(OUTP), STATE2 + movdqu STATE2, 0x50(OUTP) + + pxor 0x60(OUTP), STATE3 + movdqu STATE3, 0x60(OUTP) + + pxor 0x70(OUTP), STATE4 + movdqu STATE4, 0x70(OUTP) + + ret +ENDPROC(aesni_xts_crypt8) + #endif diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index a0795da22c0..f80e668785c 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -39,6 +39,9 @@ #include <crypto/internal/aead.h> #include <linux/workqueue.h> #include <linux/spinlock.h> +#ifdef CONFIG_X86_64 +#include <asm/crypto/glue_helper.h> +#endif #if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE) #define HAS_PCBC @@ -102,6 +105,9 @@ void crypto_fpu_exit(void); asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); +asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, bool enc, u8 *iv); + /* asmlinkage void aesni_gcm_enc() * void *ctx, AES Key schedule. Starts on a 16 byte boundary. * u8 *out, Ciphertext output. Encrypt in-place is allowed. @@ -510,6 +516,78 @@ static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in) aesni_enc(ctx, out, in); } +#ifdef CONFIG_X86_64 + +static void aesni_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) +{ + glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_enc)); +} + +static void aesni_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) +{ + glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_dec)); +} + +static void aesni_xts_enc8(void *ctx, u128 *dst, const u128 *src, le128 *iv) +{ + aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, true, (u8 *)iv); +} + +static void aesni_xts_dec8(void *ctx, u128 *dst, const u128 *src, le128 *iv) +{ + aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, false, (u8 *)iv); +} + +static const struct common_glue_ctx aesni_enc_xts = { + .num_funcs = 2, + .fpu_blocks_limit = 1, + + .funcs = { { + .num_blocks = 8, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc8) } + }, { + .num_blocks = 1, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc) } + } } +}; + +static const struct common_glue_ctx aesni_dec_xts = { + .num_funcs = 2, + .fpu_blocks_limit = 1, + + .funcs = { { + .num_blocks = 8, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec8) } + }, { + .num_blocks = 1, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec) } + } } +}; + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + + return glue_xts_crypt_128bit(&aesni_enc_xts, desc, dst, src, nbytes, + XTS_TWEAK_CAST(aesni_xts_tweak), + aes_ctx(ctx->raw_tweak_ctx), + aes_ctx(ctx->raw_crypt_ctx)); +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + + return glue_xts_crypt_128bit(&aesni_dec_xts, desc, dst, src, nbytes, + XTS_TWEAK_CAST(aesni_xts_tweak), + aes_ctx(ctx->raw_tweak_ctx), + aes_ctx(ctx->raw_crypt_ctx)); +} + +#else + static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { @@ -560,6 +638,8 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, return ret; } +#endif + #ifdef CONFIG_X86_64 static int rfc4106_init(struct crypto_tfm *tfm) { diff --git a/arch/x86/crypto/blowfish-avx2-asm_64.S b/arch/x86/crypto/blowfish-avx2-asm_64.S new file mode 100644 index 00000000000..784452e0d05 --- /dev/null +++ b/arch/x86/crypto/blowfish-avx2-asm_64.S @@ -0,0 +1,449 @@ +/* + * x86_64/AVX2 assembler optimized version of Blowfish + * + * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + +#include <linux/linkage.h> + +.file "blowfish-avx2-asm_64.S" + +.data +.align 32 + +.Lprefetch_mask: +.long 0*64 +.long 1*64 +.long 2*64 +.long 3*64 +.long 4*64 +.long 5*64 +.long 6*64 +.long 7*64 + +.Lbswap32_mask: +.long 0x00010203 +.long 0x04050607 +.long 0x08090a0b +.long 0x0c0d0e0f + +.Lbswap128_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +.Lbswap_iv_mask: + .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0 + +.text +/* structure of crypto context */ +#define p 0 +#define s0 ((16 + 2) * 4) +#define s1 ((16 + 2 + (1 * 256)) * 4) +#define s2 ((16 + 2 + (2 * 256)) * 4) +#define s3 ((16 + 2 + (3 * 256)) * 4) + +/* register macros */ +#define CTX %rdi +#define RIO %rdx + +#define RS0 %rax +#define RS1 %r8 +#define RS2 %r9 +#define RS3 %r10 + +#define RLOOP %r11 +#define RLOOPd %r11d + +#define RXr0 %ymm8 +#define RXr1 %ymm9 +#define RXr2 %ymm10 +#define RXr3 %ymm11 +#define RXl0 %ymm12 +#define RXl1 %ymm13 +#define RXl2 %ymm14 +#define RXl3 %ymm15 + +/* temp regs */ +#define RT0 %ymm0 +#define RT0x %xmm0 +#define RT1 %ymm1 +#define RT1x %xmm1 +#define RIDX0 %ymm2 +#define RIDX1 %ymm3 +#define RIDX1x %xmm3 +#define RIDX2 %ymm4 +#define RIDX3 %ymm5 + +/* vpgatherdd mask and '-1' */ +#define RNOT %ymm6 + +/* byte mask, (-1 >> 24) */ +#define RBYTE %ymm7 + +/*********************************************************************** + * 32-way AVX2 blowfish + ***********************************************************************/ +#define F(xl, xr) \ + vpsrld $24, xl, RIDX0; \ + vpsrld $16, xl, RIDX1; \ + vpsrld $8, xl, RIDX2; \ + vpand RBYTE, RIDX1, RIDX1; \ + vpand RBYTE, RIDX2, RIDX2; \ + vpand RBYTE, xl, RIDX3; \ + \ + vpgatherdd RNOT, (RS0, RIDX0, 4), RT0; \ + vpcmpeqd RNOT, RNOT, RNOT; \ + vpcmpeqd RIDX0, RIDX0, RIDX0; \ + \ + vpgatherdd RNOT, (RS1, RIDX1, 4), RT1; \ + vpcmpeqd RIDX1, RIDX1, RIDX1; \ + vpaddd RT0, RT1, RT0; \ + \ + vpgatherdd RIDX0, (RS2, RIDX2, 4), RT1; \ + vpxor RT0, RT1, RT0; \ + \ + vpgatherdd RIDX1, (RS3, RIDX3, 4), RT1; \ + vpcmpeqd RNOT, RNOT, RNOT; \ + vpaddd RT0, RT1, RT0; \ + \ + vpxor RT0, xr, xr; + +#define add_roundkey(xl, nmem) \ + vpbroadcastd nmem, RT0; \ + vpxor RT0, xl ## 0, xl ## 0; \ + vpxor RT0, xl ## 1, xl ## 1; \ + vpxor RT0, xl ## 2, xl ## 2; \ + vpxor RT0, xl ## 3, xl ## 3; + +#define round_enc() \ + add_roundkey(RXr, p(CTX,RLOOP,4)); \ + F(RXl0, RXr0); \ + F(RXl1, RXr1); \ + F(RXl2, RXr2); \ + F(RXl3, RXr3); \ + \ + add_roundkey(RXl, p+4(CTX,RLOOP,4)); \ + F(RXr0, RXl0); \ + F(RXr1, RXl1); \ + F(RXr2, RXl2); \ + F(RXr3, RXl3); + +#define round_dec() \ + add_roundkey(RXr, p+4*2(CTX,RLOOP,4)); \ + F(RXl0, RXr0); \ + F(RXl1, RXr1); \ + F(RXl2, RXr2); \ + F(RXl3, RXr3); \ + \ + add_roundkey(RXl, p+4(CTX,RLOOP,4)); \ + F(RXr0, RXl0); \ + F(RXr1, RXl1); \ + F(RXr2, RXl2); \ + F(RXr3, RXl3); + +#define init_round_constants() \ + vpcmpeqd RNOT, RNOT, RNOT; \ + leaq s0(CTX), RS0; \ + leaq s1(CTX), RS1; \ + leaq s2(CTX), RS2; \ + leaq s3(CTX), RS3; \ + vpsrld $24, RNOT, RBYTE; + +#define transpose_2x2(x0, x1, t0) \ + vpunpckldq x0, x1, t0; \ + vpunpckhdq x0, x1, x1; \ + \ + vpunpcklqdq t0, x1, x0; \ + vpunpckhqdq t0, x1, x1; + +#define read_block(xl, xr) \ + vbroadcasti128 .Lbswap32_mask, RT1; \ + \ + vpshufb RT1, xl ## 0, xl ## 0; \ + vpshufb RT1, xr ## 0, xr ## 0; \ + vpshufb RT1, xl ## 1, xl ## 1; \ + vpshufb RT1, xr ## 1, xr ## 1; \ + vpshufb RT1, xl ## 2, xl ## 2; \ + vpshufb RT1, xr ## 2, xr ## 2; \ + vpshufb RT1, xl ## 3, xl ## 3; \ + vpshufb RT1, xr ## 3, xr ## 3; \ + \ + transpose_2x2(xl ## 0, xr ## 0, RT0); \ + transpose_2x2(xl ## 1, xr ## 1, RT0); \ + transpose_2x2(xl ## 2, xr ## 2, RT0); \ + transpose_2x2(xl ## 3, xr ## 3, RT0); + +#define write_block(xl, xr) \ + vbroadcasti128 .Lbswap32_mask, RT1; \ + \ + transpose_2x2(xl ## 0, xr ## 0, RT0); \ + transpose_2x2(xl ## 1, xr ## 1, RT0); \ + transpose_2x2(xl ## 2, xr ## 2, RT0); \ + transpose_2x2(xl ## 3, xr ## 3, RT0); \ + \ + vpshufb RT1, xl ## 0, xl ## 0; \ + vpshufb RT1, xr ## 0, xr ## 0; \ + vpshufb RT1, xl ## 1, xl ## 1; \ + vpshufb RT1, xr ## 1, xr ## 1; \ + vpshufb RT1, xl ## 2, xl ## 2; \ + vpshufb RT1, xr ## 2, xr ## 2; \ + vpshufb RT1, xl ## 3, xl ## 3; \ + vpshufb RT1, xr ## 3, xr ## 3; + +.align 8 +__blowfish_enc_blk32: + /* input: + * %rdi: ctx, CTX + * RXl0..4, RXr0..4: plaintext + * output: + * RXl0..4, RXr0..4: ciphertext (RXl <=> RXr swapped) + */ + init_round_constants(); + + read_block(RXl, RXr); + + movl $1, RLOOPd; + add_roundkey(RXl, p+4*(0)(CTX)); + +.align 4 +.L__enc_loop: + round_enc(); + + leal 2(RLOOPd), RLOOPd; + cmpl $17, RLOOPd; + jne .L__enc_loop; + + add_roundkey(RXr, p+4*(17)(CTX)); + + write_block(RXl, RXr); + + ret; +ENDPROC(__blowfish_enc_blk32) + +.align 8 +__blowfish_dec_blk32: + /* input: + * %rdi: ctx, CTX + * RXl0..4, RXr0..4: ciphertext + * output: + * RXl0..4, RXr0..4: plaintext (RXl <=> RXr swapped) + */ + init_round_constants(); + + read_block(RXl, RXr); + + movl $14, RLOOPd; + add_roundkey(RXl, p+4*(17)(CTX)); + +.align 4 +.L__dec_loop: + round_dec(); + + addl $-2, RLOOPd; + jns .L__dec_loop; + + add_roundkey(RXr, p+4*(0)(CTX)); + + write_block(RXl, RXr); + + ret; +ENDPROC(__blowfish_dec_blk32) + +ENTRY(blowfish_ecb_enc_32way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + vzeroupper; + + vmovdqu 0*32(%rdx), RXl0; + vmovdqu 1*32(%rdx), RXr0; + vmovdqu 2*32(%rdx), RXl1; + vmovdqu 3*32(%rdx), RXr1; + vmovdqu 4*32(%rdx), RXl2; + vmovdqu 5*32(%rdx), RXr2; + vmovdqu 6*32(%rdx), RXl3; + vmovdqu 7*32(%rdx), RXr3; + + call __blowfish_enc_blk32; + + vmovdqu RXr0, 0*32(%rsi); + vmovdqu RXl0, 1*32(%rsi); + vmovdqu RXr1, 2*32(%rsi); + vmovdqu RXl1, 3*32(%rsi); + vmovdqu RXr2, 4*32(%rsi); + vmovdqu RXl2, 5*32(%rsi); + vmovdqu RXr3, 6*32(%rsi); + vmovdqu RXl3, 7*32(%rsi); + + vzeroupper; + + ret; +ENDPROC(blowfish_ecb_enc_32way) + +ENTRY(blowfish_ecb_dec_32way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + vzeroupper; + + vmovdqu 0*32(%rdx), RXl0; + vmovdqu 1*32(%rdx), RXr0; + vmovdqu 2*32(%rdx), RXl1; + vmovdqu 3*32(%rdx), RXr1; + vmovdqu 4*32(%rdx), RXl2; + vmovdqu 5*32(%rdx), RXr2; + vmovdqu 6*32(%rdx), RXl3; + vmovdqu 7*32(%rdx), RXr3; + + call __blowfish_dec_blk32; + + vmovdqu RXr0, 0*32(%rsi); + vmovdqu RXl0, 1*32(%rsi); + vmovdqu RXr1, 2*32(%rsi); + vmovdqu RXl1, 3*32(%rsi); + vmovdqu RXr2, 4*32(%rsi); + vmovdqu RXl2, 5*32(%rsi); + vmovdqu RXr3, 6*32(%rsi); + vmovdqu RXl3, 7*32(%rsi); + + vzeroupper; + + ret; +ENDPROC(blowfish_ecb_dec_32way) + +ENTRY(blowfish_cbc_dec_32way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + vzeroupper; + + vmovdqu 0*32(%rdx), RXl0; + vmovdqu 1*32(%rdx), RXr0; + vmovdqu 2*32(%rdx), RXl1; + vmovdqu 3*32(%rdx), RXr1; + vmovdqu 4*32(%rdx), RXl2; + vmovdqu 5*32(%rdx), RXr2; + vmovdqu 6*32(%rdx), RXl3; + vmovdqu 7*32(%rdx), RXr3; + + call __blowfish_dec_blk32; + + /* xor with src */ + vmovq (%rdx), RT0x; + vpshufd $0x4f, RT0x, RT0x; + vinserti128 $1, 8(%rdx), RT0, RT0; + vpxor RT0, RXr0, RXr0; + vpxor 0*32+24(%rdx), RXl0, RXl0; + vpxor 1*32+24(%rdx), RXr1, RXr1; + vpxor 2*32+24(%rdx), RXl1, RXl1; + vpxor 3*32+24(%rdx), RXr2, RXr2; + vpxor 4*32+24(%rdx), RXl2, RXl2; + vpxor 5*32+24(%rdx), RXr3, RXr3; + vpxor 6*32+24(%rdx), RXl3, RXl3; + + vmovdqu RXr0, (0*32)(%rsi); + vmovdqu RXl0, (1*32)(%rsi); + vmovdqu RXr1, (2*32)(%rsi); + vmovdqu RXl1, (3*32)(%rsi); + vmovdqu RXr2, (4*32)(%rsi); + vmovdqu RXl2, (5*32)(%rsi); + vmovdqu RXr3, (6*32)(%rsi); + vmovdqu RXl3, (7*32)(%rsi); + + vzeroupper; + + ret; +ENDPROC(blowfish_cbc_dec_32way) + +ENTRY(blowfish_ctr_32way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: iv (big endian, 64bit) + */ + + vzeroupper; + + vpcmpeqd RT0, RT0, RT0; + vpsrldq $8, RT0, RT0; /* a: -1, b: 0, c: -1, d: 0 */ + |