diff options
Diffstat (limited to 'arch/x86/crypto')
55 files changed, 13739 insertions, 775 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index e0ca7c9ac38..61d6e281898 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -2,7 +2,10 @@ # Arch-specific CryptoAPI modules. # -obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o +avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) +avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ + $(comma)4)$(comma)%ymm2,yes,no) + obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o @@ -12,21 +15,36 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o -obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o -obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o -obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o -obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o -obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o +obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o +obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o +obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o +obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o + +# These modules require assembler to support AVX. +ifeq ($(avx_supported),yes) + obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \ + camellia-aesni-avx-x86_64.o + obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o + obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o + obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o + obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o +endif + +# These modules require assembler to support AVX2. +ifeq ($(avx2_supported),yes) + obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o + obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o +endif aes-i586-y := aes-i586-asm_32.o aes_glue.o twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o @@ -35,20 +53,38 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o -camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ - camellia_aesni_avx_glue.o -cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o -cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o -twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o -serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o + +ifeq ($(avx_supported),yes) + camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ + camellia_aesni_avx_glue.o + cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o + cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o + twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o \ + twofish_avx_glue.o + serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o \ + serpent_avx_glue.o +endif + +ifeq ($(avx2_supported),yes) + camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o + serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o +endif aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o +aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o +ifeq ($(avx2_supported),yes) +sha1-ssse3-y += sha1_avx2_x86_64_asm.o +endif crc32c-intel-y := crc32c-intel_glue.o -crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o +crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o +crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o +sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o +sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o +crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o diff --git a/arch/x86/crypto/ablk_helper.c b/arch/x86/crypto/ablk_helper.c deleted file mode 100644 index 43282fe04a8..00000000000 --- a/arch/x86/crypto/ablk_helper.c +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Shared async block cipher helpers - * - * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> - * - * Based on aesni-intel_glue.c by: - * Copyright (C) 2008, Intel Corp. - * Author: Huang Ying <ying.huang@intel.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 - * USA - * - */ - -#include <linux/kernel.h> -#include <linux/crypto.h> -#include <linux/init.h> -#include <linux/module.h> -#include <crypto/algapi.h> -#include <crypto/cryptd.h> -#include <asm/i387.h> -#include <asm/crypto/ablk_helper.h> - -int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, - unsigned int key_len) -{ - struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm); - struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base; - int err; - - crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); - crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm) - & CRYPTO_TFM_REQ_MASK); - err = crypto_ablkcipher_setkey(child, key, key_len); - crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child) - & CRYPTO_TFM_RES_MASK); - return err; -} -EXPORT_SYMBOL_GPL(ablk_set_key); - -int __ablk_encrypt(struct ablkcipher_request *req) -{ - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); - struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm); - struct blkcipher_desc desc; - - desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); - desc.info = req->info; - desc.flags = 0; - - return crypto_blkcipher_crt(desc.tfm)->encrypt( - &desc, req->dst, req->src, req->nbytes); -} -EXPORT_SYMBOL_GPL(__ablk_encrypt); - -int ablk_encrypt(struct ablkcipher_request *req) -{ - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); - struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm); - - if (!irq_fpu_usable()) { - struct ablkcipher_request *cryptd_req = - ablkcipher_request_ctx(req); - - memcpy(cryptd_req, req, sizeof(*req)); - ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - - return crypto_ablkcipher_encrypt(cryptd_req); - } else { - return __ablk_encrypt(req); - } -} -EXPORT_SYMBOL_GPL(ablk_encrypt); - -int ablk_decrypt(struct ablkcipher_request *req) -{ - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); - struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm); - - if (!irq_fpu_usable()) { - struct ablkcipher_request *cryptd_req = - ablkcipher_request_ctx(req); - - memcpy(cryptd_req, req, sizeof(*req)); - ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - - return crypto_ablkcipher_decrypt(cryptd_req); - } else { - struct blkcipher_desc desc; - - desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); - desc.info = req->info; - desc.flags = 0; - - return crypto_blkcipher_crt(desc.tfm)->decrypt( - &desc, req->dst, req->src, req->nbytes); - } -} -EXPORT_SYMBOL_GPL(ablk_decrypt); - -void ablk_exit(struct crypto_tfm *tfm) -{ - struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm); - - cryptd_free_ablkcipher(ctx->cryptd_tfm); -} -EXPORT_SYMBOL_GPL(ablk_exit); - -int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name) -{ - struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm); - struct cryptd_ablkcipher *cryptd_tfm; - - cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - ctx->cryptd_tfm = cryptd_tfm; - tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) + - crypto_ablkcipher_reqsize(&cryptd_tfm->base); - - return 0; -} -EXPORT_SYMBOL_GPL(ablk_init_common); - -int ablk_init(struct crypto_tfm *tfm) -{ - char drv_name[CRYPTO_MAX_ALG_NAME]; - - snprintf(drv_name, sizeof(drv_name), "__driver-%s", - crypto_tfm_alg_driver_name(tfm)); - - return ablk_init_common(tfm, drv_name); -} -EXPORT_SYMBOL_GPL(ablk_init); - -MODULE_LICENSE("GPL"); diff --git a/arch/x86/crypto/aes-i586-asm_32.S b/arch/x86/crypto/aes-i586-asm_32.S index b949ec2f9af..2849dbc59e1 100644 --- a/arch/x86/crypto/aes-i586-asm_32.S +++ b/arch/x86/crypto/aes-i586-asm_32.S @@ -36,6 +36,7 @@ .file "aes-i586-asm.S" .text +#include <linux/linkage.h> #include <asm/asm-offsets.h> #define tlen 1024 // length of each of 4 'xor' arrays (256 32-bit words) @@ -219,14 +220,10 @@ // AES (Rijndael) Encryption Subroutine /* void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */ -.global aes_enc_blk - .extern crypto_ft_tab .extern crypto_fl_tab -.align 4 - -aes_enc_blk: +ENTRY(aes_enc_blk) push %ebp mov ctx(%esp),%ebp @@ -290,18 +287,15 @@ aes_enc_blk: mov %r0,(%ebp) pop %ebp ret +ENDPROC(aes_enc_blk) // AES (Rijndael) Decryption Subroutine /* void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */ -.global aes_dec_blk - .extern crypto_it_tab .extern crypto_il_tab -.align 4 - -aes_dec_blk: +ENTRY(aes_dec_blk) push %ebp mov ctx(%esp),%ebp @@ -365,3 +359,4 @@ aes_dec_blk: mov %r0,(%ebp) pop %ebp ret +ENDPROC(aes_dec_blk) diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S index 5b577d5a059..91056554716 100644 --- a/arch/x86/crypto/aes-x86_64-asm_64.S +++ b/arch/x86/crypto/aes-x86_64-asm_64.S @@ -15,6 +15,7 @@ .text +#include <linux/linkage.h> #include <asm/asm-offsets.h> #define R1 %rax @@ -49,10 +50,8 @@ #define R11 %r11 #define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \ - .global FUNC; \ - .type FUNC,@function; \ - .align 8; \ -FUNC: movq r1,r2; \ + ENTRY(FUNC); \ + movq r1,r2; \ movq r3,r4; \ leaq KEY+48(r8),r9; \ movq r10,r11; \ @@ -71,14 +70,15 @@ FUNC: movq r1,r2; \ je B192; \ leaq 32(r9),r9; -#define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \ +#define epilogue(FUNC,r1,r2,r3,r4,r5,r6,r7,r8,r9) \ movq r1,r2; \ movq r3,r4; \ movl r5 ## E,(r9); \ movl r6 ## E,4(r9); \ movl r7 ## E,8(r9); \ movl r8 ## E,12(r9); \ - ret; + ret; \ + ENDPROC(FUNC); #define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \ movzbl r2 ## H,r5 ## E; \ @@ -133,7 +133,7 @@ FUNC: movq r1,r2; \ #define entry(FUNC,KEY,B128,B192) \ prologue(FUNC,KEY,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11) -#define return epilogue(R8,R2,R9,R7,R5,R6,R3,R4,R11) +#define return(FUNC) epilogue(FUNC,R8,R2,R9,R7,R5,R6,R3,R4,R11) #define encrypt_round(TAB,OFFSET) \ round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \ @@ -151,12 +151,12 @@ FUNC: movq r1,r2; \ /* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */ - entry(aes_enc_blk,0,enc128,enc192) + entry(aes_enc_blk,0,.Le128,.Le192) encrypt_round(crypto_ft_tab,-96) encrypt_round(crypto_ft_tab,-80) -enc192: encrypt_round(crypto_ft_tab,-64) +.Le192: encrypt_round(crypto_ft_tab,-64) encrypt_round(crypto_ft_tab,-48) -enc128: encrypt_round(crypto_ft_tab,-32) +.Le128: encrypt_round(crypto_ft_tab,-32) encrypt_round(crypto_ft_tab,-16) encrypt_round(crypto_ft_tab, 0) encrypt_round(crypto_ft_tab, 16) @@ -166,16 +166,16 @@ enc128: encrypt_round(crypto_ft_tab,-32) encrypt_round(crypto_ft_tab, 80) encrypt_round(crypto_ft_tab, 96) encrypt_final(crypto_fl_tab,112) - return + return(aes_enc_blk) /* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in) */ - entry(aes_dec_blk,240,dec128,dec192) + entry(aes_dec_blk,240,.Ld128,.Ld192) decrypt_round(crypto_it_tab,-96) decrypt_round(crypto_it_tab,-80) -dec192: decrypt_round(crypto_it_tab,-64) +.Ld192: decrypt_round(crypto_it_tab,-64) decrypt_round(crypto_it_tab,-48) -dec128: decrypt_round(crypto_it_tab,-32) +.Ld128: decrypt_round(crypto_it_tab,-32) decrypt_round(crypto_it_tab,-16) decrypt_round(crypto_it_tab, 0) decrypt_round(crypto_it_tab, 16) @@ -185,4 +185,4 @@ dec128: decrypt_round(crypto_it_tab,-32) decrypt_round(crypto_it_tab, 80) decrypt_round(crypto_it_tab, 96) decrypt_final(crypto_il_tab,112) - return + return(aes_dec_blk) diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 3470624d783..477e9d75149 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -34,6 +34,10 @@ #ifdef __x86_64__ .data +.align 16 +.Lgf128mul_x_ble_mask: + .octa 0x00000000000000010000000000000087 + POLY: .octa 0xC2000000000000000000000000000001 TWOONE: .octa 0x00000001000000000000000000000001 @@ -105,6 +109,8 @@ enc: .octa 0x2 #define CTR %xmm11 #define INC %xmm12 +#define GF128MUL_MASK %xmm10 + #ifdef __x86_64__ #define AREG %rax #define KEYP %rdi @@ -1262,7 +1268,6 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst * poly = x^128 + x^127 + x^126 + x^121 + 1 * *****************************************************************************/ - ENTRY(aesni_gcm_dec) push %r12 push %r13 @@ -1437,6 +1442,7 @@ _return_T_done_decrypt: pop %r13 pop %r12 ret +ENDPROC(aesni_gcm_dec) /***************************************************************************** @@ -1700,10 +1706,12 @@ _return_T_done_encrypt: pop %r13 pop %r12 ret +ENDPROC(aesni_gcm_enc) #endif +.align 4 _key_expansion_128: _key_expansion_256a: pshufd $0b11111111, %xmm1, %xmm1 @@ -1715,6 +1723,8 @@ _key_expansion_256a: movaps %xmm0, (TKEYP) add $0x10, TKEYP ret +ENDPROC(_key_expansion_128) +ENDPROC(_key_expansion_256a) .align 4 _key_expansion_192a: @@ -1739,6 +1749,7 @@ _key_expansion_192a: movaps %xmm1, 0x10(TKEYP) add $0x20, TKEYP ret +ENDPROC(_key_expansion_192a) .align 4 _key_expansion_192b: @@ -1758,6 +1769,7 @@ _key_expansion_192b: movaps %xmm0, (TKEYP) add $0x10, TKEYP ret +ENDPROC(_key_expansion_192b) .align 4 _key_expansion_256b: @@ -1770,6 +1782,7 @@ _key_expansion_256b: movaps %xmm2, (TKEYP) add $0x10, TKEYP ret +ENDPROC(_key_expansion_256b) /* * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, @@ -1882,6 +1895,7 @@ ENTRY(aesni_set_key) popl KEYP #endif ret +ENDPROC(aesni_set_key) /* * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) @@ -1903,6 +1917,7 @@ ENTRY(aesni_enc) popl KEYP #endif ret +ENDPROC(aesni_enc) /* * _aesni_enc1: internal ABI @@ -1960,6 +1975,7 @@ _aesni_enc1: movaps 0x70(TKEYP), KEY AESENCLAST KEY STATE ret +ENDPROC(_aesni_enc1) /* * _aesni_enc4: internal ABI @@ -2068,6 +2084,7 @@ _aesni_enc4: AESENCLAST KEY STATE3 AESENCLAST KEY STATE4 ret +ENDPROC(_aesni_enc4) /* * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) @@ -2090,6 +2107,7 @@ ENTRY(aesni_dec) popl KEYP #endif ret +ENDPROC(aesni_dec) /* * _aesni_dec1: internal ABI @@ -2147,6 +2165,7 @@ _aesni_dec1: movaps 0x70(TKEYP), KEY AESDECLAST KEY STATE ret +ENDPROC(_aesni_dec1) /* * _aesni_dec4: internal ABI @@ -2255,6 +2274,7 @@ _aesni_dec4: AESDECLAST KEY STATE3 AESDECLAST KEY STATE4 ret +ENDPROC(_aesni_dec4) /* * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, @@ -2312,6 +2332,7 @@ ENTRY(aesni_ecb_enc) popl LEN #endif ret +ENDPROC(aesni_ecb_enc) /* * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, @@ -2370,6 +2391,7 @@ ENTRY(aesni_ecb_dec) popl LEN #endif ret +ENDPROC(aesni_ecb_dec) /* * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, @@ -2411,6 +2433,7 @@ ENTRY(aesni_cbc_enc) popl IVP #endif ret +ENDPROC(aesni_cbc_enc) /* * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, @@ -2501,6 +2524,7 @@ ENTRY(aesni_cbc_dec) popl IVP #endif ret +ENDPROC(aesni_cbc_dec) #ifdef __x86_64__ .align 16 @@ -2527,6 +2551,7 @@ _aesni_inc_init: MOVQ_R64_XMM TCTR_LOW INC MOVQ_R64_XMM CTR TCTR_LOW ret +ENDPROC(_aesni_inc_init) /* * _aesni_inc: internal ABI @@ -2555,6 +2580,7 @@ _aesni_inc: movaps CTR, IV PSHUFB_XMM BSWAP_MASK IV ret +ENDPROC(_aesni_inc) /* * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, @@ -2615,4 +2641,132 @@ ENTRY(aesni_ctr_enc) movups IV, (IVP) .Lctr_enc_just_ret: ret +ENDPROC(aesni_ctr_enc) + +/* + * _aesni_gf128mul_x_ble: internal ABI + * Multiply in GF(2^128) for XTS IVs + * input: + * IV: current IV + * GF128MUL_MASK == mask with 0x87 and 0x01 + * output: + * IV: next IV + * changed: + * CTR: == temporary value + */ +#define _aesni_gf128mul_x_ble() \ + pshufd $0x13, IV, CTR; \ + paddq IV, IV; \ + psrad $31, CTR; \ + pand GF128MUL_MASK, CTR; \ + pxor CTR, IV; + +/* + * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, + * bool enc, u8 *iv) + */ +ENTRY(aesni_xts_crypt8) + cmpb $0, %cl + movl $0, %ecx + movl $240, %r10d + leaq _aesni_enc4, %r11 + leaq _aesni_dec4, %rax + cmovel %r10d, %ecx + cmoveq %rax, %r11 + + movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK + movups (IVP), IV + + mov 480(KEYP), KLEN + addq %rcx, KEYP + + movdqa IV, STATE1 + movdqu 0x00(INP), INC + pxor INC, STATE1 + movdqu IV, 0x00(OUTP) + + _aesni_gf128mul_x_ble() + movdqa IV, STATE2 + movdqu 0x10(INP), INC + pxor INC, STATE2 + movdqu IV, 0x10(OUTP) + + _aesni_gf128mul_x_ble() + movdqa IV, STATE3 + movdqu 0x20(INP), INC + pxor INC, STATE3 + movdqu IV, 0x20(OUTP) + + _aesni_gf128mul_x_ble() + movdqa IV, STATE4 + movdqu 0x30(INP), INC + pxor INC, STATE4 + movdqu IV, 0x30(OUTP) + + call *%r11 + + movdqu 0x00(OUTP), INC + pxor INC, STATE1 + movdqu STATE1, 0x00(OUTP) + + _aesni_gf128mul_x_ble() + movdqa IV, STATE1 + movdqu 0x40(INP), INC + pxor INC, STATE1 + movdqu IV, 0x40(OUTP) + + movdqu 0x10(OUTP), INC + pxor INC, STATE2 + movdqu STATE2, 0x10(OUTP) + + _aesni_gf128mul_x_ble() + movdqa IV, STATE2 + movdqu 0x50(INP), INC + pxor INC, STATE2 + movdqu IV, 0x50(OUTP) + + movdqu 0x20(OUTP), INC + pxor INC, STATE3 + movdqu STATE3, 0x20(OUTP) + + _aesni_gf128mul_x_ble() + movdqa IV, STATE3 + movdqu 0x60(INP), INC + pxor INC, STATE3 + movdqu IV, 0x60(OUTP) + + movdqu 0x30(OUTP), INC + pxor INC, STATE4 + movdqu STATE4, 0x30(OUTP) + + _aesni_gf128mul_x_ble() + movdqa IV, STATE4 + movdqu 0x70(INP), INC + pxor INC, STATE4 + movdqu IV, 0x70(OUTP) + + _aesni_gf128mul_x_ble() + movups IV, (IVP) + + call *%r11 + + movdqu 0x40(OUTP), INC + pxor INC, STATE1 + movdqu STATE1, 0x40(OUTP) + + movdqu 0x50(OUTP), INC + pxor INC, STATE2 + movdqu STATE2, 0x50(OUTP) + + movdqu 0x60(OUTP), INC + pxor INC, STATE3 + movdqu STATE3, 0x60(OUTP) + + movdqu 0x70(OUTP), INC + pxor INC, STATE4 + movdqu STATE4, 0x70(OUTP) + + ret +ENDPROC(aesni_xts_crypt8) + #endif diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S new file mode 100644 index 00000000000..522ab68d1c8 --- /dev/null +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -0,0 +1,2811 @@ +######################################################################## +# Copyright (c) 2013, Intel Corporation +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of the Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# +# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR +# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## +## +## Authors: +## Erdinc Ozturk <erdinc.ozturk@intel.com> +## Vinodh Gopal <vinodh.gopal@intel.com> +## James Guilford <james.guilford@intel.com> +## Tim Chen <tim.c.chen@linux.intel.com> +## +## References: +## This code was derived and highly optimized from the code described in paper: +## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation +## on Intel Architecture Processors. August, 2010 +## The details of the implementation is explained in: +## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode +## on Intel Architecture Processors. October, 2012. +## +## Assumptions: +## +## +## +## iv: +## 0 1 2 3 +## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | Salt (From the SA) | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | Initialization Vector | +## | (This is the sequence number from IPSec header) | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | 0x1 | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## +## +## +## AAD: +## AAD padded to 128 bits with 0 +## for example, assume AAD is a u32 vector +## +## if AAD is 8 bytes: +## AAD[3] = {A0, A1}# +## padded AAD in xmm register = {A1 A0 0 0} +## +## 0 1 2 3 +## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | SPI (A1) | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | 32-bit Sequence Number (A0) | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | 0x0 | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## +## AAD Format with 32-bit Sequence Number +## +## if AAD is 12 bytes: +## AAD[3] = {A0, A1, A2}# +## padded AAD in xmm register = {A2 A1 A0 0} +## +## 0 1 2 3 +## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | SPI (A2) | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | 64-bit Extended Sequence Number {A1,A0} | +## | | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | 0x0 | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## +## AAD Format with 64-bit Extended Sequence Number +## +## +## aadLen: +## from the definition of the spec, aadLen can only be 8 or 12 bytes. +## The code additionally supports aadLen of length 16 bytes. +## +## TLen: +## from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +## +## poly = x^128 + x^127 + x^126 + x^121 + 1 +## throughout the code, one tab and two tab indentations are used. one tab is +## for GHASH part, two tabs is for AES part. +## + +#include <linux/linkage.h> +#include <asm/inst.h> + +.data +.align 16 + +POLY: .octa 0xC2000000000000000000000000000001 +POLY2: .octa 0xC20000000000000000000001C2000000 +TWOONE: .octa 0x00000001000000000000000000000001 + +# order of these constants should not change. +# more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F + +SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F +SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 +ALL_F: .octa 0xffffffffffffffffffffffffffffffff +ZERO: .octa 0x00000000000000000000000000000000 +ONE: .octa 0x00000000000000000000000000000001 +ONEf: .octa 0x01000000000000000000000000000000 + +.text + + +##define the fields of the gcm aes context +#{ +# u8 expanded_keys[16*11] store expanded keys +# u8 shifted_hkey_1[16] store HashKey <<1 mod poly here +# u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here +# u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here +# u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here +# u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here +# u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here +# u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here +# u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here +# u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes) +# u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes) +# u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes) +# u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes) +# u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes) +# u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes) +# u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes) +# u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes) +#} gcm_ctx# + +HashKey = 16*11 # store HashKey <<1 mod poly here +HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here +HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here +HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here +HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here +HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here +HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here +HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here +HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) +HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) +HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) +HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) +HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) +HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) +HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) +HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) + +#define arg1 %rdi +#define arg2 %rsi +#define arg3 %rdx +#define arg4 %rcx +#define arg5 %r8 +#define arg6 %r9 +#define arg7 STACK_OFFSET+8*1(%r14) +#define arg8 STACK_OFFSET+8*2(%r14) +#define arg9 STACK_OFFSET+8*3(%r14) + +i = 0 +j = 0 + +out_order = 0 +in_order = 1 +DEC = 0 +ENC = 1 + +.macro define_reg r n +reg_\r = %xmm\n +.endm + +.macro setreg +.altmacro +define_reg i %i +define_reg j %j +.noaltmacro +.endm + +# need to push 4 registers into stack to maintain +STACK_OFFSET = 8*4 + +TMP1 = 16*0 # Temporary storage for AAD +TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register) +TMP3 = 16*2 # Temporary storage for AES State 3 +TMP4 = 16*3 # Temporary storage for AES State 4 +TMP5 = 16*4 # Temporary storage for AES State 5 +TMP6 = 16*5 # Temporary storage for AES State 6 +TMP7 = 16*6 # Temporary storage for AES State 7 +TMP8 = 16*7 # Temporary storage for AES State 8 + +VARIABLE_OFFSET = 16*8 + +################################ +# Utility Macros +################################ + +# Encryption of a single block +.macro ENCRYPT_SINGLE_BLOCK XMM0 + vpxor (arg1), \XMM0, \XMM0 + i = 1 + setreg +.rep 9 + vaesenc 16*i(arg1), \XMM0, \XMM0 + i = (i+1) + setreg +.endr + vaesenclast 16*10(arg1), \XMM0, \XMM0 +.endm + +#ifdef CONFIG_AS_AVX +############################################################################### +# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +# Input: A and B (128-bits each, bit-reflected) +# Output: C = A*B*x mod poly, (i.e. >>1 ) +# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +############################################################################### +.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 + + vpshufd $0b01001110, \GH, \T2 + vpshufd $0b01001110, \HK, \T3 + vpxor \GH , \T2, \T2 # T2 = (a1+a0) + vpxor \HK , \T3, \T3 # T3 = (b1+b0) + + vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 + vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 + vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) + vpxor \GH, \T2,\T2 + vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 + + vpslldq $8, \T2,\T3 # shift-L T3 2 DWs + vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs + vpxor \T3, \GH, \GH + vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK + + #first phase of the reduction + vpslld $31, \GH, \T2 # packed right shifting << 31 + vpslld $30, \GH, \T3 # packed right shifting shift << 30 + vpslld $25, \GH, \T4 # packed right shifting shift << 25 + + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + + vpsrldq $4, \T2, \T5 # shift-R T5 1 DW + + vpslldq $12, \T2, \T2 # shift-L T2 3 DWs + vpxor \T2, \GH, \GH # first phase of the reduction complete + + #second phase of the reduction + + vpsrld $1,\GH, \T2 # packed left shifting >> 1 + vpsrld $2,\GH, \T3 # packed left shifting >> 2 + vpsrld $7,\GH, \T4 # packed left shifting >> 7 + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + + vpxor \T5, \T2, \T2 + vpxor \T2, \GH, \GH + vpxor \T1, \GH, \GH # the result is in GH + + +.endm + +.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 + + # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vmovdqa \HK, \T5 + + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_k(arg1) + + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly + vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_2_k(arg1) + + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly + vmovdqa \T5, HashKey_3(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_3_k(arg1) + + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly + vmovdqa \T5, HashKey_4(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_4_k(arg1) + + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly + vmovdqa \T5, HashKey_5(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_5_k(arg1) + + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly + vmovdqa \T5, HashKey_6(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_6_k(arg1) + + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly + vmovdqa \T5, HashKey_7(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_7_k(arg1) + + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly + vmovdqa \T5, HashKey_8(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_8_k(arg1) + +.endm + +## if a = number of total plaintext bytes +## b = floor(a/16) +## num_initial_blocks = b mod 4# +## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext +## r10, r11, r12, rax are clobbered +## arg1, arg2, arg3, r14 are used as a pointer only, not modified + +.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC + i = (8-\num_initial_blocks) + setreg + + mov arg6, %r10 # r10 = AAD + mov arg7, %r12 # r12 = aadLen + + + mov %r12, %r11 + + vpxor reg_i, reg_i, reg_i +_get_AAD_loop\@: + vmovd (%r10), \T1 + vpslldq $12, \T1, \T1 + vpsrldq $4, reg_i, reg_i + vpxor \T1, reg_i, reg_i + + add $4, %r10 + sub $4, %r12 + jg _get_AAD_loop\@ + + + cmp $16, %r11 + je _get_AAD_loop2_done\@ + mov $16, %r12 + +_get_AAD_loop2\@: + vpsrldq $4, reg_i, reg_i + sub $4, %r12 + cmp %r11, %r12 + jg _get_AAD_loop2\@ + +_get_AAD_loop2_done\@: + + #byte-reflect the AAD data + vpshufb SHUF_MASK(%rip), reg_i, reg_i + + # initialize the data pointer offset as zero + xor %r11, %r11 + + # start AES for num_initial_blocks blocks + mov arg5, %rax # rax = *Y0 + vmovdqu (%rax), \CTR # CTR = Y0 + vpshufb SHUF_MASK(%rip), \CTR, \CTR + + + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, reg_i + vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap + i = (i+1) + setreg +.endr + + vmovdqa (arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vpxor \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr + + j = 1 + setreg +.rep 9 + vmovdqa 16*j(arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vaesenc \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr + + j = (j+1) + setreg +.endr + + + vmovdqa 16*10(arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vaesenclast \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr + + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vmovdqu (arg3, %r11), \T1 + vpxor \T1, reg_i, reg_i + vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks + add $16, %r11 +.if \ENC_DEC == DEC + vmovdqa \T1, reg_i +.endif + vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations + i = (i+1) + setreg +.endr + + + i = (8-\num_initial_blocks) + j = (9-\num_initial_blocks) + setreg + GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 + +.rep \num_initial_blocks + vpxor reg_i, reg_j, reg_j + GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks + i = (i+1) + j = (j+1) + setreg +.endr + # XMM8 has the combined result here + + vmovdqa \XMM8, TMP1(%rsp) + vmovdqa \XMM8, \T3 + + cmp $128, %r13 + jl _initial_blocks_done\@ # no need for precomputed constants + +############################################################################### +# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM1 + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM2 + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM3 + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM4 + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM5 + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM6 + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM7 + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM8 + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + + vmovdqa (arg1), \T_key + vpxor \T_key, \XMM1, \XMM1 + vpxor \T_key, \XMM2, \XMM2 + vpxor \T_key, \XMM3, \XMM3 + vpxor \T_key, \XMM4, \XMM4 + vpxor \T_key, \XMM5, \XMM5 + vpxor \T_key, \XMM6, \XMM6 + vpxor \T_key, \XMM7, \XMM7 + vpxor \T_key, \XMM8, \XMM8 + + i = 1 + setreg +.rep 9 # do 9 rounds + vmovdqa 16*i(arg1), \T_key + vaesenc \T_key, \XMM1, \XMM1 + vaesenc \T_key, \XMM2, \XMM2 + vaesenc \T_key, \XMM3, \XMM3 + vaesenc \T_key, \XMM4, \XMM4 + vaesenc \T_key, \XMM5, \XMM5 + vaesenc \T_key, \XMM6, \XMM6 + vaesenc \T_key, \XMM7, \XMM7 + vaesenc \T_key, \XMM8, \XMM8 + i = (i+1) + setreg +.endr + + + vmovdqa 16*i(arg1), \T_key + vaesenclast \T_key, \XMM1, \XMM1 + vaesenclast \T_key, \XMM2, \XMM2 + vaesenclast \T_key, \XMM3, \XMM3 + vaesenclast \T_key, \XMM4, \XMM4 + vaesenclast \T_key, \XMM5, \XMM5 + vaesenclast \T_key, \XMM6, \XMM6 + vaesenclast \T_key, \XMM7, \XMM7 + vaesenclast \T_key, \XMM8, \XMM8 + + vmovdqu (arg3, %r11), \T1 + vpxor \T1, \XMM1, \XMM1 + vmovdqu \XMM1, (arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM1 + .endif + + vmovdqu 16*1(arg3, %r11), \T1 + vpxor \T1, \XMM2, \XMM2 + vmovdqu \XMM2, 16*1(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM2 + .endif + + vmovdqu 16*2(arg3, %r11), \T1 + vpxor \T1, \XMM3, \XMM3 + vmovdqu \XMM3, 16*2(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM3 + .endif + + vmovdqu 16*3(arg3, %r11), \T1 + vpxor \T1, \XMM4, \XMM4 + vmovdqu \XMM4, 16*3(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM4 + .endif + + vmovdqu 16*4(arg3, %r11), \T1 + vpxor \T1, \XMM5, \XMM5 + vmovdqu \XMM5, 16*4(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM5 + .endif + + vmovdqu 16*5(arg3, %r11), \T1 + vpxor \T1, \XMM6, \XMM6 + vmovdqu \XMM6, 16*5(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM6 + .endif + + vmovdqu 16*6(arg3, %r11), \T1 + vpxor \T1, \XMM7, \XMM7 + vmovdqu \XMM7, 16*6(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM7 + .endif + + vmovdqu 16*7(arg3, %r11), \T1 + vpxor \T1, \XMM8, \XMM8 + vmovdqu \XMM8, 16*7(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM8 + .endif + + add $128, %r11 + + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + +############################################################################### + +_initial_blocks_done\@: + +.endm + +# encrypt 8 blocks at a time +# ghash the 8 previously encrypted ciphertext blocks +# arg1, arg2, arg3 are used as pointers only, not modified +# r11 is the data offset value +.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC + + vmovdqa \XMM1, \T2 + vmovdqa \XMM2, TMP2(%rsp) + vmovdqa \XMM3, TMP3(%rsp) + vmovdqa \XMM4, TMP4(%rsp) + vmovdqa \XMM5, TMP5(%rsp) + vmovdqa \XMM6, TMP6(%rsp) + vmovdqa \XMM7, TMP7(%rsp) + vmovdqa \XMM8, TMP8(%rsp) + +.if \loop_idx == in_order + vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT + vpaddd ONE(%rip), \XMM1, \XMM2 + vpaddd ONE(%rip), \XMM2, \XMM3 + vpaddd ONE(%rip), \XMM3, \XMM4 + vpaddd ONE(%rip), \XMM4, \XMM5 + vpaddd ONE(%rip), \XMM5, \XMM6 + vpaddd ONE(%rip), \XMM6, \XMM7 + vpaddd ONE(%rip), \XMM7, \XMM8 + vmovdqa \XMM8, \CTR + + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap +.else + vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT + vpaddd ONEf(%rip), \XMM1, \XMM2 + vpaddd ONEf(%rip), \XMM2, \XMM3 + vpaddd ONEf(%rip), \XMM3, \XMM4 + vpaddd ONEf(%rip), \XMM4, \XMM5 + vpaddd ONEf(%rip), \XMM5, \XMM6 + vpaddd ONEf(%rip), \XMM6, \XMM7 + vpaddd ONEf(%rip), \XMM7, \XMM8 + vmovdqa \XMM8, \CTR +.endif + + + ####################################################################### + + vmovdqu (arg1), \T1 + vpxor \T1, \XMM1, \XMM1 + vpxor \T1, \XMM2, \XMM2 + vpxor \T1, \XMM3, \XMM3 + vpxor \T1, \XMM4, \XMM4 + vpxor \T1, \XMM5, \XMM5 + vpxor \T1, \XMM6, \XMM6 + vpxor \T1, \XMM7, \XMM7 + vpxor \T1, \XMM8, \XMM8 + + ####################################################################### + + + + + + vmovdqu 16*1(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqu 16*2(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + + ####################################################################### + + vmovdqa HashKey_8(arg1), \T5 + vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 + vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 + + vpshufd $0b01001110, \T2, \T6 + vpxor \T2, \T6, \T6 + + vmovdqa HashKey_8_k(arg1), \T5 + vpclmulqdq $0x00, \T5, \T6, \T6 + + vmovdqu 16*3(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP2(%rsp), \T1 + vmovdqa HashKey_7(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_7_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*4(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + ####################################################################### + + vmovdqa TMP3(%rsp), \T1 + vmovdqa HashKey_6(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_6_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*5(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP4(%rsp), \T1 + vmovdqa HashKey_5(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_5_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*6(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + + vmovdqa TMP5(%rsp), \T1 + vmovdqa HashKey_4(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_4_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*7(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP6(%rsp), \T1 + vmovdqa HashKey_3(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_3_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + + + vmovdqu 16*8(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP7(%rsp), \T1 + vmovdqa HashKey_2(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_2_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + + ####################################################################### + + vmovdqu 16*9(arg1), \T5 + vaesenc \T5, \XMM1, \XMM1 + vaesenc \T5, \XMM2, \XMM2 + vaesenc \T5, \XMM3, \XMM3 + vaesenc \T5, \XMM4, \XMM4 + vaesenc \T5, \XMM5, \XMM5 + vaesenc \T5, \XMM6, \XMM6 + vaesenc \T5, \XMM7, \XMM7 + vaesenc \T5, \XMM8, \XMM8 + + vmovdqa TMP8(%rsp), \T1 + vmovdqa HashKey(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + + vpxor \T4, \T6, \T6 + vpxor \T7, \T6, \T6 + + vmovdqu 16*10(arg1), \T5 + + i = 0 + j = 1 + setreg +.rep 8 + vpxor 16*i(arg3, %r11), \T5, \T2 + .if \ENC_DEC == ENC + vaesenclast \T2, reg_j, reg_j + .else + vaesenclast \T2, reg_j, \T3 + vmovdqu 16*i(arg3, %r11), reg_j + vmovdqu \T3, 16*i(arg2, %r11) + .endif + i = (i+1) + j = (j+1) + setreg +.endr + ####################################################################### + + + vpslldq $8, \T6, \T3 # shift-L T3 2 DWs + vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs + vpxor \T3, \T7, \T7 + vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 + + + + ####################################################################### + #first phase of the reduction + ####################################################################### + vpslld $31, \T7, \T2 # packed right shifting << 31 + vpslld $30, \T7, \T3 # packed right shifting shift << 30 + vpslld $25, \T7, \T4 # packed right shifting shift << 25 + + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + + vpsrldq $4, \T2, \T1 # shift-R T1 1 DW + + vpslldq $12, \T2, \T2 # shift-L T2 3 DWs + vpxor \T2, \T7, \T7 # first phase of the reduction complete + ####################################################################### + .if \ENC_DEC == ENC + vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer + .endif + + ####################################################################### + #second phase of the reduction + vpsrld $1, \T7, \T2 # packed left shifting >> 1 + vpsrld $2, \T7, \T3 # packed left shifting >> 2 + vpsrld $7, \T7, \T4 # packed left shifting >> 7 + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + + vpxor \T1, \T2, \T2 + vpxor \T2, \T7, \T7 + vpxor \T7, \T6, \T6 # the result is in T6 + ####################################################################### + + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + + + vpxor \T6, \XMM1, \XMM1 + + + +.endm + + +# GHASH the last 4 ciphertext blocks. +.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 + + ## Karatsuba Method + + + vpshufd $0b01001110, \XMM1, \T2 + vpxor \XMM1, \T2, \T2 + vmovdqa HashKey_8(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM1, \T6 + vpclmulqdq $0x00, \T5, \XMM1, \T7 + + vmovdqa HashKey_8_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \XMM1 + + ###################### + + vpshufd $0b01001110, \XMM2, \T2 + vpxor \XMM2, \T2, \T2 + vmovdqa HashKey_7(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM2, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM2, \T4 + vpxor \T4, \T7, \T7 + + vmovdqa HashKey_7_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vpshufd $0b01001110, \XMM3, \T2 + vpxor \XMM3, \T2, \T2 + vmovdqa HashKey_6(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM3, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM3, \T4 + vpxor \T4, \T7, \T7 + + vmovdqa HashKey_6_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vpshufd $0b01001110, \XMM4, \T2 + vpxor \XMM4, \T2, \T2 + vmovdqa HashKey_5(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM4, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM4, \T4 + vpxor \T4, \T7, \T7 + + vmovdqa HashKey_5_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vpshufd $0b01001110, \XMM5, \T2 + vpxor \XMM5, \T2, \T2 + vmovdqa HashKey_4(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM5, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM5, \T4 + vpxor \T4, \T7, \T7 + + vmovdqa HashKey_4_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vpshufd $0b01001110, \XMM6, \T2 + vpxor \XMM6, \T2, \T2 + vmovdqa HashKey_3(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM6, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM6, \T4 + vpxor \T4, \T7, \T7 + + vmovdqa HashKey_3_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vpshufd $0b01001110, \XMM7, \T2 + vpxor \XMM7, \T2, \T2 + vmovdqa HashKey_2(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM7, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM7, \T4 + vpxor \T4, \T7, \T7 + + vmovdqa HashKey_2_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vpshufd $0b01001110, \XMM8, \T2 + vpxor \XMM8, \T2, \T2 + vmovdqa HashKey(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM8, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM8, \T4 + vpxor \T4, \T7, \T7 + + vmovdqa HashKey_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + vpxor \T6, \XMM1, \XMM1 + vpxor \T7, \XMM1, \T2 + + + + + vpslldq $8, \T2, \T4 + vpsrldq $8, \T2, \T2 + + vpxor \T4, \T7, \T7 + vpxor \T2, \T6, \T6 # <T6:T7> holds the result of + # the accumulated carry-less multiplications + + ####################################################################### + #first phase of the reduction + vpslld $31, \T7, \T2 # packed right shifting << 31 + vpslld $30, \T7, \T3 # packed right shifting shift << 30 + vpslld $25, \T7, \T4 # packed right shifting shift << 25 + + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + + vpsrldq $4, \T2, \T1 # shift-R T1 1 DW + + vpslldq $12, \T2, \T2 # shift-L T2 3 DWs + vpxor \T2, \T7, \T7 # first phase of the reduction complete + ####################################################################### + + + #second phase of the reduction + vpsrld $1, \T7, \T2 # packed left shifting >> 1 + vpsrld $2, \T7, \T3 # packed left shifting >> 2 + vpsrld $7, \T7, \T4 # packed left shifting >> 7 + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + + vpxor \T1, \T2, \T2 + vpxor \T2, \T7, \T7 + vpxor \T7, \T6, \T6 # the result is in T6 + +.endm + + +# combined for GCM encrypt and decrypt functions +# clobbering all xmm registers +# clobbering r10, r11, r12, r13, r14, r15 +.macro GCM_ENC_DEC_AVX ENC_DEC + + #the number of pushes must equal STACK_OFFSET + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rsp, %r14 + + + + + sub $VARIABLE_OFFSET, %rsp + and $~63, %rsp # align rsp to 64 bytes + + + vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey + + mov arg4, %r13 # save the number of bytes of plaintext/ciphertext + and $-16, %r13 # r13 = r13 - (r13 mod 16) + + mov %r13, %r12 + shr $4, %r12 + and $7, %r12 + jz _initial_num_blocks_is_0\@ + + cmp $7, %r12 + je _initial_num_blocks_is_7\@ + cmp $6, %r12 + je _initial_num_blocks_is_6\@ + cmp $5, %r12 + je _initial_num_blocks_is_5\@ + cmp $4, %r12 + je _initial_num_blocks_is_4\@ + cmp $3, %r12 + je _initial_num_blocks_is_3\@ + cmp $2, %r12 + je _initial_num_blocks_is_2\@ + + jmp _initial_num_blocks_is_1\@ + +_initial_num_blocks_is_7\@: + INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*7, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_6\@: + INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*6, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_5\@: + INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*5, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_4\@: + INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*4, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_3\@: + INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*3, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_2\@: + INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*2, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_1\@: + INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*1, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_0\@: + INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + + +_initial_blocks_encrypted\@: + cmp $0, %r13 + je _zero_cipher_left\@ + + sub $128, %r13 + je _eight_cipher_left\@ + + + + + vmovd %xmm9, %r15d + and $255, %r15d + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + + +_encrypt_by_8_new\@: + cmp $(255-8), %r15d + jg _encrypt_by_8\@ + + + + add $8, %r15b + GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC + add $128, %r11 + sub $128, %r13 + jne _encrypt_by_8_new\@ + + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + jmp _eight_cipher_left\@ + +_encrypt_by_8\@: + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + add $8, %r15b + GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + add $128, %r11 + sub $128, %r13 + jne _encrypt_by_8_new\@ + + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + + + + +_eight_cipher_left\@: + GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 + + +_zero_cipher_left\@: + cmp $16, arg4 + jl _only_less_than_16\@ + + mov arg4, %r13 + and $15, %r13 # r13 = (arg4 mod 16) + + je _multiple_of_16_bytes\@ + + # handle the last <16 Byte block seperately + + + vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + + sub $16, %r11 + add %r13, %r11 + vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block + + lea SHIFT_MASK+16(%rip), %r12 + sub %r13, %r12 # adjust the shuffle mask pointer to be + # able to shift 16-r13 bytes (r13 is the + # number of bytes in plaintext mod 16) + vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask + vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes + jmp _final_ghash_mul\@ + +_only_less_than_16\@: + # check for 0 length + mov arg4, %r13 + and $15, %r13 # r13 = (arg4 mod 16) + + je _multiple_of_16_bytes\@ + + # handle the last <16 Byte block seperately + + + vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + + + lea SHIFT_MASK+16(%rip), %r12 + sub %r13, %r12 # adjust the shuffle mask pointer to be + # able to shift 16-r13 bytes (r13 is the + # number of bytes in plaintext mod 16) + +_get_last_16_byte_loop\@: + movb (arg3, %r11), %al + movb %al, TMP1 (%rsp , %r11) + add $1, %r11 + cmp %r13, %r11 + jne _get_last_16_byte_loop\@ + + vmovdqu TMP1(%rsp), %xmm1 + + sub $16, %r11 + +_final_ghash_mul\@: + .if \ENC_DEC == DEC + vmovdqa %xmm1, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to + # mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm2, %xmm2 + vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 + vpxor %xmm2, %xmm14, %xmm14 + #GHASH computation for the last <16 Byte block + GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + sub %r13, %r11 + add $16, %r11 + .else + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to + # mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + vpxor %xmm9, %xmm14, %xmm14 + #GHASH computation for the last <16 Byte block + GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + sub %r13, %r11 + add $16, %r11 + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext + .endif + + + ############################# + # output r13 Bytes + vmovq %xmm9, %rax + cmp $8, %r13 + jle _less_than_8_bytes_left\@ + + mov %rax, (arg2 , %r11) + add $8, %r11 + vpsrldq $8, %xmm9, %xmm9 + vmovq %xmm9, %rax + sub $8, %r13 + +_less_than_8_bytes_left\@: + movb %al, (arg2 , %r11) + add $1, %r11 + shr $8, %rax + sub $1, %r13 + jne _less_than_8_bytes_left\@ + ############################# + +_multiple_of_16_bytes\@: + mov arg7, %r12 # r12 = aadLen (number of bytes) + shl $3, %r12 # convert into number of bits + vmovd %r12d, %xmm15 # len(A) in xmm15 + + shl $3, arg4 # len(C) in bits (*128) + vmovq arg4, %xmm1 + vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 + vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) + + vpxor %xmm15, %xmm14, %xmm14 + GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation + vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap + + mov arg5, %rax # rax = *Y0 + vmovdqu (%rax), %xmm9 # xmm9 = Y0 + + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) + + vpxor %xmm14, %xmm9, %xmm9 + + + +_return_T\@: + mov arg8, %r10 # r10 = authTag + mov arg9, %r11 # r11 = auth_tag_len + + cmp $16, %r11 + je _T_16\@ + + cmp $12, %r11 + je _T_12\@ + +_T_8\@: + vmovq %xmm9, %rax + mov %rax, (%r10) + jmp _return_T_done\@ +_T_12\@: + vmovq %xmm9, %rax + mov %rax, (%r10) + vpsrldq $8, %xmm9, %xmm9 + vmovd %xmm9, %eax + mov %eax, 8(%r10) + jmp _return_T_done\@ + +_T_16\@: + vmovdqu %xmm9, (%r10) + +_return_T_done\@: + mov %r14, %rsp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 +.endm + + +############################################################# +#void aesni_gcm_precomp_avx_gen2 +# (gcm_data *my_ctx_data, +# u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ +############################################################# +ENTRY(aesni_gcm_precomp_avx_gen2) + #the number of pushes must equal STACK_OFFSET + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rsp, %r14 + + + + sub $VARIABLE_OFFSET, %rsp + and $~63, %rsp # align rsp to 64 bytes + + vmovdqu (arg2), %xmm6 # xmm6 = HashKey + + vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 + ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey + vmovdqa %xmm6, %xmm2 + vpsllq $1, %xmm6, %xmm6 + vpsrlq $63, %xmm2, %xmm2 + vmovdqa %xmm2, %xmm1 + vpslldq $8, %xmm2, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpor %xmm2, %xmm6, %xmm6 + #reduction + vpshufd $0b00100100, %xmm1, %xmm2 + vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 + vpand POLY(%rip), %xmm2, %xmm2 + vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly + ####################################################################### + vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly + + + PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 + + mov %r14, %rsp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + ret +ENDPROC(aesni_gcm_precomp_avx_gen2) + +############################################################################### +#void aesni_gcm_enc_avx_gen2( +# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ +# const u8 *in, /* Plaintext input */ +# u64 plaintext_len, /* Length of data in Bytes for encryption. */ +# u8 *iv, /* Pre-counter block j0: 4 byte salt +# (from Security Association) concatenated with 8 byte +# Initialisation Vector (from IPSec ESP Payload) +# concatenated with 0x00000001. 16-byte aligned pointer. */ +# const u8 *aad, /* Additional Authentication Data (AAD)*/ +# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ +# u8 *auth_tag, /* Authenticated Tag output. */ +# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. +# Valid values are 16 (most likely), 12 or 8. */ +############################################################################### +ENTRY(aesni_gcm_enc_avx_gen2) + GCM_ENC_DEC_AVX ENC + ret +ENDPROC(aesni_gcm_enc_avx_gen2) + +############################################################################### +#void aesni_gcm_dec_avx_gen2( +# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ +# const u8 *in, /* Ciphertext input */ +# u64 plaintext_len, /* Length of data in Bytes for encryption. */ +# u8 *iv, /* Pre-counter block j0: 4 byte salt +# (from Security Association) concatenated with 8 byte +# Initialisation Vector (from IPSec ESP Payload) +# concatenated with 0x00000001. 16-byte aligned pointer. */ +# const u8 *aad, /* Additional Authentication Data (AAD)*/ +# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ +# u8 *auth_tag, /* Authenticated Tag output. */ +# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. +# Valid values are 16 (most likely), 12 or 8. */ +############################################################################### +ENTRY(aesni_gcm_dec_avx_gen2) + GCM_ENC_DEC_AVX DEC + ret +ENDPROC(aesni_gcm_dec_avx_gen2) +#endif /* CONFIG_AS_AVX */ + +#ifdef CONFIG_AS_AVX2 +############################################################################### +# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +# Input: A and B (128-bits each, bit-reflected) +# Output: C = A*B*x mod poly, (i.e. >>1 ) +# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +############################################################################### +.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5 + + vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1 + vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0 + vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0 + vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1 + vpxor \T3, \GH, \GH + + + vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs + vpslldq $8 , \GH, \GH # shift-L GH 2 DWs + + vpxor \T3, \T1, \T1 + vpxor \T2, \GH, \GH + + ####################################################################### + #first phase of the reduction + vmovdqa POLY2(%rip), \T3 + + vpclmulqdq $0x01, \GH, \T3, \T2 + vpslldq $8, \T2, \T2 # shift-L T2 2 DWs + + vpxor \T2, \GH, \GH # first phase of the reduction complete + ####################################################################### + #second phase of the reduction + vpclmulqdq $0x00, \GH, \T3, \T2 + vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + vpclmulqdq $0x10, \GH, \T3, \GH + vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts) + + vpxor \T2, \GH, \GH # second phase of the reduction complete + ####################################################################### + vpxor \T1, \GH, \GH # the result is in GH + + +.endm + +.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6 + + # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vmovdqa \HK, \T5 + GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly + vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly + + GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly + vmovdqa \T5, HashKey_3(arg1) + + GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly + vmovdqa \T5, HashKey_4(arg1) + + GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly + vmovdqa \T5, HashKey_5(arg1) + + GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly + vmovdqa \T5, HashKey_6(arg1) + + GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly + vmovdqa \T5, HashKey_7(arg1) + + GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly + vmovdqa \T5, HashKey_8(arg1) + +.endm + + +## if a = number of total plaintext bytes +## b = floor(a/16) +## num_initial_blocks = b mod 4# +## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext +## r10, r11, r12, rax are clobbered +## arg1, arg2, arg3, r14 are used as a pointer only, not modified + +.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER + i = (8-\num_initial_blocks) + setreg + + mov arg6, %r10 # r10 = AAD + mov arg7, %r12 # r12 = aadLen + + + mov %r12, %r11 + + vpxor reg_i, reg_i, reg_i +_get_AAD_loop\@: + vmovd (%r10), \T1 + vpslldq $12, \T1, \T1 + vpsrldq $4, reg_i, reg_i + vpxor \T1, reg_i, reg_i + + add $4, %r10 + sub $4, %r12 + jg _get_AAD_loop\@ + + + cmp $16, %r11 + je _get_AAD_loop2_done\@ + mov $16, %r12 + +_get_AAD_loop2\@: + vpsrldq $4, reg_i, reg_i + sub $4, %r12 + cmp %r11, %r12 + jg _get_AAD_loop2\@ + +_get_AAD_loop2_done\@: + + #byte-reflect the AAD data + vpshufb SHUF_MASK(%rip), reg_i, reg_i + + # initialize the data pointer offset as zero + xor %r11, %r11 + + # start AES for num_initial_blocks blocks + mov arg5, %rax # rax = *Y0 + vmovdqu (%rax), \CTR # CTR = Y0 + vpshufb SHUF_MASK(%rip), \CTR, \CTR + + + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, reg_i + vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap + i = (i+1) + setreg +.endr + + vmovdqa (arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vpxor \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr + + j = 1 + setreg +.rep 9 + vmovdqa 16*j(arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vaesenc \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr + + j = (j+1) + setreg +.endr + + + vmovdqa 16*10(arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vaesenclast \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr + + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vmovdqu (arg3, %r11), \T1 + vpxor \T1, reg_i, reg_i + vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for + # num_initial_blocks blocks + add $16, %r11 +.if \ENC_DEC == DEC + vmovdqa \T1, reg_i +.endif + vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations + i = (i+1) + setreg +.endr + + + i = (8-\num_initial_blocks) + j = (9-\num_initial_blocks) + setreg + GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6 + +.rep \num_initial_blocks + vpxor reg_i, reg_j, reg_j + GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks + i = (i+1) + j = (j+1) + setreg +.endr + # XMM8 has the combined result here + + vmovdqa \XMM8, TMP1(%rsp) + vmovdqa \XMM8, \T3 + + cmp $128, %r13 + jl _initial_blocks_done\@ # no need for precomputed constants + +############################################################################### +# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM1 + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM2 + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM3 + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM4 + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM5 + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM6 + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM7 + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM8 + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + + vmovdqa (arg1), \T_key + vpxor \T_key, \XMM1, \XMM1 + vpxor \T_key, \XMM2, \XMM2 + vpxor \T_key, \XMM3, \XMM3 + vpxor \T_key, \XMM4, \XMM4 + vpxor \T_key, \XMM5, \XMM5 + vpxor \T_key, \XMM6, \XMM6 + vpxor \T_key, \XMM7, \XMM7 + vpxor \T_key, \XMM8, \XMM8 + + i = 1 + setreg +.rep 9 # do 9 rounds + vmovdqa 16*i(arg1), \T_key + vaesenc \T_key, \XMM1, \XMM1 + vaesenc \T_key, \XMM2, \XMM2 + vaesenc \T_key, \XMM3, \XMM3 + vaesenc \T_key, \XMM4, \XMM4 + vaesenc \T_key, \XMM5, \XMM5 + vaesenc \T_key, \XMM6, \XMM6 + vaesenc \T_key, \XMM7, \XMM7 + vaesenc \T_key, \XMM8, \XMM8 + i = (i+1) + setreg +.endr + + + vmovdqa 16*i(arg1), \T_key + vaesenclast \T_key, \XMM1, \XMM1 + vaesenclast \T_key, \XMM2, \XMM2 + vaesenclast \T_key, \XMM3, \XMM3 + vaesenclast \T_key, \XMM4, \XMM4 + vaesenclast \T_key, \XMM5, \XMM5 + vaesenclast \T_key, \XMM6, \XMM6 + vaesenclast \T_key, \XMM7, \XMM7 + vaesenclast \T_key, \XMM8, \XMM8 + + vmovdqu (arg3, %r11), \T1 + vpxor \T1, \XMM1, \XMM1 + vmovdqu \XMM1, (arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM1 + .endif + + vmovdqu 16*1(arg3, %r11), \T1 + vpxor \T1, \XMM2, \XMM2 + vmovdqu \XMM2, 16*1(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM2 + .endif + + vmovdqu 16*2(arg3, %r11), \T1 + vpxor \T1, \XMM3, \XMM3 + vmovdqu \XMM3, 16*2(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM3 + .endif + + vmovdqu 16*3(arg3, %r11), \T1 + vpxor \T1, \XMM4, \XMM4 + vmovdqu \XMM4, 16*3(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM4 + .endif + + vmovdqu 16*4(arg3, %r11), \T1 + vpxor \T1, \XMM5, \XMM5 + vmovdqu \XMM5, 16*4(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM5 + .endif + + vmovdqu 16*5(arg3, %r11), \T1 + vpxor \T1, \XMM6, \XMM6 + vmovdqu \XMM6, 16*5(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM6 + .endif + + vmovdqu 16*6(arg3, %r11), \T1 + vpxor \T1, \XMM7, \XMM7 + vmovdqu \XMM7, 16*6(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM7 + .endif + + vmovdqu 16*7(arg3, %r11), \T1 + vpxor \T1, \XMM8, \XMM8 + vmovdqu \XMM8, 16*7(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM8 + .endif + + add $128, %r11 + + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with + # the corresponding ciphertext + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + +############################################################################### + +_initial_blocks_done\@: + + +.endm + + + +# encrypt 8 blocks at a time +# ghash the 8 previously encrypted ciphertext blocks +# arg1, arg2, arg3 are used as pointers only, not modified +# r11 is the data offset value +.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC + + vmovdqa \XMM1, \T2 + vmovdqa \XMM2, TMP2(%rsp) + vmovdqa \XMM3, TMP3(%rsp) + vmovdqa \XMM4, TMP4(%rsp) + vmovdqa \XMM5, TMP5(%rsp) + vmovdqa \XMM6, TMP6(%rsp) + vmovdqa \XMM7, TMP7(%rsp) + vmovdqa \XMM8, TMP8(%rsp) + +.if \loop_idx == in_order + vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT + vpaddd ONE(%rip), \XMM1, \XMM2 + vpaddd ONE(%rip), \XMM2, \XMM3 + vpaddd ONE(%rip), \XMM3, \XMM4 + vpaddd ONE(%rip), \XMM4, \XMM5 + vpaddd ONE(%rip), \XMM5, \XMM6 + vpaddd ONE(%rip), \XMM6, \XMM7 + vpaddd ONE(%rip), \XMM7, \XMM8 + vmovdqa \XMM8, \CTR + + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap +.else + vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT + vpaddd ONEf(%rip), \XMM1, \XMM2 + vpaddd ONEf(%rip), \XMM2, \XMM3 + vpaddd ONEf(%rip), \XMM3, \XMM4 + vpaddd ONEf(%rip), \XMM4, \XMM5 + vpaddd ONEf(%rip), \XMM5, \XMM6 + vpaddd ONEf(%rip), \XMM6, \XMM7 + vpaddd ONEf(%rip), \XMM7, \XMM8 + vmovdqa \XMM8, \CTR +.endif + + + ####################################################################### + + vmovdqu (arg1), \T1 + vpxor \T1, \XMM1, \XMM1 + vpxor \T1, \XMM2, \XMM2 + vpxor \T1, \XMM3, \XMM3 + vpxor \T1, \XMM4, \XMM4 + vpxor \T1, \XMM5, \XMM5 + vpxor \T1, \XMM6, \XMM6 + vpxor \T1, \XMM7, \XMM7 + vpxor \T1, \XMM8, \XMM8 + + ####################################################################### + + + + + + vmovdqu 16*1(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqu 16*2(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + + ####################################################################### + + vmovdqa HashKey_8(arg1), \T5 + vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 + vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 + vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 + vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1 + vpxor \T5, \T6, \T6 + + vmovdqu 16*3(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP2(%rsp), \T1 + vmovdqa HashKey_7(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*4(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + ####################################################################### + + vmovdqa TMP3(%rsp), \T1 + vmovdqa HashKey_6(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*5(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP4(%rsp), \T1 + vmovdqa HashKey_5(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*6(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + + vmovdqa TMP5(%rsp), \T1 + vmovdqa HashKey_4(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*7(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP6(%rsp), \T1 + vmovdqa HashKey_3(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*8(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP7(%rsp), \T1 + vmovdqa HashKey_2(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + + ####################################################################### + + vmovdqu 16*9(arg1), \T5 + vaesenc \T5, \XMM1, \XMM1 + vaesenc \T5, \XMM2, \XMM2 + vaesenc \T5, \XMM3, \XMM3 + vaesenc \T5, \XMM4, \XMM4 + vaesenc \T5, \XMM5, \XMM5 + vaesenc \T5, \XMM6, \XMM6 + vaesenc \T5, \XMM7, \XMM7 + vaesenc \T5, \XMM8, \XMM8 + + vmovdqa TMP8(%rsp), \T1 + vmovdqa HashKey(arg1), \T5 + + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T1 + + + vmovdqu 16*10(arg1), \T5 + + i = 0 + j = 1 + setreg +.rep 8 + vpxor 16*i(arg3, %r11), \T5, \T2 + .if \ENC_DEC == ENC + vaesenclast \T2, reg_j, reg_j + .else + vaesenclast \T2, reg_j, \T3 + vmovdqu 16*i(arg3, %r11), reg_j + vmovdqu \T3, 16*i(arg2, %r11) + .endif + i = (i+1) + j = (j+1) + setreg +.endr + ####################################################################### + + + vpslldq $8, \T6, \T3 # shift-L T3 2 DWs + vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs + vpxor \T3, \T7, \T7 + vpxor \T6, \T1, \T1 # accumulate the results in T1:T7 + + + + ####################################################################### + #first phase of the reduction + vmovdqa POLY2(%rip), \T3 + + vpclmulqdq $0x01, \T7, \T3, \T2 + vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs + + vpxor \T2, \T7, \T7 # first phase of the reduction complete + ####################################################################### + .if \ENC_DEC == ENC + vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer + .endif + + ####################################################################### + #second phase of the reduction + vpclmulqdq $0x00, \T7, \T3, \T2 + vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + vpclmulqdq $0x10, \T7, \T3, \T4 + vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) + + vpxor \T2, \T4, \T4 # second phase of the reduction complete + ####################################################################### + vpxor \T4, \T1, \T1 # the result is in T1 + + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + + + vpxor \T1, \XMM1, \XMM1 + + + +.endm + + +# GHASH the last 4 ciphertext blocks. +.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 + + ## Karatsuba Method + + vmovdqa HashKey_8(arg1), \T5 + + vpshufd $0b01001110, \XMM1, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM1, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM1, \T6 + vpclmulqdq $0x00, \T5, \XMM1, \T7 + + vpclmulqdq $0x00, \T3, \T2, \XMM1 + + ###################### + + vmovdqa HashKey_7(arg1), \T5 + vpshufd $0b01001110, \XMM2, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM2, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM2, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM2, \T4 + vpxor \T4, \T7, \T7 + + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vmovdqa HashKey_6(arg1), \T5 + vpshufd $0b01001110, \XMM3, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM3, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM3, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM3, \T4 + vpxor \T4, \T7, \T7 + + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vmovdqa HashKey_5(arg1), \T5 + vpshufd $0b01001110, \XMM4, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM4, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM4, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM4, \T4 + vpxor \T4, \T7, \T7 + + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vmovdqa HashKey_4(arg1), \T5 + vpshufd $0b01001110, \XMM5, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM5, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM5, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM5, \T4 + vpxor \T4, \T7, \T7 + + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vmovdqa HashKey_3(arg1), \T5 + vpshufd $0b01001110, \XMM6, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM6, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM6, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM6, \T4 + vpxor \T4, \T7, \T7 + + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vmovdqa HashKey_2(arg1), \T5 + vpshufd $0b01001110, \XMM7, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM7, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM7, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM7, \T4 + vpxor \T4, \T7, \T7 + + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vmovdqa HashKey(arg1), \T5 + vpshufd $0b01001110, \XMM8, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM8, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM8, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM8, \T4 + vpxor \T4, \T7, \T7 + + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + vpxor \T6, \XMM1, \XMM1 + vpxor \T7, \XMM1, \T2 + + + + + vpslldq $8, \T2, \T4 + vpsrldq $8, \T2, \T2 + + vpxor \T4, \T7, \T7 + vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the + # accumulated carry-less multiplications + + ####################################################################### + #first phase of the reduction + vmovdqa POLY2(%rip), \T3 + + vpclmulqdq $0x01, \T7, \T3, \T2 + vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs + + vpxor \T2, \T7, \T7 # first phase of the reduction complete + ####################################################################### + + + #second phase of the reduction + vpclmulqdq $0x00, \T7, \T3, \T2 + vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + vpclmulqdq $0x10, \T7, \T3, \T4 + vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) + + vpxor \T2, \T4, \T4 # second phase of the reduction complete + ####################################################################### + vpxor \T4, \T6, \T6 # the result is in T6 +.endm + + + +# combined for GCM encrypt and decrypt functions +# clobbering all xmm registers +# clobbering r10, r11, r12, r13, r14, r15 +.macro GCM_ENC_DEC_AVX2 ENC_DEC + + #the number of pushes must equal STACK_OFFSET + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rsp, %r14 + + + + + sub $VARIABLE_OFFSET, %rsp + and $~63, %rsp # align rsp to 64 bytes + + + vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey + + mov arg4, %r13 # save the number of bytes of plaintext/ciphertext + and $-16, %r13 # r13 = r13 - (r13 mod 16) + + mov %r13, %r12 + shr $4, %r12 + and $7, %r12 + jz _initial_num_blocks_is_0\@ + + cmp $7, %r12 + je _initial_num_blocks_is_7\@ + cmp $6, %r12 + je _initial_num_blocks_is_6\@ + cmp $5, %r12 + je _initial_num_blocks_is_5\@ + cmp $4, %r12 + je _initial_num_blocks_is_4\@ + cmp $3, %r12 + je _initial_num_blocks_is_3\@ + cmp $2, %r12 + je _initial_num_blocks_is_2\@ + + jmp _initial_num_blocks_is_1\@ + +_initial_num_blocks_is_7\@: + INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*7, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_6\@: + INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*6, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_5\@: + INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*5, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_4\@: + INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*4, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_3\@: + INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*3, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_2\@: + INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*2, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_1\@: + INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*1, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_0\@: + INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + + +_initial_blocks_encrypted\@: + cmp $0, %r13 + je _zero_cipher_left\@ + + sub $128, %r13 + je _eight_cipher_left\@ + + + + + vmovd %xmm9, %r15d + and $255, %r15d + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + + +_encrypt_by_8_new\@: + cmp $(255-8), %r15d + jg _encrypt_by_8\@ + + + + add $8, %r15b + GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC + add $128, %r11 + sub $128, %r13 + jne _encrypt_by_8_new\@ + + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + jmp _eight_cipher_left\@ + +_encrypt_by_8\@: + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + add $8, %r15b + GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + add $128, %r11 + sub $128, %r13 + jne _encrypt_by_8_new\@ + + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + + + + +_eight_cipher_left\@: + GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 + + +_zero_cipher_left\@: + cmp $16, arg4 + jl _only_less_than_16\@ + + mov arg4, %r13 + and $15, %r13 # r13 = (arg4 mod 16) + + je _multiple_of_16_bytes\@ + + # handle the last <16 Byte block seperately + + + vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + + sub $16, %r11 + add %r13, %r11 + vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block + + lea SHIFT_MASK+16(%rip), %r12 + sub %r13, %r12 # adjust the shuffle mask pointer + # to be able to shift 16-r13 bytes + # (r13 is the number of bytes in plaintext mod 16) + vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask + vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes + jmp _final_ghash_mul\@ + +_only_less_than_16\@: + # check for 0 length + mov arg4, %r13 + and $15, %r13 # r13 = (arg4 mod 16) + + je _multiple_of_16_bytes\@ + + # handle the last <16 Byte block seperately + + + vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + + + lea SHIFT_MASK+16(%rip), %r12 + sub %r13, %r12 # adjust the shuffle mask pointer to be + # able to shift 16-r13 bytes (r13 is the + # number of bytes in plaintext mod 16) + +_get_last_16_byte_loop\@: + movb (arg3, %r11), %al + movb %al, TMP1 (%rsp , %r11) + add $1, %r11 + cmp %r13, %r11 + jne _get_last_16_byte_loop\@ + + vmovdqu TMP1(%rsp), %xmm1 + + sub $16, %r11 + +_final_ghash_mul\@: + .if \ENC_DEC == DEC + vmovdqa %xmm1, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm2, %xmm2 + vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 + vpxor %xmm2, %xmm14, %xmm14 + #GHASH computation for the last <16 Byte block + GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + sub %r13, %r11 + add $16, %r11 + .else + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + vpxor %xmm9, %xmm14, %xmm14 + #GHASH computation for the last <16 Byte block + GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + sub %r13, %r11 + add $16, %r11 + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext + .endif + + + ############################# + # output r13 Bytes + vmovq %xmm9, %rax + cmp $8, %r13 + jle _less_than_8_bytes_left\@ + + mov %rax, (arg2 , %r11) + add $8, %r11 + vpsrldq $8, %xmm9, %xmm9 + vmovq %xmm9, %rax + sub $8, %r13 + +_less_than_8_bytes_left\@: + movb %al, (arg2 , %r11) + add $1, %r11 + shr $8, %rax + sub $1, %r13 + jne _less_than_8_bytes_left\@ + ############################# + +_multiple_of_16_bytes\@: + mov arg7, %r12 # r12 = aadLen (number of bytes) + shl $3, %r12 # convert into number of bits + vmovd %r12d, %xmm15 # len(A) in xmm15 + + shl $3, arg4 # len(C) in bits (*128) + vmovq arg4, %xmm1 + vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 + vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) + + vpxor %xmm15, %xmm14, %xmm14 + GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation + vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap + + mov arg5, %rax # rax = *Y0 + vmovdqu (%rax), %xmm9 # xmm9 = Y0 + + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) + + vpxor %xmm14, %xmm9, %xmm9 + + + +_return_T\@: + mov arg8, %r10 # r10 = authTag + mov arg9, %r11 # r11 = auth_tag_len + + cmp $16, %r11 + je _T_16\@ + + cmp $12, %r11 + je _T_12\@ + +_T_8\@: + vmovq %xmm9, %rax + mov %rax, (%r10) + jmp _return_T_done\@ +_T_12\@: + vmovq %xmm9, %rax + mov %rax, (%r10) + vpsrldq $8, %xmm9, %xmm9 + vmovd %xmm9, %eax + mov %eax, 8(%r10) + jmp _return_T_done\@ + +_T_16\@: + vmovdqu %xmm9, (%r10) + +_return_T_done\@: + mov %r14, %rsp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 +.endm + + +############################################################# +#void aesni_gcm_precomp_avx_gen4 +# (gcm_data *my_ctx_data, +# u8 *hash_subkey)# /* H, the Hash sub key input. +# Data starts on a 16-byte boundary. */ +############################################################# +ENTRY(aesni_gcm_precomp_avx_gen4) + #the number of pushes must equal STACK_OFFSET + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rsp, %r14 + + + + sub $VARIABLE_OFFSET, %rsp + and $~63, %rsp # align rsp to 64 bytes + + vmovdqu (arg2), %xmm6 # xmm6 = HashKey + + vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 + ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey + vmovdqa %xmm6, %xmm2 + vpsllq $1, %xmm6, %xmm6 + vpsrlq $63, %xmm2, %xmm2 + vmovdqa %xmm2, %xmm1 + vpslldq $8, %xmm2, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpor %xmm2, %xmm6, %xmm6 + #reduction + vpshufd $0b00100100, %xmm1, %xmm2 + vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 + vpand POLY(%rip), %xmm2, %xmm2 + vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly + ####################################################################### + vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly + + + PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 + + mov %r14, %rsp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + ret +ENDPROC(aesni_gcm_precomp_avx_gen4) + + +############################################################################### +#void aesni_gcm_enc_avx_gen4( +# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ +# const u8 *in, /* Plaintext input */ +# u64 plaintext_len, /* Length of data in Bytes for encryption. */ +# u8 *iv, /* Pre-counter block j0: 4 byte salt +# (from Security Association) concatenated with 8 byte +# Initialisation Vector (from IPSec ESP Payload) +# concatenated with 0x00000001. 16-byte aligned pointer. */ +# const u8 *aad, /* Additional Authentication Data (AAD)*/ +# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ +# u8 *auth_tag, /* Authenticated Tag output. */ +# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. +# Valid values are 16 (most likely), 12 or 8. */ +############################################################################### +ENTRY(aesni_gcm_enc_avx_gen4) + GCM_ENC_DEC_AVX2 ENC + ret +ENDPROC(aesni_gcm_enc_avx_gen4) + +############################################################################### +#void aesni_gcm_dec_avx_gen4( +# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ +# const u8 *in, /* Ciphertext input */ +# u64 plaintext_len, /* Length of data in Bytes for encryption. */ +# u8 *iv, /* Pre-counter block j0: 4 byte salt +# (from Security Association) concatenated with 8 byte +# Initialisation Vector (from IPSec ESP Payload) +# concatenated with 0x00000001. 16-byte aligned pointer. */ +# const u8 *aad, /* Additional Authentication Data (AAD)*/ +# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ +# u8 *auth_tag, /* Authenticated Tag output. */ +# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. +# Valid values are 16 (most likely), 12 or 8. */ +############################################################################### +ENTRY(aesni_gcm_dec_avx_gen4) + GCM_ENC_DEC_AVX2 DEC + ret +ENDPROC(aesni_gcm_dec_avx_gen4) + +#endif /* CONFIG_AS_AVX2 */ diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 1b9c22bea8a..948ad0e7774 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -34,14 +34,13 @@ #include <asm/cpu_device_id.h> #include <asm/i387.h> #include <asm/crypto/aes.h> -#include <asm/crypto/ablk_helper.h> +#include <crypto/ablk_helper.h> #include <crypto/scatterwalk.h> #include <crypto/internal/aead.h> #include <linux/workqueue.h> #include <linux/spinlock.h> - -#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE) -#define HAS_CTR +#ifdef CONFIG_X86_64 +#include <asm/crypto/glue_helper.h> #endif #if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE) @@ -102,10 +101,16 @@ asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, int crypto_fpu_init(void); void crypto_fpu_exit(void); +#define AVX_GEN2_OPTSIZE 640 +#define AVX_GEN4_OPTSIZE 4096 + #ifdef CONFIG_X86_64 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); +asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, bool enc, u8 *iv); + /* asmlinkage void aesni_gcm_enc() * void *ctx, AES Key schedule. Starts on a 16 byte boundary. * u8 *out, Ciphertext output. Encrypt in-place is allowed. @@ -148,6 +153,123 @@ asmlinkage void aesni_gcm_dec(void *ctx, u8 *out, u8 *hash_subkey, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len); + +#ifdef CONFIG_AS_AVX +/* + * asmlinkage void aesni_gcm_precomp_avx_gen2() + * gcm_data *my_ctx_data, context data + * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. + */ +asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, u8 *hash_subkey); + +asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, u8 *out, + const u8 *in, unsigned long plaintext_len, u8 *iv, + const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len); + +asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, u8 *out, + const u8 *in, unsigned long ciphertext_len, u8 *iv, + const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len); + +static void aesni_gcm_enc_avx(void *ctx, u8 *out, + const u8 *in, unsigned long plaintext_len, u8 *iv, + u8 *hash_subkey, const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len) +{ + if (plaintext_len < AVX_GEN2_OPTSIZE) { + aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad, + aad_len, auth_tag, auth_tag_len); + } else { + aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); + aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad, + aad_len, auth_tag, auth_tag_len); + } +} + +static void aesni_gcm_dec_avx(void *ctx, u8 *out, + const u8 *in, unsigned long ciphertext_len, u8 *iv, + u8 *hash_subkey, const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len) +{ + if (ciphertext_len < AVX_GEN2_OPTSIZE) { + aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, aad, + aad_len, auth_tag, auth_tag_len); + } else { + aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); + aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad, + aad_len, auth_tag, auth_tag_len); + } +} +#endif + +#ifdef CONFIG_AS_AVX2 +/* + * asmlinkage void aesni_gcm_precomp_avx_gen4() + * gcm_data *my_ctx_data, context data + * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. + */ +asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, u8 *hash_subkey); + +asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, u8 *out, + const u8 *in, unsigned long plaintext_len, u8 *iv, + const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len); + +asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, u8 *out, + const u8 *in, unsigned long ciphertext_len, u8 *iv, + const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len); + +static void aesni_gcm_enc_avx2(void *ctx, u8 *out, + const u8 *in, unsigned long plaintext_len, u8 *iv, + u8 *hash_subkey, const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len) +{ + if (plaintext_len < AVX_GEN2_OPTSIZE) { + aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad, + aad_len, auth_tag, auth_tag_len); + } else if (plaintext_len < AVX_GEN4_OPTSIZE) { + aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); + aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad, + aad_len, auth_tag, auth_tag_len); + } else { + aesni_gcm_precomp_avx_gen4(ctx, hash_subkey); + aesni_gcm_enc_avx_gen4(ctx, out, in, plaintext_len, iv, aad, + aad_len, auth_tag, auth_tag_len); + } +} + +static void aesni_gcm_dec_avx2(void *ctx, u8 *out, + const u8 *in, unsigned long ciphertext_len, u8 *iv, + u8 *hash_subkey, const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len) +{ + if (ciphertext_len < AVX_GEN2_OPTSIZE) { + aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, + aad, aad_len, auth_tag, auth_tag_len); + } else if (ciphertext_len < AVX_GEN4_OPTSIZE) { + aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); + aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad, + aad_len, auth_tag, auth_tag_len); + } else { + aesni_gcm_precomp_avx_gen4(ctx, hash_subkey); + aesni_gcm_dec_avx_gen4(ctx, out, in, ciphertext_len, iv, aad, + aad_len, auth_tag, auth_tag_len); + } +} +#endif + +static void (*aesni_gcm_enc_tfm)(void *ctx, u8 *out, + const u8 *in, unsigned long plaintext_len, u8 *iv, + u8 *hash_subkey, const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len); + +static void (*aesni_gcm_dec_tfm)(void *ctx, u8 *out, + const u8 *in, unsigned long ciphertext_len, u8 *iv, + u8 *hash_subkey, const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len); + static inline struct aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) { @@ -395,12 +517,6 @@ static int ablk_ctr_init(struct crypto_tfm *tfm) return ablk_init_common(tfm, "__driver-ctr-aes-aesni"); } -#ifdef HAS_CTR -static int ablk_rfc3686_ctr_init(struct crypto_tfm *tfm) -{ - return ablk_init_common(tfm, "rfc3686(__driver-ctr-aes-aesni)"); -} -#endif #endif #ifdef HAS_PCBC @@ -520,6 +636,78 @@ static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in) aesni_enc(ctx, out, in); } +#ifdef CONFIG_X86_64 + +static void aesni_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) +{ + glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_enc)); +} + +static void aesni_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) +{ + glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_dec)); +} + +static void aesni_xts_enc8(void *ctx, u128 *dst, const u128 *src, le128 *iv) +{ + aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, true, (u8 *)iv); +} + +static void aesni_xts_dec8(void *ctx, u128 *dst, const u128 *src, le128 *iv) +{ + aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, false, (u8 *)iv); +} + +static const struct common_glue_ctx aesni_enc_xts = { + .num_funcs = 2, + .fpu_blocks_limit = 1, + + .funcs = { { + .num_blocks = 8, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc8) } + }, { + .num_blocks = 1, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc) } + } } +}; + +static const struct common_glue_ctx aesni_dec_xts = { + .num_funcs = 2, + .fpu_blocks_limit = 1, + + .funcs = { { + .num_blocks = 8, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec8) } + }, { + .num_blocks = 1, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec) } + } } +}; + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + + return glue_xts_crypt_128bit(&aesni_enc_xts, desc, dst, src, nbytes, + XTS_TWEAK_CAST(aesni_xts_tweak), + aes_ctx(ctx->raw_tweak_ctx), + aes_ctx(ctx->raw_crypt_ctx)); +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + + return glue_xts_crypt_128bit(&aesni_dec_xts, desc, dst, src, nbytes, + XTS_TWEAK_CAST(aesni_xts_tweak), + aes_ctx(ctx->raw_tweak_ctx), + aes_ctx(ctx->raw_crypt_ctx)); +} + +#else + static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { @@ -570,6 +758,8 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, return ret; } +#endif + #ifdef CONFIG_X86_64 static int rfc4106_init(struct crypto_tfm *tfm) { @@ -845,7 +1035,7 @@ static int __driver_rfc4106_encrypt(struct aead_request *req) dst = src; } - aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv, + aesni_gcm_enc_tfm(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv, ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst + ((unsigned long)req->cryptlen), auth_tag_len); @@ -926,12 +1116,12 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) dst = src; } - aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv, + aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv, ctx->hash_subkey, assoc, (unsigned long)req->assoclen, authTag, auth_tag_len); /* Compare generated tag with passed in tag. */ - retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ? + retval = crypto_memneq(src + tempCipherLen, authTag, auth_tag_len) ? -EBADMSG : 0; if (one_entry_in_sg) { @@ -1158,33 +1348,6 @@ static struct crypto_alg aesni_algs[] = { { .maxauthsize = 16, }, }, -#ifdef HAS_CTR -}, { - .cra_name = "rfc3686(ctr(aes))", - .cra_driver_name = "rfc3686-ctr-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_rfc3686_ctr_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE + - CTR_RFC3686_NONCE_SIZE, - .max_keysize = AES_MAX_KEY_SIZE + - CTR_RFC3686_NONCE_SIZE, - .ivsize = CTR_RFC3686_IV_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - .geniv = "seqiv", - }, - }, -#endif #endif #ifdef HAS_PCBC }, { @@ -1310,6 +1473,27 @@ static int __init aesni_init(void) if (!x86_match_cpu(aesni_cpu_id)) return -ENODEV; +#ifdef CONFIG_X86_64 +#ifdef CONFIG_AS_AVX2 + if (boot_cpu_has(X86_FEATURE_AVX2)) { + pr_info("AVX2 version of gcm_enc/dec engaged.\n"); + aesni_gcm_enc_tfm = aesni_gcm_enc_avx2; + aesni_gcm_dec_tfm = aesni_gcm_dec_avx2; + } else +#endif +#ifdef CONFIG_AS_AVX + if (boot_cpu_has(X86_FEATURE_AVX)) { + pr_info("AVX version of gcm_enc/dec engaged.\n"); + aesni_gcm_enc_tfm = aesni_gcm_enc_avx; + aesni_gcm_dec_tfm = aesni_gcm_dec_avx; + } else +#endif + { + pr_info("SSE version of gcm_enc/dec engaged.\n"); + aesni_gcm_enc_tfm = aesni_gcm_enc; + aesni_gcm_dec_tfm = aesni_gcm_dec; + } +#endif err = crypto_fpu_init(); if (err) diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S index 391d245dc08..246c67006ed 100644 --- a/arch/x86/crypto/blowfish-x86_64-asm_64.S +++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S @@ -20,6 +20,8 @@ * */ +#include <linux/linkage.h> + .file "blowfish-x86_64-asm.S" .text @@ -116,11 +118,7 @@ bswapq RX0; \ xorq RX0, (RIO); -.align 8 -.global __blowfish_enc_blk -.type __blowfish_enc_blk,@function; - -__blowfish_enc_blk: +ENTRY(__blowfish_enc_blk) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -148,19 +146,16 @@ __blowfish_enc_blk: movq %r10, RIO; test %cl, %cl; - jnz __enc_xor; + jnz .L__enc_xor; write_block(); ret; -__enc_xor: +.L__enc_xor: xor_block(); ret; +ENDPROC(__blowfish_enc_blk) -.align 8 -.global blowfish_dec_blk -.type blowfish_dec_blk,@function; - -blowfish_dec_blk: +ENTRY(blowfish_dec_blk) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -189,6 +184,7 @@ blowfish_dec_blk: movq %r11, %rbp; ret; +ENDPROC(blowfish_dec_blk) /********************************************************************** 4-way blowfish, four blocks parallel @@ -300,11 +296,7 @@ blowfish_dec_blk: bswapq RX3; \ xorq RX3, 24(RIO); -.align 8 -.global __blowfish_enc_blk_4way -.type __blowfish_enc_blk_4way,@function; - -__blowfish_enc_blk_4way: +ENTRY(__blowfish_enc_blk_4way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -336,7 +328,7 @@ __blowfish_enc_blk_4way: movq %r11, RIO; test %bpl, %bpl; - jnz __enc_xor4; + jnz .L__enc_xor4; write_block4(); @@ -344,18 +336,15 @@ __blowfish_enc_blk_4way: popq %rbp; ret; -__enc_xor4: +.L__enc_xor4: xor_block4(); popq %rbx; popq %rbp; ret; +ENDPROC(__blowfish_enc_blk_4way) -.align 8 -.global blowfish_dec_blk_4way -.type blowfish_dec_blk_4way,@function; - -blowfish_dec_blk_4way: +ENTRY(blowfish_dec_blk_4way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -387,4 +376,4 @@ blowfish_dec_blk_4way: popq %rbp; ret; - +ENDPROC(blowfish_dec_blk_4way) diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 50ec333b70e..8af519ed73d 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -223,9 +223,6 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, src -= 1; dst -= 1; } while (nbytes >= bsize * 4); - - if (nbytes < bsize) - goto done; } /* Handle leftovers */ diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index 2306d2e4816..ce71f921240 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -1,7 +1,7 @@ /* * x86_64/AVX/AES-NI assembler implementation of Camellia * - * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -15,6 +15,8 @@ * http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz */ +#include <linux/linkage.h> + #define CAMELLIA_TABLE_BYTE_LEN 272 /* struct camellia_ctx: */ @@ -190,6 +192,7 @@ roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd: %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %rcx, (%r9)); ret; +ENDPROC(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) .align 8 roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab: @@ -197,6 +200,7 @@ roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab: %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11, %rax, (%r9)); ret; +ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) /* * IN/OUT: @@ -585,6 +589,10 @@ roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab: .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +/* For XTS mode IV generation */ +.Lxts_gf128mul_and_shl1_mask: + .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 + /* * pre-SubByte transform * @@ -709,8 +717,6 @@ roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab: .text .align 8 -.type __camellia_enc_blk16,@function; - __camellia_enc_blk16: /* input: * %rdi: ctx, CTX @@ -793,10 +799,9 @@ __camellia_enc_blk16: %xmm15, %rax, %rcx, 24); jmp .Lenc_done; +ENDPROC(__camellia_enc_blk16) .align 8 -.type __camellia_dec_blk16,@function; - __camellia_dec_blk16: /* input: * %rdi: ctx, CTX @@ -877,12 +882,9 @@ __camellia_dec_blk16: ((key_table + (24) * 8) + 4)(CTX)); jmp .Ldec_max24; +ENDPROC(__camellia_dec_blk16) -.align 8 -.global camellia_ecb_enc_16way -.type camellia_ecb_enc_16way,@function; - -camellia_ecb_enc_16way: +ENTRY(camellia_ecb_enc_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -903,12 +905,9 @@ camellia_ecb_enc_16way: %xmm8, %rsi); ret; +ENDPROC(camellia_ecb_enc_16way) -.align 8 -.global camellia_ecb_dec_16way -.type camellia_ecb_dec_16way,@function; - -camellia_ecb_dec_16way: +ENTRY(camellia_ecb_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -934,12 +933,9 @@ camellia_ecb_dec_16way: %xmm8, %rsi); ret; +ENDPROC(camellia_ecb_dec_16way) -.align 8 -.global camellia_cbc_dec_16way -.type camellia_cbc_dec_16way,@function; - -camellia_cbc_dec_16way: +ENTRY(camellia_cbc_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -986,6 +982,7 @@ camellia_cbc_dec_16way: %xmm8, %rsi); ret; +ENDPROC(camellia_cbc_dec_16way) #define inc_le128(x, minus_one, tmp) \ vpcmpeqq minus_one, x, tmp; \ @@ -993,11 +990,7 @@ camellia_cbc_dec_16way: vpslldq $8, tmp, tmp; \ vpsubq tmp, x, x; -.align 8 -.global camellia_ctr_16way -.type camellia_ctr_16way,@function; - -camellia_ctr_16way: +ENTRY(camellia_ctr_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -1100,3 +1093,178 @@ camellia_ctr_16way: %xmm8, %rsi); ret; +ENDPROC(camellia_ctr_16way) + +#define gf128mul_x_ble(iv, mask, tmp) \ + vpsrad $31, iv, tmp; \ + vpaddq iv, iv, iv; \ + vpshufd $0x13, tmp, tmp; \ + vpand mask, tmp, tmp; \ + vpxor tmp, iv, iv; + +.align 8 +camellia_xts_crypt_16way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) + * %r8: index for input whitening key + * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16 + */ + + subq $(16 * 16), %rsp; + movq %rsp, %rax; + + vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14; + + /* load IV */ + vmovdqu (%rcx), %xmm0; + vpxor 0 * 16(%rdx), %xmm0, %xmm15; + vmovdqu %xmm15, 15 * 16(%rax); + vmovdqu %xmm0, 0 * 16(%rsi); + + /* construct IVs */ + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); + vpxor 1 * 16(%rdx), %xmm0, %xmm15; + vmovdqu %xmm15, 14 * 16(%rax); + vmovdqu %xmm0, 1 * 16(%rsi); + + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); + vpxor 2 * 16(%rdx), %xmm0, %xmm13; + vmovdqu %xmm0, 2 * 16(%rsi); + + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); + vpxor 3 * 16(%rdx), %xmm0, %xmm12; + vmovdqu %xmm0, 3 * 16(%rsi); + + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); + vpxor 4 * 16(%rdx), %xmm0, %xmm11; + vmovdqu %xmm0, 4 * 16(%rsi); + + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); + vpxor 5 * 16(%rdx), %xmm0, %xmm10; + vmovdqu %xmm0, 5 * 16(%rsi); + + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); + vpxor 6 * 16(%rdx), %xmm0, %xmm9; + vmovdqu %xmm0, 6 * 16(%rsi); + + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); + vpxor 7 * 16(%rdx), %xmm0, %xmm8; + vmovdqu %xmm0, 7 * 16(%rsi); + + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); + vpxor 8 * 16(%rdx), %xmm0, %xmm7; + vmovdqu %xmm0, 8 * 16(%rsi); + + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); + vpxor 9 * 16(%rdx), %xmm0, %xmm6; + vmovdqu %xmm0, 9 * 16(%rsi); + + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); + vpxor 10 * 16(%rdx), %xmm0, %xmm5; + vmovdqu %xmm0, 10 * 16(%rsi); + + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); + vpxor 11 * 16(%rdx), %xmm0, %xmm4; + vmovdqu %xmm0, 11 * 16(%rsi); + + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); + vpxor 12 * 16(%rdx), %xmm0, %xmm3; + vmovdqu %xmm0, 12 * 16(%rsi); + + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); + vpxor 13 * 16(%rdx), %xmm0, %xmm2; + vmovdqu %xmm0, 13 * 16(%rsi); + + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); + vpxor 14 * 16(%rdx), %xmm0, %xmm1; + vmovdqu %xmm0, 14 * 16(%rsi); + + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); + vpxor 15 * 16(%rdx), %xmm0, %xmm15; + vmovdqu %xmm15, 0 * 16(%rax); + vmovdqu %xmm0, 15 * 16(%rsi); + + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); + vmovdqu %xmm0, (%rcx); + + /* inpack16_pre: */ + vmovq (key_table)(CTX, %r8, 8), %xmm15; + vpshufb .Lpack_bswap, %xmm15, %xmm15; + vpxor 0 * 16(%rax), %xmm15, %xmm0; + vpxor %xmm1, %xmm15, %xmm1; + vpxor %xmm2, %xmm15, %xmm2; + vpxor %xmm3, %xmm15, %xmm3; + vpxor %xmm4, %xmm15, %xmm4; + vpxor %xmm5, %xmm15, %xmm5; + vpxor %xmm6, %xmm15, %xmm6; + vpxor %xmm7, %xmm15, %xmm7; + vpxor %xmm8, %xmm15, %xmm8; + vpxor %xmm9, %xmm15, %xmm9; + vpxor %xmm10, %xmm15, %xmm10; + vpxor %xmm11, %xmm15, %xmm11; + vpxor %xmm12, %xmm15, %xmm12; + vpxor %xmm13, %xmm15, %xmm13; + vpxor 14 * 16(%rax), %xmm15, %xmm14; + vpxor 15 * 16(%rax), %xmm15, %xmm15; + + call *%r9; + + addq $(16 * 16), %rsp; + + vpxor 0 * 16(%rsi), %xmm7, %xmm7; + vpxor 1 * 16(%rsi), %xmm6, %xmm6; + vpxor 2 * 16(%rsi), %xmm5, %xmm5; + vpxor 3 * 16(%rsi), %xmm4, %xmm4; + vpxor 4 * 16(%rsi), %xmm3, %xmm3; + vpxor 5 * 16(%rsi), %xmm2, %xmm2; + vpxor 6 * 16(%rsi), %xmm1, %xmm1; + vpxor 7 * 16(%rsi), %xmm0, %xmm0; + vpxor 8 * 16(%rsi), %xmm15, %xmm15; + vpxor 9 * 16(%rsi), %xmm14, %xmm14; + vpxor 10 * 16(%rsi), %xmm13, %xmm13; + vpxor 11 * 16(%rsi), %xmm12, %xmm12; + vpxor 12 * 16(%rsi), %xmm11, %xmm11; + vpxor 13 * 16(%rsi), %xmm10, %xmm10; + vpxor 14 * 16(%rsi), %xmm9, %xmm9; + vpxor 15 * 16(%rsi), %xmm8, %xmm8; + write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, + %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, + %xmm8, %rsi); + + ret; +ENDPROC(camellia_xts_crypt_16way) + +ENTRY(camellia_xts_enc_16way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) + */ + xorl %r8d, %r8d; /* input whitening key, 0 for enc */ + + leaq __camellia_enc_blk16, %r9; + + jmp camellia_xts_crypt_16way; +ENDPROC(camellia_xts_enc_16way) + +ENTRY(camellia_xts_dec_16way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) + */ + + cmpl $16, key_length(CTX); + movl $32, %r8d; + movl $24, %eax; + cmovel %eax, %r8d; /* input whitening key, last for dec */ + + leaq __camellia_dec_blk16, %r9; + + jmp camellia_xts_crypt_16way; +ENDPROC(camellia_xts_dec_16way) diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S new file mode 100644 index 00000000000..0e0b8863a34 --- /dev/null +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -0,0 +1,1386 @@ +/* + * x86_64/AVX2/AES-NI assembler implementation of Camellia + * + * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + +#include <linux/linkage.h> + +#define CAMELLIA_TABLE_BYTE_LEN 272 + +/* struct camellia_ctx: */ +#define key_table 0 +#define key_length CAMELLIA_TABLE_BYTE_LEN + +/* register macros */ +#define CTX %rdi +#define RIO %r8 + +/********************************************************************** + helper macros + **********************************************************************/ +#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ + vpand x, mask4bit, tmp0; \ + vpandn x, mask4bit, x; \ + vpsrld $4, x, x; \ + \ + vpshufb tmp0, lo_t, tmp0; \ + vpshufb x, hi_t, x; \ + vpxor tmp0, x, x; + +#define ymm0_x xmm0 +#define ymm1_x xmm1 +#define ymm2_x xmm2 +#define ymm3_x xmm3 +#define ymm4_x xmm4 +#define ymm5_x xmm5 +#define ymm6_x xmm6 +#define ymm7_x xmm7 +#define ymm8_x xmm8 +#define ymm9_x xmm9 +#define ymm10_x xmm10 +#define ymm11_x xmm11 +#define ymm12_x xmm12 +#define ymm13_x xmm13 +#define ymm14_x xmm14 +#define ymm15_x xmm15 + +/********************************************************************** + 32-way camellia + **********************************************************************/ + +/* + * IN: + * x0..x7: byte-sliced AB state + * mem_cd: register pointer storing CD state + * key: index for key material + * OUT: + * x0..x7: new byte-sliced CD state + */ +#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \ + t7, mem_cd, key) \ + /* \ + * S-function with AES subbytes \ + */ \ + vbroadcasti128 .Linv_shift_row, t4; \ + vpbroadcastd .L0f0f0f0f, t7; \ + vbroadcasti128 .Lpre_tf_lo_s1, t5; \ + vbroadcasti128 .Lpre_tf_hi_s1, t6; \ + vbroadcasti128 .Lpre_tf_lo_s4, t2; \ + vbroadcasti128 .Lpre_tf_hi_s4, t3; \ + \ + /* AES inverse shift rows */ \ + vpshufb t4, x0, x0; \ + vpshufb t4, x7, x7; \ + vpshufb t4, x3, x3; \ + vpshufb t4, x6, x6; \ + vpshufb t4, x2, x2; \ + vpshufb t4, x5, x5; \ + vpshufb t4, x1, x1; \ + vpshufb t4, x4, x4; \ + \ + /* prefilter sboxes 1, 2 and 3 */ \ + /* prefilter sbox 4 */ \ + filter_8bit(x0, t5, t6, t7, t4); \ + filter_8bit(x7, t5, t6, t7, t4); \ + vextracti128 $1, x0, t0##_x; \ + vextracti128 $1, x7, t1##_x; \ + filter_8bit(x3, t2, t3, t7, t4); \ + filter_8bit(x6, t2, t3, t7, t4); \ + vextracti128 $1, x3, t3##_x; \ + vextracti128 $1, x6, t2##_x; \ + filter_8bit(x2, t5, t6, t7, t4); \ + filter_8bit(x5, t5, t6, t7, t4); \ + filter_8bit(x1, t5, t6, t7, t4); \ + filter_8bit(x4, t5, t6, t7, t4); \ + \ + vpxor t4##_x, t4##_x, t4##_x; \ + \ + /* AES subbytes + AES shift rows */ \ + vextracti128 $1, x2, t6##_x; \ + vextracti128 $1, x5, t5##_x; \ + vaesenclast t4##_x, x0##_x, x0##_x; \ + vaesenclast t4##_x, t0##_x, t0##_x; \ + vinserti128 $1, t0##_x, x0, x0; \ + vaesenclast t4##_x, x7##_x, x7##_x; \ + vaesenclast t4##_x, t1##_x, t1##_x; \ + vinserti128 $1, t1##_x, x7, x7; \ + vaesenclast t4##_x, x3##_x, x3##_x; \ + vaesenclast t4##_x, t3##_x, t3##_x; \ + vinserti128 $1, t3##_x, x3, x3; \ + vaesenclast t4##_x, x6##_x, x6##_x; \ + vaesenclast t4##_x, t2##_x, t2##_x; \ + vinserti128 $1, t2##_x, x6, x6; \ + vextracti128 $1, x1, t3##_x; \ + vextracti128 $1, x4, t2##_x; \ + vbroadcasti128 .Lpost_tf_lo_s1, t0; \ + vbroadcasti128 .Lpost_tf_hi_s1, t1; \ + vaesenclast t4##_x, x2##_x, x2##_x; \ + vaesenclast t4##_x, t6##_x, t6##_x; \ + vinserti128 $1, t6##_x, x2, x2; \ + vaesenclast t4##_x, x5##_x, x5##_x; \ + vaesenclast t4##_x, t5##_x, t5##_x; \ + vinserti128 $1, t5##_x, x5, x5; \ + vaesenclast t4##_x, x1##_x, x1##_x; \ + vaesenclast t4##_x, t3##_x, t3##_x; \ + vinserti128 $1, t3##_x, x1, x1; \ + vaesenclast t4##_x, x4##_x, x4##_x; \ + vaesenclast t4##_x, t2##_x, t2##_x; \ + vinserti128 $1, t2##_x, x4, x4; \ + \ + /* postfilter sboxes 1 and 4 */ \ + vbroadcasti128 .Lpost_tf_lo_s3, t2; \ + vbroadcasti128 .Lpost_tf_hi_s3, t3; \ + filter_8bit(x0, t0, t1, t7, t6); \ + filter_8bit(x7, t0, t1, t7, t6); \ + filter_8bit(x3, t0, t1, t7, t6); \ + filter_8bit(x6, t0, t1, t7, t6); \ + \ + /* postfilter sbox 3 */ \ + vbroadcasti128 .Lpost_tf_lo_s2, t4; \ + vbroadcasti128 .Lpost_tf_hi_s2, t5; \ + filter_8bit(x2, t2, t3, t7, t6); \ + filter_8bit(x5, t2, t3, t7, t6); \ + \ + vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \ + \ + /* postfilter sbox 2 */ \ + filter_8bit(x1, t4, t5, t7, t2); \ + filter_8bit(x4, t4, t5, t7, t2); \ + vpxor t7, t7, t7; \ + \ + vpsrldq $1, t0, t1; \ + vpsrldq $2, t0, t2; \ + vpshufb t7, t1, t1; \ + vpsrldq $3, t0, t3; \ + \ + /* P-function */ \ + vpxor x5, x0, x0; \ + vpxor x6, x1, x1; \ + vpxor x7, x2, x2; \ + vpxor x4, x3, x3; \ + \ + vpshufb t7, t2, t2; \ + vpsrldq $4, t0, t4; \ + vpshufb t7, t3, t3; \ + vpsrldq $5, t0, t5; \ + vpshufb t7, t4, t4; \ + \ + vpxor x2, x4, x4; \ + vpxor x3, x5, x5; \ + vpxor x0, x6, x6; \ + vpxor x1, x7, x7; \ + \ + vpsrldq $6, t0, t6; \ + vpshufb t7, t5, t5; \ + vpshufb t7, t6, t6; \ + \ + vpxor x7, x0, x0; \ + vpxor x4, x1, x1; \ + vpxor x5, x2, x2; \ + vpxor x6, x3, x3; \ + \ + vpxor x3, x4, x4; \ + vpxor x0, x5, x5; \ + vpxor x1, x6, x6; \ + vpxor x2, x7, x7; /* note: high and low parts swapped */ \ + \ + /* Add key material and result to CD (x becomes new CD) */ \ + \ + vpxor t6, x1, x1; \ + vpxor 5 * 32(mem_cd), x1, x1; \ + \ + vpsrldq $7, t0, t6; \ + vpshufb t7, t0, t0; \ + vpshufb t7, t6, t7; \ + \ + vpxor t7, x0, x0; \ + vpxor 4 * 32(mem_cd), x0, x0; \ + \ + vpxor t5, x2, x2; \ + vpxor 6 * 32(mem_cd), x2, x2; \ + \ + vpxor t4, x3, x3; \ + vpxor 7 * 32(mem_cd), x3, x3; \ + \ + vpxor t3, x4, x4; \ + vpxor 0 * 32(mem_cd), x4, x4; \ + \ + vpxor t2, x5, x5; \ + vpxor 1 * 32(mem_cd), x5, x5; \ + \ + vpxor t1, x6, x6; \ + vpxor 2 * 32(mem_cd), x6, x6; \ + \ + vpxor t0, x7, x7; \ + vpxor 3 * 32(mem_cd), x7, x7; + +/* + * Size optimization... with inlined roundsm32 binary would be over 5 times + * larger and would only marginally faster. + */ +.align 8 +roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd: + roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, + %rcx, (%r9)); + ret; +ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) + +.align 8 +roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab: + roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3, + %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11, + %rax, (%r9)); + ret; +ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) + +/* + * IN/OUT: + * x0..x7: byte-sliced AB state preloaded + * mem_ab: byte-sliced AB state in memory + * mem_cb: byte-sliced CD state in memory + */ +#define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ + leaq (key_table + (i) * 8)(CTX), %r9; \ + call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \ + \ + vmovdqu x0, 4 * 32(mem_cd); \ + vmovdqu x1, 5 * 32(mem_cd); \ + vmovdqu x2, 6 * 32(mem_cd); \ + vmovdqu x3, 7 * 32(mem_cd); \ + vmovdqu x4, 0 * 32(mem_cd); \ + vmovdqu x5, 1 * 32(mem_cd); \ + vmovdqu x6, 2 * 32(mem_cd); \ + vmovdqu x7, 3 * 32(mem_cd); \ + \ + leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \ + call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \ + \ + store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); + +#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ + +#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ + /* Store new AB state */ \ + vmovdqu x4, 4 * 32(mem_ab); \ + vmovdqu x5, 5 * 32(mem_ab); \ + vmovdqu x6, 6 * 32(mem_ab); \ + vmovdqu x7, 7 * 32(mem_ab); \ + vmovdqu x0, 0 * 32(mem_ab); \ + vmovdqu x1, 1 * 32(mem_ab); \ + vmovdqu x2, 2 * 32(mem_ab); \ + vmovdqu x3, 3 * 32(mem_ab); + +#define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, i) \ + two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ + two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ + two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); + +#define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, i) \ + two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ + two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ + two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); + +/* + * IN: + * v0..3: byte-sliced 32-bit integers + * OUT: + * v0..3: (IN <<< 1) + */ +#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \ + vpcmpgtb v0, zero, t0; \ + vpaddb v0, v0, v0; \ + vpabsb t0, t0; \ + \ + vpcmpgtb v1, zero, t1; \ + vpaddb v1, v1, v1; \ + vpabsb t1, t1; \ + \ + vpcmpgtb v2, zero, t2; \ + vpaddb v2, v2, v2; \ + vpabsb t2, t2; \ + \ + vpor t0, v1, v1; \ + \ + vpcmpgtb v3, zero, t0; \ + vpaddb v3, v3, v3; \ + vpabsb t0, t0; \ + \ + vpor t1, v2, v2; \ + vpor t2, v3, v3; \ + vpor t0, v0, v0; + +/* + * IN: + * r: byte-sliced AB state in memory + * l: byte-sliced CD state in memory + * OUT: + * x0..x7: new byte-sliced CD state + */ +#define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ + tt1, tt2, tt3, kll, klr, krl, krr) \ + /* \ + * t0 = kll; \ + * t0 &= ll; \ + * lr ^= rol32(t0, 1); \ + */ \ + vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ + vpxor tt0, tt0, tt0; \ + vpshufb tt0, t0, t3; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t2; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t1; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t0; \ + \ + vpand l0, t0, t0; \ + vpand l1, t1, t1; \ + vpand l2, t2, t2; \ + vpand l3, t3, t3; \ + \ + rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ + \ + vpxor l4, t0, l4; \ + vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ + vmovdqu l4, 4 * 32(l); \ + vpxor l5, t1, l5; \ + vmovdqu l5, 5 * 32(l); \ + vpxor l6, t2, l6; \ + vmovdqu l6, 6 * 32(l); \ + vpxor l7, t3, l7; \ + vmovdqu l7, 7 * 32(l); \ + \ + /* \ + * t2 = krr; \ + * t2 |= rr; \ + * rl ^= t2; \ + */ \ + \ + vpshufb tt0, t0, t3; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t2; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t1; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t0; \ + \ + vpor 4 * 32(r), t0, t0; \ + vpor 5 * 32(r), t1, t1; \ + vpor 6 * 32(r), t2, t2; \ + vpor 7 * 32(r), t3, t3; \ + \ + vpxor 0 * 32(r), t0, t0; \ + vpxor 1 * 32(r), t1, t1; \ + vpxor 2 * 32(r), t2, t2; \ + vpxor 3 * 32(r), t3, t3; \ + vmovdqu t0, 0 * 32(r); \ + vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ + vmovdqu t1, 1 * 32(r); \ + vmovdqu t2, 2 * 32(r); \ + vmovdqu t3, 3 * 32(r); \ + \ + /* \ + * t2 = krl; \ + * t2 &= rl; \ + * rr ^= rol32(t2, 1); \ + */ \ + vpshufb tt0, t0, t3; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t2; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t1; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t0; \ + \ + vpand 0 * 32(r), t0, t0; \ + vpand 1 * 32(r), t1, t1; \ + vpand 2 * 32(r), t2, t2; \ + vpand 3 * 32(r), t3, t3; \ + \ + rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ + \ + vpxor 4 * 32(r), t0, t0; \ + vpxor 5 * 32(r), t1, t1; \ + vpxor 6 * 32(r), t2, t2; \ + vpxor 7 * 32(r), t3, t3; \ + vmovdqu t0, 4 * 32(r); \ + vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ + vmovdqu t1, 5 * 32(r); \ + vmovdqu t2, 6 * 32(r); \ + vmovdqu t3, 7 * 32(r); \ + \ + /* \ + * t0 = klr; \ + * t0 |= lr; \ + * ll ^= t0; \ + */ \ + \ + vpshufb tt0, t0, t3; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t2; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t1; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t0; \ + \ + vpor l4, t0, t0; \ + vpor l5, t1, t1; \ + vpor l6, t2, t2; \ + vpor l7, t3, t3; \ + \ + vpxor l0, t0, l0; \ + vmovdqu l0, 0 * 32(l); \ + vpxor l1, t1, l1; \ + vmovdqu l1, 1 * 32(l); \ + vpxor l2, t2, l2; \ + vmovdqu l2, 2 * 32(l); \ + vpxor l3, t3, l3; \ + vmovdqu l3, 3 * 32(l); + +#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ + vpunpckhdq x1, x0, t2; \ + vpunpckldq x1, x0, x0; \ + \ + vpunpckldq x3, x2, t1; \ + vpunpckhdq x3, x2, x2; \ + \ + vpunpckhqdq t1, x0, x1; \ + vpunpcklqdq t1, x0, x0; \ + \ + vpunpckhqdq x2, t2, x3; \ + vpunpcklqdq x2, t2, x2; + +#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \ + a3, b3, c3, d3, st0, st1) \ + vmovdqu d2, st0; \ + vmovdqu d3, st1; \ + transpose_4x4(a0, a1, a2, a3, d2, d3); \ + transpose_4x4(b0, b1, b2, b3, d2, d3); \ + vmovdqu st0, d2; \ + vmovdqu st1, d3; \ + \ + vmovdqu a0, st0; \ + vmovdqu a1, st1; \ + transpose_4x4(c0, c1, c2, c3, a0, a1); \ + transpose_4x4(d0, d1, d2, d3, a0, a1); \ + \ + vbroadcasti128 .Lshufb_16x16b, a0; \ + vmovdqu st1, a1; \ + vpshufb a0, a2, a2; \ + vpshufb a0, a3, a3; \ + vpshufb a0, b0, b0; \ + vpshufb a0, b1, b1; \ + vpshufb a0, b2, b2; \ + vpshufb a0, b3, b3; \ + vpshufb a0, a1, a1; \ + vpshufb a0, c0, c0; \ + vpshufb a0, c1, c1; \ + vpshufb a0, c2, c2; \ + vpshufb a0, c3, c3; \ + vpshufb a0, d0, d0; \ + vpshufb a0, d1, d1; \ + vpshufb a0, d2, d2; \ + vpshufb a0, d3, d3; \ + vmovdqu d3, st1; \ + vmovdqu st0, d3; \ + vpshufb a0, d3, a0; \ + vmovdqu d2, st0; \ + \ + transpose_4x4(a0, b0, c0, d0, d2, d3); \ + transpose_4x4(a1, b1, c1, d1, d2, d3); \ + vmovdqu st0, d2; \ + vmovdqu st1, d3; \ + \ + vmovdqu b0, st0; \ + vmovdqu b1, st1; \ + transpose_4x4(a2, b2, c2, d2, b0, b1); \ + transpose_4x4(a3, b3, c3, d3, b0, b1); \ + vmovdqu st0, b0; \ + vmovdqu st1, b1; \ + /* does not adjust output bytes inside vectors */ + +/* load blocks to registers and apply pre-whitening */ +#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, rio, key) \ + vpbroadcastq key, x0; \ + vpshufb .Lpack_bswap, x0, x0; \ + \ + vpxor 0 * 32(rio), x0, y7; \ + vpxor 1 * 32(rio), x0, y6; \ + vpxor 2 * 32(rio), x0, y5; \ + vpxor 3 * 32(rio), x0, y4; \ + vpxor 4 * 32(rio), x0, y3; \ + vpxor 5 * 32(rio), x0, y2; \ + vpxor 6 * 32(rio), x0, y1; \ + vpxor 7 * 32(rio), x0, y0; \ + vpxor 8 * 32(rio), x0, x7; \ + vpxor 9 * 32(rio), x0, x6; \ + vpxor 10 * 32(rio), x0, x5; \ + vpxor 11 * 32(rio), x0, x4; \ + vpxor 12 * 32(rio), x0, x3; \ + vpxor 13 * 32(rio), x0, x2; \ + vpxor 14 * 32(rio), x0, x1; \ + vpxor 15 * 32(rio), x0, x0; + +/* byteslice pre-whitened blocks and store to temporary memory */ +#define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd) \ + byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \ + y4, y5, y6, y7, (mem_ab), (mem_cd)); \ + \ + vmovdqu x0, 0 * 32(mem_ab); \ + vmovdqu x1, 1 * 32(mem_ab); \ + vmovdqu x2, 2 * 32(mem_ab); \ + vmovdqu x3, 3 * 32(mem_ab); \ + vmovdqu x4, 4 * 32(mem_ab); \ + vmovdqu x5, 5 * 32(mem_ab); \ + vmovdqu x6, 6 * 32(mem_ab); \ + vmovdqu x7, 7 * 32(mem_ab); \ + vmovdqu y0, 0 * 32(mem_cd); \ + vmovdqu y1, 1 * 32(mem_cd); \ + vmovdqu y2, 2 * 32(mem_cd); \ + vmovdqu y3, 3 * 32(mem_cd); \ + vmovdqu y4, 4 * 32(mem_cd); \ + vmovdqu y5, 5 * 32(mem_cd); \ + vmovdqu y6, 6 * 32(mem_cd); \ + vmovdqu y7, 7 * 32(mem_cd); + +/* de-byteslice, apply post-whitening and store blocks */ +#define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ + y5, y6, y7, key, stack_tmp0, stack_tmp1) \ + byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \ + y3, y7, x3, x7, stack_tmp0, stack_tmp1); \ + \ + vmovdqu x0, stack_tmp0; \ + \ + vpbroadcastq key, x0; \ + vpshufb .Lpack_bswap, x0, x0; \ + \ + vpxor x0, y7, y7; \ + vpxor x0, y6, y6; \ + vpxor x0, y5, y5; \ + vpxor x0, y4, y4; \ + vpxor x0, y3, y3; \ + vpxor x0, y2, y2; \ + vpxor x0, y1, y1; \ + vpxor x0, y0, y0; \ + vpxor x0, x7, x7; \ + vpxor x0, x6, x6; \ + vpxor x0, x5, x5; \ + vpxor x0, x4, x4; \ + vpxor x0, x3, x3; \ + vpxor x0, x2, x2; \ + vpxor x0, x1, x1; \ + vpxor stack_tmp0, x0, x0; + +#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, rio) \ + vmovdqu x0, 0 * 32(rio); \ + vmovdqu x1, 1 * 32(rio); \ + vmovdqu x2, 2 * 32(rio); \ + vmovdqu x3, 3 * 32(rio); \ + vmovdqu x4, 4 * 32(rio); \ + vmovdqu x5, 5 * 32(rio); \ + vmovdqu x6, 6 * 32(rio); \ + vmovdqu x7, 7 * 32(rio); \ + vmovdqu y0, 8 * 32(rio); \ + vmovdqu y1, 9 * 32(rio); \ + vmovdqu y2, 10 * 32(rio); \ + vmovdqu y3, 11 * 32(rio); \ + vmovdqu y4, 12 * 32(rio); \ + vmovdqu y5, 13 * 32(rio); \ + vmovdqu y6, 14 * 32(rio); \ + vmovdqu y7, 15 * 32(rio); + +.data +.align 32 + +#define SHUFB_BYTES(idx) \ + 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) + +.Lshufb_16x16b: + .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) + .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) + +.Lpack_bswap: + .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 + .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 + +/* For CTR-mode IV byteswap */ +.Lbswap128_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +/* For XTS mode */ +.Lxts_gf128mul_and_shl1_mask_0: + .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 +.Lxts_gf128mul_and_shl1_mask_1: + .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 + +/* + * pre-SubByte transform + * + * pre-lookup for sbox1, sbox2, sbox3: + * swap_bitendianness( + * isom_map_camellia_to_aes( + * camellia_f( + * swap_bitendianess(in) + * ) + * ) + * ) + * + * (note: '⊕ 0xc5' inside camellia_f()) + */ +.Lpre_tf_lo_s1: + .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 + .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 +.Lpre_tf_hi_s1: + .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a + .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 + +/* + * pre-SubByte transform + * + * pre-lookup for sbox4: + * swap_bitendianness( + * isom_map_camellia_to_aes( + * camellia_f( + * swap_bitendianess(in <<< 1) + * ) + * ) + * ) + * + * (note: '⊕ 0xc5' inside camellia_f()) + */ +.Lpre_tf_lo_s4: + .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 + .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 +.Lpre_tf_hi_s4: + .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 + .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf + +/* + * post-SubByte transform + * + * post-lookup for sbox1, sbox4: + * swap_bitendianness( + * camellia_h( + * isom_map_aes_to_camellia( + * swap_bitendianness( + * aes_inverse_affine_transform(in) + * ) + * ) + * ) + * ) + * + * (note: '⊕ 0x6e' inside camellia_h()) + */ +.Lpost_tf_lo_s1: + .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 + .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 +.Lpost_tf_hi_s1: + .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 + .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c + +/* + * post-SubByte transform + * + * post-lookup for sbox2: + * swap_bitendianness( + * camellia_h( + * isom_map_aes_to_camellia( + * swap_bitendianness( + * aes_inverse_affine_transform(in) + * ) + * ) + * ) + * ) <<< 1 + * + * (note: '⊕ 0x6e' inside camellia_h()) + */ +.Lpost_tf_lo_s2: + .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 + .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 +.Lpost_tf_hi_s2: + .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 + .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 + +/* + * post-SubByte transform + * + * post-lookup for sbox3: + * swap_bitendianness( + * camellia_h( + * isom_map_aes_to_camellia( + * swap_bitendianness( + * aes_inverse_affine_transform(in) + * ) + * ) + * ) + * ) >>> 1 + * + * (note: '⊕ 0x6e' inside camellia_h()) + */ +.Lpost_tf_lo_s3: + .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 + .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 +.Lpost_tf_hi_s3: + .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 + .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 + +/* For isolating SubBytes from AESENCLAST, inverse shift row */ +.Linv_shift_row: + .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b + .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 + +.align 4 +/* 4-bit mask */ +.L0f0f0f0f: + .long 0x0f0f0f0f + +.text + +.align 8 +__camellia_enc_blk32: + /* input: + * %rdi: ctx, CTX + * %rax: temporary storage, 512 bytes + * %ymm0..%ymm15: 32 plaintext blocks + * output: + * %ymm0..%ymm15: 32 encrypted blocks, order swapped: + * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 + */ + + leaq 8 * 32(%rax), %rcx; + + inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %rcx); + + enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %rcx, 0); + + fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, + ((key_table + (8) * 8) + 0)(CTX), + ((key_table + (8) * 8) + 4)(CTX), + ((key_table + (8) * 8) + 8)(CTX), + ((key_table + (8) * 8) + 12)(CTX)); + + enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %rcx, 8); + + fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, + ((key_table + (16) * 8) + 0)(CTX), + ((key_table + (16) * 8) + 4)(CTX), + ((key_table + (16) * 8) + 8)(CTX), + ((key_table + (16) * 8) + 12)(CTX)); + + enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %rcx, 16); + + movl $24, %r8d; + cmpl $16, key_length(CTX); + jne .Lenc_max32; + +.Lenc_done: + /* load CD for output */ + vmovdqu 0 * 32(%rcx), %ymm8; + vmovdqu 1 * 32(%rcx), %ymm9; + vmovdqu 2 * 32(%rcx), %ymm10; + vmovdqu 3 * 32(%rcx), %ymm11; + vmovdqu 4 * 32(%rcx), %ymm12; + vmovdqu 5 * 32(%rcx), %ymm13; + vmovdqu 6 * 32(%rcx), %ymm14; + vmovdqu 7 * 32(%rcx), %ymm15; + + outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax)); + + ret; + +.align 8 +.Lenc_max32: + movl $32, %r8d; + + fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, + ((key_table + (24) * 8) + 0)(CTX), + ((key_table + (24) * 8) + 4)(CTX), + ((key_table + (24) * 8) + 8)(CTX), + ((key_table + (24) * 8) + 12)(CTX)); + + enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %rcx, 24); + + jmp .Lenc_done; +ENDPROC(__camellia_enc_blk32) + +.align 8 +__camellia_dec_blk32: + /* input: + * %rdi: ctx, CTX + * %rax: temporary storage, 512 bytes + * %r8d: 24 for 16 byte key, 32 for larger + * %ymm0..%ymm15: 16 encrypted blocks + * output: + * %ymm0..%ymm15: 16 plaintext blocks, order swapped: + * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 + */ + + leaq 8 * 32(%rax), %rcx; + + inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %rcx); + + cmpl $32, %r8d; + je .Ldec_max32; + +.Ldec_max24: + dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %rcx, 16); + + fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, + ((key_table + (16) * 8) + 8)(CTX), + ((key_table + (16) * 8) + 12)(CTX), + ((key_table + (16) * 8) + 0)(CTX), + ((key_table + (16) * 8) + 4)(CTX)); + + dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %rcx, 8); + + fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, + ((key_table + (8) * 8) + 8)(CTX), + ((key_table + (8) * 8) + 12)(CTX), + ((key_table + (8) * 8) + 0)(CTX), + ((key_table + (8) * 8) + 4)(CTX)); + + dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %rcx, 0); + + /* load CD for output */ + vmovdqu 0 * 32(%rcx), %ymm8; + vmovdqu 1 * 32(%rcx), %ymm9; + vmovdqu 2 * 32(%rcx), %ymm10; + vmovdqu 3 * 32(%rcx), %ymm11; + vmovdqu 4 * 32(%rcx), %ymm12; + vmovdqu 5 * 32(%rcx), %ymm13; + vmovdqu 6 * 32(%rcx), %ymm14; + vmovdqu 7 * 32(%rcx), %ymm15; + + outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax)); + + ret; + +.align 8 +.Ldec_max32: + dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rax, %rcx, 24); + + fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, + ((key_table + (24) * 8) + 8)(CTX), + ((key_table + (24) * 8) + 12)(CTX), + ((key_table + (24) * 8) + 0)(CTX), + ((key_table + (24) * 8) + 4)(CTX)); + + jmp .Ldec_max24; +ENDPROC(__camellia_dec_blk32) + +ENTRY(camellia_ecb_enc_32way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst (32 blocks) + * %rdx: src (32 blocks) + */ + + vzeroupper; + + inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rdx, (key_table)(CTX)); + + /* now dst can be used as temporary buffer (even in src == dst case) */ + movq %rsi, %rax; + + call __camellia_enc_blk32; + + write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, + %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, + %ymm8, %rsi); + + vzeroupper; + + ret; +ENDPROC(camellia_ecb_enc_32way) + +ENTRY(camellia_ecb_dec_32way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst (32 blocks) + * %rdx: src (32 blocks) + */ + + vzeroupper; + + cmpl $16, key_length(CTX); + movl $32, %r8d; + movl $24, %eax; + cmovel %eax, %r8d; /* max */ + + inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rdx, (key_table)(CTX, %r8, 8)); + + /* now dst can be used as temporary buffer (even in src == dst case) */ + movq %rsi, %rax; + + call __camellia_dec_blk32; + + write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, + %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, + %ymm8, %rsi); + + vzeroupper; + + ret; +ENDPROC(camellia_ecb_dec_32way) + +ENTRY(camellia_cbc_dec_32way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst (32 blocks) + * %rdx: src (32 blocks) + */ + + vzeroupper; + + cmpl $16, key_length(CTX); + movl $32, %r8d; + movl $24, %eax; + cmovel %eax, %r8d; /* max */ + + inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, + %ymm15, %rdx, (key_table)(CTX, %r8, 8)); + + movq %rsp, %r10; + cmpq %rsi, %rdx; + je .Lcbc_dec_use_stack; + + /* dst can be used as temporary storage, src is not overwritten. */ + movq %rsi, %rax; + jmp .Lcbc_dec_continue; + +.Lcbc_dec_use_stack: + /* + * dst still in-use (because dst == src), so use stack for temporary + * storage. + */ + subq $(16 * 32), %rsp; + movq %rsp, %rax; + +.Lcbc_dec_continue: + call __camellia_dec_blk32; + + vmovdqu %ymm7, (%rax); + vpxor %ymm7, %ymm7, %ymm7; + vinserti128 $1, (%rdx), %ymm7, %ymm7; + vpxor (%rax), %ymm7, %ymm7; + movq %r10, %rsp; + vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6; + vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5; + vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4; + vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3; + vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2; + vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1; + vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0; + vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15; + vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14; + vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13; + vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12; + vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11; + vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10; + vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9; + vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8; + write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, + %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, + %ymm8, %rsi); + + vzeroupper; + + ret; +ENDPROC(camellia_cbc_dec_32way) + +#define inc_le128(x, minus_one, tmp) \ + vpcmpeqq minus_one, x, tmp; \ + vpsubq minus_one, x, x; \ + vpslldq $8, tmp, tmp; \ + vpsubq tmp, x, x; + +#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ + vpcmpeqq minus_one, x, tmp1; \ + vpcmpeqq minus_two, x, tmp2; \ + vpsubq minus_two, x, x; \ + vpor tmp2, tmp1, tmp1; \ + vpslldq $8, tmp1, tmp1; \ + vpsubq tmp1, x, x; + +ENTRY(camellia_ctr_32way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst (32 blocks) + * %rdx: src (32 blocks) + * %rcx: iv (little endian, 128bit) + */ + + vzeroupper; + + movq %rsp, %r10; + cmpq %rsi, %rdx; + je .Lctr_use_stack; + + /* dst can be used as temporary storage, src is not overwritten. */ + movq %rsi, %rax; + jmp .Lctr_continue; + +.Lctr_use_stack: + subq $(16 * 32), %rsp; + movq %rsp, %rax; + +.Lctr_continue: + vpcmpeqd %ymm15, %ymm15, %ymm15; + vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */ + vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */ + + /* load IV and byteswap */ + vmovdqu (%rcx), %xmm0; + vmovdqa %xmm0, %xmm1; + inc_le128(%xmm0, %xmm15, %xmm14); + vbroadcasti128 .Lbswap128_mask, %ymm14; + vinserti128 $1, %xmm0, %ymm1, %ymm0; + vpshufb %ymm14, %ymm0, %ymm13; + vmovdqu %ymm13, 15 * 32(%rax); + + /* construct IVs */ + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */ + vpshufb %ymm14, %ymm0, %ymm13; + vmovdqu %ymm13, 14 * 32(%rax); + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); + vpshufb %ymm14, %ymm0, %ymm13; + vmovdqu %ymm13, 13 * 32(%rax); + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); + vpshufb %ymm14, %ymm0, %ymm13; + vmovdqu %ymm13, 12 * 32(%rax); + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); + vpshufb %ymm14, %ymm0, %ymm13; + vmovdqu %ymm13, 11 * 32(%rax); + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); + vpshufb %ymm14, %ymm0, %ymm10; + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); + vpshufb %ymm14, %ymm0, %ymm9; + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); + vpshufb %ymm14, %ymm0, %ymm8; + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); + vpshufb %ymm14, %ymm0, %ymm7; + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); + vpshufb %ymm14, %ymm0, %ymm6; + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); + vpshufb %ymm14, %ymm0, %ymm5; + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); + vpshufb %ymm14, %ymm0, %ymm4; + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); + vpshufb %ymm14, %ymm0, %ymm3; + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); + vpshufb %ymm14, %ymm0, %ymm2; + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); + vpshufb %ymm14, %ymm0, %ymm1; + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); + vextracti128 $1, %ymm0, %xmm13; + vpshufb %ymm14, %ymm0, %ymm0; + inc_le128(%xmm13, %xmm15, %xmm14); + vmovdqu %xmm13, (%rcx); + + /* inpack32_pre: */ + vpbroadcastq (key_table)(CTX), %ymm15; + vpshufb .Lpack_bswap, %ymm15, %ymm15; + vpxor %ymm0, %ymm15, %ymm0; + vpxor %ymm1, %ymm15, %ymm1; + vpxor %ymm2, %ymm15, %ymm2; + vpxor %ymm3, %ymm15, %ymm3; + vpxor %ymm4, %ymm15, %ymm4; + vpxor %ymm5, %ymm15, %ymm5; + vpxor %ymm6, %ymm15, %ymm6; + vpxor %ymm7, %ymm15, %ymm7; + vpxor %ymm8, %ymm15, %ymm8; + vpxor %ymm9, %ymm15, %ymm9; + vpxor %ymm10, %ymm15, %ymm10; + vpxor 11 * 32(%rax), %ymm15, %ymm11; + vpxor 12 * 32(%rax), %ymm15, %ymm12; + vpxor 13 * 32(%rax), %ymm15, %ymm13; + vpxor 14 * 32(%rax), %ymm15, %ymm14; + vpxor 15 * 32(%rax), %ymm15, %ymm15; + + call __camellia_enc_blk32; + + movq %r10, %rsp; + + vpxor 0 * 32(%rdx), %ymm7, %ymm7; + vpxor 1 * 32(%rdx), %ymm6, %ymm6; + vpxor 2 * 32(%rdx), %ymm5, %ymm5; + vpxor 3 * 32(%rdx), %ymm4, %ymm4; + vpxor 4 * 32(%rdx), %ymm3, %ymm3; + vpxor 5 * 32(%rdx), %ymm2, %ymm2; + vpxor 6 * 32(%rdx), %ymm1, %ymm1; + vpxor 7 * 32(%rdx), %ymm0, %ymm0; + vpxor 8 * 32(%rdx), %ymm15, %ymm15; + vpxor 9 * 32(%rdx), %ymm14, %ymm14; + vpxor 10 * 32(%rdx), %ymm13, %ymm13; + vpxor 11 * 32(%rdx), %ymm12, %ymm12; + vpxor 12 * 32(%rdx), %ymm11, %ymm11; + vpxor 13 * 32(%rdx), %ymm10, %ymm10; + vpxor 14 * 32(%rdx), %ymm9, %ymm9; + vpxor 15 * 32(%rdx), %ymm8, %ymm8; + write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, + %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, + %ymm8, %rsi); + + vzeroupper; + + ret; +ENDPROC(camellia_ctr_32way) + +#define gf128mul_x_ble(iv, mask, tmp) \ + vpsrad $31, iv, tmp; \ + vpaddq iv, iv, iv; \ + vpshufd $0x13, tmp, tmp; \ + vpand mask, tmp, tmp; \ + vpxor tmp, iv, iv; + +#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \ + vpsrad $31, iv, tmp0; \ + vpaddq iv, iv, tmp1; \ + vpsllq $2, iv, iv; \ + vpshufd $0x13, tmp0, tmp0; \ + vpsrad $31, tmp1, tmp1; \ + vpand mask2, tmp0, tmp0; \ + vpshufd $0x13, tmp1, tmp1; \ + vpxor tmp0, iv, iv; \ + vpand mask1, tmp1, tmp1; \ + vpxor tmp1, iv, iv; + +.align 8 +camellia_xts_crypt_32way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (32 blocks) + * %rdx: src (32 blocks) + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) + * %r8: index for input whitening key + * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32 + */ + + vzeroupper; + + subq $(16 * 32), %rsp; + movq %rsp, %rax; + + vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12; + + /* load IV and construct second IV */ + vmovdqu (%rcx), %xmm0; + vmovdqa %xmm0, %xmm15; + gf128mul_x_ble(%xmm0, %xmm12, %xmm13); + vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13; + vinserti128 $1, %xmm0, %ymm15, %ymm0; + vpxor 0 * 32(%rdx), %ymm0, %ymm15; + vmovdqu %ymm15, 15 * 32(%rax); + vmovdqu %ymm0, 0 * 32(%rsi); + + /* construct IVs */ + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); + vpxor 1 * 32(%rdx), %ymm0, %ymm15; + vmovdqu %ymm15, 14 * 32(%rax); + vmovdqu %ymm0, 1 * 32(%rsi); + + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); + vpxor 2 * 32(%rdx), %ymm0, %ymm15; + vmovdqu %ymm15, 13 * 32(%rax); + vmovdqu %ymm0, 2 * 32(%rsi); + + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); + vpxor 3 * 32(%rdx), %ymm0, %ymm15; + vmovdqu %ymm15, 12 * 32(%rax); + vmovdqu %ymm0, 3 * 32(%rsi); + + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); + vpxor 4 * 32(%rdx), %ymm0, %ymm11; + vmovdqu %ymm0, 4 * 32(%rsi); + + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); + vpxor 5 * 32(%rdx), %ymm0, %ymm10; + vmovdqu %ymm0, 5 * 32(%rsi); + + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); + vpxor 6 * 32(%rdx), %ymm0, %ymm9; + vmovdqu %ymm0, 6 * 32(%rsi); + + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); + vpxor 7 * 32(%rdx), %ymm0, %ymm8; + vmovdqu %ymm0, 7 * 32(%rsi); + + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); + vpxor 8 * 32(%rdx), %ymm0, %ymm7; + vmovdqu %ymm0, 8 * 32(%rsi); + + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); + vpxor 9 * 32(%rdx), %ymm0, %ymm6; + vmovdqu %ymm0, 9 * 32(%rsi); + + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); + vpxor 10 * 32(%rdx), %ymm0, %ymm5; + vmovdqu %ymm0, 10 * 32(%rsi); + + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); + vpxor 11 * 32(%rdx), %ymm0, %ymm4; + vmovdqu %ymm0, 11 * 32(%rsi); + + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); + vpxor 12 * 32(%rdx), %ymm0, %ymm3; + vmovdqu %ymm0, 12 * 32(%rsi); + + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); + vpxor 13 * 32(%rdx), %ymm0, %ymm2; + vmovdqu %ymm0, 13 * 32(%rsi); + + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); + vpxor 14 * 32(%rdx), %ymm0, %ymm1; + vmovdqu %ymm0, 14 * 32(%rsi); + + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); + vpxor 15 * 32(%rdx), %ymm0, %ymm15; + vmovdqu %ymm15, 0 * 32(%rax); + vmovdqu %ymm0, 15 * 32(%rsi); + + vextracti128 $1, %ymm0, %xmm0; + gf128mul_x_ble(%xmm0, %xmm12, %xmm15); + vmovdqu %xmm0, (%rcx); + + /* inpack32_pre: */ + vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15; + vpshufb .Lpack_bswap, %ymm15, %ymm15; + vpxor 0 * 32(%rax), %ymm15, %ymm0; + vpxor %ymm1, %ymm15, %ymm1; + vpxor %ymm2, %ymm15, %ymm2; + vpxor %ymm3, %ymm15, %ymm3; + vpxor %ymm4, %ymm15, %ymm4; + vpxor %ymm5, %ymm15, %ymm5; + vpxor %ymm6, %ymm15, %ymm6; + vpxor %ymm7, %ymm15, %ymm7; + vpxor %ymm8, %ymm15, %ymm8; + vpxor %ymm9, %ymm15, %ymm9; + vpxor %ymm10, %ymm15, %ymm10; + vpxor %ymm11, %ymm15, %ymm11; + vpxor 12 * 32(%rax), %ymm15, %ymm12; + vpxor 13 * 32(%rax), %ymm15, %ymm13; + vpxor 14 * 32(%rax), %ymm15, %ymm14; + vpxor 15 * 32(%rax), %ymm15, %ymm15; + + call *%r9; + + addq $(16 * 32), %rsp; + + vpxor 0 * 32(%rsi), %ymm7, %ymm7; + vpxor 1 * 32(%rsi), %ymm6, %ymm6; + vpxor 2 * 32(%rsi), %ymm5, %ymm5; + vpxor 3 * 32(%rsi), %ymm4, %ymm4; + vpxor 4 * 32(%rsi), %ymm3, %ymm3; + vpxor 5 * 32(%rsi), %ymm2, %ymm2; + vpxor 6 * 32(%rsi), %ymm1, %ymm1; + vpxor 7 * 32(%rsi), %ymm0, %ymm0; + vpxor 8 * 32(%rsi), %ymm15, %ymm15; + vpxor 9 * 32(%rsi), %ymm14, %ymm14; + vpxor 10 * 32(%rsi), %ymm13, %ymm13; + vpxor 11 * 32(%rsi), %ymm12, %ymm12; + vpxor 12 * 32(%rsi), %ymm11, %ymm11; + vpxor 13 * 32(%rsi), %ymm10, %ymm10; + vpxor 14 * 32(%rsi), %ymm9, %ymm9; + vpxor 15 * 32(%rsi), %ymm8, %ymm8; + write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, + %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, + %ymm8, %rsi); + + vzeroupper; + + ret; +ENDPROC(camellia_xts_crypt_32way) + +ENTRY(camellia_xts_enc_32way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst (32 blocks) + * %rdx: src (32 blocks) + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) + */ + + xorl %r8d, %r8d; /* input whitening key, 0 for enc */ + + leaq __camellia_enc_blk32, %r9; + + jmp camellia_xts_crypt_32way; +ENDPROC(camellia_xts_enc_32way) + +ENTRY(camellia_xts_dec_32way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst (32 blocks) + * %rdx: src (32 blocks) + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) + */ + + cmpl $16, key_length(CTX); + movl $32, %r8d; + movl $24, %eax; + cmovel %eax, %r8d; /* input whitening key, last for dec */ + + leaq __camellia_dec_blk32, %r9; + + jmp camellia_xts_crypt_32way; +ENDPROC(camellia_xts_dec_32way) diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S index 0b3374335fd..310319c601e 100644 --- a/arch/x86/crypto/camellia-x86_64-asm_64.S +++ b/arch/x86/crypto/camellia-x86_64-asm_64.S @@ -20,6 +20,8 @@ * */ +#include <linux/linkage.h> + .file "camellia-x86_64-asm_64.S" .text @@ -188,10 +190,7 @@ bswapq RAB0; \ movq RAB0, 4*2(RIO); -.global __camellia_enc_blk; -.type __camellia_enc_blk,@function; - -__camellia_enc_blk: +ENTRY(__camellia_enc_blk) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -214,33 +213,31 @@ __camellia_enc_blk: movl $24, RT1d; /* max */ cmpb $16, key_length(CTX); - je __enc_done; + je .L__enc_done; enc_fls(24); enc_rounds(24); movl $32, RT1d; /* max */ -__enc_done: +.L__enc_done: testb RXORbl, RXORbl; movq RDST, RIO; - jnz __enc_xor; + jnz .L__enc_xor; enc_outunpack(mov, RT1); movq RRBP, %rbp; ret; -__enc_xor: +.L__enc_xor: enc_outunpack(xor, RT1); movq RRBP, %rbp; ret; +ENDPROC(__camellia_enc_blk) -.global camellia_dec_blk; -.type camellia_dec_blk,@function; - -camellia_dec_blk: +ENTRY(camellia_dec_blk) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -258,12 +255,12 @@ camellia_dec_blk: dec_inpack(RT2); cmpb $24, RT2bl; - je __dec_rounds16; + je .L__dec_rounds16; dec_rounds(24); dec_fls(24); -__dec_rounds16: +.L__dec_rounds16: dec_rounds(16); dec_fls(16); dec_rounds(8); @@ -276,6 +273,7 @@ __dec_rounds16: movq RRBP, %rbp; ret; +ENDPROC(camellia_dec_blk) /********************************************************************** 2-way camellia @@ -426,10 +424,7 @@ __dec_rounds16: bswapq RAB1; \ movq RAB1, 12*2(RIO); -.global __camellia_enc_blk_2way; -.type __camellia_enc_blk_2way,@function; - -__camellia_enc_blk_2way: +ENTRY(__camellia_enc_blk_2way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -453,16 +448,16 @@ __camellia_enc_blk_2way: movl $24, RT2d; /* max */ cmpb $16, key_length(CTX); - je __enc2_done; + je .L__enc2_done; enc_fls2(24); enc_rounds2(24); movl $32, RT2d; /* max */ -__enc2_done: +.L__enc2_done: test RXORbl, RXORbl; movq RDST, RIO; - jnz __enc2_xor; + jnz .L__enc2_xor; enc_outunpack2(mov, RT2); @@ -470,17 +465,15 @@ __enc2_done: popq %rbx; ret; -__enc2_xor: +.L__enc2_xor: enc_outunpack2(xor, RT2); movq RRBP, %rbp; popq %rbx; ret; +ENDPROC(__camellia_enc_blk_2way) -.global camellia_dec_blk_2way; -.type camellia_dec_blk_2way,@function; - -camellia_dec_blk_2way: +ENTRY(camellia_dec_blk_2way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -499,12 +492,12 @@ camellia_dec_blk_2way: dec_inpack2(RT2); cmpb $24, RT2bl; - je __dec2_rounds16; + je .L__dec2_rounds16; dec_rounds2(24); dec_fls2(24); -__dec2_rounds16: +.L__dec2_rounds16: dec_rounds2(16); dec_fls2(16); dec_rounds2(8); @@ -518,3 +511,4 @@ __dec2_rounds16: movq RRBP, %rbp; movq RXOR, %rbx; ret; +ENDPROC(camellia_dec_blk_2way) diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c new file mode 100644 index 00000000000..4209a76fcda --- /dev/null +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -0,0 +1,586 @@ +/* + * Glue Code for x86_64/AVX2/AES-NI assembler optimized version of Camellia + * + * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/crypto.h> +#include <linux/err.h> +#include <crypto/ablk_helper.h> +#include <crypto/algapi.h> +#include <crypto/ctr.h> +#include <crypto/lrw.h> +#include <crypto/xts.h> +#include <asm/xcr.h> +#include <asm/xsave.h> +#include <asm/crypto/camellia.h> +#include <asm/crypto/glue_helper.h> + +#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 +#define CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS 32 + +/* 32-way AVX2/AES-NI parallel cipher functions */ +asmlinkage void camellia_ecb_enc_32way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void camellia_ecb_dec_32way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src); + +asmlinkage void camellia_cbc_dec_32way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void camellia_ctr_32way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); + +asmlinkage void camellia_xts_enc_32way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); +asmlinkage void camellia_xts_dec_32way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); + +static const struct common_glue_ctx camellia_enc = { + .num_funcs = 4, + .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_32way) } + }, { + .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) } + }, { + .num_blocks = 2, + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) } + } } +}; + +static const struct common_glue_ctx camellia_ctr = { + .num_funcs = 4, + .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_32way) } + }, { + .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) } + }, { + .num_blocks = 2, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) } + }, { + .num_blocks = 1, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) } + } } +}; + +static const struct common_glue_ctx camellia_enc_xts = { + .num_funcs = 3, + .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_32way) } + }, { + .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) } + }, { + .num_blocks = 1, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) } + } } +}; + +static const struct common_glue_ctx camellia_dec = { + .num_funcs = 4, + .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_32way) } + }, { + .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) } + }, { + .num_blocks = 2, + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) } + } } +}; + +static const struct common_glue_ctx camellia_dec_cbc = { + .num_funcs = 4, + .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_32way) } + }, { + .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) } + }, { + .num_blocks = 2, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) } + }, { + .num_blocks = 1, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) } + } } +}; + +static const struct common_glue_ctx camellia_dec_xts = { + .num_funcs = 3, + .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_32way) } + }, { + .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) } + }, { + .num_blocks = 1, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) } + } } +}; + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes); +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc, + dst, src, nbytes); +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src, + nbytes); +} + +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes); +} + +static inline bool camellia_fpu_begin(bool fpu_enabled, unsigned int nbytes) +{ + return glue_fpu_begin(CAMELLIA_BLOCK_SIZE, + CAMELLIA_AESNI_PARALLEL_BLOCKS, NULL, fpu_enabled, + nbytes); +} + +static inline void camellia_fpu_end(bool fpu_enabled) +{ + glue_fpu_end(fpu_enabled); +} + +static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len) +{ + return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len, + &tfm->crt_flags); +} + +struct crypt_priv { + struct camellia_ctx *ctx; + bool fpu_enabled; +}; + +static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = CAMELLIA_BLOCK_SIZE; + struct crypt_priv *ctx = priv; + int i; + + ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes); + + if (nbytes >= CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS * bsize) { + camellia_ecb_enc_32way(ctx->ctx, srcdst, srcdst); + srcdst += bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS; + nbytes -= bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS; + } + + if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { + camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst); + srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; + nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; + } + + while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { + camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst); + srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; + nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + camellia_enc_blk(ctx->ctx, srcdst, srcdst); +} + +static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = CAMELLIA_BLOCK_SIZE; + struct crypt_priv *ctx = priv; + int i; + + ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes); + + if (nbytes >= CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS * bsize) { + camellia_ecb_dec_32way(ctx->ctx, srcdst, srcdst); + srcdst += bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS; + nbytes -= bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS; + } + + if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { + camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst); + srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; + nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; + } + + while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { + camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst); + srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; + nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + camellia_dec_blk(ctx->ctx, srcdst, srcdst); +} + +static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->camellia_ctx, + .fpu_enabled = false, + }; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &crypt_ctx, + .crypt_fn = encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = lrw_crypt(desc, dst, src, nbytes, &req); + camellia_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->camellia_ctx, + .fpu_enabled = false, + }; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &crypt_ctx, + .crypt_fn = decrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = lrw_crypt(desc, dst, src, nbytes, &req); + camellia_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + + return glue_xts_crypt_128bit(&camellia_enc_xts, desc, dst, src, nbytes, + XTS_TWEAK_CAST(camellia_enc_blk), + &ctx->tweak_ctx, &ctx->crypt_ctx); +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + + return glue_xts_crypt_128bit(&camellia_dec_xts, desc, dst, src, nbytes, + XTS_TWEAK_CAST(camellia_enc_blk), + &ctx->tweak_ctx, &ctx->crypt_ctx); +} + +static struct crypto_alg cmll_algs[10] = { { + .cra_name = "__ecb-camellia-aesni-avx2", + .cra_driver_name = "__driver-ecb-camellia-aesni-avx2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct camellia_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .setkey = camellia_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}, { + .cra_name = "__cbc-camellia-aesni-avx2", + .cra_driver_name = "__driver-cbc-camellia-aesni-avx2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct camellia_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .setkey = camellia_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}, { + .cra_name = "__ctr-camellia-aesni-avx2", + .cra_driver_name = "__driver-ctr-camellia-aesni-avx2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct camellia_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = camellia_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +}, { + .cra_name = "__lrw-camellia-aesni-avx2", + .cra_driver_name = "__driver-lrw-camellia-aesni-avx2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct camellia_lrw_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_exit = lrw_camellia_exit_tfm, + .cra_u = { + .blkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE + + CAMELLIA_BLOCK_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE + + CAMELLIA_BLOCK_SIZE, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = lrw_camellia_setkey, + .encrypt = lrw_encrypt, + .decrypt = lrw_decrypt, + }, + }, +}, { + .cra_name = "__xts-camellia-aesni-avx2", + .cra_driver_name = "__driver-xts-camellia-aesni-avx2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct camellia_xts_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2, + .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = xts_camellia_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, + }, + }, +}, { + .cra_name = "ecb(camellia)", + .cra_driver_name = "ecb-camellia-aesni-avx2", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "cbc(camellia)", + .cra_driver_name = "cbc-camellia-aesni-avx2", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = __ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "ctr(camellia)", + .cra_driver_name = "ctr-camellia-aesni-avx2", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_encrypt, + .geniv = "chainiv", + }, + }, +}, { + .cra_name = "lrw(camellia)", + .cra_driver_name = "lrw-camellia-aesni-avx2", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE + + CAMELLIA_BLOCK_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE + + CAMELLIA_BLOCK_SIZE, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "xts(camellia)", + .cra_driver_name = "xts-camellia-aesni-avx2", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2, + .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +} }; + +static int __init camellia_aesni_init(void) +{ + u64 xcr0; + + if (!cpu_has_avx2 || !cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) { + pr_info("AVX2 or AES-NI instructions are not detected.\n"); + return -ENODEV; + } + + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { + pr_info("AVX2 detected but unusable.\n"); + return -ENODEV; + } + + return crypto_register_algs(cmll_algs, ARRAY_SIZE(cmll_algs)); +} + +static void __exit camellia_aesni_fini(void) +{ + crypto_unregister_algs(cmll_algs, ARRAY_SIZE(cmll_algs)); +} + +module_init(camellia_aesni_init); +module_exit(camellia_aesni_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX2 optimized"); +MODULE_ALIAS("camellia"); +MODULE_ALIAS("camellia-asm"); diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 96cbb6068fc..87a041a10f4 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -1,7 +1,7 @@ /* * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia * - * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -14,6 +14,7 @@ #include <linux/types.h> #include <linux/crypto.h> #include <linux/err.h> +#include <crypto/ablk_helper.h> #include <crypto/algapi.h> #include <crypto/ctr.h> #include <crypto/lrw.h> @@ -21,21 +22,48 @@ #include <asm/xcr.h> #include <asm/xsave.h> #include <asm/crypto/camellia.h> -#include <asm/crypto/ablk_helper.h> #include <asm/crypto/glue_helper.h> #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 -/* 16-way AES-NI parallel cipher functions */ +/* 16-way parallel cipher functions (avx/aes-ni) */ asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst, const u8 *src); +EXPORT_SYMBOL_GPL(camellia_ecb_enc_16way); + asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst, const u8 *src); +EXPORT_SYMBOL_GPL(camellia_ecb_dec_16way); asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst, const u8 *src); +EXPORT_SYMBOL_GPL(camellia_cbc_dec_16way); + asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst, const u8 *src, le128 *iv); +EXPORT_SYMBOL_GPL(camellia_ctr_16way); + +asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); +EXPORT_SYMBOL_GPL(camellia_xts_enc_16way); + +asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); +EXPORT_SYMBOL_GPL(camellia_xts_dec_16way); + +void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) +{ + glue_xts_crypt_128bit_one(ctx, dst, src, iv, + GLUE_FUNC_CAST(camellia_enc_blk)); +} +EXPORT_SYMBOL_GPL(camellia_xts_enc); + +void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) +{ + glue_xts_crypt_128bit_one(ctx, dst, src, iv, + GLUE_FUNC_CAST(camellia_dec_blk)); +} +EXPORT_SYMBOL_GPL(camellia_xts_dec); static const struct common_glue_ctx camellia_enc = { .num_funcs = 3, @@ -69,6 +97,19 @@ static const struct common_glue_ctx camellia_ctr = { } } }; +static const struct common_glue_ctx camellia_enc_xts = { + .num_funcs = 2, + .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) } + }, { + .num_blocks = 1, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) } + } } +}; + static const struct common_glue_ctx camellia_dec = { .num_funcs = 3, .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, @@ -101,6 +142,19 @@ static const struct common_glue_ctx camellia_dec_cbc = { } } }; +static const struct common_glue_ctx camellia_dec_xts = { + .num_funcs = 2, + .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) } + }, { + .num_blocks = 1, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) } + } } +}; + static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { @@ -261,54 +315,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->crypt_ctx, - .fpu_enabled = false, - }; - struct xts_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - .tweak_ctx = &ctx->tweak_ctx, - .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk), - .crypt_ctx = &crypt_ctx, - .crypt_fn = encrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = xts_crypt(desc, dst, src, nbytes, &req); - camellia_fpu_end(crypt_ctx.fpu_enabled); - - return ret; + return glue_xts_crypt_128bit(&camellia_enc_xts, desc, dst, src, nbytes, + XTS_TWEAK_CAST(camellia_enc_blk), + &ctx->tweak_ctx, &ctx->crypt_ctx); } static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->crypt_ctx, - .fpu_enabled = false, - }; - struct xts_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .tweak_ctx = &ctx->tweak_ctx, - .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk), - .crypt_ctx = &crypt_ctx, - .crypt_fn = decrypt_callback, - }; - int ret; - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = xts_crypt(desc, dst, src, nbytes, &req); - camellia_fpu_end(crypt_ctx.fpu_enabled); - - return ret; + return glue_xts_crypt_128bit(&camellia_dec_xts, desc, dst, src, nbytes, + XTS_TWEAK_CAST(camellia_enc_blk), + &ctx->tweak_ctx, &ctx->crypt_ctx); } static struct crypto_alg cmll_algs[10] = { { diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index 5cb86ccd4ac..c171dcbf192 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -62,7 +62,7 @@ static void camellia_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) } /* camellia sboxes */ -const u64 camellia_sp10011110[256] = { +__visible const u64 camellia_sp10011110[256] = { 0x7000007070707000ULL, 0x8200008282828200ULL, 0x2c00002c2c2c2c00ULL, 0xec0000ecececec00ULL, 0xb30000b3b3b3b300ULL, 0x2700002727272700ULL, 0xc00000c0c0c0c000ULL, 0xe50000e5e5e5e500ULL, 0xe40000e4e4e4e400ULL, @@ -151,7 +151,7 @@ const u64 camellia_sp10011110[256] = { 0x9e00009e9e9e9e00ULL, }; -const u64 camellia_sp22000222[256] = { +__visible const u64 camellia_sp22000222[256] = { 0xe0e0000000e0e0e0ULL, 0x0505000000050505ULL, 0x5858000000585858ULL, 0xd9d9000000d9d9d9ULL, 0x6767000000676767ULL, 0x4e4e0000004e4e4eULL, 0x8181000000818181ULL, 0xcbcb000000cbcbcbULL, 0xc9c9000000c9c9c9ULL, @@ -240,7 +240,7 @@ const u64 camellia_sp22000222[256] = { 0x3d3d0000003d3d3dULL, }; -const u64 camellia_sp03303033[256] = { +__visible const u64 camellia_sp03303033[256] = { 0x0038380038003838ULL, 0x0041410041004141ULL, 0x0016160016001616ULL, 0x0076760076007676ULL, 0x00d9d900d900d9d9ULL, 0x0093930093009393ULL, 0x0060600060006060ULL, 0x00f2f200f200f2f2ULL, 0x0072720072007272ULL, @@ -329,7 +329,7 @@ const u64 camellia_sp03303033[256] = { 0x004f4f004f004f4fULL, }; -const u64 camellia_sp00444404[256] = { +__visible const u64 camellia_sp00444404[256] = { 0x0000707070700070ULL, 0x00002c2c2c2c002cULL, 0x0000b3b3b3b300b3ULL, 0x0000c0c0c0c000c0ULL, 0x0000e4e4e4e400e4ULL, 0x0000575757570057ULL, 0x0000eaeaeaea00eaULL, 0x0000aeaeaeae00aeULL, 0x0000232323230023ULL, @@ -418,7 +418,7 @@ const u64 camellia_sp00444404[256] = { 0x00009e9e9e9e009eULL, }; -const u64 camellia_sp02220222[256] = { +__visible const u64 camellia_sp02220222[256] = { 0x00e0e0e000e0e0e0ULL, 0x0005050500050505ULL, 0x0058585800585858ULL, 0x00d9d9d900d9d9d9ULL, 0x0067676700676767ULL, 0x004e4e4e004e4e4eULL, 0x0081818100818181ULL, 0x00cbcbcb00cbcbcbULL, 0x00c9c9c900c9c9c9ULL, @@ -507,7 +507,7 @@ const u64 camellia_sp02220222[256] = { 0x003d3d3d003d3d3dULL, }; -const u64 camellia_sp30333033[256] = { +__visible const u64 camellia_sp30333033[256] = { 0x3800383838003838ULL, 0x4100414141004141ULL, 0x1600161616001616ULL, 0x7600767676007676ULL, 0xd900d9d9d900d9d9ULL, 0x9300939393009393ULL, 0x6000606060006060ULL, 0xf200f2f2f200f2f2ULL, 0x7200727272007272ULL, @@ -596,7 +596,7 @@ const u64 camellia_sp30333033[256] = { 0x4f004f4f4f004f4fULL, }; -const u64 camellia_sp44044404[256] = { +__visible const u64 camellia_sp44044404[256] = { 0x7070007070700070ULL, 0x2c2c002c2c2c002cULL, 0xb3b300b3b3b300b3ULL, 0xc0c000c0c0c000c0ULL, 0xe4e400e4e4e400e4ULL, 0x5757005757570057ULL, 0xeaea00eaeaea00eaULL, 0xaeae00aeaeae00aeULL, 0x2323002323230023ULL, @@ -685,7 +685,7 @@ const u64 camellia_sp44044404[256] = { 0x9e9e009e9e9e009eULL, }; -const u64 camellia_sp11101110[256] = { +__visible const u64 camellia_sp11101110[256] = { 0x7070700070707000ULL, 0x8282820082828200ULL, 0x2c2c2c002c2c2c00ULL, 0xececec00ececec00ULL, 0xb3b3b300b3b3b300ULL, 0x2727270027272700ULL, 0xc0c0c000c0c0c000ULL, 0xe5e5e500e5e5e500ULL, 0xe4e4e400e4e4e400ULL, @@ -828,8 +828,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) subRL[1] ^= (subRL[1] & ~subRL[9]) << 32; /* modified for FLinv(kl2) */ - dw = (subRL[1] & subRL[9]) >> 32, - subRL[1] ^= rol32(dw, 1); + dw = (subRL[1] & subRL[9]) >> 32; + subRL[1] ^= rol32(dw, 1); /* round 8 */ subRL[11] ^= subRL[1]; @@ -840,8 +840,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) subRL[1] ^= (subRL[1] & ~subRL[17]) << 32; /* modified for FLinv(kl4) */ - dw = (subRL[1] & subRL[17]) >> 32, - subRL[1] ^= rol32(dw, 1); + dw = (subRL[1] & subRL[17]) >> 32; + subRL[1] ^= rol32(dw, 1); /* round 14 */ subRL[19] ^= subRL[1]; @@ -859,8 +859,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) } else { subRL[1] ^= (subRL[1] & ~subRL[25]) << 32; /* modified for FLinv(kl6) */ - dw = (subRL[1] & subRL[25]) >> 32, - subRL[1] ^= rol32(dw, 1); + dw = (subRL[1] & subRL[25]) >> 32; + subRL[1] ^= rol32(dw, 1); /* round 20 */ subRL[27] ^= subRL[1]; @@ -882,8 +882,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) kw4 ^= (kw4 & ~subRL[24]) << 32; /* modified for FL(kl5) */ - dw = (kw4 & subRL[24]) >> 32, - kw4 ^= rol32(dw, 1); + dw = (kw4 & subRL[24]) >> 32; + kw4 ^= rol32(dw, 1); } /* round 17 */ @@ -895,8 +895,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) kw4 ^= (kw4 & ~subRL[16]) << 32; /* modified for FL(kl3) */ - dw = (kw4 & subRL[16]) >> 32, - kw4 ^= rol32(dw, 1); + dw = (kw4 & subRL[16]) >> 32; + kw4 ^= rol32(dw, 1); /* round 11 */ subRL[14] ^= kw4; @@ -907,8 +907,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) kw4 ^= (kw4 & ~subRL[8]) << 32; /* modified for FL(kl1) */ - dw = (kw4 & subRL[8]) >> 32, - kw4 ^= rol32(dw, 1); + dw = (kw4 & subRL[8]) >> 32; + kw4 ^= rol32(dw, 1); /* round 5 */ subRL[6] ^= kw4; @@ -928,8 +928,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) SET_SUBKEY_LR(6, subRL[5] ^ subRL[7]); /* round 5 */ tl = (subRL[10] >> 32) ^ (subRL[10] & ~subRL[8]); - dw = tl & (subRL[8] >> 32), /* FL(kl1) */ - tr = subRL[10] ^ rol32(dw, 1); + dw = tl & (subRL[8] >> 32); /* FL(kl1) */ + tr = subRL[10] ^ rol32(dw, 1); tt = (tr | ((u64)tl << 32)); SET_SUBKEY_LR(7, subRL[6] ^ tt); /* round 6 */ @@ -937,8 +937,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) SET_SUBKEY_LR(9, subRL[9]); /* FLinv(kl2) */ tl = (subRL[7] >> 32) ^ (subRL[7] & ~subRL[9]); - dw = tl & (subRL[9] >> 32), /* FLinv(kl2) */ - tr = subRL[7] ^ rol32(dw, 1); + dw = tl & (subRL[9] >> 32); /* FLinv(kl2) */ + tr = subRL[7] ^ rol32(dw, 1); tt = (tr | ((u64)tl << 32)); SET_SUBKEY_LR(10, subRL[11] ^ tt); /* round 7 */ @@ -948,8 +948,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) SET_SUBKEY_LR(14, subRL[13] ^ subRL[15]); /* round 11 */ tl = (subRL[18] >> 32) ^ (subRL[18] & ~subRL[16]); - dw = tl & (subRL[16] >> 32), /* FL(kl3) */ - tr = subRL[18] ^ rol32(dw, 1); + dw = tl & (subRL[16] >> 32); /* FL(kl3) */ + tr = subRL[18] ^ rol32(dw, 1); tt = (tr | ((u64)tl << 32)); SET_SUBKEY_LR(15, subRL[14] ^ tt); /* round 12 */ @@ -957,8 +957,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) SET_SUBKEY_LR(17, subRL[17]); /* FLinv(kl4) */ tl = (subRL[15] >> 32) ^ (subRL[15] & ~subRL[17]); - dw = tl & (subRL[17] >> 32), /* FLinv(kl4) */ - tr = subRL[15] ^ rol32(dw, 1); + dw = tl & (subRL[17] >> 32); /* FLinv(kl4) */ + tr = subRL[15] ^ rol32(dw, 1); tt = (tr | ((u64)tl << 32)); SET_SUBKEY_LR(18, subRL[19] ^ tt); /* round 13 */ @@ -972,8 +972,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) SET_SUBKEY_LR(24, subRL[24] ^ subRL[23]); /* kw3 */ } else { tl = (subRL[26] >> 32) ^ (subRL[26] & ~subRL[24]); - dw = tl & (subRL[24] >> 32), /* FL(kl5) */ - tr = subRL[26] ^ rol32(dw, 1); + dw = tl & (subRL[24] >> 32); /* FL(kl5) */ + tr = subRL[26] ^ rol32(dw, 1); tt = (tr | ((u64)tl << 32)); SET_SUBKEY_LR(23, subRL[22] ^ tt); /* round 18 */ @@ -981,8 +981,8 @@ static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) SET_SUBKEY_LR(25, subRL[25]); /* FLinv(kl6) */ tl = (subRL[23] >> 32) ^ (subRL[23] & ~subRL[25]); - dw = tl & (subRL[25] >> 32), /* FLinv(kl6) */ - tr = subRL[23] ^ rol32(dw, 1); + dw = tl & (subRL[25] >> 32); /* FLinv(kl6) */ + tr = subRL[23] ^ rol32(dw, 1); tt = (tr | ((u64)tl << 32)); SET_SUBKEY_LR(26, subRL[27] ^ tt); /* round 19 */ diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index 15b00ac7cbd..c35fd5d6ecd 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -23,6 +23,8 @@ * */ +#include <linux/linkage.h> + .file "cast5-avx-x86_64-asm_64.S" .extern cast_s1 @@ -211,8 +213,6 @@ .text .align 16 -.type __cast5_enc_blk16,@function; - __cast5_enc_blk16: /* input: * %rdi: ctx, CTX @@ -263,14 +263,14 @@ __cast5_enc_blk16: movzbl rr(CTX), %eax; testl %eax, %eax; - jnz __skip_enc; + jnz .L__skip_enc; round(RL, RR, 12, 1); round(RR, RL, 13, 2); round(RL, RR, 14, 3); round(RR, RL, 15, 1); -__skip_enc: +.L__skip_enc: popq %rbx; popq %rbp; @@ -282,10 +282,9 @@ __skip_enc: outunpack_blocks(RR4, RL4, RTMP, RX, RKM); ret; +ENDPROC(__cast5_enc_blk16) .align 16 -.type __cast5_dec_blk16,@function; - __cast5_dec_blk16: /* input: * %rdi: ctx, CTX @@ -323,14 +322,14 @@ __cast5_dec_blk16: movzbl rr(CTX), %eax; testl %eax, %eax; - jnz __skip_dec; + jnz .L__skip_dec; round(RL, RR, 15, 1); round(RR, RL, 14, 3); round(RL, RR, 13, 2); round(RR, RL, 12, 1); -__dec_tail: +.L__dec_tail: round(RL, RR, 11, 3); round(RR, RL, 10, 2); round(RL, RR, 9, 1); @@ -355,15 +354,12 @@ __dec_tail: ret; -__skip_dec: +.L__skip_dec: vpsrldq $4, RKR, RKR; - jmp __dec_tail; + jmp .L__dec_tail; +ENDPROC(__cast5_dec_blk16) -.align 16 -.global cast5_ecb_enc_16way -.type cast5_ecb_enc_16way,@function; - -cast5_ecb_enc_16way: +ENTRY(cast5_ecb_enc_16way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -393,12 +389,9 @@ cast5_ecb_enc_16way: vmovdqu RL4, (7*4*4)(%r11); ret; +ENDPROC(cast5_ecb_enc_16way) -.align 16 -.global cast5_ecb_dec_16way -.type cast5_ecb_dec_16way,@function; - -cast5_ecb_dec_16way: +ENTRY(cast5_ecb_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -428,12 +421,9 @@ cast5_ecb_dec_16way: vmovdqu RL4, (7*4*4)(%r11); ret; +ENDPROC(cast5_ecb_dec_16way) -.align 16 -.global cast5_cbc_dec_16way -.type cast5_cbc_dec_16way,@function; - -cast5_cbc_dec_16way: +ENTRY(cast5_cbc_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -480,12 +470,9 @@ cast5_cbc_dec_16way: popq %r12; ret; +ENDPROC(cast5_cbc_dec_16way) -.align 16 -.global cast5_ctr_16way -.type cast5_ctr_16way,@function; - -cast5_ctr_16way: +ENTRY(cast5_ctr_16way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -556,3 +543,4 @@ cast5_ctr_16way: popq %r12; ret; +ENDPROC(cast5_ctr_16way) diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index c6631813dc1..e57e20ab5e0 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -26,13 +26,13 @@ #include <linux/types.h> #include <linux/crypto.h> #include <linux/err.h> +#include <crypto/ablk_helper.h> #include <crypto/algapi.h> #include <crypto/cast5.h> #include <crypto/cryptd.h> #include <crypto/ctr.h> #include <asm/xcr.h> #include <asm/xsave.h> -#include <asm/crypto/ablk_helper.h> #include <asm/crypto/glue_helper.h> #define CAST5_PARALLEL_BLOCKS 16 @@ -203,9 +203,6 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, src -= 1; dst -= 1; } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; } /* Handle leftovers */ diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index 2569d0da841..e3531f83395 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -4,7 +4,7 @@ * Copyright (C) 2012 Johannes Goetzfried * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> * - * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -23,6 +23,7 @@ * */ +#include <linux/linkage.h> #include "glue_helper-asm-avx.S" .file "cast6-avx-x86_64-asm_64.S" @@ -226,6 +227,8 @@ .data .align 16 +.Lxts_gf128mul_and_shl1_mask: + .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 .Lbswap_mask: .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 .Lbswap128_mask: @@ -250,8 +253,6 @@ .text .align 8 -.type __cast6_enc_blk8,@function; - __cast6_enc_blk8: /* input: * %rdi: ctx, CTX @@ -295,10 +296,9 @@ __cast6_enc_blk8: outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); ret; +ENDPROC(__cast6_enc_blk8) .align 8 -.type __cast6_dec_blk8,@function; - __cast6_dec_blk8: /* input: * %rdi: ctx, CTX @@ -341,12 +341,9 @@ __cast6_dec_blk8: outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); ret; +ENDPROC(__cast6_dec_blk8) -.align 8 -.global cast6_ecb_enc_8way -.type cast6_ecb_enc_8way,@function; - -cast6_ecb_enc_8way: +ENTRY(cast6_ecb_enc_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -362,12 +359,9 @@ cast6_ecb_enc_8way: store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); ret; +ENDPROC(cast6_ecb_enc_8way) -.align 8 -.global cast6_ecb_dec_8way -.type cast6_ecb_dec_8way,@function; - -cast6_ecb_dec_8way: +ENTRY(cast6_ecb_dec_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -383,12 +377,9 @@ cast6_ecb_dec_8way: store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); ret; +ENDPROC(cast6_ecb_dec_8way) -.align 8 -.global cast6_cbc_dec_8way -.type cast6_cbc_dec_8way,@function; - -cast6_cbc_dec_8way: +ENTRY(cast6_cbc_dec_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -409,12 +400,9 @@ cast6_cbc_dec_8way: popq %r12; ret; +ENDPROC(cast6_cbc_dec_8way) -.align 8 -.global cast6_ctr_8way -.type cast6_ctr_8way,@function; - -cast6_ctr_8way: +ENTRY(cast6_ctr_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -437,3 +425,48 @@ cast6_ctr_8way: popq %r12; ret; +ENDPROC(cast6_ctr_8way) + +ENTRY(cast6_xts_enc_8way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) + */ + + movq %rsi, %r11; + + /* regs <= src, dst <= IVs, regs <= regs xor IVs */ + load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, + RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask); + + call __cast6_enc_blk8; + + /* dst <= regs xor IVs(in dst) */ + store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + ret; +ENDPROC(cast6_xts_enc_8way) + +ENTRY(cast6_xts_dec_8way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) + */ + + movq %rsi, %r11; + + /* regs <= src, dst <= IVs, regs <= regs xor IVs */ + load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, + RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask); + + call __cast6_dec_blk8; + + /* dst <= regs xor IVs(in dst) */ + store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + ret; +ENDPROC(cast6_xts_dec_8way) diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 92f7ca24790..09f3677393e 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -4,6 +4,8 @@ * Copyright (C) 2012 Johannes Goetzfried * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> * + * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -26,6 +28,7 @@ #include <linux/types.h> #include <linux/crypto.h> #include <linux/err.h> +#include <crypto/ablk_helper.h> #include <crypto/algapi.h> #include <crypto/cast6.h> #include <crypto/cryptd.h> @@ -35,7 +38,6 @@ #include <crypto/xts.h> #include <asm/xcr.h> #include <asm/xsave.h> -#include <asm/crypto/ablk_helper.h> #include <asm/crypto/glue_helper.h> #define CAST6_PARALLEL_BLOCKS 8 @@ -50,6 +52,23 @@ asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst, asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src, le128 *iv); +asmlinkage void cast6_xts_enc_8way(struct cast6_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); +asmlinkage void cast6_xts_dec_8way(struct cast6_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); + +static void cast6_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) +{ + glue_xts_crypt_128bit_one(ctx, dst, src, iv, + GLUE_FUNC_CAST(__cast6_encrypt)); +} + +static void cast6_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) +{ + glue_xts_crypt_128bit_one(ctx, dst, src, iv, + GLUE_FUNC_CAST(__cast6_decrypt)); +} + static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) { be128 ctrblk; @@ -87,6 +106,19 @@ static const struct common_glue_ctx cast6_ctr = { } } }; +static const struct common_glue_ctx cast6_enc_xts = { + .num_funcs = 2, + .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = CAST6_PARALLEL_BLOCKS, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc_8way) } + }, { + .num_blocks = 1, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc) } + } } +}; + static const struct common_glue_ctx cast6_dec = { .num_funcs = 2, .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, @@ -113,6 +145,19 @@ static const struct common_glue_ctx cast6_dec_cbc = { } } }; +static const struct common_glue_ctx cast6_dec_xts = { + .num_funcs = 2, + .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = CAST6_PARALLEL_BLOCKS, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec_8way) } + }, { + .num_blocks = 1, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec) } + } } +}; + static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { @@ -307,54 +352,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[CAST6_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->crypt_ctx, - .fpu_enabled = false, - }; - struct xts_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - .tweak_ctx = &ctx->tweak_ctx, - .tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt), - .crypt_ctx = &crypt_ctx, - .crypt_fn = encrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = xts_crypt(desc, dst, src, nbytes, &req); - cast6_fpu_end(crypt_ctx.fpu_enabled); - - return ret; + return glue_xts_crypt_128bit(&cast6_enc_xts, desc, dst, src, nbytes, + XTS_TWEAK_CAST(__cast6_encrypt), + &ctx->tweak_ctx, &ctx->crypt_ctx); } static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[CAST6_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->crypt_ctx, - .fpu_enabled = false, - }; - struct xts_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .tweak_ctx = &ctx->tweak_ctx, - .tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt), - .crypt_ctx = &crypt_ctx, - .crypt_fn = decrypt_callback, - }; - int ret; - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = xts_crypt(desc, dst, src, nbytes, &req); - cast6_fpu_end(crypt_ctx.fpu_enabled); - - return ret; + return glue_xts_crypt_128bit(&cast6_dec_xts, desc, dst, src, nbytes, + XTS_TWEAK_CAST(__cast6_encrypt), + &ctx->tweak_ctx, &ctx->crypt_ctx); } static struct crypto_alg cast6_algs[10] = { { diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S new file mode 100644 index 00000000000..f247304299a --- /dev/null +++ b/arch/x86/crypto/crc32-pclmul_asm.S @@ -0,0 +1,246 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + * + * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 + * calculation. + * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) + * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found + * at: + * http://www.intel.com/products/processor/manuals/ + * Intel(R) 64 and IA-32 Architectures Software Developer's Manual + * Volume 2B: Instruction Set Reference, N-Z + * + * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> + * Alexander Boyko <Alexander_Boyko@xyratex.com> + */ + +#include <linux/linkage.h> +#include <asm/inst.h> + + +.align 16 +/* + * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 + * #define CONSTANT_R1 0x154442bd4LL + * + * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 + * #define CONSTANT_R2 0x1c6e41596LL + */ +.Lconstant_R2R1: + .octa 0x00000001c6e415960000000154442bd4 +/* + * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 + * #define CONSTANT_R3 0x1751997d0LL + * + * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e + * #define CONSTANT_R4 0x0ccaa009eLL + */ +.Lconstant_R4R3: + .octa 0x00000000ccaa009e00000001751997d0 +/* + * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 + * #define CONSTANT_R5 0x163cd6124LL + */ +.Lconstant_R5: + .octa 0x00000000000000000000000163cd6124 +.Lconstant_mask32: + .octa 0x000000000000000000000000FFFFFFFF +/* + * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL + * + * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL + * #define CONSTANT_RU 0x1F7011641LL + */ +.Lconstant_RUpoly: + .octa 0x00000001F701164100000001DB710641 + +#define CONSTANT %xmm0 + +#ifdef __x86_64__ +#define BUF %rdi +#define LEN %rsi +#define CRC %edx +#else +#define BUF %eax +#define LEN %edx +#define CRC %ecx +#endif + + + +.text +/** + * Calculate crc32 + * BUF - buffer (16 bytes aligned) + * LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63 + * CRC - initial crc32 + * return %eax crc32 + * uint crc32_pclmul_le_16(unsigned char const *buffer, + * size_t len, uint crc32) + */ + +ENTRY(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */ + movdqa (BUF), %xmm1 + movdqa 0x10(BUF), %xmm2 + movdqa 0x20(BUF), %xmm3 + movdqa 0x30(BUF), %xmm4 + movd CRC, CONSTANT + pxor CONSTANT, %xmm1 + sub $0x40, LEN + add $0x40, BUF +#ifndef __x86_64__ + /* This is for position independent code(-fPIC) support for 32bit */ + call delta +delta: + pop %ecx +#endif + cmp $0x40, LEN + jb less_64 + +#ifdef __x86_64__ + movdqa .Lconstant_R2R1(%rip), CONSTANT +#else + movdqa .Lconstant_R2R1 - delta(%ecx), CONSTANT +#endif + +loop_64:/* 64 bytes Full cache line folding */ + prefetchnta 0x40(BUF) + movdqa %xmm1, %xmm5 + movdqa %xmm2, %xmm6 + movdqa %xmm3, %xmm7 +#ifdef __x86_64__ + movdqa %xmm4, %xmm8 +#endif + PCLMULQDQ 00, CONSTANT, %xmm1 + PCLMULQDQ 00, CONSTANT, %xmm2 + PCLMULQDQ 00, CONSTANT, %xmm3 +#ifdef __x86_64__ + PCLMULQDQ 00, CONSTANT, %xmm4 +#endif + PCLMULQDQ 0x11, CONSTANT, %xmm5 + PCLMULQDQ 0x11, CONSTANT, %xmm6 + PCLMULQDQ 0x11, CONSTANT, %xmm7 +#ifdef __x86_64__ + PCLMULQDQ 0x11, CONSTANT, %xmm8 +#endif + pxor %xmm5, %xmm1 + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 +#ifdef __x86_64__ + pxor %xmm8, %xmm4 +#else + /* xmm8 unsupported for x32 */ + movdqa %xmm4, %xmm5 + PCLMULQDQ 00, CONSTANT, %xmm4 + PCLMULQDQ 0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm4 +#endif + + pxor (BUF), %xmm1 + pxor 0x10(BUF), %xmm2 + pxor 0x20(BUF), %xmm3 + pxor 0x30(BUF), %xmm4 + + sub $0x40, LEN + add $0x40, BUF + cmp $0x40, LEN + jge loop_64 +less_64:/* Folding cache line into 128bit */ +#ifdef __x86_64__ + movdqa .Lconstant_R4R3(%rip), CONSTANT +#else + movdqa .Lconstant_R4R3 - delta(%ecx), CONSTANT +#endif + prefetchnta (BUF) + + movdqa %xmm1, %xmm5 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + PCLMULQDQ 0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor %xmm2, %xmm1 + + movdqa %xmm1, %xmm5 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + PCLMULQDQ 0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor %xmm3, %xmm1 + + movdqa %xmm1, %xmm5 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + PCLMULQDQ 0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor %xmm4, %xmm1 + + cmp $0x10, LEN + jb fold_64 +loop_16:/* Folding rest buffer into 128bit */ + movdqa %xmm1, %xmm5 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + PCLMULQDQ 0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor (BUF), %xmm1 + sub $0x10, LEN + add $0x10, BUF + cmp $0x10, LEN + jge loop_16 + +fold_64: + /* perform the last 64 bit fold, also adds 32 zeroes + * to the input stream */ + PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ + psrldq $0x08, %xmm1 + pxor CONSTANT, %xmm1 + + /* final 32-bit fold */ + movdqa %xmm1, %xmm2 +#ifdef __x86_64__ + movdqa .Lconstant_R5(%rip), CONSTANT + movdqa .Lconstant_mask32(%rip), %xmm3 +#else + movdqa .Lconstant_R5 - delta(%ecx), CONSTANT + movdqa .Lconstant_mask32 - delta(%ecx), %xmm3 +#endif + psrldq $0x04, %xmm2 + pand %xmm3, %xmm1 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + pxor %xmm2, %xmm1 + + /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ +#ifdef __x86_64__ + movdqa .Lconstant_RUpoly(%rip), CONSTANT +#else + movdqa .Lconstant_RUpoly - delta(%ecx), CONSTANT +#endif + movdqa %xmm1, %xmm2 + pand %xmm3, %xmm1 + PCLMULQDQ 0x10, CONSTANT, %xmm1 + pand %xmm3, %xmm1 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + pxor %xmm2, %xmm1 + PEXTRD 0x01, %xmm1, %eax + + ret +ENDPROC(crc32_pclmul_le_16) diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c new file mode 100644 index 00000000000..9d014a74ef9 --- /dev/null +++ b/arch/x86/crypto/crc32-pclmul_glue.c @@ -0,0 +1,201 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + * + * Wrappers for kernel crypto shash api to pclmulqdq crc32 imlementation. + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/crc32.h> +#include <crypto/internal/hash.h> + +#include <asm/cpufeature.h> +#include <asm/cpu_device_id.h> +#include <asm/i387.h> + +#define CHKSUM_BLOCK_SIZE 1 +#define CHKSUM_DIGEST_SIZE 4 + +#define PCLMUL_MIN_LEN 64L /* minimum size of buffer + * for crc32_pclmul_le_16 */ +#define SCALE_F 16L /* size of xmm register */ +#define SCALE_F_MASK (SCALE_F - 1) + +u32 crc32_pclmul_le_16(unsigned char const *buffer, size_t len, u32 crc32); + +static u32 __attribute__((pure)) + crc32_pclmul_le(u32 crc, unsigned char const *p, size_t len) +{ + unsigned int iquotient; + unsigned int iremainder; + unsigned int prealign; + + if (len < PCLMUL_MIN_LEN + SCALE_F_MASK || !irq_fpu_usable()) + return crc32_le(crc, p, len); + + if ((long)p & SCALE_F_MASK) { + /* align p to 16 byte */ + prealign = SCALE_F - ((long)p & SCALE_F_MASK); + + crc = crc32_le(crc, p, prealign); + len -= prealign; + p = (unsigned char *)(((unsigned long)p + SCALE_F_MASK) & + ~SCALE_F_MASK); + } + iquotient = len & (~SCALE_F_MASK); + iremainder = len & SCALE_F_MASK; + + kernel_fpu_begin(); + crc = crc32_pclmul_le_16(p, iquotient, crc); + kernel_fpu_end(); + + if (iremainder) + crc = crc32_le(crc, p + iquotient, iremainder); + + return crc; +} + +static int crc32_pclmul_cra_init(struct crypto_tfm *tfm) +{ + u32 *key = crypto_tfm_ctx(tfm); + + *key = 0; + + return 0; +} + +static int crc32_pclmul_setkey(struct crypto_shash *hash, const u8 *key, + unsigned int keylen) +{ + u32 *mctx = crypto_shash_ctx(hash); + + if (keylen != sizeof(u32)) { + crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + *mctx = le32_to_cpup((__le32 *)key); + return 0; +} + +static int crc32_pclmul_init(struct shash_desc *desc) +{ + u32 *mctx = crypto_shash_ctx(desc->tfm); + u32 *crcp = shash_desc_ctx(desc); + + *crcp = *mctx; + + return 0; +} + +static int crc32_pclmul_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + u32 *crcp = shash_desc_ctx(desc); + + *crcp = crc32_pclmul_le(*crcp, data, len); + return 0; +} + +/* No final XOR 0xFFFFFFFF, like crc32_le */ +static int __crc32_pclmul_finup(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + *(__le32 *)out = cpu_to_le32(crc32_pclmul_le(*crcp, data, len)); + return 0; +} + +static int crc32_pclmul_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32_pclmul_finup(shash_desc_ctx(desc), data, len, out); +} + +static int crc32_pclmul_final(struct shash_desc *desc, u8 *out) +{ + u32 *crcp = shash_desc_ctx(desc); + + *(__le32 *)out = cpu_to_le32p(crcp); + return 0; +} + +static int crc32_pclmul_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32_pclmul_finup(crypto_shash_ctx(desc->tfm), data, len, + out); +} + +static struct shash_alg alg = { + .setkey = crc32_pclmul_setkey, + .init = crc32_pclmul_init, + .update = crc32_pclmul_update, + .final = crc32_pclmul_final, + .finup = crc32_pclmul_finup, + .digest = crc32_pclmul_digest, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + .base = { + .cra_name = "crc32", + .cra_driver_name = "crc32-pclmul", + .cra_priority = 200, + .cra_blocksize = CHKSUM_BLOCK_SIZE, + .cra_ctxsize = sizeof(u32), + .cra_module = THIS_MODULE, + .cra_init = crc32_pclmul_cra_init, + } +}; + +static const struct x86_cpu_id crc32pclmul_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, crc32pclmul_cpu_id); + + +static int __init crc32_pclmul_mod_init(void) +{ + + if (!x86_match_cpu(crc32pclmul_cpu_id)) { + pr_info("PCLMULQDQ-NI instructions are not detected.\n"); + return -ENODEV; + } + return crypto_register_shash(&alg); +} + +static void __exit crc32_pclmul_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(crc32_pclmul_mod_init); +module_exit(crc32_pclmul_mod_fini); + +MODULE_AUTHOR("Alexander Boyko <alexander_boyko@xyratex.com>"); +MODULE_LICENSE("GPL"); + +MODULE_ALIAS("crc32"); +MODULE_ALIAS("crc32-pclmul"); diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 93c6d39237a..dbc4339b541 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -1,9 +1,10 @@ /* * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) * - * The white paper on CRC32C calculations with PCLMULQDQ instruction can be + * The white papers on CRC32C calculations with PCLMULQDQ instruction can be * downloaded from: - * http://download.intel.com/design/intarch/papers/323405.pdf + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf * * Copyright (C) 2012 Intel Corporation. * @@ -42,6 +43,9 @@ * SOFTWARE. */ +#include <asm/inst.h> +#include <linux/linkage.h> + ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction .macro LABEL prefix n @@ -68,8 +72,7 @@ # unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); -.global crc_pcl -crc_pcl: +ENTRY(crc_pcl) #define bufp %rdi #define bufp_dw %edi #define bufp_w %di @@ -224,10 +227,10 @@ LABEL crc_ %i movdqa (bufp), %xmm0 # 2 consts: K1:K2 movq crc_init, %xmm1 # CRC for block 1 - pclmulqdq $0x00,%xmm0,%xmm1 # Multiply by K2 + PCLMULQDQ 0x00,%xmm0,%xmm1 # Multiply by K2 movq crc1, %xmm2 # CRC for block 2 - pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 + PCLMULQDQ 0x10, %xmm0, %xmm2 # Multiply by K1 pxor %xmm2,%xmm1 movq %xmm1, %rax @@ -323,6 +326,9 @@ JMPTBL_ENTRY %i .noaltmacro i=i+1 .endr + +ENDPROC(crc_pcl) + ################################################################ ## PCLMULQDQ tables ## Table is 128 entries x 2 quad words each diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S new file mode 100644 index 00000000000..35e97569d05 --- /dev/null +++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S @@ -0,0 +1,643 @@ +######################################################################## +# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions +# +# Copyright (c) 2013, Intel Corporation +# +# Authors: +# Erdinc Ozturk <erdinc.ozturk@intel.com> +# Vinodh Gopal <vinodh.gopal@intel.com> +# James Guilford <james.guilford@intel.com> +# Tim Chen <tim.c.chen@linux.intel.com> +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of the Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# +# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## +# Function API: +# UINT16 crc_t10dif_pcl( +# UINT16 init_crc, //initial CRC value, 16 bits +# const unsigned char *buf, //buffer pointer to calculate CRC on +# UINT64 len //buffer length in bytes (64-bit data) +# ); +# +# Reference paper titled "Fast CRC Computation for Generic +# Polynomials Using PCLMULQDQ Instruction" +# URL: http://www.intel.com/content/dam/www/public/us/en/documents +# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +# +# + +#include <linux/linkage.h> + +.text + +#define arg1 %rdi +#define arg2 %rsi +#define arg3 %rdx + +#define arg1_low32 %edi + +ENTRY(crc_t10dif_pcl) +.align 16 + + # adjust the 16-bit initial_crc value, scale it to 32 bits + shl $16, arg1_low32 + + # Allocate Stack Space + mov %rsp, %rcx + sub $16*2, %rsp + # align stack to 16 byte boundary + and $~(0x10 - 1), %rsp + + # check if smaller than 256 + cmp $256, arg3 + + # for sizes less than 128, we can't fold 64B at a time... + jl _less_than_128 + + + # load the initial crc value + movd arg1_low32, %xmm10 # initial crc + + # crc value does not need to be byte-reflected, but it needs + # to be moved to the high part of the register. + # because data will be byte-reflected and will align with + # initial crc at correct place. + pslldq $12, %xmm10 + + movdqa SHUF_MASK(%rip), %xmm11 + # receive the initial 64B data, xor the initial crc value + movdqu 16*0(arg2), %xmm0 + movdqu 16*1(arg2), %xmm1 + movdqu 16*2(arg2), %xmm2 + movdqu 16*3(arg2), %xmm3 + movdqu 16*4(arg2), %xmm4 + movdqu 16*5(arg2), %xmm5 + movdqu 16*6(arg2), %xmm6 + movdqu 16*7(arg2), %xmm7 + + pshufb %xmm11, %xmm0 + # XOR the initial_crc value + pxor %xmm10, %xmm0 + pshufb %xmm11, %xmm1 + pshufb %xmm11, %xmm2 + pshufb %xmm11, %xmm3 + pshufb %xmm11, %xmm4 + pshufb %xmm11, %xmm5 + pshufb %xmm11, %xmm6 + pshufb %xmm11, %xmm7 + + movdqa rk3(%rip), %xmm10 #xmm10 has rk3 and rk4 + #imm value of pclmulqdq instruction + #will determine which constant to use + + ################################################################# + # we subtract 256 instead of 128 to save one instruction from the loop + sub $256, arg3 + + # at this section of the code, there is 64*x+y (0<=y<64) bytes of + # buffer. The _fold_64_B_loop will fold 64B at a time + # until we have 64+y Bytes of buffer + + + # fold 64B at a time. This section of the code folds 4 xmm + # registers in parallel +_fold_64_B_loop: + + # update the buffer pointer + add $128, arg2 # buf += 64# + + movdqu 16*0(arg2), %xmm9 + movdqu 16*1(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm0, %xmm8 + movdqa %xmm1, %xmm13 + pclmulqdq $0x0 , %xmm10, %xmm0 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0 , %xmm10, %xmm1 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm0 + xorps %xmm8 , %xmm0 + pxor %xmm12, %xmm1 + xorps %xmm13, %xmm1 + + movdqu 16*2(arg2), %xmm9 + movdqu 16*3(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm2, %xmm8 + movdqa %xmm3, %xmm13 + pclmulqdq $0x0, %xmm10, %xmm2 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0, %xmm10, %xmm3 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm2 + xorps %xmm8 , %xmm2 + pxor %xmm12, %xmm3 + xorps %xmm13, %xmm3 + + movdqu 16*4(arg2), %xmm9 + movdqu 16*5(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm4, %xmm8 + movdqa %xmm5, %xmm13 + pclmulqdq $0x0, %xmm10, %xmm4 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0, %xmm10, %xmm5 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm4 + xorps %xmm8 , %xmm4 + pxor %xmm12, %xmm5 + xorps %xmm13, %xmm5 + + movdqu 16*6(arg2), %xmm9 + movdqu 16*7(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm6 , %xmm8 + movdqa %xmm7 , %xmm13 + pclmulqdq $0x0 , %xmm10, %xmm6 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0 , %xmm10, %xmm7 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm6 + xorps %xmm8 , %xmm6 + pxor %xmm12, %xmm7 + xorps %xmm13, %xmm7 + + sub $128, arg3 + + # check if there is another 64B in the buffer to be able to fold + jge _fold_64_B_loop + ################################################################## + + + add $128, arg2 + # at this point, the buffer pointer is pointing at the last y Bytes + # of the buffer the 64B of folded data is in 4 of the xmm + # registers: xmm0, xmm1, xmm2, xmm3 + + + # fold the 8 xmm registers to 1 xmm register with different constants + + movdqa rk9(%rip), %xmm10 + movdqa %xmm0, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm0 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm0, %xmm7 + + movdqa rk11(%rip), %xmm10 + movdqa %xmm1, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm1 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm1, %xmm7 + + movdqa rk13(%rip), %xmm10 + movdqa %xmm2, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm2 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm2, %xmm7 + + movdqa rk15(%rip), %xmm10 + movdqa %xmm3, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm3 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm3, %xmm7 + + movdqa rk17(%rip), %xmm10 + movdqa %xmm4, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm4 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm4, %xmm7 + + movdqa rk19(%rip), %xmm10 + movdqa %xmm5, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm5 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm5, %xmm7 + + movdqa rk1(%rip), %xmm10 #xmm10 has rk1 and rk2 + #imm value of pclmulqdq instruction + #will determine which constant to use + movdqa %xmm6, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm6 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm6, %xmm7 + + + # instead of 64, we add 48 to the loop counter to save 1 instruction + # from the loop instead of a cmp instruction, we use the negative + # flag with the jl instruction + add $128-16, arg3 + jl _final_reduction_for_128 + + # now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 + # and the rest is in memory. We can fold 16 bytes at a time if y>=16 + # continue folding 16B at a time + +_16B_reduction_loop: + movdqa %xmm7, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm7 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + movdqu (arg2), %xmm0 + pshufb %xmm11, %xmm0 + pxor %xmm0 , %xmm7 + add $16, arg2 + sub $16, arg3 + # instead of a cmp instruction, we utilize the flags with the + # jge instruction equivalent of: cmp arg3, 16-16 + # check if there is any more 16B in the buffer to be able to fold + jge _16B_reduction_loop + + #now we have 16+z bytes left to reduce, where 0<= z < 16. + #first, we reduce the data in the xmm7 register + + +_final_reduction_for_128: + # check if any more data to fold. If not, compute the CRC of + # the final 128 bits + add $16, arg3 + je _128_done + + # here we are getting data that is less than 16 bytes. + # since we know that there was data before the pointer, we can + # offset the input pointer before the actual point, to receive + # exactly 16 bytes. after that the registers need to be adjusted. +_get_last_two_xmms: + movdqa %xmm7, %xmm2 + + movdqu -16(arg2, arg3), %xmm1 + pshufb %xmm11, %xmm1 + + # get rid of the extra data that was loaded before + # load the shift constant + lea pshufb_shf_table+16(%rip), %rax + sub arg3, %rax + movdqu (%rax), %xmm0 + + # shift xmm2 to the left by arg3 bytes + pshufb %xmm0, %xmm2 + + # shift xmm7 to the right by 16-arg3 bytes + pxor mask1(%rip), %xmm0 + pshufb %xmm0, %xmm7 + pblendvb %xmm2, %xmm1 #xmm0 is implicit + + # fold 16 Bytes + movdqa %xmm1, %xmm2 + movdqa %xmm7, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm7 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm2, %xmm7 + +_128_done: + # compute crc of a 128-bit value + movdqa rk5(%rip), %xmm10 # rk5 and rk6 in xmm10 + movdqa %xmm7, %xmm0 + + #64b fold + pclmulqdq $0x1, %xmm10, %xmm7 + pslldq $8 , %xmm0 + pxor %xmm0, %xmm7 + + #32b fold + movdqa %xmm7, %xmm0 + + pand mask2(%rip), %xmm0 + + psrldq $12, %xmm7 + pclmulqdq $0x10, %xmm10, %xmm7 + pxor %xmm0, %xmm7 + + #barrett reduction +_barrett: + movdqa rk7(%rip), %xmm10 # rk7 and rk8 in xmm10 + movdqa %xmm7, %xmm0 + pclmulqdq $0x01, %xmm10, %xmm7 + pslldq $4, %xmm7 + pclmulqdq $0x11, %xmm10, %xmm7 + + pslldq $4, %xmm7 + pxor %xmm0, %xmm7 + pextrd $1, %xmm7, %eax + +_cleanup: + # scale the result back to 16 bits + shr $16, %eax + mov %rcx, %rsp + ret + +######################################################################## + +.align 16 +_less_than_128: + + # check if there is enough buffer to be able to fold 16B at a time + cmp $32, arg3 + jl _less_than_32 + movdqa SHUF_MASK(%rip), %xmm11 + + # now if there is, load the constants + movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10 + + movd arg1_low32, %xmm0 # get the initial crc value + pslldq $12, %xmm0 # align it to its correct place + movdqu (arg2), %xmm7 # load the plaintext + pshufb %xmm11, %xmm7 # byte-reflect the plaintext + pxor %xmm0, %xmm7 + + + # update the buffer pointer + add $16, arg2 + + # update the counter. subtract 32 instead of 16 to save one + # instruction from the loop + sub $32, arg3 + + jmp _16B_reduction_loop + + +.align 16 +_less_than_32: + # mov initial crc to the return value. this is necessary for + # zero-length buffers. + mov arg1_low32, %eax + test arg3, arg3 + je _cleanup + + movdqa SHUF_MASK(%rip), %xmm11 + + movd arg1_low32, %xmm0 # get the initial crc value + pslldq $12, %xmm0 # align it to its correct place + + cmp $16, arg3 + je _exact_16_left + jl _less_than_16_left + + movdqu (arg2), %xmm7 # load the plaintext + pshufb %xmm11, %xmm7 # byte-reflect the plaintext + pxor %xmm0 , %xmm7 # xor the initial crc value + add $16, arg2 + sub $16, arg3 + movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10 + jmp _get_last_two_xmms + + +.align 16 +_less_than_16_left: + # use stack space to load data less than 16 bytes, zero-out + # the 16B in memory first. + + pxor %xmm1, %xmm1 + mov %rsp, %r11 + movdqa %xmm1, (%r11) + + cmp $4, arg3 + jl _only_less_than_4 + + # backup the counter value + mov arg3, %r9 + cmp $8, arg3 + jl _less_than_8_left + + # load 8 Bytes + mov (arg2), %rax + mov %rax, (%r11) + add $8, %r11 + sub $8, arg3 + add $8, arg2 +_less_than_8_left: + + cmp $4, arg3 + jl _less_than_4_left + + # load 4 Bytes + mov (arg2), %eax + mov %eax, (%r11) + add $4, %r11 + sub $4, arg3 + add $4, arg2 +_less_than_4_left: + + cmp $2, arg3 + jl _less_than_2_left + + # load 2 Bytes + mov (arg2), %ax + mov %ax, (%r11) + add $2, %r11 + sub $2, arg3 + add $2, arg2 +_less_than_2_left: + cmp $1, arg3 + jl _zero_left + + # load 1 Byte + mov (arg2), %al + mov %al, (%r11) +_zero_left: + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + # shl r9, 4 + lea pshufb_shf_table+16(%rip), %rax + sub %r9, %rax + movdqu (%rax), %xmm0 + pxor mask1(%rip), %xmm0 + + pshufb %xmm0, %xmm7 + jmp _128_done + +.align 16 +_exact_16_left: + movdqu (arg2), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + jmp _128_done + +_only_less_than_4: + cmp $3, arg3 + jl _only_less_than_3 + + # load 3 Bytes + mov (arg2), %al + mov %al, (%r11) + + mov 1(arg2), %al + mov %al, 1(%r11) + + mov 2(arg2), %al + mov %al, 2(%r11) + + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + psrldq $5, %xmm7 + + jmp _barrett +_only_less_than_3: + cmp $2, arg3 + jl _only_less_than_2 + + # load 2 Bytes + mov (arg2), %al + mov %al, (%r11) + + mov 1(arg2), %al + mov %al, 1(%r11) + + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + psrldq $6, %xmm7 + + jmp _barrett +_only_less_than_2: + + # load 1 Byte + mov (arg2), %al + mov %al, (%r11) + + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + psrldq $7, %xmm7 + + jmp _barrett + +ENDPROC(crc_t10dif_pcl) + +.data + +# precomputed constants +# these constants are precomputed from the poly: +# 0x8bb70000 (0x8bb7 scaled to 32 bits) +.align 16 +# Q = 0x18BB70000 +# rk1 = 2^(32*3) mod Q << 32 +# rk2 = 2^(32*5) mod Q << 32 +# rk3 = 2^(32*15) mod Q << 32 +# rk4 = 2^(32*17) mod Q << 32 +# rk5 = 2^(32*3) mod Q << 32 +# rk6 = 2^(32*2) mod Q << 32 +# rk7 = floor(2^64/Q) +# rk8 = Q +rk1: +.quad 0x2d56000000000000 +rk2: +.quad 0x06df000000000000 +rk3: +.quad 0x9d9d000000000000 +rk4: +.quad 0x7cf5000000000000 +rk5: +.quad 0x2d56000000000000 +rk6: +.quad 0x1368000000000000 +rk7: +.quad 0x00000001f65a57f8 +rk8: +.quad 0x000000018bb70000 + +rk9: +.quad 0xceae000000000000 +rk10: +.quad 0xbfd6000000000000 +rk11: +.quad 0x1e16000000000000 +rk12: +.quad 0x713c000000000000 +rk13: +.quad 0xf7f9000000000000 +rk14: +.quad 0x80a6000000000000 +rk15: +.quad 0x044c000000000000 +rk16: +.quad 0xe658000000000000 +rk17: +.quad 0xad18000000000000 +rk18: +.quad 0xa497000000000000 +rk19: +.quad 0x6ee3000000000000 +rk20: +.quad 0xe7b5000000000000 + + + +mask1: +.octa 0x80808080808080808080808080808080 +mask2: +.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF + +SHUF_MASK: +.octa 0x000102030405060708090A0B0C0D0E0F + +pshufb_shf_table: +# use these values for shift constants for the pshufb instruction +# different alignments result in values as shown: +# DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1 +# DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2 +# DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3 +# DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4 +# DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5 +# DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6 +# DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7 +# DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8 +# DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9 +# DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10 +# DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11 +# DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12 +# DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13 +# DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14 +# DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15 +.octa 0x8f8e8d8c8b8a89888786858483828100 +.octa 0x000e0d0c0b0a09080706050403020100 diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c new file mode 100644 index 00000000000..7845d7fd54c --- /dev/null +++ b/arch/x86/crypto/crct10dif-pclmul_glue.c @@ -0,0 +1,151 @@ +/* + * Cryptographic API. + * + * T10 Data Integrity Field CRC16 Crypto Transform using PCLMULQDQ Instructions + * + * Copyright (C) 2013 Intel Corporation + * Author: Tim Chen <tim.c.chen@linux.intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <linux/types.h> +#include <linux/module.h> +#include <linux/crc-t10dif.h> +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <asm/i387.h> +#include <asm/cpufeature.h> +#include <asm/cpu_device_id.h> + +asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf, + size_t len); + +struct chksum_desc_ctx { + __u16 crc; +}; + +/* + * Steps through buffer one byte at at time, calculates reflected + * crc using table. + */ + +static int chksum_init(struct shash_desc *desc) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->crc = 0; + + return 0; +} + +static int chksum_update(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + if (irq_fpu_usable()) { + kernel_fpu_begin(); + ctx->crc = crc_t10dif_pcl(ctx->crc, data, length); + kernel_fpu_end(); + } else + ctx->crc = crc_t10dif_generic(ctx->crc, data, length); + return 0; +} + +static int chksum_final(struct shash_desc *desc, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + *(__u16 *)out = ctx->crc; + return 0; +} + +static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + if (irq_fpu_usable()) { + kernel_fpu_begin(); + *(__u16 *)out = crc_t10dif_pcl(*crcp, data, len); + kernel_fpu_end(); + } else + *(__u16 *)out = crc_t10dif_generic(*crcp, data, len); + return 0; +} + +static int chksum_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup(&ctx->crc, data, len, out); +} + +static int chksum_digest(struct shash_desc *desc, const u8 *data, + unsigned int length, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup(&ctx->crc, data, length, out); +} + +static struct shash_alg alg = { + .digestsize = CRC_T10DIF_DIGEST_SIZE, + .init = chksum_init, + .update = chksum_update, + .final = chksum_final, + .finup = chksum_finup, + .digest = chksum_digest, + .descsize = sizeof(struct chksum_desc_ctx), + .base = { + .cra_name = "crct10dif", + .cra_driver_name = "crct10dif-pclmul", + .cra_priority = 200, + .cra_blocksize = CRC_T10DIF_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static const struct x86_cpu_id crct10dif_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id); + +static int __init crct10dif_intel_mod_init(void) +{ + if (!x86_match_cpu(crct10dif_cpu_id)) + return -ENODEV; + + return crypto_register_shash(&alg); +} + +static void __exit crct10dif_intel_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(crct10dif_intel_mod_init); +module_exit(crct10dif_intel_mod_fini); + +MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>"); +MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ."); +MODULE_LICENSE("GPL"); + +MODULE_ALIAS("crct10dif"); +MODULE_ALIAS("crct10dif-pclmul"); diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index 1eb7f90cb7b..5d1e0075ac2 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -24,10 +24,6 @@ .align 16 .Lbswap_mask: .octa 0x000102030405060708090a0b0c0d0e0f -.Lpoly: - .octa 0xc2000000000000000000000000000001 -.Ltwo_one: - .octa 0x00000001000000000000000000000001 #define DATA %xmm0 #define SHASH %xmm1 @@ -94,8 +90,9 @@ __clmul_gf128mul_ble: pxor T2, T1 pxor T1, DATA ret +ENDPROC(__clmul_gf128mul_ble) -/* void clmul_ghash_mul(char *dst, const be128 *shash) */ +/* void clmul_ghash_mul(char *dst, const u128 *shash) */ ENTRY(clmul_ghash_mul) movups (%rdi), DATA movups (%rsi), SHASH @@ -105,10 +102,11 @@ ENTRY(clmul_ghash_mul) PSHUFB_XMM BSWAP DATA movups DATA, (%rdi) ret +ENDPROC(clmul_ghash_mul) /* * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, - * const be128 *shash); + * const u128 *shash); */ ENTRY(clmul_ghash_update) cmp $16, %rdx @@ -131,27 +129,4 @@ ENTRY(clmul_ghash_update) movups DATA, (%rdi) .Lupdate_just_ret: ret - -/* - * void clmul_ghash_setkey(be128 *shash, const u8 *key); - * - * Calculate hash_key << 1 mod poly - */ -ENTRY(clmul_ghash_setkey) - movaps .Lbswap_mask, BSWAP - movups (%rsi), %xmm0 - PSHUFB_XMM BSWAP %xmm0 - movaps %xmm0, %xmm1 - psllq $1, %xmm0 - psrlq $63, %xmm1 - movaps %xmm1, %xmm2 - pslldq $8, %xmm1 - psrldq $8, %xmm2 - por %xmm1, %xmm0 - # reduction - pshufd $0b00100100, %xmm2, %xmm1 - pcmpeqd .Ltwo_one, %xmm1 - pand .Lpoly, %xmm1 - pxor %xmm1, %xmm0 - movups %xmm0, (%rdi) - ret +ENDPROC(clmul_ghash_update) diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 6759dd1135b..88bb7ba8b17 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -25,19 +25,17 @@ #define GHASH_BLOCK_SIZE 16 #define GHASH_DIGEST_SIZE 16 -void clmul_ghash_mul(char *dst, const be128 *shash); +void clmul_ghash_mul(char *dst, const u128 *shash); void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, - const be128 *shash); - -void clmul_ghash_setkey(be128 *shash, const u8 *key); + const u128 *shash); struct ghash_async_ctx { struct cryptd_ahash *cryptd_tfm; }; struct ghash_ctx { - be128 shash; + u128 shash; }; struct ghash_desc_ctx { @@ -58,13 +56,23 @@ static int ghash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { struct ghash_ctx *ctx = crypto_shash_ctx(tfm); + be128 *x = (be128 *)key; + u64 a, b; if (keylen != GHASH_BLOCK_SIZE) { crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } - clmul_ghash_setkey(&ctx->shash, key); + /* perform multiplication by 'x' in GF(2^128) */ + a = be64_to_cpu(x->a); + b = be64_to_cpu(x->b); + + ctx->shash.a = (b << 1) | (a >> 63); + ctx->shash.b = (a << 1) | (b >> 63); + + if (a >> 63) + ctx->shash.b ^= ((u64)0xc2) << 56; return 0; } diff --git a/arch/x86/crypto/glue_helper-asm-avx.S b/arch/x86/crypto/glue_helper-asm-avx.S index f7b6ea2ddfd..02ee2308fb3 100644 --- a/arch/x86/crypto/glue_helper-asm-avx.S +++ b/arch/x86/crypto/glue_helper-asm-avx.S @@ -1,7 +1,7 @@ /* * Shared glue code for 128bit block ciphers, AVX assembler macros * - * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -89,3 +89,62 @@ vpxor (6*16)(src), x6, x6; \ vpxor (7*16)(src), x7, x7; \ store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); + +#define gf128mul_x_ble(iv, mask, tmp) \ + vpsrad $31, iv, tmp; \ + vpaddq iv, iv, iv; \ + vpshufd $0x13, tmp, tmp; \ + vpand mask, tmp, tmp; \ + vpxor tmp, iv, iv; + +#define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \ + t1, xts_gf128mul_and_shl1_mask) \ + vmovdqa xts_gf128mul_and_shl1_mask, t0; \ + \ + /* load IV */ \ + vmovdqu (iv), tiv; \ + vpxor (0*16)(src), tiv, x0; \ + vmovdqu tiv, (0*16)(dst); \ + \ + /* construct and store IVs, also xor with source */ \ + gf128mul_x_ble(tiv, t0, t1); \ + vpxor (1*16)(src), tiv, x1; \ + vmovdqu tiv, (1*16)(dst); \ + \ + gf128mul_x_ble(tiv, t0, t1); \ + vpxor (2*16)(src), tiv, x2; \ + vmovdqu tiv, (2*16)(dst); \ + \ + gf128mul_x_ble(tiv, t0, t1); \ + vpxor (3*16)(src), tiv, x3; \ + vmovdqu tiv, (3*16)(dst); \ + \ + gf128mul_x_ble(tiv, t0, t1); \ + vpxor (4*16)(src), tiv, x4; \ + vmovdqu tiv, (4*16)(dst); \ + \ + gf128mul_x_ble(tiv, t0, t1); \ + vpxor (5*16)(src), tiv, x5; \ + vmovdqu tiv, (5*16)(dst); \ + \ + gf128mul_x_ble(tiv, t0, t1); \ + vpxor (6*16)(src), tiv, x6; \ + vmovdqu tiv, (6*16)(dst); \ + \ + gf128mul_x_ble(tiv, t0, t1); \ + vpxor (7*16)(src), tiv, x7; \ + vmovdqu tiv, (7*16)(dst); \ + \ + gf128mul_x_ble(tiv, t0, t1); \ + vmovdqu tiv, (iv); + +#define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ + vpxor (0*16)(dst), x0, x0; \ + vpxor (1*16)(dst), x1, x1; \ + vpxor (2*16)(dst), x2, x2; \ + vpxor (3*16)(dst), x3, x3; \ + vpxor (4*16)(dst), x4, x4; \ + vpxor (5*16)(dst), x5, x5; \ + vpxor (6*16)(dst), x6, x6; \ + vpxor (7*16)(dst), x7, x7; \ + store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); diff --git a/arch/x86/crypto/glue_helper-asm-avx2.S b/arch/x86/crypto/glue_helper-asm-avx2.S new file mode 100644 index 00000000000..a53ac11dd38 --- /dev/null +++ b/arch/x86/crypto/glue_helper-asm-avx2.S @@ -0,0 +1,180 @@ +/* + * Shared glue code for 128bit block ciphers, AVX2 assembler macros + * + * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + +#define load_16way(src, x0, x1, x2, x3, x4, x5, x6, x7) \ + vmovdqu (0*32)(src), x0; \ + vmovdqu (1*32)(src), x1; \ + vmovdqu (2*32)(src), x2; \ + vmovdqu (3*32)(src), x3; \ + vmovdqu (4*32)(src), x4; \ + vmovdqu (5*32)(src), x5; \ + vmovdqu (6*32)(src), x6; \ + vmovdqu (7*32)(src), x7; + +#define store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ + vmovdqu x0, (0*32)(dst); \ + vmovdqu x1, (1*32)(dst); \ + vmovdqu x2, (2*32)(dst); \ + vmovdqu x3, (3*32)(dst); \ + vmovdqu x4, (4*32)(dst); \ + vmovdqu x5, (5*32)(dst); \ + vmovdqu x6, (6*32)(dst); \ + vmovdqu x7, (7*32)(dst); + +#define store_cbc_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7, t0) \ + vpxor t0, t0, t0; \ + vinserti128 $1, (src), t0, t0; \ + vpxor t0, x0, x0; \ + vpxor (0*32+16)(src), x1, x1; \ + vpxor (1*32+16)(src), x2, x2; \ + vpxor (2*32+16)(src), x3, x3; \ + vpxor (3*32+16)(src), x4, x4; \ + vpxor (4*32+16)(src), x5, x5; \ + vpxor (5*32+16)(src), x6, x6; \ + vpxor (6*32+16)(src), x7, x7; \ + store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); + +#define inc_le128(x, minus_one, tmp) \ + vpcmpeqq minus_one, x, tmp; \ + vpsubq minus_one, x, x; \ + vpslldq $8, tmp, tmp; \ + vpsubq tmp, x, x; + +#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ + vpcmpeqq minus_one, x, tmp1; \ + vpcmpeqq minus_two, x, tmp2; \ + vpsubq minus_two, x, x; \ + vpor tmp2, tmp1, tmp1; \ + vpslldq $8, tmp1, tmp1; \ + vpsubq tmp1, x, x; + +#define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \ + t1x, t2, t2x, t3, t3x, t4, t5) \ + vpcmpeqd t0, t0, t0; \ + vpsrldq $8, t0, t0; /* ab: -1:0 ; cd: -1:0 */ \ + vpaddq t0, t0, t4; /* ab: -2:0 ; cd: -2:0 */\ + \ + /* load IV and byteswap */ \ + vmovdqu (iv), t2x; \ + vmovdqa t2x, t3x; \ + inc_le128(t2x, t0x, t1x); \ + vbroadcasti128 bswap, t1; \ + vinserti128 $1, t2x, t3, t2; /* ab: le0 ; cd: le1 */ \ + vpshufb t1, t2, x0; \ + \ + /* construct IVs */ \ + add2_le128(t2, t0, t4, t3, t5); /* ab: le2 ; cd: le3 */ \ + vpshufb t1, t2, x1; \ + add2_le128(t2, t0, t4, t3, t5); \ + vpshufb t1, t2, x2; \ + add2_le128(t2, t0, t4, t3, t5); \ + vpshufb t1, t2, x3; \ + add2_le128(t2, t0, t4, t3, t5); \ + vpshufb t1, t2, x4; \ + add2_le128(t2, t0, t4, t3, t5); \ + vpshufb t1, t2, x5; \ + add2_le128(t2, t0, t4, t3, t5); \ + vpshufb t1, t2, x6; \ + add2_le128(t2, t0, t4, t3, t5); \ + vpshufb t1, t2, x7; \ + vextracti128 $1, t2, t2x; \ + inc_le128(t2x, t0x, t3x); \ + vmovdqu t2x, (iv); + +#define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ + vpxor (0*32)(src), x0, x0; \ + vpxor (1*32)(src), x1, x1; \ + vpxor (2*32)(src), x2, x2; \ + vpxor (3*32)(src), x3, x3; \ + vpxor (4*32)(src), x4, x4; \ + vpxor (5*32)(src), x5, x5; \ + vpxor (6*32)(src), x6, x6; \ + vpxor (7*32)(src), x7, x7; \ + store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); + +#define gf128mul_x_ble(iv, mask, tmp) \ + vpsrad $31, iv, tmp; \ + vpaddq iv, iv, iv; \ + vpshufd $0x13, tmp, tmp; \ + vpand mask, tmp, tmp; \ + vpxor tmp, iv, iv; + +#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \ + vpsrad $31, iv, tmp0; \ + vpaddq iv, iv, tmp1; \ + vpsllq $2, iv, iv; \ + vpshufd $0x13, tmp0, tmp0; \ + vpsrad $31, tmp1, tmp1; \ + vpand mask2, tmp0, tmp0; \ + vpshufd $0x13, tmp1, tmp1; \ + vpxor tmp0, iv, iv; \ + vpand mask1, tmp1, tmp1; \ + vpxor tmp1, iv, iv; + +#define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \ + tivx, t0, t0x, t1, t1x, t2, t2x, t3, \ + xts_gf128mul_and_shl1_mask_0, \ + xts_gf128mul_and_shl1_mask_1) \ + vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \ + \ + /* load IV and construct second IV */ \ + vmovdqu (iv), tivx; \ + vmovdqa tivx, t0x; \ + gf128mul_x_ble(tivx, t1x, t2x); \ + vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \ + vinserti128 $1, tivx, t0, tiv; \ + vpxor (0*32)(src), tiv, x0; \ + vmovdqu tiv, (0*32)(dst); \ + \ + /* construct and store IVs, also xor with source */ \ + gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ + vpxor (1*32)(src), tiv, x1; \ + vmovdqu tiv, (1*32)(dst); \ + \ + gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ + vpxor (2*32)(src), tiv, x2; \ + vmovdqu tiv, (2*32)(dst); \ + \ + gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ + vpxor (3*32)(src), tiv, x3; \ + vmovdqu tiv, (3*32)(dst); \ + \ + gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ + vpxor (4*32)(src), tiv, x4; \ + vmovdqu tiv, (4*32)(dst); \ + \ + gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ + vpxor (5*32)(src), tiv, x5; \ + vmovdqu tiv, (5*32)(dst); \ + \ + gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ + vpxor (6*32)(src), tiv, x6; \ + vmovdqu tiv, (6*32)(dst); \ + \ + gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ + vpxor (7*32)(src), tiv, x7; \ + vmovdqu tiv, (7*32)(dst); \ + \ + vextracti128 $1, tiv, tivx; \ + gf128mul_x_ble(tivx, t1x, t2x); \ + vmovdqu tivx, (iv); + +#define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ + vpxor (0*32)(dst), x0, x0; \ + vpxor (1*32)(dst), x1, x1; \ + vpxor (2*32)(dst), x2, x2; \ + vpxor (3*32)(dst), x3, x3; \ + vpxor (4*32)(dst), x4, x4; \ + vpxor (5*32)(dst), x5, x5; \ + vpxor (6*32)(dst), x6, x6; \ + vpxor (7*32)(dst), x7, x7; \ + store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index 22ce4f683e5..432f1d76ceb 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -1,7 +1,7 @@ /* * Shared glue code for 128bit block ciphers * - * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> @@ -304,4 +304,99 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, } EXPORT_SYMBOL_GPL(glue_ctr_crypt_128bit); +static unsigned int __glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, + void *ctx, + struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + const unsigned int bsize = 128 / 8; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + unsigned int num_blocks, func_bytes; + unsigned int i; + + /* Process multi-block batch */ + for (i = 0; i < gctx->num_funcs; i++) { + num_blocks = gctx->funcs[i].num_blocks; + func_bytes = bsize * num_blocks; + + if (nbytes >= func_bytes) { + do { + gctx->funcs[i].fn_u.xts(ctx, dst, src, + (le128 *)walk->iv); + + src += num_blocks; + dst += num_blocks; + nbytes -= func_bytes; + } while (nbytes >= func_bytes); + + if (nbytes < bsize) + goto done; + } + } + +done: + return nbytes; +} + +/* for implementations implementing faster XTS IV generator */ +int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes, + void (*tweak_fn)(void *ctx, u8 *dst, const u8 *src), + void *tweak_ctx, void *crypt_ctx) +{ + const unsigned int bsize = 128 / 8; + bool fpu_enabled = false; + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + + err = blkcipher_walk_virt(desc, &walk); + nbytes = walk.nbytes; + if (!nbytes) + return err; + + /* set minimum length to bsize, for tweak_fn */ + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, + desc, fpu_enabled, + nbytes < bsize ? bsize : nbytes); + + /* calculate first value of T */ + tweak_fn(tweak_ctx, walk.iv, walk.iv); + + while (nbytes) { + nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk); + + err = blkcipher_walk_done(desc, &walk, nbytes); + nbytes = walk.nbytes; + } + + glue_fpu_end(fpu_enabled); + + return err; +} +EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit); + +void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv, + common_glue_func_t fn) +{ + le128 ivblk = *iv; + + /* generate next IV */ + le128_gf128mul_x_ble(iv, &ivblk); + + /* CC <- T xor C */ + u128_xor(dst, src, (u128 *)&ivblk); + + /* PP <- D(Key2,CC) */ + fn(ctx, (u8 *)dst, (u8 *)dst); + + /* P <- T xor PP */ + u128_xor(dst, dst, (u128 *)&ivblk); +} +EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit_one); + MODULE_LICENSE("GPL"); diff --git a/arch/x86/crypto/salsa20-i586-asm_32.S b/arch/x86/crypto/salsa20-i586-asm_32.S index 72eb306680b..329452b8f79 100644 --- a/arch/x86/crypto/salsa20-i586-asm_32.S +++ b/arch/x86/crypto/salsa20-i586-asm_32.S @@ -2,11 +2,12 @@ # D. J. Bernstein # Public domain. -# enter ECRYPT_encrypt_bytes +#include <linux/linkage.h> + .text -.p2align 5 -.globl ECRYPT_encrypt_bytes -ECRYPT_encrypt_bytes: + +# enter salsa20_encrypt_bytes +ENTRY(salsa20_encrypt_bytes) mov %esp,%eax and $31,%eax add $256,%eax @@ -933,11 +934,10 @@ ECRYPT_encrypt_bytes: add $64,%esi # goto bytesatleast1 jmp ._bytesatleast1 -# enter ECRYPT_keysetup -.text -.p2align 5 -.globl ECRYPT_keysetup -ECRYPT_keysetup: +ENDPROC(salsa20_encrypt_bytes) + +# enter salsa20_keysetup +ENTRY(salsa20_keysetup) mov %esp,%eax and $31,%eax add $256,%eax @@ -1060,11 +1060,10 @@ ECRYPT_keysetup: # leave add %eax,%esp ret -# enter ECRYPT_ivsetup -.text -.p2align 5 -.globl ECRYPT_ivsetup -ECRYPT_ivsetup: +ENDPROC(salsa20_keysetup) + +# enter salsa20_ivsetup +ENTRY(salsa20_ivsetup) mov %esp,%eax and $31,%eax add $256,%eax @@ -1112,3 +1111,4 @@ ECRYPT_ivsetup: # leave add %eax,%esp ret +ENDPROC(salsa20_ivsetup) diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S index 6214a9b0970..9279e0b2d60 100644 --- a/arch/x86/crypto/salsa20-x86_64-asm_64.S +++ b/arch/x86/crypto/salsa20-x86_64-asm_64.S @@ -1,8 +1,7 @@ -# enter ECRYPT_encrypt_bytes -.text -.p2align 5 -.globl ECRYPT_encrypt_bytes -ECRYPT_encrypt_bytes: +#include <linux/linkage.h> + +# enter salsa20_encrypt_bytes +ENTRY(salsa20_encrypt_bytes) mov %rsp,%r11 and $31,%r11 add $256,%r11 @@ -802,11 +801,10 @@ ECRYPT_encrypt_bytes: # comment:fp stack unchanged by jump # goto bytesatleast1 jmp ._bytesatleast1 -# enter ECRYPT_keysetup -.text -.p2align 5 -.globl ECRYPT_keysetup -ECRYPT_keysetup: +ENDPROC(salsa20_encrypt_bytes) + +# enter salsa20_keysetup +ENTRY(salsa20_keysetup) mov %rsp,%r11 and $31,%r11 add $256,%r11 @@ -892,11 +890,10 @@ ECRYPT_keysetup: mov %rdi,%rax mov %rsi,%rdx ret -# enter ECRYPT_ivsetup -.text -.p2align 5 -.globl ECRYPT_ivsetup -ECRYPT_ivsetup: +ENDPROC(salsa20_keysetup) + +# enter salsa20_ivsetup +ENTRY(salsa20_ivsetup) mov %rsp,%r11 and $31,%r11 add $256,%r11 @@ -918,3 +915,4 @@ ECRYPT_ivsetup: mov %rdi,%rax mov %rsi,%rdx ret +ENDPROC(salsa20_ivsetup) diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c index a3a3c0205c1..5e8e67739bb 100644 --- a/arch/x86/crypto/salsa20_glue.c +++ b/arch/x86/crypto/salsa20_glue.c @@ -26,11 +26,6 @@ #define SALSA20_MIN_KEY_SIZE 16U #define SALSA20_MAX_KEY_SIZE 32U -// use the ECRYPT_* function names -#define salsa20_keysetup ECRYPT_keysetup -#define salsa20_ivsetup ECRYPT_ivsetup -#define salsa20_encrypt_bytes ECRYPT_encrypt_bytes - struct salsa20_ctx { u32 input[16]; diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index 02b0e9fe997..2f202f49872 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -4,8 +4,7 @@ * Copyright (C) 2012 Johannes Goetzfried * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> * - * Based on arch/x86/crypto/serpent-sse2-x86_64-asm_64.S by - * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -24,6 +23,7 @@ * */ +#include <linux/linkage.h> #include "glue_helper-asm-avx.S" .file "serpent-avx-x86_64-asm_64.S" @@ -33,6 +33,8 @@ .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +.Lxts_gf128mul_and_shl1_mask: + .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 .text @@ -566,8 +568,6 @@ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) .align 8 -.type __serpent_enc_blk8_avx,@function; - __serpent_enc_blk8_avx: /* input: * %rdi: ctx, CTX @@ -619,10 +619,9 @@ __serpent_enc_blk8_avx: write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); ret; +ENDPROC(__serpent_enc_blk8_avx) .align 8 -.type __serpent_dec_blk8_avx,@function; - __serpent_dec_blk8_avx: /* input: * %rdi: ctx, CTX @@ -674,12 +673,9 @@ __serpent_dec_blk8_avx: write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); ret; +ENDPROC(__serpent_dec_blk8_avx) -.align 8 -.global serpent_ecb_enc_8way_avx -.type serpent_ecb_enc_8way_avx,@function; - -serpent_ecb_enc_8way_avx: +ENTRY(serpent_ecb_enc_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -693,12 +689,9 @@ serpent_ecb_enc_8way_avx: store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); ret; +ENDPROC(serpent_ecb_enc_8way_avx) -.align 8 -.global serpent_ecb_dec_8way_avx -.type serpent_ecb_dec_8way_avx,@function; - -serpent_ecb_dec_8way_avx: +ENTRY(serpent_ecb_dec_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -712,12 +705,9 @@ serpent_ecb_dec_8way_avx: store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); ret; +ENDPROC(serpent_ecb_dec_8way_avx) -.align 8 -.global serpent_cbc_dec_8way_avx -.type serpent_cbc_dec_8way_avx,@function; - -serpent_cbc_dec_8way_avx: +ENTRY(serpent_cbc_dec_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -731,12 +721,9 @@ serpent_cbc_dec_8way_avx: store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); ret; +ENDPROC(serpent_cbc_dec_8way_avx) -.align 8 -.global serpent_ctr_8way_avx -.type serpent_ctr_8way_avx,@function; - -serpent_ctr_8way_avx: +ENTRY(serpent_ctr_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -752,3 +739,44 @@ serpent_ctr_8way_avx: store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); ret; +ENDPROC(serpent_ctr_8way_avx) + +ENTRY(serpent_xts_enc_8way_avx) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) + */ + + /* regs <= src, dst <= IVs, regs <= regs xor IVs */ + load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, + RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask); + + call __serpent_enc_blk8_avx; + + /* dst <= regs xor IVs(in dst) */ + store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + ret; +ENDPROC(serpent_xts_enc_8way_avx) + +ENTRY(serpent_xts_dec_8way_avx) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) + */ + + /* regs <= src, dst <= IVs, regs <= regs xor IVs */ + load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, + RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask); + + call __serpent_dec_blk8_avx; + + /* dst <= regs xor IVs(in dst) */ + store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); + + ret; +ENDPROC(serpent_xts_dec_8way_avx) diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S new file mode 100644 index 00000000000..b222085ccca --- /dev/null +++ b/arch/x86/crypto/serpent-avx2-asm_64.S @@ -0,0 +1,800 @@ +/* + * x86_64/AVX2 assembler optimized version of Serpent + * + * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * Based on AVX assembler implementation of Serpent by: + * Copyright © 2012 Johannes Goetzfried + * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + +#include <linux/linkage.h> +#include "glue_helper-asm-avx2.S" + +.file "serpent-avx2-asm_64.S" + +.data +.align 16 + +.Lbswap128_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +.Lxts_gf128mul_and_shl1_mask_0: + .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 +.Lxts_gf128mul_and_shl1_mask_1: + .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 + +.text + +#define CTX %rdi + +#define RNOT %ymm0 +#define tp %ymm1 + +#define RA1 %ymm2 +#define RA2 %ymm3 +#define RB1 %ymm4 +#define RB2 %ymm5 +#define RC1 %ymm6 +#define RC2 %ymm7 +#define RD1 %ymm8 +#define RD2 %ymm9 +#define RE1 %ymm10 +#define RE2 %ymm11 + +#define RK0 %ymm12 +#define RK1 %ymm13 +#define RK2 %ymm14 +#define RK3 %ymm15 + +#define RK0x %xmm12 +#define RK1x %xmm13 +#define RK2x %xmm14 +#define RK3x %xmm15 + +#define S0_1(x0, x1, x2, x3, x4) \ + vpor x0, x3, tp; \ + vpxor x3, x0, x0; \ + vpxor x2, x3, x4; \ + vpxor RNOT, x4, x4; \ + vpxor x1, tp, x3; \ + vpand x0, x1, x1; \ + vpxor x4, x1, x1; \ + vpxor x0, x2, x2; +#define S0_2(x0, x1, x2, x3, x4) \ + vpxor x3, x0, x0; \ + vpor x0, x4, x4; \ + vpxor x2, x0, x0; \ + vpand x1, x2, x2; \ + vpxor x2, x3, x3; \ + vpxor RNOT, x1, x1; \ + vpxor x4, x2, x2; \ + vpxor x2, x1, x1; + +#define S1_1(x0, x1, x2, x3, x4) \ + vpxor x0, x1, tp; \ + vpxor x3, x0, x0; \ + vpxor RNOT, x3, x3; \ + vpand tp, x1, x4; \ + vpor tp, x0, x0; \ + vpxor x2, x3, x3; \ + vpxor x3, x0, x0; \ + vpxor x3, tp, x1; +#define S1_2(x0, x1, x2, x3, x4) \ + vpxor x4, x3, x3; \ + vpor x4, x1, x1; \ + vpxor x2, x4, x4; \ + vpand x0, x2, x2; \ + vpxor x1, x2, x2; \ + vpor x0, x1, x1; \ + vpxor RNOT, x0, x0; \ + vpxor x2, x0, x0; \ + vpxor x1, x4, x4; + +#define S2_1(x0, x1, x2, x3, x4) \ + vpxor RNOT, x3, x3; \ + vpxor x0, x1, x1; \ + vpand x2, x0, tp; \ + vpxor x3, tp, tp; \ + vpor x0, x3, x3; \ + vpxor x1, x2, x2; \ + vpxor x1, x3, x3; \ + vpand tp, x1, x1; +#define S2_2(x0, x1, x2, x3, x4) \ + vpxor x2, tp, tp; \ + vpand x3, x2, x2; \ + vpor x1, x3, x3; \ + vpxor RNOT, tp, tp; \ + vpxor tp, x3, x3; \ + vpxor tp, x0, x4; \ + vpxor x2, tp, x0; \ + vpor x2, x1, x1; + +#define S3_1(x0, x1, x2, x3, x4) \ + vpxor x3, x1, tp; \ + vpor x0, x3, x3; \ + vpand x0, x1, x4; \ + vpxor x2, x0, x0; \ + vpxor tp, x2, x2; \ + vpand x3, tp, x1; \ + vpxor x3, x2, x2; \ + vpor x4, x0, x0; \ + vpxor x3, x4, x4; +#define S3_2(x0, x1, x2, x3, x4) \ + vpxor x0, x1, x1; \ + vpand x3, x0, x0; \ + vpand x4, x3, x3; \ + vpxor x2, x3, x3; \ + vpor x1, x4, x4; \ + vpand x1, x2, x2; \ + vpxor x3, x4, x4; \ + vpxor x3, x0, x0; \ + vpxor x2, x3, x3; + +#define S4_1(x0, x1, x2, x3, x4) \ + vpand x0, x3, tp; \ + vpxor x3, x0, x0; \ + vpxor x2, tp, tp; \ + vpor x3, x2, x2; \ + vpxor x1, x0, x0; \ + vpxor tp, x3, x4; \ + vpor x0, x2, x2; \ + vpxor x1, x2, x2; +#define S4_2(x0, x1, x2, x3, x4) \ + vpand x0, x1, x1; \ + vpxor x4, x1, x1; \ + vpand x2, x4, x4; \ + vpxor tp, x2, x2; \ + vpxor x0, x4, x4; \ + vpor x1, tp, x3; \ + vpxor RNOT, x1, x1; \ + vpxor x0, x3, x3; + +#define S5_1(x0, x1, x2, x3, x4) \ + vpor x0, x1, tp; \ + vpxor tp, x2, x2; \ + vpxor RNOT, x3, x3; \ + vpxor x0, x1, x4; \ + vpxor x2, x0, x0; \ + vpand x4, tp, x1; \ + vpor x3, x4, x4; \ + vpxor x0, x4, x4; +#define S5_2(x0, x1, x2, x3, x4) \ + vpand x3, x0, x0; \ + vpxor x3, x1, x1; \ + vpxor x2, x3, x3; \ + vpxor x1, x0, x0; \ + vpand x4, x2, x2; \ + vpxor x2, x1, x1; \ + vpand x0, x2, x2; \ + vpxor x2, x3, x3; + +#define S6_1(x0, x1, x2, x3, x4) \ + vpxor x0, x3, x3; \ + vpxor x2, x1, tp; \ + vpxor x0, x2, x2; \ + vpand x3, x0, x0; \ + vpor x3, tp, tp; \ + vpxor RNOT, x1, x4; \ + vpxor tp, x0, x0; \ + vpxor x2, tp, x1; +#define S6_2(x0, x1, x2, x3, x4) \ + vpxor x4, x3, x3; \ + vpxor x0, x4, x4; \ + vpand x0, x2, x2; \ + vpxor x1, x4, x4; \ + vpxor x3, x2, x2; \ + vpand x1, x3, x3; \ + vpxor x0, x3, x3; \ + vpxor x2, x1, x1; + +#define S7_1(x0, x1, x2, x3, x4) \ + vpxor RNOT, x1, tp; \ + vpxor RNOT, x0, x0; \ + vpand x2, tp, x1; \ + vpxor x3, x1, x1; \ + vpor tp, x3, x3; \ + vpxor x2, tp, x4; \ + vpxor x3, x2, x2; \ + vpxor x0, x3, x3; \ + vpor x1, x0, x0; +#define S7_2(x0, x1, x2, x3, x4) \ + vpand x0, x2, x2; \ + vpxor x4, x0, x0; \ + vpxor x3, x4, x4; \ + vpand x0, x3, x3; \ + vpxor x1, x4, x4; \ + vpxor x4, x2, x2; \ + vpxor x1, x3, x3; \ + vpor x0, x4, x4; \ + vpxor x1, x4, x4; + +#define SI0_1(x0, x1, x2, x3, x4) \ + vpxor x0, x1, x1; \ + vpor x1, x3, tp; \ + vpxor x1, x3, x4; \ + vpxor RNOT, x0, x0; \ + vpxor tp, x2, x2; \ + vpxor x0, tp, x3; \ + vpand x1, x0, x0; \ + vpxor x2, x0, x0; +#define SI0_2(x0, x1, x2, x3, x4) \ + vpand x3, x2, x2; \ + vpxor x4, x3, x3; \ + vpxor x3, x2, x2; \ + vpxor x3, x1, x1; \ + vpand x0, x3, x3; \ + vpxor x0, x1, x1; \ + vpxor x2, x0, x0; \ + vpxor x3, x4, x4; + +#define SI1_1(x0, x1, x2, x3, x4) \ + vpxor x3, x1, x1; \ + vpxor x2, x0, tp; \ + vpxor RNOT, x2, x2; \ + vpor x1, x0, x4; \ + vpxor x3, x4, x4; \ + vpand x1, x3, x3; \ + vpxor x2, x1, x1; \ + vpand x4, x2, x2; +#define SI1_2(x0, x1, x2, x3, x4) \ + vpxor x1, x4, x4; \ + vpor x3, x1, x1; \ + vpxor tp, x3, x3; \ + vpxor tp, x2, x2; \ + vpor x4, tp, x0; \ + vpxor x4, x2, x2; \ + vpxor x0, x1, x1; \ + vpxor x1, x4, x4; + +#define SI2_1(x0, x1, x2, x3, x4) \ + vpxor x1, x2, x2; \ + vpxor RNOT, x3, tp; \ + vpor x2, tp, tp; \ + vpxor x3, x2, x2; \ + vpxor x0, x3, x4; \ + vpxor x1, tp, x3; \ + vpor x2, x1, x1; \ + vpxor x0, x2, x2; +#define SI2_2(x0, x1, x2, x3, x4) \ + vpxor x4, x1, x1; \ + vpor x3, x4, x4; \ + vpxor x3, x2, x2; \ + vpxor x2, x4, x4; \ + vpand x1, x2, x2; \ + vpxor x3, x2, x2; \ + vpxor x4, x3, x3; \ + vpxor x0, x4, x4; + +#define SI3_1(x0, x1, x2, x3, x4) \ + vpxor x1, x2, x2; \ + vpand x2, x1, tp; \ + vpxor x0, tp, tp; \ + vpor x1, x0, x0; \ + vpxor x3, x1, x4; \ + vpxor x3, x0, x0; \ + vpor tp, x3, x3; \ + vpxor x2, tp, x1; +#define SI3_2(x0, x1, x2, x3, x4) \ + vpxor x3, x1, x1; \ + vpxor x2, x0, x0; \ + vpxor x3, x2, x2; \ + vpand x1, x3, x3; \ + vpxor x0, x1, x1; \ + vpand x2, x0, x0; \ + vpxor x3, x4, x4; \ + vpxor x0, x3, x3; \ + vpxor x1, x0, x0; + +#define SI4_1(x0, x1, x2, x3, x4) \ + vpxor x3, x2, x2; \ + vpand x1, x0, tp; \ + vpxor x2, tp, tp; \ + vpor x3, x2, x2; \ + vpxor RNOT, x0, x4; \ + vpxor tp, x1, x1; \ + vpxor x2, tp, x0; \ + vpand x4, x2, x2; +#define SI4_2(x0, x1, x2, x3, x4) \ + vpxor x0, x2, x2; \ + vpor x4, x0, x0; \ + vpxor x3, x0, x0; \ + vpand x2, x3, x3; \ + vpxor x3, x4, x4; \ + vpxor x1, x3, x3; \ + vpand x0, x1, x1; \ + vpxor x1, x4, x4; \ + vpxor x3, x0, x0; + +#define SI5_1(x0, x1, x2, x3, x4) \ + vpor x2, x1, tp; \ + vpxor x1, x2, x2; \ + vpxor x3, tp, tp; \ + vpand x1, x3, x3; \ + vpxor x3, x2, x2; \ + vpor x0, x3, x3; \ + vpxor RNOT, x0, x0; \ + vpxor x2, x3, x3; \ + vpor x0, x2, x2; +#define SI5_2(x0, x1, x2, x3, x4) \ + vpxor tp, x1, x4; \ + vpxor x4, x2, x2; \ + vpand x0, x4, x4; \ + vpxor tp, x0, x0; \ + vpxor x3, tp, x1; \ + vpand x2, x0, x0; \ + vpxor x3, x2, x2; \ + vpxor x2, x0, x0; \ + vpxor x4, x2, x2; \ + vpxor x3, x4, x4; + +#define SI6_1(x0, x1, x2, x3, x4) \ + vpxor x2, x0, x0; \ + vpand x3, x0, tp; \ + vpxor x3, x2, x2; \ + vpxor x2, tp, tp; \ + vpxor x1, x3, x3; \ + vpor x0, x2, x2; \ + vpxor x3, x2, x2; \ + vpand tp, x3, x3; +#define SI6_2(x0, x1, x2, x3, x4) \ + vpxor RNOT, tp, tp; \ + vpxor x1, x3, x3; \ + vpand x2, x1, x1; \ + vpxor tp, x0, x4; \ + vpxor x4, x3, x3; \ + vpxor x2, x4, x4; \ + vpxor x1, tp, x0; \ + vpxor x0, x2, x2; + +#define SI7_1(x0, x1, x2, x3, x4) \ + vpand x0, x3, tp; \ + vpxor x2, x0, x0; \ + vpor x3, x2, x2; \ + vpxor x1, x3, x4; \ + vpxor RNOT, x0, x0; \ + vpor tp, x1, x1; \ + vpxor x0, x4, x4; \ + vpand x2, x0, x0; \ + vpxor x1, x0, x0; +#define SI7_2(x0, x1, x2, x3, x4) \ + vpand x2, x1, x1; \ + vpxor x2, tp, x3; \ + vpxor x3, x4, x4; \ + vpand x3, x2, x2; \ + vpor x0, x3, x3; \ + vpxor x4, x1, x1; \ + vpxor x4, x3, x3; \ + vpand x0, x4, x4; \ + vpxor x2, x4, x4; + +#define get_key(i,j,t) \ + vpbroadcastd (4*(i)+(j))*4(CTX), t; + +#define K2(x0, x1, x2, x3, x4, i) \ + get_key(i, 0, RK0); \ + get_key(i, 1, RK1); \ + get_key(i, 2, RK2); \ + get_key(i, 3, RK3); \ + vpxor RK0, x0 ## 1, x0 ## 1; \ + vpxor RK1, x1 ## 1, x1 ## 1; \ + vpxor RK2, x2 ## 1, x2 ## 1; \ + vpxor RK3, x3 ## 1, x3 ## 1; \ + vpxor RK0, x0 ## 2, x0 ## 2; \ + vpxor RK1, x1 ## 2, x1 ## 2; \ + vpxor RK2, x2 ## 2, x2 ## 2; \ + vpxor RK3, x3 ## 2, x3 ## 2; + +#define LK2(x0, x1, x2, x3, x4, i) \ + vpslld $13, x0 ## 1, x4 ## 1; \ + vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \ + vpor x4 ## 1, x0 ## 1, x0 ## 1; \ + vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ + vpslld $3, x2 ## 1, x4 ## 1; \ + vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \ + vpor x4 ## 1, x2 ## 1, x2 ## 1; \ + vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ + vpslld $13, x0 ## 2, x4 ## 2; \ + vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \ + vpor x4 ## 2, x0 ## 2, x0 ## 2; \ + vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ + vpslld $3, x2 ## 2, x4 ## 2; \ + vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \ + vpor x4 ## 2, x2 ## 2, x2 ## 2; \ + vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ + vpslld $1, x1 ## 1, x4 ## 1; \ + vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \ + vpor x4 ## 1, x1 ## 1, x1 ## 1; \ + vpslld $3, x0 ## 1, x4 ## 1; \ + vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ + vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ + get_key(i, 1, RK1); \ + vpslld $1, x1 ## 2, x4 ## 2; \ + vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \ + vpor x4 ## 2, x1 ## 2, x1 ## 2; \ + vpslld $3, x0 ## 2, x4 ## 2; \ + vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ + vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ + get_key(i, 3, RK3); \ + vpslld $7, x3 ## 1, x4 ## 1; \ + vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \ + vpor x4 ## 1, x3 ## 1, x3 ## 1; \ + vpslld $7, x1 ## 1, x4 ## 1; \ + vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ + vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ + vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ + vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ + get_key(i, 0, RK0); \ + vpslld $7, x3 ## 2, x4 ## 2; \ + vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \ + vpor x4 ## 2, x3 ## 2, x3 ## 2; \ + vpslld $7, x1 ## 2, x4 ## 2; \ + vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ + vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ + vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ + vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ + get_key(i, 2, RK2); \ + vpxor RK1, x1 ## 1, x1 ## 1; \ + vpxor RK3, x3 ## 1, x3 ## 1; \ + vpslld $5, x0 ## 1, x4 ## 1; \ + vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \ + vpor x4 ## 1, x0 ## 1, x0 ## 1; \ + vpslld $22, x2 ## 1, x4 ## 1; \ + vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \ + vpor x4 ## 1, x2 ## 1, x2 ## 1; \ + vpxor RK0, x0 ## 1, x0 ## 1; \ + vpxor RK2, x2 ## 1, x2 ## 1; \ + vpxor RK1, x1 ## 2, x1 ## 2; \ + vpxor RK3, x3 ## 2, x3 ## 2; \ + vpslld $5, x0 ## 2, x4 ## 2; \ + vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \ + vpor x4 ## 2, x0 ## 2, x0 ## 2; \ + vpslld $22, x2 ## 2, x4 ## 2; \ + vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \ + vpor x4 ## 2, x2 ## 2, x2 ## 2; \ + vpxor RK0, x0 ## 2, x0 ## 2; \ + vpxor RK2, x2 ## 2, x2 ## 2; + +#define KL2(x0, x1, x2, x3, x4, i) \ + vpxor RK0, x0 ## 1, x0 ## 1; \ + vpxor RK2, x2 ## 1, x2 ## 1; \ + vpsrld $5, x0 ## 1, x4 ## 1; \ + vpslld $(32 - 5), x0 ## 1, x0 ## 1; \ + vpor x4 ## 1, x0 ## 1, x0 ## 1; \ + vpxor RK3, x3 ## 1, x3 ## 1; \ + vpxor RK1, x1 ## 1, x1 ## 1; \ + vpsrld $22, x2 ## 1, x4 ## 1; \ + vpslld $(32 - 22), x2 ## 1, x2 ## 1; \ + vpor x4 ## 1, x2 ## 1, x2 ## 1; \ + vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ + vpxor RK0, x0 ## 2, x0 ## 2; \ + vpxor RK2, x2 ## 2, x2 ## 2; \ + vpsrld $5, x0 ## 2, x4 ## 2; \ + vpslld $(32 - 5), x0 ## 2, x0 ## 2; \ + vpor x4 ## 2, x0 ## 2, x0 ## 2; \ + vpxor RK3, x3 ## 2, x3 ## 2; \ + vpxor RK1, x1 ## 2, x1 ## 2; \ + vpsrld $22, x2 ## 2, x4 ## 2; \ + vpslld $(32 - 22), x2 ## 2, x2 ## 2; \ + vpor x4 ## 2, x2 ## 2, x2 ## 2; \ + vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ + vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ + vpslld $7, x1 ## 1, x4 ## 1; \ + vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ + vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ + vpsrld $1, x1 ## 1, x4 ## 1; \ + vpslld $(32 - 1), x1 ## 1, x1 ## 1; \ + vpor x4 ## 1, x1 ## 1, x1 ## 1; \ + vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ + vpslld $7, x1 ## 2, x4 ## 2; \ + vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ + vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ + vpsrld $1, x1 ## 2, x4 ## 2; \ + vpslld $(32 - 1), x1 ## 2, x1 ## 2; \ + vpor x4 ## 2, x1 ## 2, x1 ## 2; \ + vpsrld $7, x3 ## 1, x4 ## 1; \ + vpslld $(32 - 7), x3 ## 1, x3 ## 1; \ + vpor x4 ## 1, x3 ## 1, x3 ## 1; \ + vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ + vpslld $3, x0 ## 1, x4 ## 1; \ + vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ + vpsrld $7, x3 ## 2, x4 ## 2; \ + vpslld $(32 - 7), x3 ## 2, x3 ## 2; \ + vpor x4 ## 2, x3 ## 2, x3 ## 2; \ + vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ + vpslld $3, x0 ## 2, x4 ## 2; \ + vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ + vpsrld $13, x0 ## 1, x4 ## 1; \ + vpslld $(32 - 13), x0 ## 1, x0 ## 1; \ + vpor x4 ## 1, x0 ## 1, x0 ## 1; \ + vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ + vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ + vpsrld $3, x2 ## 1, x4 ## 1; \ + vpslld $(32 - 3), x2 ## 1, x2 ## 1; \ + vpor x4 ## 1, x2 ## 1, x2 ## 1; \ + vpsrld $13, x0 ## 2, x4 ## 2; \ + vpslld $(32 - 13), x0 ## 2, x0 ## 2; \ + vpor x4 ## 2, x0 ## 2, x0 ## 2; \ + vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ + vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ + vpsrld $3, x2 ## 2, x4 ## 2; \ + vpslld $(32 - 3), x2 ## 2, x2 ## 2; \ + vpor x4 ## 2, x2 ## 2, x2 ## 2; + +#define S(SBOX, x0, x1, x2, x3, x4) \ + SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ + SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ + SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ + SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); + +#define SP(SBOX, x0, x1, x2, x3, x4, i) \ + get_key(i, 0, RK0); \ + SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ + get_key(i, 2, RK2); \ + SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ + get_key(i, 3, RK3); \ + SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ + get_key(i, 1, RK1); \ + SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ + +#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + vpunpckldq x1, x0, t0; \ + vpunpckhdq x1, x0, t2; \ + vpunpckldq x3, x2, t1; \ + vpunpckhdq x3, x2, x3; \ + \ + vpunpcklqdq t1, t0, x0; \ + vpunpckhqdq t1, t0, x1; \ + vpunpcklqdq x3, t2, x2; \ + vpunpckhqdq x3, t2, x3; + +#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) + +#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) + +.align 8 +__serpent_enc_blk16: + /* input: + * %rdi: ctx, CTX + * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext + * output: + * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext + */ + + vpcmpeqd RNOT, RNOT, RNOT; + + read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); + read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); + + K2(RA, RB, RC, RD, RE, 0); + S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); + S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2); + S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3); + S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4); + S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5); + S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6); + S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7); + S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8); + S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9); + S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10); + S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11); + S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12); + S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13); + S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14); + S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15); + S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16); + S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17); + S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18); + S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19); + S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20); + S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21); + S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22); + S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23); + S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24); + S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25); + S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26); + S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27); + S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28); + S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29); + S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30); + S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); + S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); + + write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); + write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); + + ret; +ENDPROC(__serpent_enc_blk16) + +.align 8 +__serpent_dec_blk16: + /* input: + * %rdi: ctx, CTX + * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext + * output: + * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext + */ + + vpcmpeqd RNOT, RNOT, RNOT; + + read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); + read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); + + K2(RA, RB, RC, RD, RE, 32); + SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); + SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30); + SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29); + SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28); + SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27); + SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26); + SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25); + SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24); + SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23); + SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22); + SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21); + SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20); + SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19); + SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18); + SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17); + SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16); + SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15); + SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14); + SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13); + SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12); + SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11); + SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10); + SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9); + SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8); + SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7); + SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6); + SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5); + SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4); + SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3); + SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2); + SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); + S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); + + write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); + write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); + + ret; +ENDPROC(__serpent_dec_blk16) + +ENTRY(serpent_ecb_enc_16way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + vzeroupper; + + load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + call __serpent_enc_blk16; + + store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + vzeroupper; + + ret; +ENDPROC(serpent_ecb_enc_16way) + +ENTRY(serpent_ecb_dec_16way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + vzeroupper; + + load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + call __serpent_dec_blk16; + + store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); + + vzeroupper; + + ret; +ENDPROC(serpent_ecb_dec_16way) + +ENTRY(serpent_cbc_dec_16way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + vzeroupper; + + load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + call __serpent_dec_blk16; + + store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2, + RK0); + + vzeroupper; + + ret; +ENDPROC(serpent_cbc_dec_16way) + +ENTRY(serpent_ctr_16way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: iv (little endian, 128bit) + */ + + vzeroupper; + + load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, + RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, + tp); + + call __serpent_enc_blk16; + + store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + vzeroupper; + + ret; +ENDPROC(serpent_ctr_16way) + +ENTRY(serpent_xts_enc_16way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) + */ + + vzeroupper; + + load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, + RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, + .Lxts_gf128mul_and_shl1_mask_0, + .Lxts_gf128mul_and_shl1_mask_1); + + call __serpent_enc_blk16; + + store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + vzeroupper; + + ret; +ENDPROC(serpent_xts_enc_16way) + +ENTRY(serpent_xts_dec_16way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) + */ + + vzeroupper; + + load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, + RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, + .Lxts_gf128mul_and_shl1_mask_0, + .Lxts_gf128mul_and_shl1_mask_1); + + call __serpent_dec_blk16; + + store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); + + vzeroupper; + + ret; +ENDPROC(serpent_xts_dec_16way) diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S index c00053d42f9..d348f1553a7 100644 --- a/arch/x86/crypto/serpent-sse2-i586-asm_32.S +++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S @@ -24,6 +24,8 @@ * */ +#include <linux/linkage.h> + .file "serpent-sse2-i586-asm_32.S" .text @@ -510,11 +512,7 @@ pxor t0, x3; \ movdqu x3, (3*4*4)(out); -.align 8 -.global __serpent_enc_blk_4way -.type __serpent_enc_blk_4way,@function; - -__serpent_enc_blk_4way: +ENTRY(__serpent_enc_blk_4way) /* input: * arg_ctx(%esp): ctx, CTX * arg_dst(%esp): dst @@ -566,22 +564,19 @@ __serpent_enc_blk_4way: movl arg_dst(%esp), %eax; cmpb $0, arg_xor(%esp); - jnz __enc_xor4; + jnz .L__enc_xor4; write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); ret; -__enc_xor4: +.L__enc_xor4: xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); ret; +ENDPROC(__serpent_enc_blk_4way) -.align 8 -.global serpent_dec_blk_4way -.type serpent_dec_blk_4way,@function; - -serpent_dec_blk_4way: +ENTRY(serpent_dec_blk_4way) /* input: * arg_ctx(%esp): ctx, CTX * arg_dst(%esp): dst @@ -633,3 +628,4 @@ serpent_dec_blk_4way: write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA); ret; +ENDPROC(serpent_dec_blk_4way) diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S index 3ee1ff04d3e..acc066c7c6b 100644 --- a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S @@ -24,6 +24,8 @@ * */ +#include <linux/linkage.h> + .file "serpent-sse2-x86_64-asm_64.S" .text @@ -632,11 +634,7 @@ pxor t0, x3; \ movdqu x3, (3*4*4)(out); -.align 8 -.global __serpent_enc_blk_8way -.type __serpent_enc_blk_8way,@function; - -__serpent_enc_blk_8way: +ENTRY(__serpent_enc_blk_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -687,24 +685,21 @@ __serpent_enc_blk_8way: leaq (4*4*4)(%rsi), %rax; testb %cl, %cl; - jnz __enc_xor8; + jnz .L__enc_xor8; write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); ret; -__enc_xor8: +.L__enc_xor8: xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); ret; +ENDPROC(__serpent_enc_blk_8way) -.align 8 -.global serpent_dec_blk_8way -.type serpent_dec_blk_8way,@function; - -serpent_dec_blk_8way: +ENTRY(serpent_dec_blk_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -756,3 +751,4 @@ serpent_dec_blk_8way: write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); ret; +ENDPROC(serpent_dec_blk_8way) diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c new file mode 100644 index 00000000000..2fae489b152 --- /dev/null +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -0,0 +1,562 @@ +/* + * Glue Code for x86_64/AVX2 assembler optimized version of Serpent + * + * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/crypto.h> +#include <linux/err.h> +#include <crypto/ablk_helper.h> +#include <crypto/algapi.h> +#include <crypto/ctr.h> +#include <crypto/lrw.h> +#include <crypto/xts.h> +#include <crypto/serpent.h> +#include <asm/xcr.h> +#include <asm/xsave.h> +#include <asm/crypto/serpent-avx.h> +#include <asm/crypto/glue_helper.h> + +#define SERPENT_AVX2_PARALLEL_BLOCKS 16 + +/* 16-way AVX2 parallel cipher functions */ +asmlinkage void serpent_ecb_enc_16way(struct serpent_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void serpent_ecb_dec_16way(struct serpent_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void serpent_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src); + +asmlinkage void serpent_ctr_16way(void *ctx, u128 *dst, const u128 *src, + le128 *iv); +asmlinkage void serpent_xts_enc_16way(struct serpent_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); +asmlinkage void serpent_xts_dec_16way(struct serpent_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); + +static const struct common_glue_ctx serpent_enc = { + .num_funcs = 3, + .fpu_blocks_limit = 8, + + .funcs = { { + .num_blocks = 16, + .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_16way) } + }, { + .num_blocks = 8, + .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } + } } +}; + +static const struct common_glue_ctx serpent_ctr = { + .num_funcs = 3, + .fpu_blocks_limit = 8, + + .funcs = { { + .num_blocks = 16, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_16way) } + }, { + .num_blocks = 8, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) } + }, { + .num_blocks = 1, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) } + } } +}; + +static const struct common_glue_ctx serpent_enc_xts = { + .num_funcs = 3, + .fpu_blocks_limit = 8, + + .funcs = { { + .num_blocks = 16, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_16way) } + }, { + .num_blocks = 8, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) } + }, { + .num_blocks = 1, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) } + } } +}; + +static const struct common_glue_ctx serpent_dec = { + .num_funcs = 3, + .fpu_blocks_limit = 8, + + .funcs = { { + .num_blocks = 16, + .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_16way) } + }, { + .num_blocks = 8, + .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } + } } +}; + +static const struct common_glue_ctx serpent_dec_cbc = { + .num_funcs = 3, + .fpu_blocks_limit = 8, + + .funcs = { { + .num_blocks = 16, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_16way) } + }, { + .num_blocks = 8, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) } + }, { + .num_blocks = 1, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } + } } +}; + +static const struct common_glue_ctx serpent_dec_xts = { + .num_funcs = 3, + .fpu_blocks_limit = 8, + + .funcs = { { + .num_blocks = 16, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_16way) } + }, { + .num_blocks = 8, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) } + }, { + .num_blocks = 1, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) } + } } +}; + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes); +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc, + dst, src, nbytes); +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src, + nbytes); +} + +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes); +} + +static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes) +{ + /* since reusing AVX functions, starts using FPU at 8 parallel blocks */ + return glue_fpu_begin(SERPENT_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes); +} + +static inline void serpent_fpu_end(bool fpu_enabled) +{ + glue_fpu_end(fpu_enabled); +} + +struct crypt_priv { + struct serpent_ctx *ctx; + bool fpu_enabled; +}; + +static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = SERPENT_BLOCK_SIZE; + struct crypt_priv *ctx = priv; + int i; + + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); + + if (nbytes >= SERPENT_AVX2_PARALLEL_BLOCKS * bsize) { + serpent_ecb_enc_16way(ctx->ctx, srcdst, srcdst); + srcdst += bsize * SERPENT_AVX2_PARALLEL_BLOCKS; + nbytes -= bsize * SERPENT_AVX2_PARALLEL_BLOCKS; + } + + while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) { + serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst); + srcdst += bsize * SERPENT_PARALLEL_BLOCKS; + nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + __serpent_encrypt(ctx->ctx, srcdst, srcdst); +} + +static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = SERPENT_BLOCK_SIZE; + struct crypt_priv *ctx = priv; + int i; + + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); + + if (nbytes >= SERPENT_AVX2_PARALLEL_BLOCKS * bsize) { + serpent_ecb_dec_16way(ctx->ctx, srcdst, srcdst); + srcdst += bsize * SERPENT_AVX2_PARALLEL_BLOCKS; + nbytes -= bsize * SERPENT_AVX2_PARALLEL_BLOCKS; + } + + while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) { + serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst); + srcdst += bsize * SERPENT_PARALLEL_BLOCKS; + nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + __serpent_decrypt(ctx->ctx, srcdst, srcdst); +} + +static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[SERPENT_AVX2_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->serpent_ctx, + .fpu_enabled = false, + }; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &crypt_ctx, + .crypt_fn = encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = lrw_crypt(desc, dst, src, nbytes, &req); + serpent_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[SERPENT_AVX2_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->serpent_ctx, + .fpu_enabled = false, + }; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &crypt_ctx, + .crypt_fn = decrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = lrw_crypt(desc, dst, src, nbytes, &req); + serpent_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + + return glue_xts_crypt_128bit(&serpent_enc_xts, desc, dst, src, nbytes, + XTS_TWEAK_CAST(__serpent_encrypt), + &ctx->tweak_ctx, &ctx->crypt_ctx); +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + + return glue_xts_crypt_128bit(&serpent_dec_xts, desc, dst, src, nbytes, + XTS_TWEAK_CAST(__serpent_encrypt), + &ctx->tweak_ctx, &ctx->crypt_ctx); +} + +static struct crypto_alg srp_algs[10] = { { + .cra_name = "__ecb-serpent-avx2", + .cra_driver_name = "__driver-ecb-serpent-avx2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(srp_algs[0].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .setkey = serpent_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}, { + .cra_name = "__cbc-serpent-avx2", + .cra_driver_name = "__driver-cbc-serpent-avx2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(srp_algs[1].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .setkey = serpent_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}, { + .cra_name = "__ctr-serpent-avx2", + .cra_driver_name = "__driver-ctr-serpent-avx2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(srp_algs[2].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = serpent_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +}, { + .cra_name = "__lrw-serpent-avx2", + .cra_driver_name = "__driver-lrw-serpent-avx2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_lrw_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(srp_algs[3].cra_list), + .cra_exit = lrw_serpent_exit_tfm, + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE + + SERPENT_BLOCK_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE + + SERPENT_BLOCK_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = lrw_serpent_setkey, + .encrypt = lrw_encrypt, + .decrypt = lrw_decrypt, + }, + }, +}, { + .cra_name = "__xts-serpent-avx2", + .cra_driver_name = "__driver-xts-serpent-avx2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_xts_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(srp_algs[4].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE * 2, + .max_keysize = SERPENT_MAX_KEY_SIZE * 2, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = xts_serpent_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, + }, + }, +}, { + .cra_name = "ecb(serpent)", + .cra_driver_name = "ecb-serpent-avx2", + .cra_priority = 600, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(srp_algs[5].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "cbc(serpent)", + .cra_driver_name = "cbc-serpent-avx2", + .cra_priority = 600, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(srp_algs[6].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = __ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "ctr(serpent)", + .cra_driver_name = "ctr-serpent-avx2", + .cra_priority = 600, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(srp_algs[7].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_encrypt, + .geniv = "chainiv", + }, + }, +}, { + .cra_name = "lrw(serpent)", + .cra_driver_name = "lrw-serpent-avx2", + .cra_priority = 600, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(srp_algs[8].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE + + SERPENT_BLOCK_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE + + SERPENT_BLOCK_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "xts(serpent)", + .cra_driver_name = "xts-serpent-avx2", + .cra_priority = 600, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(srp_algs[9].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE * 2, + .max_keysize = SERPENT_MAX_KEY_SIZE * 2, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +} }; + +static int __init init(void) +{ + u64 xcr0; + + if (!cpu_has_avx2 || !cpu_has_osxsave) { + pr_info("AVX2 instructions are not detected.\n"); + return -ENODEV; + } + + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { + pr_info("AVX detected but unusable.\n"); + return -ENODEV; + } + + return crypto_register_algs(srp_algs, ARRAY_SIZE(srp_algs)); +} + +static void __exit fini(void) +{ + crypto_unregister_algs(srp_algs, ARRAY_SIZE(srp_algs)); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized"); +MODULE_ALIAS("serpent"); +MODULE_ALIAS("serpent-asm"); diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 52abaaf28e7..ff487087097 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -4,8 +4,7 @@ * Copyright (C) 2012 Johannes Goetzfried * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> * - * Glue code based on serpent_sse2_glue.c by: - * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -29,6 +28,7 @@ #include <linux/types.h> #include <linux/crypto.h> #include <linux/err.h> +#include <crypto/ablk_helper.h> #include <crypto/algapi.h> #include <crypto/serpent.h> #include <crypto/cryptd.h> @@ -39,10 +39,34 @@ #include <asm/xcr.h> #include <asm/xsave.h> #include <asm/crypto/serpent-avx.h> -#include <asm/crypto/ablk_helper.h> #include <asm/crypto/glue_helper.h> -static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) +/* 8-way parallel cipher functions */ +asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, + const u8 *src); +EXPORT_SYMBOL_GPL(serpent_ecb_enc_8way_avx); + +asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, + const u8 *src); +EXPORT_SYMBOL_GPL(serpent_ecb_dec_8way_avx); + +asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, + const u8 *src); +EXPORT_SYMBOL_GPL(serpent_cbc_dec_8way_avx); + +asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); +EXPORT_SYMBOL_GPL(serpent_ctr_8way_avx); + +asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); +EXPORT_SYMBOL_GPL(serpent_xts_enc_8way_avx); + +asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); +EXPORT_SYMBOL_GPL(serpent_xts_dec_8way_avx); + +void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) { be128 ctrblk; @@ -52,6 +76,22 @@ static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); u128_xor(dst, src, (u128 *)&ctrblk); } +EXPORT_SYMBOL_GPL(__serpent_crypt_ctr); + +void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) +{ + glue_xts_crypt_128bit_one(ctx, dst, src, iv, + GLUE_FUNC_CAST(__serpent_encrypt)); +} +EXPORT_SYMBOL_GPL(serpent_xts_enc); + +void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) +{ + glue_xts_crypt_128bit_one(ctx, dst, src, iv, + GLUE_FUNC_CAST(__serpent_decrypt)); +} +EXPORT_SYMBOL_GPL(serpent_xts_dec); + static const struct common_glue_ctx serpent_enc = { .num_funcs = 2, @@ -75,7 +115,20 @@ static const struct common_glue_ctx serpent_ctr = { .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) } }, { .num_blocks = 1, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) } + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) } + } } +}; + +static const struct common_glue_ctx serpent_enc_xts = { + .num_funcs = 2, + .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = SERPENT_PARALLEL_BLOCKS, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) } + }, { + .num_blocks = 1, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) } } } }; @@ -105,6 +158,19 @@ static const struct common_glue_ctx serpent_dec_cbc = { } } }; +static const struct common_glue_ctx serpent_dec_xts = { + .num_funcs = 2, + .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = SERPENT_PARALLEL_BLOCKS, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) } + }, { + .num_blocks = 1, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) } + } } +}; + static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { @@ -187,13 +253,8 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) __serpent_decrypt(ctx->ctx, srcdst, srcdst); } -struct serpent_lrw_ctx { - struct lrw_table_ctx lrw_table; - struct serpent_ctx serpent_ctx; -}; - -static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) +int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) { struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); int err; @@ -206,6 +267,7 @@ static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, return lrw_init_table(&ctx->lrw_table, key + keylen - SERPENT_BLOCK_SIZE); } +EXPORT_SYMBOL_GPL(lrw_serpent_setkey); static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) @@ -259,20 +321,16 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, return ret; } -static void lrw_exit_tfm(struct crypto_tfm *tfm) +void lrw_serpent_exit_tfm(struct crypto_tfm *tfm) { struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); lrw_free_table(&ctx->lrw_table); } +EXPORT_SYMBOL_GPL(lrw_serpent_exit_tfm); -struct serpent_xts_ctx { - struct serpent_ctx tweak_ctx; - struct serpent_ctx crypt_ctx; -}; - -static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) +int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) { struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm); u32 *flags = &tfm->crt_flags; @@ -294,59 +352,26 @@ static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, /* second half of xts-key is for tweak */ return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); } +EXPORT_SYMBOL_GPL(xts_serpent_setkey); static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[SERPENT_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->crypt_ctx, - .fpu_enabled = false, - }; - struct xts_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .tweak_ctx = &ctx->tweak_ctx, - .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt), - .crypt_ctx = &crypt_ctx, - .crypt_fn = encrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = xts_crypt(desc, dst, src, nbytes, &req); - serpent_fpu_end(crypt_ctx.fpu_enabled); - return ret; + return glue_xts_crypt_128bit(&serpent_enc_xts, desc, dst, src, nbytes, + XTS_TWEAK_CAST(__serpent_encrypt), + &ctx->tweak_ctx, &ctx->crypt_ctx); } static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[SERPENT_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->crypt_ctx, - .fpu_enabled = false, - }; - struct xts_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .tweak_ctx = &ctx->tweak_ctx, - .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt), - .crypt_ctx = &crypt_ctx, - .crypt_fn = decrypt_callback, - }; - int ret; - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = xts_crypt(desc, dst, src, nbytes, &req); - serpent_fpu_end(crypt_ctx.fpu_enabled); - - return ret; + return glue_xts_crypt_128bit(&serpent_dec_xts, desc, dst, src, nbytes, + XTS_TWEAK_CAST(__serpent_encrypt), + &ctx->tweak_ctx, &ctx->crypt_ctx); } static struct crypto_alg serpent_algs[10] = { { @@ -417,7 +442,7 @@ static struct crypto_alg serpent_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_exit = lrw_exit_tfm, + .cra_exit = lrw_serpent_exit_tfm, .cra_u = { .blkcipher = { .min_keysize = SERPENT_MIN_KEY_SIZE + diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 97a356ece24..8c95f863730 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -34,6 +34,7 @@ #include <linux/types.h> #include <linux/crypto.h> #include <linux/err.h> +#include <crypto/ablk_helper.h> #include <crypto/algapi.h> #include <crypto/serpent.h> #include <crypto/cryptd.h> @@ -42,7 +43,6 @@ #include <crypto/lrw.h> #include <crypto/xts.h> #include <asm/crypto/serpent-sse2.h> -#include <asm/crypto/ablk_helper.h> #include <asm/crypto/glue_helper.h> static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S new file mode 100644 index 00000000000..1cd792db15e --- /dev/null +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -0,0 +1,708 @@ +/* + * Implement fast SHA-1 with AVX2 instructions. (x86_64) + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Ilya Albrekht <ilya.albrekht@intel.com> + * Maxim Locktyukhin <maxim.locktyukhin@intel.com> + * Ronen Zohar <ronen.zohar@intel.com> + * Chandramouli Narayanan <mouli@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * SHA-1 implementation with Intel(R) AVX2 instruction set extensions. + * + *This implementation is based on the previous SSSE3 release: + *Visit http://software.intel.com/en-us/articles/ + *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/ + * + *Updates 20-byte SHA-1 record in 'hash' for even number of + *'num_blocks' consecutive 64-byte blocks + * + *extern "C" void sha1_transform_avx2( + * int *hash, const char* input, size_t num_blocks ); + */ + +#include <linux/linkage.h> + +#define CTX %rdi /* arg1 */ +#define BUF %rsi /* arg2 */ +#define CNT %rdx /* arg3 */ + +#define REG_A %ecx +#define REG_B %esi +#define REG_C %edi +#define REG_D %eax +#define REG_E %edx +#define REG_TB %ebx +#define REG_TA %r12d +#define REG_RA %rcx +#define REG_RB %rsi +#define REG_RC %rdi +#define REG_RD %rax +#define REG_RE %rdx +#define REG_RTA %r12 +#define REG_RTB %rbx +#define REG_T1 %ebp +#define xmm_mov vmovups +#define avx2_zeroupper vzeroupper +#define RND_F1 1 +#define RND_F2 2 +#define RND_F3 3 + +.macro REGALLOC + .set A, REG_A + .set B, REG_B + .set C, REG_C + .set D, REG_D + .set E, REG_E + .set TB, REG_TB + .set TA, REG_TA + + .set RA, REG_RA + .set RB, REG_RB + .set RC, REG_RC + .set RD, REG_RD + .set RE, REG_RE + + .set RTA, REG_RTA + .set RTB, REG_RTB + + .set T1, REG_T1 +.endm + +#define K_BASE %r8 +#define HASH_PTR %r9 +#define BUFFER_PTR %r10 +#define BUFFER_PTR2 %r13 +#define BUFFER_END %r11 + +#define PRECALC_BUF %r14 +#define WK_BUF %r15 + +#define W_TMP %xmm0 +#define WY_TMP %ymm0 +#define WY_TMP2 %ymm9 + +# AVX2 variables +#define WY0 %ymm3 +#define WY4 %ymm5 +#define WY08 %ymm7 +#define WY12 %ymm8 +#define WY16 %ymm12 +#define WY20 %ymm13 +#define WY24 %ymm14 +#define WY28 %ymm15 + +#define YMM_SHUFB_BSWAP %ymm10 + +/* + * Keep 2 iterations precalculated at a time: + * - 80 DWORDs per iteration * 2 + */ +#define W_SIZE (80*2*2 +16) + +#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF) +#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF) + + +.macro UPDATE_HASH hash, val + add \hash, \val + mov \val, \hash +.endm + +.macro PRECALC_RESET_WY + .set WY_00, WY0 + .set WY_04, WY4 + .set WY_08, WY08 + .set WY_12, WY12 + .set WY_16, WY16 + .set WY_20, WY20 + .set WY_24, WY24 + .set WY_28, WY28 + .set WY_32, WY_00 +.endm + +.macro PRECALC_ROTATE_WY + /* Rotate macros */ + .set WY_32, WY_28 + .set WY_28, WY_24 + .set WY_24, WY_20 + .set WY_20, WY_16 + .set WY_16, WY_12 + .set WY_12, WY_08 + .set WY_08, WY_04 + .set WY_04, WY_00 + .set WY_00, WY_32 + + /* Define register aliases */ + .set WY, WY_00 + .set WY_minus_04, WY_04 + .set WY_minus_08, WY_08 + .set WY_minus_12, WY_12 + .set WY_minus_16, WY_16 + .set WY_minus_20, WY_20 + .set WY_minus_24, WY_24 + .set WY_minus_28, WY_28 + .set WY_minus_32, WY +.endm + +.macro PRECALC_00_15 + .if (i == 0) # Initialize and rotate registers + PRECALC_RESET_WY + PRECALC_ROTATE_WY + .endif + + /* message scheduling pre-compute for rounds 0-15 */ + .if ((i & 7) == 0) + /* + * blended AVX2 and ALU instruction scheduling + * 1 vector iteration per 8 rounds + */ + vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP + .elseif ((i & 7) == 1) + vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\ + WY_TMP, WY_TMP + .elseif ((i & 7) == 2) + vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY + .elseif ((i & 7) == 4) + vpaddd K_XMM(K_BASE), WY, WY_TMP + .elseif ((i & 7) == 7) + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif +.endm + +.macro PRECALC_16_31 + /* + * message scheduling pre-compute for rounds 16-31 + * calculating last 32 w[i] values in 8 XMM registers + * pre-calculate K+w[i] values and store to mem + * for later load by ALU add instruction + * + * "brute force" vectorization for rounds 16-31 only + * due to w[i]->w[i-3] dependency + */ + .if ((i & 7) == 0) + /* + * blended AVX2 and ALU instruction scheduling + * 1 vector iteration per 8 rounds + */ + /* w[i-14] */ + vpalignr $8, WY_minus_16, WY_minus_12, WY + vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */ + .elseif ((i & 7) == 1) + vpxor WY_minus_08, WY, WY + vpxor WY_minus_16, WY_TMP, WY_TMP + .elseif ((i & 7) == 2) + vpxor WY_TMP, WY, WY + vpslldq $12, WY, WY_TMP2 + .elseif ((i & 7) == 3) + vpslld $1, WY, WY_TMP + vpsrld $31, WY, WY + .elseif ((i & 7) == 4) + vpor WY, WY_TMP, WY_TMP + vpslld $2, WY_TMP2, WY + .elseif ((i & 7) == 5) + vpsrld $30, WY_TMP2, WY_TMP2 + vpxor WY, WY_TMP, WY_TMP + .elseif ((i & 7) == 7) + vpxor WY_TMP2, WY_TMP, WY + vpaddd K_XMM(K_BASE), WY, WY_TMP + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif +.endm + +.macro PRECALC_32_79 + /* + * in SHA-1 specification: + * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 + * instead we do equal: + * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 + * allows more efficient vectorization + * since w[i]=>w[i-3] dependency is broken + */ + + .if ((i & 7) == 0) + /* + * blended AVX2 and ALU instruction scheduling + * 1 vector iteration per 8 rounds + */ + vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP + .elseif ((i & 7) == 1) + /* W is W_minus_32 before xor */ + vpxor WY_minus_28, WY, WY + .elseif ((i & 7) == 2) + vpxor WY_minus_16, WY_TMP, WY_TMP + .elseif ((i & 7) == 3) + vpxor WY_TMP, WY, WY + .elseif ((i & 7) == 4) + vpslld $2, WY, WY_TMP + .elseif ((i & 7) == 5) + vpsrld $30, WY, WY + vpor WY, WY_TMP, WY + .elseif ((i & 7) == 7) + vpaddd K_XMM(K_BASE), WY, WY_TMP + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif +.endm + +.macro PRECALC r, s + .set i, \r + + .if (i < 40) + .set K_XMM, 32*0 + .elseif (i < 80) + .set K_XMM, 32*1 + .elseif (i < 120) + .set K_XMM, 32*2 + .else + .set K_XMM, 32*3 + .endif + + .if (i<32) + PRECALC_00_15 \s + .elseif (i<64) + PRECALC_16_31 \s + .elseif (i < 160) + PRECALC_32_79 \s + .endif +.endm + +.macro ROTATE_STATE + .set T_REG, E + .set E, D + .set D, C + .set C, B + .set B, TB + .set TB, A + .set A, T_REG + + .set T_REG, RE + .set RE, RD + .set RD, RC + .set RC, RB + .set RB, RTB + .set RTB, RA + .set RA, T_REG +.endm + +/* Macro relies on saved ROUND_Fx */ + +.macro RND_FUN f, r + .if (\f == RND_F1) + ROUND_F1 \r + .elseif (\f == RND_F2) + ROUND_F2 \r + .elseif (\f == RND_F3) + ROUND_F3 \r + .endif +.endm + +.macro RR r + .set round_id, (\r % 80) + + .if (round_id == 0) /* Precalculate F for first round */ + .set ROUND_FUNC, RND_F1 + mov B, TB + + rorx $(32-30), B, B /* b>>>2 */ + andn D, TB, T1 + and C, TB + xor T1, TB + .endif + + RND_FUN ROUND_FUNC, \r + ROTATE_STATE + + .if (round_id == 18) + .set ROUND_FUNC, RND_F2 + .elseif (round_id == 38) + .set ROUND_FUNC, RND_F3 + .elseif (round_id == 58) + .set ROUND_FUNC, RND_F2 + .endif + + .set round_id, ( (\r+1) % 80) + + RND_FUN ROUND_FUNC, (\r+1) + ROTATE_STATE +.endm + +.macro ROUND_F1 r + add WK(\r), E + + andn C, A, T1 /* ~b&d */ + lea (RE,RTB), E /* Add F from the previous round */ + + rorx $(32-5), A, TA /* T2 = A >>> 5 */ + rorx $(32-30),A, TB /* b>>>2 for next round */ + + PRECALC (\r) /* msg scheduling for next 2 blocks */ + + /* + * Calculate F for the next round + * (b & c) ^ andn[b, d] + */ + and B, A /* b&c */ + xor T1, A /* F1 = (b&c) ^ (~b&d) */ + + lea (RE,RTA), E /* E += A >>> 5 */ +.endm + +.macro ROUND_F2 r + add WK(\r), E + lea (RE,RTB), E /* Add F from the previous round */ + + /* Calculate F for the next round */ + rorx $(32-5), A, TA /* T2 = A >>> 5 */ + .if ((round_id) < 79) + rorx $(32-30), A, TB /* b>>>2 for next round */ + .endif + PRECALC (\r) /* msg scheduling for next 2 blocks */ + + .if ((round_id) < 79) + xor B, A + .endif + + add TA, E /* E += A >>> 5 */ + + .if ((round_id) < 79) + xor C, A + .endif +.endm + +.macro ROUND_F3 r + add WK(\r), E + PRECALC (\r) /* msg scheduling for next 2 blocks */ + + lea (RE,RTB), E /* Add F from the previous round */ + + mov B, T1 + or A, T1 + + rorx $(32-5), A, TA /* T2 = A >>> 5 */ + rorx $(32-30), A, TB /* b>>>2 for next round */ + + /* Calculate F for the next round + * (b and c) or (d and (b or c)) + */ + and C, T1 + and B, A + or T1, A + + add TA, E /* E += A >>> 5 */ + +.endm + +/* + * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining + */ +.macro SHA1_PIPELINED_MAIN_BODY + + REGALLOC + + mov (HASH_PTR), A + mov 4(HASH_PTR), B + mov 8(HASH_PTR), C + mov 12(HASH_PTR), D + mov 16(HASH_PTR), E + + mov %rsp, PRECALC_BUF + lea (2*4*80+32)(%rsp), WK_BUF + + # Precalc WK for first 2 blocks + PRECALC_OFFSET = 0 + .set i, 0 + .rept 160 + PRECALC i + .set i, i + 1 + .endr + PRECALC_OFFSET = 128 + xchg WK_BUF, PRECALC_BUF + + .align 32 +_loop: + /* + * code loops through more than one block + * we use K_BASE value as a signal of a last block, + * it is set below by: cmovae BUFFER_PTR, K_BASE + */ + cmp K_BASE, BUFFER_PTR + jne _begin + .align 32 + jmp _end + .align 32 +_begin: + + /* + * Do first block + * rounds: 0,2,4,6,8 + */ + .set j, 0 + .rept 5 + RR j + .set j, j+2 + .endr + + jmp _loop0 +_loop0: + + /* + * rounds: + * 10,12,14,16,18 + * 20,22,24,26,28 + * 30,32,34,36,38 + * 40,42,44,46,48 + * 50,52,54,56,58 + */ + .rept 25 + RR j + .set j, j+2 + .endr + + add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */ + cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */ + cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ + + /* + * rounds + * 60,62,64,66,68 + * 70,72,74,76,78 + */ + .rept 10 + RR j + .set j, j+2 + .endr + + UPDATE_HASH (HASH_PTR), A + UPDATE_HASH 4(HASH_PTR), TB + UPDATE_HASH 8(HASH_PTR), C + UPDATE_HASH 12(HASH_PTR), D + UPDATE_HASH 16(HASH_PTR), E + + cmp K_BASE, BUFFER_PTR /* is current block the last one? */ + je _loop + + mov TB, B + + /* Process second block */ + /* + * rounds + * 0+80, 2+80, 4+80, 6+80, 8+80 + * 10+80,12+80,14+80,16+80,18+80 + */ + + .set j, 0 + .rept 10 + RR j+80 + .set j, j+2 + .endr + + jmp _loop1 +_loop1: + /* + * rounds + * 20+80,22+80,24+80,26+80,28+80 + * 30+80,32+80,34+80,36+80,38+80 + */ + .rept 10 + RR j+80 + .set j, j+2 + .endr + + jmp _loop2 +_loop2: + + /* + * rounds + * 40+80,42+80,44+80,46+80,48+80 + * 50+80,52+80,54+80,56+80,58+80 + */ + .rept 10 + RR j+80 + .set j, j+2 + .endr + + add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */ + + cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */ + cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ + + jmp _loop3 +_loop3: + + /* + * rounds + * 60+80,62+80,64+80,66+80,68+80 + * 70+80,72+80,74+80,76+80,78+80 + */ + .rept 10 + RR j+80 + .set j, j+2 + .endr + + UPDATE_HASH (HASH_PTR), A + UPDATE_HASH 4(HASH_PTR), TB + UPDATE_HASH 8(HASH_PTR), C + UPDATE_HASH 12(HASH_PTR), D + UPDATE_HASH 16(HASH_PTR), E + + /* Reset state for AVX2 reg permutation */ + mov A, TA + mov TB, A + mov C, TB + mov E, C + mov D, B + mov TA, D + + REGALLOC + + xchg WK_BUF, PRECALC_BUF + + jmp _loop + + .align 32 + _end: + +.endm +/* + * macro implements SHA-1 function's body for several 64-byte blocks + * param: function's name + */ +.macro SHA1_VECTOR_ASM name + ENTRY(\name) + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + RESERVE_STACK = (W_SIZE*4 + 8+24) + + /* Align stack */ + mov %rsp, %rbx + and $~(0x20-1), %rsp + push %rbx + sub $RESERVE_STACK, %rsp + + avx2_zeroupper + + lea K_XMM_AR(%rip), K_BASE + + mov CTX, HASH_PTR + mov BUF, BUFFER_PTR + lea 64(BUF), BUFFER_PTR2 + + shl $6, CNT /* mul by 64 */ + add BUF, CNT + add $64, CNT + mov CNT, BUFFER_END + + cmp BUFFER_END, BUFFER_PTR2 + cmovae K_BASE, BUFFER_PTR2 + + xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP + + SHA1_PIPELINED_MAIN_BODY + + avx2_zeroupper + + add $RESERVE_STACK, %rsp + pop %rsp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + + ENDPROC(\name) +.endm + +.section .rodata + +#define K1 0x5a827999 +#define K2 0x6ed9eba1 +#define K3 0x8f1bbcdc +#define K4 0xca62c1d6 + +.align 128 +K_XMM_AR: + .long K1, K1, K1, K1 + .long K1, K1, K1, K1 + .long K2, K2, K2, K2 + .long K2, K2, K2, K2 + .long K3, K3, K3, K3 + .long K3, K3, K3, K3 + .long K4, K4, K4, K4 + .long K4, K4, K4, K4 + +BSWAP_SHUFB_CTL: + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f +.text + +SHA1_VECTOR_ASM sha1_transform_avx2 diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S index 49d6987a73d..a4109506a5e 100644 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ b/arch/x86/crypto/sha1_ssse3_asm.S @@ -28,6 +28,8 @@ * (at your option) any later version. */ +#include <linux/linkage.h> + #define CTX %rdi // arg1 #define BUF %rsi // arg2 #define CNT %rdx // arg3 @@ -69,10 +71,8 @@ * param: function's name */ .macro SHA1_VECTOR_ASM name - .global \name - .type \name, @function - .align 32 -\name: + ENTRY(\name) + push %rbx push %rbp push %r12 @@ -106,7 +106,7 @@ pop %rbx ret - .size \name, .-\name + ENDPROC(\name) .endm /* diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 4a11a9d7245..74d16ef707c 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -10,6 +10,7 @@ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> * Copyright (c) Mathias Krause <minipli@googlemail.com> + * Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -39,6 +40,12 @@ asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, asmlinkage void sha1_transform_avx(u32 *digest, const char *data, unsigned int rounds); #endif +#ifdef CONFIG_AS_AVX2 +#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ + +asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, + unsigned int rounds); +#endif static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int); @@ -165,6 +172,18 @@ static int sha1_ssse3_import(struct shash_desc *desc, const void *in) return 0; } +#ifdef CONFIG_AS_AVX2 +static void sha1_apply_transform_avx2(u32 *digest, const char *data, + unsigned int rounds) +{ + /* Select the optimal transform based on data block size */ + if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE) + sha1_transform_avx2(digest, data, rounds); + else + sha1_transform_avx(digest, data, rounds); +} +#endif + static struct shash_alg alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_ssse3_init, @@ -201,27 +220,49 @@ static bool __init avx_usable(void) return true; } + +#ifdef CONFIG_AS_AVX2 +static bool __init avx2_usable(void) +{ + if (avx_usable() && cpu_has_avx2 && boot_cpu_has(X86_FEATURE_BMI1) && + boot_cpu_has(X86_FEATURE_BMI2)) + return true; + + return false; +} +#endif #endif static int __init sha1_ssse3_mod_init(void) { + char *algo_name; + /* test for SSSE3 first */ - if (cpu_has_ssse3) + if (cpu_has_ssse3) { sha1_transform_asm = sha1_transform_ssse3; + algo_name = "SSSE3"; + } #ifdef CONFIG_AS_AVX /* allow AVX to override SSSE3, it's a little faster */ - if (avx_usable()) + if (avx_usable()) { sha1_transform_asm = sha1_transform_avx; + algo_name = "AVX"; +#ifdef CONFIG_AS_AVX2 + /* allow AVX2 to override AVX, it's a little faster */ + if (avx2_usable()) { + sha1_transform_asm = sha1_apply_transform_avx2; + algo_name = "AVX2"; + } +#endif + } #endif if (sha1_transform_asm) { - pr_info("Using %s optimized SHA-1 implementation\n", - sha1_transform_asm == sha1_transform_ssse3 ? "SSSE3" - : "AVX"); + pr_info("Using %s optimized SHA-1 implementation\n", algo_name); return crypto_register_shash(&alg); } - pr_info("Neither AVX nor SSSE3 is available/usable.\n"); + pr_info("Neither AVX nor AVX2 nor SSSE3 is available/usable.\n"); return -ENODEV; } diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S new file mode 100644 index 00000000000..642f15687a0 --- /dev/null +++ b/arch/x86/crypto/sha256-avx-asm.S @@ -0,0 +1,496 @@ +######################################################################## +# Implement fast SHA-256 with AVX1 instructions. (x86_64) +# +# Copyright (C) 2013 Intel Corporation. +# +# Authors: +# James Guilford <james.guilford@intel.com> +# Kirk Yap <kirk.s.yap@intel.com> +# Tim Chen <tim.c.chen@linux.intel.com> +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +######################################################################## +# +# This code is described in an Intel White-Paper: +# "Fast SHA-256 Implementations on Intel Architecture Processors" +# +# To find it, surf to http://www.intel.com/p/en_US/embedded +# and search for that title. +# +######################################################################## +# This code schedules 1 block at a time, with 4 lanes per block +######################################################################## + +#ifdef CONFIG_AS_AVX +#include <linux/linkage.h> + +## assume buffers not aligned +#define VMOVDQ vmovdqu + +################################ Define Macros + +# addm [mem], reg +# Add reg to mem using reg-mem add and store +.macro addm p1 p2 + add \p1, \p2 + mov \p2, \p1 +.endm + + +.macro MY_ROR p1 p2 + shld $(32-(\p1)), \p2, \p2 +.endm + +################################ + +# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask +# Load xmm with mem and byte swap each dword +.macro COPY_XMM_AND_BSWAP p1 p2 p3 + VMOVDQ \p2, \p1 + vpshufb \p3, \p1, \p1 +.endm + +################################ + +X0 = %xmm4 +X1 = %xmm5 +X2 = %xmm6 +X3 = %xmm7 + +XTMP0 = %xmm0 +XTMP1 = %xmm1 +XTMP2 = %xmm2 +XTMP3 = %xmm3 +XTMP4 = %xmm8 +XFER = %xmm9 +XTMP5 = %xmm11 + +SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA +SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00 +BYTE_FLIP_MASK = %xmm13 + +NUM_BLKS = %rdx # 3rd arg +CTX = %rsi # 2nd arg +INP = %rdi # 1st arg + +SRND = %rdi # clobbers INP +c = %ecx +d = %r8d +e = %edx +TBL = %rbp +a = %eax +b = %ebx + +f = %r9d +g = %r10d +h = %r11d + +y0 = %r13d +y1 = %r14d +y2 = %r15d + + +_INP_END_SIZE = 8 +_INP_SIZE = 8 +_XFER_SIZE = 16 +_XMM_SAVE_SIZE = 0 + +_INP_END = 0 +_INP = _INP_END + _INP_END_SIZE +_XFER = _INP + _INP_SIZE +_XMM_SAVE = _XFER + _XFER_SIZE +STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE + +# rotate_Xs +# Rotate values of symbols X0...X3 +.macro rotate_Xs +X_ = X0 +X0 = X1 +X1 = X2 +X2 = X3 +X3 = X_ +.endm + +# ROTATE_ARGS +# Rotate values of symbols a...h +.macro ROTATE_ARGS +TMP_ = h +h = g +g = f +f = e +e = d +d = c +c = b +b = a +a = TMP_ +.endm + +.macro FOUR_ROUNDS_AND_SCHED + ## compute s0 four at a time and s1 two at a time + ## compute W[-16] + W[-7] 4 at a time + + mov e, y0 # y0 = e + MY_ROR (25-11), y0 # y0 = e >> (25-11) + mov a, y1 # y1 = a + vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] + MY_ROR (22-13), y1 # y1 = a >> (22-13) + xor e, y0 # y0 = e ^ (e >> (25-11)) + mov f, y2 # y2 = f + MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor a, y1 # y1 = a ^ (a >> (22-13) + xor g, y2 # y2 = f^g + vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16] + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and e, y2 # y2 = (f^g)&e + MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + ## compute s0 + vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor g, y2 # y2 = CH = ((f^g)&e)^g + MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y0, y2 # y2 = S1 + CH + add _XFER(%rsp), y2 # y2 = k + w + S1 + CH + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + vpsrld $7, XTMP1, XTMP2 + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + vpslld $(32-7), XTMP1, XTMP3 + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS + mov e, y0 # y0 = e + mov a, y1 # y1 = a + MY_ROR (25-11), y0 # y0 = e >> (25-11) + xor e, y0 # y0 = e ^ (e >> (25-11)) + mov f, y2 # y2 = f + MY_ROR (22-13), y1 # y1 = a >> (22-13) + vpsrld $18, XTMP1, XTMP2 # + xor a, y1 # y1 = a ^ (a >> (22-13) + MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor g, y2 # y2 = f^g + vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 + MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and e, y2 # y2 = (f^g)&e + MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + vpslld $(32-18), XTMP1, XTMP1 + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor g, y2 # y2 = CH = ((f^g)&e)^g + vpxor XTMP1, XTMP3, XTMP3 # + add y0, y2 # y2 = S1 + CH + add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH + MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + ## compute low s1 + vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS + mov e, y0 # y0 = e + mov a, y1 # y1 = a + MY_ROR (25-11), y0 # y0 = e >> (25-11) + xor e, y0 # y0 = e ^ (e >> (25-11)) + MY_ROR (22-13), y1 # y1 = a >> (22-13) + mov f, y2 # y2 = f + xor a, y1 # y1 = a ^ (a >> (22-13) + MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} + xor g, y2 # y2 = f^g + vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA} + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and e, y2 # y2 = (f^g)&e + vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA} + MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor g, y2 # y2 = CH = ((f^g)&e)^g + MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + vpxor XTMP3, XTMP2, XTMP2 # + add y0, y2 # y2 = S1 + CH + MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH + vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + ## compute high s1 + vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS + mov e, y0 # y0 = e + MY_ROR (25-11), y0 # y0 = e >> (25-11) + mov a, y1 # y1 = a + MY_ROR (22-13), y1 # y1 = a >> (22-13) + xor e, y0 # y0 = e ^ (e >> (25-11)) + mov f, y2 # y2 = f + MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} + xor a, y1 # y1 = a ^ (a >> (22-13) + xor g, y2 # y2 = f^g + vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC} + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and e, y2 # y2 = (f^g)&e + MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC} + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor g, y2 # y2 = CH = ((f^g)&e)^g + vpxor XTMP3, XTMP2, XTMP2 + MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y0, y2 # y2 = S1 + CH + add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH + vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS + rotate_Xs +.endm + +## input is [rsp + _XFER + %1 * 4] +.macro DO_ROUND round + mov e, y0 # y0 = e + MY_ROR (25-11), y0 # y0 = e >> (25-11) + mov a, y1 # y1 = a + xor e, y0 # y0 = e ^ (e >> (25-11)) + MY_ROR (22-13), y1 # y1 = a >> (22-13) + mov f, y2 # y2 = f + xor a, y1 # y1 = a ^ (a >> (22-13) + MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor g, y2 # y2 = f^g + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + and e, y2 # y2 = (f^g)&e + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor g, y2 # y2 = CH = ((f^g)&e)^g + add y0, y2 # y2 = S1 + CH + MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + offset = \round * 4 + _XFER # + add offset(%rsp), y2 # y2 = k + w + S1 + CH + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS +.endm + +######################################################################## +## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks) +## arg 1 : pointer to input data +## arg 2 : pointer to digest +## arg 3 : Num blocks +######################################################################## +.text +ENTRY(sha256_transform_avx) +.align 32 + pushq %rbx + pushq %rbp + pushq %r13 + pushq %r14 + pushq %r15 + pushq %r12 + + mov %rsp, %r12 + subq $STACK_SIZE, %rsp # allocate stack space + and $~15, %rsp # align stack pointer + + shl $6, NUM_BLKS # convert to bytes + jz done_hash + add INP, NUM_BLKS # pointer to end of data + mov NUM_BLKS, _INP_END(%rsp) + + ## load initial digest + mov 4*0(CTX), a + mov 4*1(CTX), b + mov 4*2(CTX), c + mov 4*3(CTX), d + mov 4*4(CTX), e + mov 4*5(CTX), f + mov 4*6(CTX), g + mov 4*7(CTX), h + + vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK + vmovdqa _SHUF_00BA(%rip), SHUF_00BA + vmovdqa _SHUF_DC00(%rip), SHUF_DC00 +loop0: + lea K256(%rip), TBL + + ## byte swap first 16 dwords + COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK + + mov INP, _INP(%rsp) + + ## schedule 48 input dwords, by doing 3 rounds of 16 each + mov $3, SRND +.align 16 +loop1: + vpaddd (TBL), X0, XFER + vmovdqa XFER, _XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddd 1*16(TBL), X0, XFER + vmovdqa XFER, _XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddd 2*16(TBL), X0, XFER + vmovdqa XFER, _XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddd 3*16(TBL), X0, XFER + vmovdqa XFER, _XFER(%rsp) + add $4*16, TBL + FOUR_ROUNDS_AND_SCHED + + sub $1, SRND + jne loop1 + + mov $2, SRND +loop2: + vpaddd (TBL), X0, XFER + vmovdqa XFER, _XFER(%rsp) + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + vpaddd 1*16(TBL), X1, XFER + vmovdqa XFER, _XFER(%rsp) + add $2*16, TBL + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + vmovdqa X2, X0 + vmovdqa X3, X1 + + sub $1, SRND + jne loop2 + + addm (4*0)(CTX),a + addm (4*1)(CTX),b + addm (4*2)(CTX),c + addm (4*3)(CTX),d + addm (4*4)(CTX),e + addm (4*5)(CTX),f + addm (4*6)(CTX),g + addm (4*7)(CTX),h + + mov _INP(%rsp), INP + add $64, INP + cmp _INP_END(%rsp), INP + jne loop0 + +done_hash: + + mov %r12, %rsp + + popq %r12 + popq %r15 + popq %r14 + popq %r13 + popq %rbp + popq %rbx + ret +ENDPROC(sha256_transform_avx) + +.data +.align 64 +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x0c0d0e0f08090a0b0405060700010203 + +# shuffle xBxA -> 00BA +_SHUF_00BA: + .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 + +# shuffle xDxC -> DC00 +_SHUF_DC00: + .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF +#endif diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S new file mode 100644 index 00000000000..9e86944c539 --- /dev/null +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -0,0 +1,772 @@ +######################################################################## +# Implement fast SHA-256 with AVX2 instructions. (x86_64) +# +# Copyright (C) 2013 Intel Corporation. +# +# Authors: +# James Guilford <james.guilford@intel.com> +# Kirk Yap <kirk.s.yap@intel.com> +# Tim Chen <tim.c.chen@linux.intel.com> +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +######################################################################## +# +# This code is described in an Intel White-Paper: +# "Fast SHA-256 Implementations on Intel Architecture Processors" +# +# To find it, surf to http://www.intel.com/p/en_US/embedded +# and search for that title. +# +######################################################################## +# This code schedules 2 blocks at a time, with 4 lanes per block +######################################################################## + +#ifdef CONFIG_AS_AVX2 +#include <linux/linkage.h> + +## assume buffers not aligned +#define VMOVDQ vmovdqu + +################################ Define Macros + +# addm [mem], reg +# Add reg to mem using reg-mem add and store +.macro addm p1 p2 + add \p1, \p2 + mov \p2, \p1 +.endm + +################################ + +X0 = %ymm4 +X1 = %ymm5 +X2 = %ymm6 +X3 = %ymm7 + +# XMM versions of above +XWORD0 = %xmm4 +XWORD1 = %xmm5 +XWORD2 = %xmm6 +XWORD3 = %xmm7 + +XTMP0 = %ymm0 +XTMP1 = %ymm1 +XTMP2 = %ymm2 +XTMP3 = %ymm3 +XTMP4 = %ymm8 +XFER = %ymm9 +XTMP5 = %ymm11 + +SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA +SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 +BYTE_FLIP_MASK = %ymm13 + +X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK + +NUM_BLKS = %rdx # 3rd arg +CTX = %rsi # 2nd arg +INP = %rdi # 1st arg +c = %ecx +d = %r8d +e = %edx # clobbers NUM_BLKS +y3 = %edi # clobbers INP + + +TBL = %rbp +SRND = CTX # SRND is same register as CTX + +a = %eax +b = %ebx +f = %r9d +g = %r10d +h = %r11d +old_h = %r11d + +T1 = %r12d +y0 = %r13d +y1 = %r14d +y2 = %r15d + + +_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round +_XMM_SAVE_SIZE = 0 +_INP_END_SIZE = 8 +_INP_SIZE = 8 +_CTX_SIZE = 8 +_RSP_SIZE = 8 + +_XFER = 0 +_XMM_SAVE = _XFER + _XFER_SIZE +_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE +_INP = _INP_END + _INP_END_SIZE +_CTX = _INP + _INP_SIZE +_RSP = _CTX + _CTX_SIZE +STACK_SIZE = _RSP + _RSP_SIZE + +# rotate_Xs +# Rotate values of symbols X0...X3 +.macro rotate_Xs + X_ = X0 + X0 = X1 + X1 = X2 + X2 = X3 + X3 = X_ +.endm + +# ROTATE_ARGS +# Rotate values of symbols a...h +.macro ROTATE_ARGS + old_h = h + TMP_ = h + h = g + g = f + f = e + e = d + d = c + c = b + b = a + a = TMP_ +.endm + +.macro FOUR_ROUNDS_AND_SCHED disp +################################### RND N + 0 ############################ + + mov a, y3 # y3 = a # MAJA + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + + addl \disp(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] + mov f, y2 # y2 = f # CH + rorx $13, a, T1 # T1 = a >> 13 # S0B + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + xor g, y2 # y2 = f^g # CH + vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 + rorx $6, e, y1 # y1 = (e >> 6) # S1 + + and e, y2 # y2 = (f^g)&e # CH + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $22, a, y1 # y1 = a >> 22 # S0A + add h, d # d = k + w + h + d # -- + + and b, y3 # y3 = (a|c)&b # MAJA + vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + vpsrld $7, XTMP1, XTMP2 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + + add y0, y2 # y2 = S1 + CH # -- + vpslld $(32-7), XTMP1, XTMP3 + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 + + vpsrld $18, XTMP1, XTMP2 + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + + ROTATE_ARGS + +################################### RND N + 1 ############################ + + mov a, y3 # y3 = a # MAJA + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + offset = \disp + 1*4 + addl offset(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + + vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 + mov f, y2 # y2 = f # CH + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + xor g, y2 # y2 = f^g # CH + + + rorx $6, e, y1 # y1 = (e >> 6) # S1 + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $22, a, y1 # y1 = a >> 22 # S0A + and e, y2 # y2 = (f^g)&e # CH + add h, d # d = k + w + h + d # -- + + vpslld $(32-18), XTMP1, XTMP1 + and b, y3 # y3 = (a|c)&b # MAJA + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + + vpxor XTMP1, XTMP3, XTMP3 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 + vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} + + + ROTATE_ARGS + +################################### RND N + 2 ############################ + + mov a, y3 # y3 = a # MAJA + rorx $25, e, y0 # y0 = e >> 25 # S1A + offset = \disp + 2*4 + addl offset(%rsp, SRND), h # h = k + w + h # -- + + vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} + rorx $11, e, y1 # y1 = e >> 11 # S1B + or c, y3 # y3 = a|c # MAJA + mov f, y2 # y2 = f # CH + xor g, y2 # y2 = f^g # CH + + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} + and e, y2 # y2 = (f^g)&e # CH + + rorx $6, e, y1 # y1 = (e >> 6) # S1 + vpxor XTMP3, XTMP2, XTMP2 + add h, d # d = k + w + h + d # -- + and b, y3 # y3 = (a|c)&b # MAJA + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $22, a, y1 # y1 = a >> 22 # S0A + vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a ,T1 # T1 = (a >> 2) # S0 + vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} + + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1,h # h = k + w + h + S0 # -- + add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- + add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + + add y3,h # h = t1 + S0 + MAJ # -- + + + ROTATE_ARGS + +################################### RND N + 3 ############################ + + mov a, y3 # y3 = a # MAJA + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + offset = \disp + 3*4 + addl offset(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + + vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} + mov f, y2 # y2 = f # CH + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + xor g, y2 # y2 = f^g # CH + + + vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} + rorx $6, e, y1 # y1 = (e >> 6) # S1 + and e, y2 # y2 = (f^g)&e # CH + add h, d # d = k + w + h + d # -- + and b, y3 # y3 = (a|c)&b # MAJA + + vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + vpxor XTMP3, XTMP2, XTMP2 + rorx $22, a, y1 # y1 = a >> 22 # S0A + add y0, y2 # y2 = S1 + CH # -- + + vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + rorx $2, a, T1 # T1 = (a >> 2) # S0 + vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} + + vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + + add y1, h # h = k + w + h + S0 # -- + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + ROTATE_ARGS + rotate_Xs +.endm + +.macro DO_4ROUNDS disp +################################### RND N + 0 ########################### + + mov f, y2 # y2 = f # CH + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + rorx $6, e, y1 # y1 = (e >> 6) # S1 + and e, y2 # y2 = (f^g)&e # CH + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $22, a, y1 # y1 = a >> 22 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + addl \disp(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + ROTATE_ARGS + +################################### RND N + 1 ########################### + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + rorx $6, e, y1 # y1 = (e >> 6) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $22, a, y1 # y1 = a >> 22 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + offset = 4*1 + \disp + addl offset(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + ROTATE_ARGS + +################################### RND N + 2 ############################## + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + rorx $6, e, y1 # y1 = (e >> 6) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $22, a, y1 # y1 = a >> 22 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + offset = 4*2 + \disp + addl offset(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + ROTATE_ARGS + +################################### RND N + 3 ########################### + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $25, e, y0 # y0 = e >> 25 # S1A + rorx $11, e, y1 # y1 = e >> 11 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 + rorx $6, e, y1 # y1 = (e >> 6) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 + rorx $13, a, T1 # T1 = a >> 13 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $22, a, y1 # y1 = a >> 22 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 + rorx $2, a, T1 # T1 = (a >> 2) # S0 + offset = 4*3 + \disp + addl offset(%rsp, SRND), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + + add y3, h # h = t1 + S0 + MAJ # -- + + ROTATE_ARGS + +.endm + +######################################################################## +## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks) +## arg 1 : pointer to input data +## arg 2 : pointer to digest +## arg 3 : Num blocks +######################################################################## +.text +ENTRY(sha256_transform_rorx) +.align 32 + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + mov %rsp, %rax + subq $STACK_SIZE, %rsp + and $-32, %rsp # align rsp to 32 byte boundary + mov %rax, _RSP(%rsp) + + + shl $6, NUM_BLKS # convert to bytes + jz done_hash + lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block + mov NUM_BLKS, _INP_END(%rsp) + + cmp NUM_BLKS, INP + je only_one_block + + ## load initial digest + mov (CTX), a + mov 4*1(CTX), b + mov 4*2(CTX), c + mov 4*3(CTX), d + mov 4*4(CTX), e + mov 4*5(CTX), f + mov 4*6(CTX), g + mov 4*7(CTX), h + + vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK + vmovdqa _SHUF_00BA(%rip), SHUF_00BA + vmovdqa _SHUF_DC00(%rip), SHUF_DC00 + + mov CTX, _CTX(%rsp) + +loop0: + lea K256(%rip), TBL + + ## Load first 16 dwords from two blocks + VMOVDQ 0*32(INP),XTMP0 + VMOVDQ 1*32(INP),XTMP1 + VMOVDQ 2*32(INP),XTMP2 + VMOVDQ 3*32(INP),XTMP3 + + ## byte swap data + vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 + vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 + vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 + vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 + + ## transpose data into high/low halves + vperm2i128 $0x20, XTMP2, XTMP0, X0 + vperm2i128 $0x31, XTMP2, XTMP0, X1 + vperm2i128 $0x20, XTMP3, XTMP1, X2 + vperm2i128 $0x31, XTMP3, XTMP1, X3 + +last_block_enter: + add $64, INP + mov INP, _INP(%rsp) + + ## schedule 48 input dwords, by doing 3 rounds of 12 each + xor SRND, SRND + +.align 16 +loop1: + vpaddd 0*32(TBL, SRND), X0, XFER + vmovdqa XFER, 0*32+_XFER(%rsp, SRND) + FOUR_ROUNDS_AND_SCHED _XFER + 0*32 + + vpaddd 1*32(TBL, SRND), X0, XFER + vmovdqa XFER, 1*32+_XFER(%rsp, SRND) + FOUR_ROUNDS_AND_SCHED _XFER + 1*32 + + vpaddd 2*32(TBL, SRND), X0, XFER + vmovdqa XFER, 2*32+_XFER(%rsp, SRND) + FOUR_ROUNDS_AND_SCHED _XFER + 2*32 + + vpaddd 3*32(TBL, SRND), X0, XFER + vmovdqa XFER, 3*32+_XFER(%rsp, SRND) + FOUR_ROUNDS_AND_SCHED _XFER + 3*32 + + add $4*32, SRND + cmp $3*4*32, SRND + jb loop1 + +loop2: + ## Do last 16 rounds with no scheduling + vpaddd 0*32(TBL, SRND), X0, XFER + vmovdqa XFER, 0*32+_XFER(%rsp, SRND) + DO_4ROUNDS _XFER + 0*32 + vpaddd 1*32(TBL, SRND), X1, XFER + vmovdqa XFER, 1*32+_XFER(%rsp, SRND) + DO_4ROUNDS _XFER + 1*32 + add $2*32, SRND + + vmovdqa X2, X0 + vmovdqa X3, X1 + + cmp $4*4*32, SRND + jb loop2 + + mov _CTX(%rsp), CTX + mov _INP(%rsp), INP + + addm (4*0)(CTX),a + addm (4*1)(CTX),b + addm (4*2)(CTX),c + addm (4*3)(CTX),d + addm (4*4)(CTX),e + addm (4*5)(CTX),f + addm (4*6)(CTX),g + addm (4*7)(CTX),h + + cmp _INP_END(%rsp), INP + ja done_hash + + #### Do second block using previously scheduled results + xor SRND, SRND +.align 16 +loop3: + DO_4ROUNDS _XFER + 0*32 + 16 + DO_4ROUNDS _XFER + 1*32 + 16 + add $2*32, SRND + cmp $4*4*32, SRND + jb loop3 + + mov _CTX(%rsp), CTX + mov _INP(%rsp), INP + add $64, INP + + addm (4*0)(CTX),a + addm (4*1)(CTX),b + addm (4*2)(CTX),c + addm (4*3)(CTX),d + addm (4*4)(CTX),e + addm (4*5)(CTX),f + addm (4*6)(CTX),g + addm (4*7)(CTX),h + + cmp _INP_END(%rsp), INP + jb loop0 + ja done_hash + +do_last_block: + #### do last block + lea K256(%rip), TBL + + VMOVDQ 0*16(INP),XWORD0 + VMOVDQ 1*16(INP),XWORD1 + VMOVDQ 2*16(INP),XWORD2 + VMOVDQ 3*16(INP),XWORD3 + + vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 + vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 + vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 + vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 + + jmp last_block_enter + +only_one_block: + + ## load initial digest + mov (4*0)(CTX),a + mov (4*1)(CTX),b + mov (4*2)(CTX),c + mov (4*3)(CTX),d + mov (4*4)(CTX),e + mov (4*5)(CTX),f + mov (4*6)(CTX),g + mov (4*7)(CTX),h + + vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK + vmovdqa _SHUF_00BA(%rip), SHUF_00BA + vmovdqa _SHUF_DC00(%rip), SHUF_DC00 + + mov CTX, _CTX(%rsp) + jmp do_last_block + +done_hash: + + mov _RSP(%rsp), %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret +ENDPROC(sha256_transform_rorx) + +.data +.align 64 +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 + +# shuffle xBxA -> 00BA +_SHUF_00BA: + .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 + +# shuffle xDxC -> DC00 +_SHUF_DC00: + .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF +#endif diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S new file mode 100644 index 00000000000..f833b74d902 --- /dev/null +++ b/arch/x86/crypto/sha256-ssse3-asm.S @@ -0,0 +1,506 @@ +######################################################################## +# Implement fast SHA-256 with SSSE3 instructions. (x86_64) +# +# Copyright (C) 2013 Intel Corporation. +# +# Authors: +# James Guilford <james.guilford@intel.com> +# Kirk Yap <kirk.s.yap@intel.com> +# Tim Chen <tim.c.chen@linux.intel.com> +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +######################################################################## +# +# This code is described in an Intel White-Paper: +# "Fast SHA-256 Implementations on Intel Architecture Processors" +# +# To find it, surf to http://www.intel.com/p/en_US/embedded +# and search for that title. +# +######################################################################## + +#include <linux/linkage.h> + +## assume buffers not aligned +#define MOVDQ movdqu + +################################ Define Macros + +# addm [mem], reg +# Add reg to mem using reg-mem add and store +.macro addm p1 p2 + add \p1, \p2 + mov \p2, \p1 +.endm + +################################ + +# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask +# Load xmm with mem and byte swap each dword +.macro COPY_XMM_AND_BSWAP p1 p2 p3 + MOVDQ \p2, \p1 + pshufb \p3, \p1 +.endm + +################################ + +X0 = %xmm4 +X1 = %xmm5 +X2 = %xmm6 +X3 = %xmm7 + +XTMP0 = %xmm0 +XTMP1 = %xmm1 +XTMP2 = %xmm2 +XTMP3 = %xmm3 +XTMP4 = %xmm8 +XFER = %xmm9 + +SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA +SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00 +BYTE_FLIP_MASK = %xmm12 + +NUM_BLKS = %rdx # 3rd arg +CTX = %rsi # 2nd arg +INP = %rdi # 1st arg + +SRND = %rdi # clobbers INP +c = %ecx +d = %r8d +e = %edx +TBL = %rbp +a = %eax +b = %ebx + +f = %r9d +g = %r10d +h = %r11d + +y0 = %r13d +y1 = %r14d +y2 = %r15d + + + +_INP_END_SIZE = 8 +_INP_SIZE = 8 +_XFER_SIZE = 16 +_XMM_SAVE_SIZE = 0 + +_INP_END = 0 +_INP = _INP_END + _INP_END_SIZE +_XFER = _INP + _INP_SIZE +_XMM_SAVE = _XFER + _XFER_SIZE +STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE + +# rotate_Xs +# Rotate values of symbols X0...X3 +.macro rotate_Xs +X_ = X0 +X0 = X1 +X1 = X2 +X2 = X3 +X3 = X_ +.endm + +# ROTATE_ARGS +# Rotate values of symbols a...h +.macro ROTATE_ARGS +TMP_ = h +h = g +g = f +f = e +e = d +d = c +c = b +b = a +a = TMP_ +.endm + +.macro FOUR_ROUNDS_AND_SCHED + ## compute s0 four at a time and s1 two at a time + ## compute W[-16] + W[-7] 4 at a time + movdqa X3, XTMP0 + mov e, y0 # y0 = e + ror $(25-11), y0 # y0 = e >> (25-11) + mov a, y1 # y1 = a + palignr $4, X2, XTMP0 # XTMP0 = W[-7] + ror $(22-13), y1 # y1 = a >> (22-13) + xor e, y0 # y0 = e ^ (e >> (25-11)) + mov f, y2 # y2 = f + ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + movdqa X1, XTMP1 + xor a, y1 # y1 = a ^ (a >> (22-13) + xor g, y2 # y2 = f^g + paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16] + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and e, y2 # y2 = (f^g)&e + ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + ## compute s0 + palignr $4, X0, XTMP1 # XTMP1 = W[-15] + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor g, y2 # y2 = CH = ((f^g)&e)^g + movdqa XTMP1, XTMP2 # XTMP2 = W[-15] + ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y0, y2 # y2 = S1 + CH + add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH + movdqa XTMP1, XTMP3 # XTMP3 = W[-15] + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + pslld $(32-7), XTMP1 # + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + psrld $7, XTMP2 # + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + # + ROTATE_ARGS # + movdqa XTMP3, XTMP2 # XTMP2 = W[-15] + mov e, y0 # y0 = e + mov a, y1 # y1 = a + movdqa XTMP3, XTMP4 # XTMP4 = W[-15] + ror $(25-11), y0 # y0 = e >> (25-11) + xor e, y0 # y0 = e ^ (e >> (25-11)) + mov f, y2 # y2 = f + ror $(22-13), y1 # y1 = a >> (22-13) + pslld $(32-18), XTMP3 # + xor a, y1 # y1 = a ^ (a >> (22-13) + ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor g, y2 # y2 = f^g + psrld $18, XTMP2 # + ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and e, y2 # y2 = (f^g)&e + ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + pxor XTMP3, XTMP1 + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor g, y2 # y2 = CH = ((f^g)&e)^g + psrld $3, XTMP4 # XTMP4 = W[-15] >> 3 + add y0, y2 # y2 = S1 + CH + add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH + ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + pxor XTMP4, XTMP1 # XTMP1 = s0 + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + ## compute low s1 + pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + + ROTATE_ARGS + movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA} + mov e, y0 # y0 = e + mov a, y1 # y1 = a + ror $(25-11), y0 # y0 = e >> (25-11) + movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA} + xor e, y0 # y0 = e ^ (e >> (25-11)) + ror $(22-13), y1 # y1 = a >> (22-13) + mov f, y2 # y2 = f + xor a, y1 # y1 = a ^ (a >> (22-13) + ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} + xor g, y2 # y2 = f^g + psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and e, y2 # y2 = (f^g)&e + psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} + ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor g, y2 # y2 = CH = ((f^g)&e)^g + ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + pxor XTMP3, XTMP2 + add y0, y2 # y2 = S1 + CH + ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH + pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA} + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA} + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + ## compute high s1 + pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA} + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + # + ROTATE_ARGS # + movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC} + mov e, y0 # y0 = e + ror $(25-11), y0 # y0 = e >> (25-11) + mov a, y1 # y1 = a + movdqa XTMP2, X0 # X0 = W[-2] {DDCC} + ror $(22-13), y1 # y1 = a >> (22-13) + xor e, y0 # y0 = e ^ (e >> (25-11)) + mov f, y2 # y2 = f + ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} + xor a, y1 # y1 = a ^ (a >> (22-13) + xor g, y2 # y2 = f^g + psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25 + and e, y2 # y2 = (f^g)&e + ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + psrld $10, X0 # X0 = W[-2] >> 10 {DDCC} + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22 + ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2 + xor g, y2 # y2 = CH = ((f^g)&e)^g + pxor XTMP3, XTMP2 # + ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2 + add y0, y2 # y2 = S1 + CH + add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH + pxor XTMP2, X0 # X0 = s1 {xDxC} + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + pshufb SHUF_DC00, X0 # X0 = s1 {DC00} + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]} + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + + ROTATE_ARGS + rotate_Xs +.endm + +## input is [rsp + _XFER + %1 * 4] +.macro DO_ROUND round + mov e, y0 # y0 = e + ror $(25-11), y0 # y0 = e >> (25-11) + mov a, y1 # y1 = a + xor e, y0 # y0 = e ^ (e >> (25-11)) + ror $(22-13), y1 # y1 = a >> (22-13) + mov f, y2 # y2 = f + xor a, y1 # y1 = a ^ (a >> (22-13) + ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor g, y2 # y2 = f^g + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) + and e, y2 # y2 = (f^g)&e + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor g, y2 # y2 = CH = ((f^g)&e)^g + add y0, y2 # y2 = S1 + CH + ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + offset = \round * 4 + _XFER + add offset(%rsp), y2 # y2 = k + w + S1 + CH + mov a, y0 # y0 = a + add y2, h # h = h + S1 + CH + k + w + mov a, y2 # y2 = a + or c, y0 # y0 = a|c + add h, d # d = d + h + S1 + CH + k + w + and c, y2 # y2 = a&c + and b, y0 # y0 = (a|c)&b + add y1, h # h = h + S1 + CH + k + w + S0 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS +.endm + +######################################################################## +## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks) +## arg 1 : pointer to input data +## arg 2 : pointer to digest +## arg 3 : Num blocks +######################################################################## +.text +ENTRY(sha256_transform_ssse3) +.align 32 + pushq %rbx + pushq %rbp + pushq %r13 + pushq %r14 + pushq %r15 + pushq %r12 + + mov %rsp, %r12 + subq $STACK_SIZE, %rsp + and $~15, %rsp + + shl $6, NUM_BLKS # convert to bytes + jz done_hash + add INP, NUM_BLKS + mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data + + ## load initial digest + mov 4*0(CTX), a + mov 4*1(CTX), b + mov 4*2(CTX), c + mov 4*3(CTX), d + mov 4*4(CTX), e + mov 4*5(CTX), f + mov 4*6(CTX), g + mov 4*7(CTX), h + + movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK + movdqa _SHUF_00BA(%rip), SHUF_00BA + movdqa _SHUF_DC00(%rip), SHUF_DC00 + +loop0: + lea K256(%rip), TBL + + ## byte swap first 16 dwords + COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK + + mov INP, _INP(%rsp) + + ## schedule 48 input dwords, by doing 3 rounds of 16 each + mov $3, SRND +.align 16 +loop1: + movdqa (TBL), XFER + paddd X0, XFER + movdqa XFER, _XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + movdqa 1*16(TBL), XFER + paddd X0, XFER + movdqa XFER, _XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + movdqa 2*16(TBL), XFER + paddd X0, XFER + movdqa XFER, _XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + movdqa 3*16(TBL), XFER + paddd X0, XFER + movdqa XFER, _XFER(%rsp) + add $4*16, TBL + FOUR_ROUNDS_AND_SCHED + + sub $1, SRND + jne loop1 + + mov $2, SRND +loop2: + paddd (TBL), X0 + movdqa X0, _XFER(%rsp) + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + paddd 1*16(TBL), X1 + movdqa X1, _XFER(%rsp) + add $2*16, TBL + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + movdqa X2, X0 + movdqa X3, X1 + + sub $1, SRND + jne loop2 + + addm (4*0)(CTX),a + addm (4*1)(CTX),b + addm (4*2)(CTX),c + addm (4*3)(CTX),d + addm (4*4)(CTX),e + addm (4*5)(CTX),f + addm (4*6)(CTX),g + addm (4*7)(CTX),h + + mov _INP(%rsp), INP + add $64, INP + cmp _INP_END(%rsp), INP + jne loop0 + +done_hash: + + mov %r12, %rsp + + popq %r12 + popq %r15 + popq %r14 + popq %r13 + popq %rbp + popq %rbx + + ret +ENDPROC(sha256_transform_ssse3) + +.data +.align 64 +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x0c0d0e0f08090a0b0405060700010203 + +# shuffle xBxA -> 00BA +_SHUF_00BA: + .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 + +# shuffle xDxC -> DC00 +_SHUF_DC00: + .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c new file mode 100644 index 00000000000..f248546da1c --- /dev/null +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -0,0 +1,322 @@ +/* + * Cryptographic API. + * + * Glue code for the SHA256 Secure Hash Algorithm assembler + * implementation using supplemental SSE3 / AVX / AVX2 instructions. + * + * This file is based on sha256_generic.c + * + * Copyright (C) 2013 Intel Corporation. + * + * Author: + * Tim Chen <tim.c.chen@linux.intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/cryptohash.h> +#include <linux/types.h> +#include <crypto/sha.h> +#include <asm/byteorder.h> +#include <asm/i387.h> +#include <asm/xcr.h> +#include <asm/xsave.h> +#include <linux/string.h> + +asmlinkage void sha256_transform_ssse3(const char *data, u32 *digest, + u64 rounds); +#ifdef CONFIG_AS_AVX +asmlinkage void sha256_transform_avx(const char *data, u32 *digest, + u64 rounds); +#endif +#ifdef CONFIG_AS_AVX2 +asmlinkage void sha256_transform_rorx(const char *data, u32 *digest, + u64 rounds); +#endif + +static asmlinkage void (*sha256_transform_asm)(const char *, u32 *, u64); + + +static int sha256_ssse3_init(struct shash_desc *desc) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + sctx->state[0] = SHA256_H0; + sctx->state[1] = SHA256_H1; + sctx->state[2] = SHA256_H2; + sctx->state[3] = SHA256_H3; + sctx->state[4] = SHA256_H4; + sctx->state[5] = SHA256_H5; + sctx->state[6] = SHA256_H6; + sctx->state[7] = SHA256_H7; + sctx->count = 0; + + return 0; +} + +static int __sha256_ssse3_update(struct shash_desc *desc, const u8 *data, + unsigned int len, unsigned int partial) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + unsigned int done = 0; + + sctx->count += len; + + if (partial) { + done = SHA256_BLOCK_SIZE - partial; + memcpy(sctx->buf + partial, data, done); + sha256_transform_asm(sctx->buf, sctx->state, 1); + } + + if (len - done >= SHA256_BLOCK_SIZE) { + const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE; + + sha256_transform_asm(data + done, sctx->state, (u64) rounds); + + done += rounds * SHA256_BLOCK_SIZE; + } + + memcpy(sctx->buf, data + done, len - done); + + return 0; +} + +static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; + int res; + + /* Handle the fast case right here */ + if (partial + len < SHA256_BLOCK_SIZE) { + sctx->count += len; + memcpy(sctx->buf + partial, data, len); + + return 0; + } + + if (!irq_fpu_usable()) { + res = crypto_sha256_update(desc, data, len); + } else { + kernel_fpu_begin(); + res = __sha256_ssse3_update(desc, data, len, partial); + kernel_fpu_end(); + } + + return res; +} + + +/* Add padding and return the message digest. */ +static int sha256_ssse3_final(struct shash_desc *desc, u8 *out) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + unsigned int i, index, padlen; + __be32 *dst = (__be32 *)out; + __be64 bits; + static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, }; + + bits = cpu_to_be64(sctx->count << 3); + + /* Pad out to 56 mod 64 and append length */ + index = sctx->count % SHA256_BLOCK_SIZE; + padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index); + + if (!irq_fpu_usable()) { + crypto_sha256_update(desc, padding, padlen); + crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits)); + } else { + kernel_fpu_begin(); + /* We need to fill a whole block for __sha256_ssse3_update() */ + if (padlen <= 56) { + sctx->count += padlen; + memcpy(sctx->buf + index, padding, padlen); + } else { + __sha256_ssse3_update(desc, padding, padlen, index); + } + __sha256_ssse3_update(desc, (const u8 *)&bits, + sizeof(bits), 56); + kernel_fpu_end(); + } + + /* Store state in digest */ + for (i = 0; i < 8; i++) + dst[i] = cpu_to_be32(sctx->state[i]); + + /* Wipe context */ + memset(sctx, 0, sizeof(*sctx)); + + return 0; +} + +static int sha256_ssse3_export(struct shash_desc *desc, void *out) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + memcpy(out, sctx, sizeof(*sctx)); + + return 0; +} + +static int sha256_ssse3_import(struct shash_desc *desc, const void *in) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + memcpy(sctx, in, sizeof(*sctx)); + + return 0; +} + +static int sha224_ssse3_init(struct shash_desc *desc) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + sctx->state[0] = SHA224_H0; + sctx->state[1] = SHA224_H1; + sctx->state[2] = SHA224_H2; + sctx->state[3] = SHA224_H3; + sctx->state[4] = SHA224_H4; + sctx->state[5] = SHA224_H5; + sctx->state[6] = SHA224_H6; + sctx->state[7] = SHA224_H7; + sctx->count = 0; + + return 0; +} + +static int sha224_ssse3_final(struct shash_desc *desc, u8 *hash) +{ + u8 D[SHA256_DIGEST_SIZE]; + + sha256_ssse3_final(desc, D); + + memcpy(hash, D, SHA224_DIGEST_SIZE); + memset(D, 0, SHA256_DIGEST_SIZE); + + return 0; +} + +static struct shash_alg algs[] = { { + .digestsize = SHA256_DIGEST_SIZE, + .init = sha256_ssse3_init, + .update = sha256_ssse3_update, + .final = sha256_ssse3_final, + .export = sha256_ssse3_export, + .import = sha256_ssse3_import, + .descsize = sizeof(struct sha256_state), + .statesize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha256", + .cra_driver_name = "sha256-ssse3", + .cra_priority = 150, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}, { + .digestsize = SHA224_DIGEST_SIZE, + .init = sha224_ssse3_init, + .update = sha256_ssse3_update, + .final = sha224_ssse3_final, + .export = sha256_ssse3_export, + .import = sha256_ssse3_import, + .descsize = sizeof(struct sha256_state), + .statesize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha224", + .cra_driver_name = "sha224-ssse3", + .cra_priority = 150, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA224_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; + +#ifdef CONFIG_AS_AVX +static bool __init avx_usable(void) +{ + u64 xcr0; + + if (!cpu_has_avx || !cpu_has_osxsave) + return false; + + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { + pr_info("AVX detected but unusable.\n"); + + return false; + } + + return true; +} +#endif + +static int __init sha256_ssse3_mod_init(void) +{ + /* test for SSSE3 first */ + if (cpu_has_ssse3) + sha256_transform_asm = sha256_transform_ssse3; + +#ifdef CONFIG_AS_AVX + /* allow AVX to override SSSE3, it's a little faster */ + if (avx_usable()) { +#ifdef CONFIG_AS_AVX2 + if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2)) + sha256_transform_asm = sha256_transform_rorx; + else +#endif + sha256_transform_asm = sha256_transform_avx; + } +#endif + + if (sha256_transform_asm) { +#ifdef CONFIG_AS_AVX + if (sha256_transform_asm == sha256_transform_avx) + pr_info("Using AVX optimized SHA-256 implementation\n"); +#ifdef CONFIG_AS_AVX2 + else if (sha256_transform_asm == sha256_transform_rorx) + pr_info("Using AVX2 optimized SHA-256 implementation\n"); +#endif + else +#endif + pr_info("Using SSSE3 optimized SHA-256 implementation\n"); + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); + } + pr_info("Neither AVX nor SSSE3 is available/usable.\n"); + + return -ENODEV; +} + +static void __exit sha256_ssse3_mod_fini(void) +{ + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); +} + +module_init(sha256_ssse3_mod_init); +module_exit(sha256_ssse3_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated"); + +MODULE_ALIAS("sha256"); +MODULE_ALIAS("sha224"); diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S new file mode 100644 index 00000000000..974dde9bc6c --- /dev/null +++ b/arch/x86/crypto/sha512-avx-asm.S @@ -0,0 +1,423 @@ +######################################################################## +# Implement fast SHA-512 with AVX instructions. (x86_64) +# +# Copyright (C) 2013 Intel Corporation. +# +# Authors: +# James Guilford <james.guilford@intel.com> +# Kirk Yap <kirk.s.yap@intel.com> +# David Cote <david.m.cote@intel.com> +# Tim Chen <tim.c.chen@linux.intel.com> +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +######################################################################## +# +# This code is described in an Intel White-Paper: +# "Fast SHA-512 Implementations on Intel Architecture Processors" +# +# To find it, surf to http://www.intel.com/p/en_US/embedded +# and search for that title. +# +######################################################################## + +#ifdef CONFIG_AS_AVX +#include <linux/linkage.h> + +.text + +# Virtual Registers +# ARG1 +msg = %rdi +# ARG2 +digest = %rsi +# ARG3 +msglen = %rdx +T1 = %rcx +T2 = %r8 +a_64 = %r9 +b_64 = %r10 +c_64 = %r11 +d_64 = %r12 +e_64 = %r13 +f_64 = %r14 +g_64 = %r15 +h_64 = %rbx +tmp0 = %rax + +# Local variables (stack frame) + +# Message Schedule +W_SIZE = 80*8 +# W[t] + K[t] | W[t+1] + K[t+1] +WK_SIZE = 2*8 +RSPSAVE_SIZE = 1*8 +GPRSAVE_SIZE = 5*8 + +frame_W = 0 +frame_WK = frame_W + W_SIZE +frame_RSPSAVE = frame_WK + WK_SIZE +frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE +frame_size = frame_GPRSAVE + GPRSAVE_SIZE + +# Useful QWORD "arrays" for simpler memory references +# MSG, DIGEST, K_t, W_t are arrays +# WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even + +# Input message (arg1) +#define MSG(i) 8*i(msg) + +# Output Digest (arg2) +#define DIGEST(i) 8*i(digest) + +# SHA Constants (static mem) +#define K_t(i) 8*i+K512(%rip) + +# Message Schedule (stack frame) +#define W_t(i) 8*i+frame_W(%rsp) + +# W[t]+K[t] (stack frame) +#define WK_2(i) 8*((i%2))+frame_WK(%rsp) + +.macro RotateState + # Rotate symbols a..h right + TMP = h_64 + h_64 = g_64 + g_64 = f_64 + f_64 = e_64 + e_64 = d_64 + d_64 = c_64 + c_64 = b_64 + b_64 = a_64 + a_64 = TMP +.endm + +.macro RORQ p1 p2 + # shld is faster than ror on Sandybridge + shld $(64-\p2), \p1, \p1 +.endm + +.macro SHA512_Round rnd + # Compute Round %%t + mov f_64, T1 # T1 = f + mov e_64, tmp0 # tmp = e + xor g_64, T1 # T1 = f ^ g + RORQ tmp0, 23 # 41 # tmp = e ror 23 + and e_64, T1 # T1 = (f ^ g) & e + xor e_64, tmp0 # tmp = (e ror 23) ^ e + xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g) + idx = \rnd + add WK_2(idx), T1 # W[t] + K[t] from message scheduler + RORQ tmp0, 4 # 18 # tmp = ((e ror 23) ^ e) ror 4 + xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e + mov a_64, T2 # T2 = a + add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h + RORQ tmp0, 14 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) + add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e) + mov a_64, tmp0 # tmp = a + xor c_64, T2 # T2 = a ^ c + and c_64, tmp0 # tmp = a & c + and b_64, T2 # T2 = (a ^ c) & b + xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) + mov a_64, tmp0 # tmp = a + RORQ tmp0, 5 # 39 # tmp = a ror 5 + xor a_64, tmp0 # tmp = (a ror 5) ^ a + add T1, d_64 # e(next_state) = d + T1 + RORQ tmp0, 6 # 34 # tmp = ((a ror 5) ^ a) ror 6 + xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a + lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c) + RORQ tmp0, 28 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) + add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a) + RotateState +.endm + +.macro SHA512_2Sched_2Round_avx rnd + # Compute rounds t-2 and t-1 + # Compute message schedule QWORDS t and t+1 + + # Two rounds are computed based on the values for K[t-2]+W[t-2] and + # K[t-1]+W[t-1] which were previously stored at WK_2 by the message + # scheduler. + # The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)]. + # They are then added to their respective SHA512 constants at + # [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)] + # For brievity, the comments following vectored instructions only refer to + # the first of a pair of QWORDS. + # Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} + # The computation of the message schedule and the rounds are tightly + # stitched to take advantage of instruction-level parallelism. + + idx = \rnd - 2 + vmovdqa W_t(idx), %xmm4 # XMM4 = W[t-2] + idx = \rnd - 15 + vmovdqu W_t(idx), %xmm5 # XMM5 = W[t-15] + mov f_64, T1 + vpsrlq $61, %xmm4, %xmm0 # XMM0 = W[t-2]>>61 + mov e_64, tmp0 + vpsrlq $1, %xmm5, %xmm6 # XMM6 = W[t-15]>>1 + xor g_64, T1 + RORQ tmp0, 23 # 41 + vpsrlq $19, %xmm4, %xmm1 # XMM1 = W[t-2]>>19 + and e_64, T1 + xor e_64, tmp0 + vpxor %xmm1, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 + xor g_64, T1 + idx = \rnd + add WK_2(idx), T1# + vpsrlq $8, %xmm5, %xmm7 # XMM7 = W[t-15]>>8 + RORQ tmp0, 4 # 18 + vpsrlq $6, %xmm4, %xmm2 # XMM2 = W[t-2]>>6 + xor e_64, tmp0 + mov a_64, T2 + add h_64, T1 + vpxor %xmm7, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 + RORQ tmp0, 14 # 14 + add tmp0, T1 + vpsrlq $7, %xmm5, %xmm8 # XMM8 = W[t-15]>>7 + mov a_64, tmp0 + xor c_64, T2 + vpsllq $(64-61), %xmm4, %xmm3 # XMM3 = W[t-2]<<3 + and c_64, tmp0 + and b_64, T2 + vpxor %xmm3, %xmm2, %xmm2 # XMM2 = W[t-2]>>6 ^ W[t-2]<<3 + xor tmp0, T2 + mov a_64, tmp0 + vpsllq $(64-1), %xmm5, %xmm9 # XMM9 = W[t-15]<<63 + RORQ tmp0, 5 # 39 + vpxor %xmm9, %xmm8, %xmm8 # XMM8 = W[t-15]>>7 ^ W[t-15]<<63 + xor a_64, tmp0 + add T1, d_64 + RORQ tmp0, 6 # 34 + xor a_64, tmp0 + vpxor %xmm8, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ + # W[t-15]>>7 ^ W[t-15]<<63 + lea (T1, T2), h_64 + RORQ tmp0, 28 # 28 + vpsllq $(64-19), %xmm4, %xmm4 # XMM4 = W[t-2]<<25 + add tmp0, h_64 + RotateState + vpxor %xmm4, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ + # W[t-2]<<25 + mov f_64, T1 + vpxor %xmm2, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + mov e_64, tmp0 + xor g_64, T1 + idx = \rnd - 16 + vpaddq W_t(idx), %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + idx = \rnd - 7 + vmovdqu W_t(idx), %xmm1 # XMM1 = W[t-7] + RORQ tmp0, 23 # 41 + and e_64, T1 + xor e_64, tmp0 + xor g_64, T1 + vpsllq $(64-8), %xmm5, %xmm5 # XMM5 = W[t-15]<<56 + idx = \rnd + 1 + add WK_2(idx), T1 + vpxor %xmm5, %xmm6, %xmm6 # XMM6 = s0(W[t-15]) + RORQ tmp0, 4 # 18 + vpaddq %xmm6, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) + xor e_64, tmp0 + vpaddq %xmm1, %xmm0, %xmm0 # XMM0 = W[t] = s1(W[t-2]) + W[t-7] + + # s0(W[t-15]) + W[t-16] + mov a_64, T2 + add h_64, T1 + RORQ tmp0, 14 # 14 + add tmp0, T1 + idx = \rnd + vmovdqa %xmm0, W_t(idx) # Store W[t] + vpaddq K_t(idx), %xmm0, %xmm0 # Compute W[t]+K[t] + vmovdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds + mov a_64, tmp0 + xor c_64, T2 + and c_64, tmp0 + and b_64, T2 + xor tmp0, T2 + mov a_64, tmp0 + RORQ tmp0, 5 # 39 + xor a_64, tmp0 + add T1, d_64 + RORQ tmp0, 6 # 34 + xor a_64, tmp0 + lea (T1, T2), h_64 + RORQ tmp0, 28 # 28 + add tmp0, h_64 + RotateState +.endm + +######################################################################## +# void sha512_transform_avx(const void* M, void* D, u64 L) +# Purpose: Updates the SHA512 digest stored at D with the message stored in M. +# The size of the message pointed to by M must be an integer multiple of SHA512 +# message blocks. +# L is the message length in SHA512 blocks +######################################################################## +ENTRY(sha512_transform_avx) + cmp $0, msglen + je nowork + + # Allocate Stack Space + mov %rsp, %rax + sub $frame_size, %rsp + and $~(0x20 - 1), %rsp + mov %rax, frame_RSPSAVE(%rsp) + + # Save GPRs + mov %rbx, frame_GPRSAVE(%rsp) + mov %r12, frame_GPRSAVE +8*1(%rsp) + mov %r13, frame_GPRSAVE +8*2(%rsp) + mov %r14, frame_GPRSAVE +8*3(%rsp) + mov %r15, frame_GPRSAVE +8*4(%rsp) + +updateblock: + + # Load state variables + mov DIGEST(0), a_64 + mov DIGEST(1), b_64 + mov DIGEST(2), c_64 + mov DIGEST(3), d_64 + mov DIGEST(4), e_64 + mov DIGEST(5), f_64 + mov DIGEST(6), g_64 + mov DIGEST(7), h_64 + + t = 0 + .rept 80/2 + 1 + # (80 rounds) / (2 rounds/iteration) + (1 iteration) + # +1 iteration because the scheduler leads hashing by 1 iteration + .if t < 2 + # BSWAP 2 QWORDS + vmovdqa XMM_QWORD_BSWAP(%rip), %xmm1 + vmovdqu MSG(t), %xmm0 + vpshufb %xmm1, %xmm0, %xmm0 # BSWAP + vmovdqa %xmm0, W_t(t) # Store Scheduled Pair + vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t] + vmovdqa %xmm0, WK_2(t) # Store into WK for rounds + .elseif t < 16 + # BSWAP 2 QWORDS# Compute 2 Rounds + vmovdqu MSG(t), %xmm0 + vpshufb %xmm1, %xmm0, %xmm0 # BSWAP + SHA512_Round t-2 # Round t-2 + vmovdqa %xmm0, W_t(t) # Store Scheduled Pair + vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t] + SHA512_Round t-1 # Round t-1 + vmovdqa %xmm0, WK_2(t)# Store W[t]+K[t] into WK + .elseif t < 79 + # Schedule 2 QWORDS# Compute 2 Rounds + SHA512_2Sched_2Round_avx t + .else + # Compute 2 Rounds + SHA512_Round t-2 + SHA512_Round t-1 + .endif + t = t+2 + .endr + + # Update digest + add a_64, DIGEST(0) + add b_64, DIGEST(1) + add c_64, DIGEST(2) + add d_64, DIGEST(3) + add e_64, DIGEST(4) + add f_64, DIGEST(5) + add g_64, DIGEST(6) + add h_64, DIGEST(7) + + # Advance to next message block + add $16*8, msg + dec msglen + jnz updateblock + + # Restore GPRs + mov frame_GPRSAVE(%rsp), %rbx + mov frame_GPRSAVE +8*1(%rsp), %r12 + mov frame_GPRSAVE +8*2(%rsp), %r13 + mov frame_GPRSAVE +8*3(%rsp), %r14 + mov frame_GPRSAVE +8*4(%rsp), %r15 + + # Restore Stack Pointer + mov frame_RSPSAVE(%rsp), %rsp + +nowork: + ret +ENDPROC(sha512_transform_avx) + +######################################################################## +### Binary Data + +.data + +.align 16 + +# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. +XMM_QWORD_BSWAP: + .octa 0x08090a0b0c0d0e0f0001020304050607 + +# K[t] used in SHA512 hashing +K512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +#endif diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S new file mode 100644 index 00000000000..568b96105f5 --- /dev/null +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -0,0 +1,743 @@ +######################################################################## +# Implement fast SHA-512 with AVX2 instructions. (x86_64) +# +# Copyright (C) 2013 Intel Corporation. +# +# Authors: +# James Guilford <james.guilford@intel.com> +# Kirk Yap <kirk.s.yap@intel.com> +# David Cote <david.m.cote@intel.com> +# Tim Chen <tim.c.chen@linux.intel.com> +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +######################################################################## +# +# This code is described in an Intel White-Paper: +# "Fast SHA-512 Implementations on Intel Architecture Processors" +# +# To find it, surf to http://www.intel.com/p/en_US/embedded +# and search for that title. +# +######################################################################## +# This code schedules 1 blocks at a time, with 4 lanes per block +######################################################################## + +#ifdef CONFIG_AS_AVX2 +#include <linux/linkage.h> + +.text + +# Virtual Registers +Y_0 = %ymm4 +Y_1 = %ymm5 +Y_2 = %ymm6 +Y_3 = %ymm7 + +YTMP0 = %ymm0 +YTMP1 = %ymm1 +YTMP2 = %ymm2 +YTMP3 = %ymm3 +YTMP4 = %ymm8 +XFER = YTMP0 + +BYTE_FLIP_MASK = %ymm9 + +# 1st arg +INP = %rdi +# 2nd arg +CTX = %rsi +# 3rd arg +NUM_BLKS = %rdx + +c = %rcx +d = %r8 +e = %rdx +y3 = %rdi + +TBL = %rbp + +a = %rax +b = %rbx + +f = %r9 +g = %r10 +h = %r11 +old_h = %r11 + +T1 = %r12 +y0 = %r13 +y1 = %r14 +y2 = %r15 + +y4 = %r12 + +# Local variables (stack frame) +XFER_SIZE = 4*8 +SRND_SIZE = 1*8 +INP_SIZE = 1*8 +INPEND_SIZE = 1*8 +RSPSAVE_SIZE = 1*8 +GPRSAVE_SIZE = 6*8 + +frame_XFER = 0 +frame_SRND = frame_XFER + XFER_SIZE +frame_INP = frame_SRND + SRND_SIZE +frame_INPEND = frame_INP + INP_SIZE +frame_RSPSAVE = frame_INPEND + INPEND_SIZE +frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE +frame_size = frame_GPRSAVE + GPRSAVE_SIZE + +## assume buffers not aligned +#define VMOVDQ vmovdqu + +# addm [mem], reg +# Add reg to mem using reg-mem add and store +.macro addm p1 p2 + add \p1, \p2 + mov \p2, \p1 +.endm + + +# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask +# Load ymm with mem and byte swap each dword +.macro COPY_YMM_AND_BSWAP p1 p2 p3 + VMOVDQ \p2, \p1 + vpshufb \p3, \p1, \p1 +.endm +# rotate_Ys +# Rotate values of symbols Y0...Y3 +.macro rotate_Ys + Y_ = Y_0 + Y_0 = Y_1 + Y_1 = Y_2 + Y_2 = Y_3 + Y_3 = Y_ +.endm + +# RotateState +.macro RotateState + # Rotate symbols a..h right + old_h = h + TMP_ = h + h = g + g = f + f = e + e = d + d = c + c = b + b = a + a = TMP_ +.endm + +# macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL +# YDST = {YSRC1, YSRC2} >> RVAL*8 +.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL + vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI} + vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8 +.endm + +.macro FOUR_ROUNDS_AND_SCHED +################################### RND N + 0 ######################################### + + # Extract w[t-7] + MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7] + # Calculate w[t-16] + w[t-7] + vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16] + # Extract w[t-15] + MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15] + + # Calculate sigma0 + + # Calculate w[t-15] ror 1 + vpsrlq $1, YTMP1, YTMP2 + vpsllq $(64-1), YTMP1, YTMP3 + vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 + # Calculate w[t-15] shr 7 + vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7 + + mov a, y3 # y3 = a # MAJA + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + add frame_XFER(%rsp),h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + mov f, y2 # y2 = f # CH + rorx $34, a, T1 # T1 = a >> 34 # S0B + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + xor g, y2 # y2 = f^g # CH + rorx $14, e, y1 # y1 = (e >> 14) # S1 + + and e, y2 # y2 = (f^g)&e # CH + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $39, a, y1 # y1 = a >> 39 # S0A + add h, d # d = k + w + h + d # -- + + and b, y3 # y3 = (a|c)&b # MAJA + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + + add y0, y2 # y2 = S1 + CH # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + +################################### RND N + 1 ######################################### + + # Calculate w[t-15] ror 8 + vpsrlq $8, YTMP1, YTMP2 + vpsllq $(64-8), YTMP1, YTMP1 + vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8 + # XOR the three components + vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 + vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0 + + + # Add three components, w[t-16], w[t-7] and sigma0 + vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 + # Move to appropriate lanes for calculating w[16] and w[17] + vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA} + # Move to appropriate lanes for calculating w[18] and w[19] + vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00} + + # Calculate w[16] and w[17] in both 128 bit lanes + + # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes + vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA} + vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA} + + + mov a, y3 # y3 = a # MAJA + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + add 1*8+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + + mov f, y2 # y2 = f # CH + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + xor g, y2 # y2 = f^g # CH + + + rorx $14, e, y1 # y1 = (e >> 14) # S1 + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $39, a, y1 # y1 = a >> 39 # S0A + and e, y2 # y2 = (f^g)&e # CH + add h, d # d = k + w + h + d # -- + + and b, y3 # y3 = (a|c)&b # MAJA + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + + rorx $28, a, T1 # T1 = (a >> 28) # S0 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + + +################################### RND N + 2 ######################################### + + vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA} + vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA} + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA} + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} + vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA} + vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA} + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA} + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ + # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} + + # Add sigma1 to the other compunents to get w[16] and w[17] + vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]} + + # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane + vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--} + + mov a, y3 # y3 = a # MAJA + rorx $41, e, y0 # y0 = e >> 41 # S1A + add 2*8+frame_XFER(%rsp), h # h = k + w + h # -- + + rorx $18, e, y1 # y1 = e >> 18 # S1B + or c, y3 # y3 = a|c # MAJA + mov f, y2 # y2 = f # CH + xor g, y2 # y2 = f^g # CH + + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + and e, y2 # y2 = (f^g)&e # CH + + rorx $14, e, y1 # y1 = (e >> 14) # S1 + add h, d # d = k + w + h + d # -- + and b, y3 # y3 = (a|c)&b # MAJA + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $39, a, y1 # y1 = a >> 39 # S0A + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + +################################### RND N + 3 ######################################### + + vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--} + vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--} + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--} + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} + vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--} + vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--} + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--} + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ + # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} + + # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] + # to newly calculated sigma1 to get w[18] and w[19] + vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --} + + # Form w[19, w[18], w17], w[16] + vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]} + + mov a, y3 # y3 = a # MAJA + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + add 3*8+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + + mov f, y2 # y2 = f # CH + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + xor g, y2 # y2 = f^g # CH + + + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + add h, d # d = k + w + h + d # -- + and b, y3 # y3 = (a|c)&b # MAJA + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + rorx $39, a, y1 # y1 = a >> 39 # S0A + add y0, y2 # y2 = S1 + CH # -- + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + rorx $28, a, T1 # T1 = (a >> 28) # S0 + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + + add y1, h # h = k + w + h + S0 # -- + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + + rotate_Ys +.endm + +.macro DO_4ROUNDS + +################################### RND N + 0 ######################################### + + mov f, y2 # y2 = f # CH + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $39, a, y1 # y1 = a >> 39 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + add frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + RotateState + +################################### RND N + 1 ######################################### + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $39, a, y1 # y1 = a >> 39 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + add 8*1+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + RotateState + +################################### RND N + 2 ######################################### + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $39, a, y1 # y1 = a >> 39 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + add 8*2+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + RotateState + +################################### RND N + 3 ######################################### + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $39, a, y1 # y1 = a >> 39 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + add 8*3+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + +.endm + +######################################################################## +# void sha512_transform_rorx(const void* M, void* D, uint64_t L)# +# Purpose: Updates the SHA512 digest stored at D with the message stored in M. +# The size of the message pointed to by M must be an integer multiple of SHA512 +# message blocks. +# L is the message length in SHA512 blocks +######################################################################## +ENTRY(sha512_transform_rorx) + # Allocate Stack Space + mov %rsp, %rax + sub $frame_size, %rsp + and $~(0x20 - 1), %rsp + mov %rax, frame_RSPSAVE(%rsp) + + # Save GPRs + mov %rbp, frame_GPRSAVE(%rsp) + mov %rbx, 8*1+frame_GPRSAVE(%rsp) + mov %r12, 8*2+frame_GPRSAVE(%rsp) + mov %r13, 8*3+frame_GPRSAVE(%rsp) + mov %r14, 8*4+frame_GPRSAVE(%rsp) + mov %r15, 8*5+frame_GPRSAVE(%rsp) + + shl $7, NUM_BLKS # convert to bytes + jz done_hash + add INP, NUM_BLKS # pointer to end of data + mov NUM_BLKS, frame_INPEND(%rsp) + + ## load initial digest + mov 8*0(CTX),a + mov 8*1(CTX),b + mov 8*2(CTX),c + mov 8*3(CTX),d + mov 8*4(CTX),e + mov 8*5(CTX),f + mov 8*6(CTX),g + mov 8*7(CTX),h + + vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK + +loop0: + lea K512(%rip), TBL + + ## byte swap first 16 dwords + COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK + + mov INP, frame_INP(%rsp) + + ## schedule 64 input dwords, by doing 12 rounds of 4 each + movq $4, frame_SRND(%rsp) + +.align 16 +loop1: + vpaddq (TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddq 1*32(TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddq 2*32(TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddq 3*32(TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + add $(4*32), TBL + FOUR_ROUNDS_AND_SCHED + + subq $1, frame_SRND(%rsp) + jne loop1 + + movq $2, frame_SRND(%rsp) +loop2: + vpaddq (TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + DO_4ROUNDS + vpaddq 1*32(TBL), Y_1, XFER + vmovdqa XFER, frame_XFER(%rsp) + add $(2*32), TBL + DO_4ROUNDS + + vmovdqa Y_2, Y_0 + vmovdqa Y_3, Y_1 + + subq $1, frame_SRND(%rsp) + jne loop2 + + addm 8*0(CTX),a + addm 8*1(CTX),b + addm 8*2(CTX),c + addm 8*3(CTX),d + addm 8*4(CTX),e + addm 8*5(CTX),f + addm 8*6(CTX),g + addm 8*7(CTX),h + + mov frame_INP(%rsp), INP + add $128, INP + cmp frame_INPEND(%rsp), INP + jne loop0 + +done_hash: + +# Restore GPRs + mov frame_GPRSAVE(%rsp) ,%rbp + mov 8*1+frame_GPRSAVE(%rsp) ,%rbx + mov 8*2+frame_GPRSAVE(%rsp) ,%r12 + mov 8*3+frame_GPRSAVE(%rsp) ,%r13 + mov 8*4+frame_GPRSAVE(%rsp) ,%r14 + mov 8*5+frame_GPRSAVE(%rsp) ,%r15 + + # Restore Stack Pointer + mov frame_RSPSAVE(%rsp), %rsp + ret +ENDPROC(sha512_transform_rorx) + +######################################################################## +### Binary Data + +.data + +.align 64 +# K[t] used in SHA512 hashing +K512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +.align 32 + +# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x08090a0b0c0d0e0f0001020304050607 + .octa 0x18191a1b1c1d1e1f1011121314151617 + +MASK_YMM_LO: + .octa 0x00000000000000000000000000000000 + .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF +#endif diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S new file mode 100644 index 00000000000..fb56855d51f --- /dev/null +++ b/arch/x86/crypto/sha512-ssse3-asm.S @@ -0,0 +1,421 @@ +######################################################################## +# Implement fast SHA-512 with SSSE3 instructions. (x86_64) +# +# Copyright (C) 2013 Intel Corporation. +# +# Authors: +# James Guilford <james.guilford@intel.com> +# Kirk Yap <kirk.s.yap@intel.com> +# David Cote <david.m.cote@intel.com> +# Tim Chen <tim.c.chen@linux.intel.com> +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +######################################################################## +# +# This code is described in an Intel White-Paper: +# "Fast SHA-512 Implementations on Intel Architecture Processors" +# +# To find it, surf to http://www.intel.com/p/en_US/embedded +# and search for that title. +# +######################################################################## + +#include <linux/linkage.h> + +.text + +# Virtual Registers +# ARG1 +msg = %rdi +# ARG2 +digest = %rsi +# ARG3 +msglen = %rdx +T1 = %rcx +T2 = %r8 +a_64 = %r9 +b_64 = %r10 +c_64 = %r11 +d_64 = %r12 +e_64 = %r13 +f_64 = %r14 +g_64 = %r15 +h_64 = %rbx +tmp0 = %rax + +# Local variables (stack frame) + +W_SIZE = 80*8 +WK_SIZE = 2*8 +RSPSAVE_SIZE = 1*8 +GPRSAVE_SIZE = 5*8 + +frame_W = 0 +frame_WK = frame_W + W_SIZE +frame_RSPSAVE = frame_WK + WK_SIZE +frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE +frame_size = frame_GPRSAVE + GPRSAVE_SIZE + +# Useful QWORD "arrays" for simpler memory references +# MSG, DIGEST, K_t, W_t are arrays +# WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even + +# Input message (arg1) +#define MSG(i) 8*i(msg) + +# Output Digest (arg2) +#define DIGEST(i) 8*i(digest) + +# SHA Constants (static mem) +#define K_t(i) 8*i+K512(%rip) + +# Message Schedule (stack frame) +#define W_t(i) 8*i+frame_W(%rsp) + +# W[t]+K[t] (stack frame) +#define WK_2(i) 8*((i%2))+frame_WK(%rsp) + +.macro RotateState + # Rotate symbols a..h right + TMP = h_64 + h_64 = g_64 + g_64 = f_64 + f_64 = e_64 + e_64 = d_64 + d_64 = c_64 + c_64 = b_64 + b_64 = a_64 + a_64 = TMP +.endm + +.macro SHA512_Round rnd + + # Compute Round %%t + mov f_64, T1 # T1 = f + mov e_64, tmp0 # tmp = e + xor g_64, T1 # T1 = f ^ g + ror $23, tmp0 # 41 # tmp = e ror 23 + and e_64, T1 # T1 = (f ^ g) & e + xor e_64, tmp0 # tmp = (e ror 23) ^ e + xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g) + idx = \rnd + add WK_2(idx), T1 # W[t] + K[t] from message scheduler + ror $4, tmp0 # 18 # tmp = ((e ror 23) ^ e) ror 4 + xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e + mov a_64, T2 # T2 = a + add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h + ror $14, tmp0 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) + add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e) + mov a_64, tmp0 # tmp = a + xor c_64, T2 # T2 = a ^ c + and c_64, tmp0 # tmp = a & c + and b_64, T2 # T2 = (a ^ c) & b + xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) + mov a_64, tmp0 # tmp = a + ror $5, tmp0 # 39 # tmp = a ror 5 + xor a_64, tmp0 # tmp = (a ror 5) ^ a + add T1, d_64 # e(next_state) = d + T1 + ror $6, tmp0 # 34 # tmp = ((a ror 5) ^ a) ror 6 + xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a + lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c) + ror $28, tmp0 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) + add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a) + RotateState +.endm + +.macro SHA512_2Sched_2Round_sse rnd + + # Compute rounds t-2 and t-1 + # Compute message schedule QWORDS t and t+1 + + # Two rounds are computed based on the values for K[t-2]+W[t-2] and + # K[t-1]+W[t-1] which were previously stored at WK_2 by the message + # scheduler. + # The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. + # They are then added to their respective SHA512 constants at + # [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] + # For brievity, the comments following vectored instructions only refer to + # the first of a pair of QWORDS. + # Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} + # The computation of the message schedule and the rounds are tightly + # stitched to take advantage of instruction-level parallelism. + # For clarity, integer instructions (for the rounds calculation) are indented + # by one tab. Vectored instructions (for the message scheduler) are indented + # by two tabs. + + mov f_64, T1 + idx = \rnd -2 + movdqa W_t(idx), %xmm2 # XMM2 = W[t-2] + xor g_64, T1 + and e_64, T1 + movdqa %xmm2, %xmm0 # XMM0 = W[t-2] + xor g_64, T1 + idx = \rnd + add WK_2(idx), T1 + idx = \rnd - 15 + movdqu W_t(idx), %xmm5 # XMM5 = W[t-15] + mov e_64, tmp0 + ror $23, tmp0 # 41 + movdqa %xmm5, %xmm3 # XMM3 = W[t-15] + xor e_64, tmp0 + ror $4, tmp0 # 18 + psrlq $61-19, %xmm0 # XMM0 = W[t-2] >> 42 + xor e_64, tmp0 + ror $14, tmp0 # 14 + psrlq $(8-7), %xmm3 # XMM3 = W[t-15] >> 1 + add tmp0, T1 + add h_64, T1 + pxor %xmm2, %xmm0 # XMM0 = (W[t-2] >> 42) ^ W[t-2] + mov a_64, T2 + xor c_64, T2 + pxor %xmm5, %xmm3 # XMM3 = (W[t-15] >> 1) ^ W[t-15] + and b_64, T2 + mov a_64, tmp0 + psrlq $(19-6), %xmm0 # XMM0 = ((W[t-2]>>42)^W[t-2])>>13 + and c_64, tmp0 + xor tmp0, T2 + psrlq $(7-1), %xmm3 # XMM3 = ((W[t-15]>>1)^W[t-15])>>6 + mov a_64, tmp0 + ror $5, tmp0 # 39 + pxor %xmm2, %xmm0 # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] + xor a_64, tmp0 + ror $6, tmp0 # 34 + pxor %xmm5, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] + xor a_64, tmp0 + ror $28, tmp0 # 28 + psrlq $6, %xmm0 # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 + add tmp0, T2 + add T1, d_64 + psrlq $1, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 + lea (T1, T2), h_64 + RotateState + movdqa %xmm2, %xmm1 # XMM1 = W[t-2] + mov f_64, T1 + xor g_64, T1 + movdqa %xmm5, %xmm4 # XMM4 = W[t-15] + and e_64, T1 + xor g_64, T1 + psllq $(64-19)-(64-61) , %xmm1 # XMM1 = W[t-2] << 42 + idx = \rnd + 1 + add WK_2(idx), T1 + mov e_64, tmp0 + psllq $(64-1)-(64-8), %xmm4 # XMM4 = W[t-15] << 7 + ror $23, tmp0 # 41 + xor e_64, tmp0 + pxor %xmm2, %xmm1 # XMM1 = (W[t-2] << 42)^W[t-2] + ror $4, tmp0 # 18 + xor e_64, tmp0 + pxor %xmm5, %xmm4 # XMM4 = (W[t-15]<<7)^W[t-15] + ror $14, tmp0 # 14 + add tmp0, T1 + psllq $(64-61), %xmm1 # XMM1 = ((W[t-2] << 42)^W[t-2])<<3 + add h_64, T1 + mov a_64, T2 + psllq $(64-8), %xmm4 # XMM4 = ((W[t-15]<<7)^W[t-15])<<56 + xor c_64, T2 + and b_64, T2 + pxor %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + mov a_64, tmp0 + and c_64, tmp0 + idx = \rnd - 7 + movdqu W_t(idx), %xmm1 # XMM1 = W[t-7] + xor tmp0, T2 + pxor %xmm4, %xmm3 # XMM3 = s0(W[t-15]) + mov a_64, tmp0 + paddq %xmm3, %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + ror $5, tmp0 # 39 + idx =\rnd-16 + paddq W_t(idx), %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] + xor a_64, tmp0 + paddq %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] + ror $6, tmp0 # 34 + movdqa %xmm0, W_t(\rnd) # Store scheduled qwords + xor a_64, tmp0 + paddq K_t(\rnd), %xmm0 # Compute W[t]+K[t] + ror $28, tmp0 # 28 + idx = \rnd + movdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds + add tmp0, T2 + add T1, d_64 + lea (T1, T2), h_64 + RotateState +.endm + +######################################################################## +# void sha512_transform_ssse3(const void* M, void* D, u64 L)# +# Purpose: Updates the SHA512 digest stored at D with the message stored in M. +# The size of the message pointed to by M must be an integer multiple of SHA512 +# message blocks. +# L is the message length in SHA512 blocks. +######################################################################## +ENTRY(sha512_transform_ssse3) + + cmp $0, msglen + je nowork + + # Allocate Stack Space + mov %rsp, %rax + sub $frame_size, %rsp + and $~(0x20 - 1), %rsp + mov %rax, frame_RSPSAVE(%rsp) + + # Save GPRs + mov %rbx, frame_GPRSAVE(%rsp) + mov %r12, frame_GPRSAVE +8*1(%rsp) + mov %r13, frame_GPRSAVE +8*2(%rsp) + mov %r14, frame_GPRSAVE +8*3(%rsp) + mov %r15, frame_GPRSAVE +8*4(%rsp) + +updateblock: + +# Load state variables + mov DIGEST(0), a_64 + mov DIGEST(1), b_64 + mov DIGEST(2), c_64 + mov DIGEST(3), d_64 + mov DIGEST(4), e_64 + mov DIGEST(5), f_64 + mov DIGEST(6), g_64 + mov DIGEST(7), h_64 + + t = 0 + .rept 80/2 + 1 + # (80 rounds) / (2 rounds/iteration) + (1 iteration) + # +1 iteration because the scheduler leads hashing by 1 iteration + .if t < 2 + # BSWAP 2 QWORDS + movdqa XMM_QWORD_BSWAP(%rip), %xmm1 + movdqu MSG(t), %xmm0 + pshufb %xmm1, %xmm0 # BSWAP + movdqa %xmm0, W_t(t) # Store Scheduled Pair + paddq K_t(t), %xmm0 # Compute W[t]+K[t] + movdqa %xmm0, WK_2(t) # Store into WK for rounds + .elseif t < 16 + # BSWAP 2 QWORDS# Compute 2 Rounds + movdqu MSG(t), %xmm0 + pshufb %xmm1, %xmm0 # BSWAP + SHA512_Round t-2 # Round t-2 + movdqa %xmm0, W_t(t) # Store Scheduled Pair + paddq K_t(t), %xmm0 # Compute W[t]+K[t] + SHA512_Round t-1 # Round t-1 + movdqa %xmm0, WK_2(t) # Store W[t]+K[t] into WK + .elseif t < 79 + # Schedule 2 QWORDS# Compute 2 Rounds + SHA512_2Sched_2Round_sse t + .else + # Compute 2 Rounds + SHA512_Round t-2 + SHA512_Round t-1 + .endif + t = t+2 + .endr + + # Update digest + add a_64, DIGEST(0) + add b_64, DIGEST(1) + add c_64, DIGEST(2) + add d_64, DIGEST(3) + add e_64, DIGEST(4) + add f_64, DIGEST(5) + add g_64, DIGEST(6) + add h_64, DIGEST(7) + + # Advance to next message block + add $16*8, msg + dec msglen + jnz updateblock + + # Restore GPRs + mov frame_GPRSAVE(%rsp), %rbx + mov frame_GPRSAVE +8*1(%rsp), %r12 + mov frame_GPRSAVE +8*2(%rsp), %r13 + mov frame_GPRSAVE +8*3(%rsp), %r14 + mov frame_GPRSAVE +8*4(%rsp), %r15 + + # Restore Stack Pointer + mov frame_RSPSAVE(%rsp), %rsp + +nowork: + ret +ENDPROC(sha512_transform_ssse3) + +######################################################################## +### Binary Data + +.data + +.align 16 + +# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. +XMM_QWORD_BSWAP: + .octa 0x08090a0b0c0d0e0f0001020304050607 + +# K[t] used in SHA512 hashing +K512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c new file mode 100644 index 00000000000..8626b03e83b --- /dev/null +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -0,0 +1,330 @@ +/* + * Cryptographic API. + * + * Glue code for the SHA512 Secure Hash Algorithm assembler + * implementation using supplemental SSE3 / AVX / AVX2 instructions. + * + * This file is based on sha512_generic.c + * + * Copyright (C) 2013 Intel Corporation + * Author: Tim Chen <tim.c.chen@linux.intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/cryptohash.h> +#include <linux/types.h> +#include <crypto/sha.h> +#include <asm/byteorder.h> +#include <asm/i387.h> +#include <asm/xcr.h> +#include <asm/xsave.h> + +#include <linux/string.h> + +asmlinkage void sha512_transform_ssse3(const char *data, u64 *digest, + u64 rounds); +#ifdef CONFIG_AS_AVX +asmlinkage void sha512_transform_avx(const char *data, u64 *digest, + u64 rounds); +#endif +#ifdef CONFIG_AS_AVX2 +asmlinkage void sha512_transform_rorx(const char *data, u64 *digest, + u64 rounds); +#endif + +static asmlinkage void (*sha512_transform_asm)(const char *, u64 *, u64); + + +static int sha512_ssse3_init(struct shash_desc *desc) +{ + struct sha512_state *sctx = shash_desc_ctx(desc); + + sctx->state[0] = SHA512_H0; + sctx->state[1] = SHA512_H1; + sctx->state[2] = SHA512_H2; + sctx->state[3] = SHA512_H3; + sctx->state[4] = SHA512_H4; + sctx->state[5] = SHA512_H5; + sctx->state[6] = SHA512_H6; + sctx->state[7] = SHA512_H7; + sctx->count[0] = sctx->count[1] = 0; + + return 0; +} + +static int __sha512_ssse3_update(struct shash_desc *desc, const u8 *data, + unsigned int len, unsigned int partial) +{ + struct sha512_state *sctx = shash_desc_ctx(desc); + unsigned int done = 0; + + sctx->count[0] += len; + if (sctx->count[0] < len) + sctx->count[1]++; + + if (partial) { + done = SHA512_BLOCK_SIZE - partial; + memcpy(sctx->buf + partial, data, done); + sha512_transform_asm(sctx->buf, sctx->state, 1); + } + + if (len - done >= SHA512_BLOCK_SIZE) { + const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE; + + sha512_transform_asm(data + done, sctx->state, (u64) rounds); + + done += rounds * SHA512_BLOCK_SIZE; + } + + memcpy(sctx->buf, data + done, len - done); + + return 0; +} + +static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha512_state *sctx = shash_desc_ctx(desc); + unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE; + int res; + + /* Handle the fast case right here */ + if (partial + len < SHA512_BLOCK_SIZE) { + sctx->count[0] += len; + if (sctx->count[0] < len) + sctx->count[1]++; + memcpy(sctx->buf + partial, data, len); + + return 0; + } + + if (!irq_fpu_usable()) { + res = crypto_sha512_update(desc, data, len); + } else { + kernel_fpu_begin(); + res = __sha512_ssse3_update(desc, data, len, partial); + kernel_fpu_end(); + } + + return res; +} + + +/* Add padding and return the message digest. */ +static int sha512_ssse3_final(struct shash_desc *desc, u8 *out) +{ + struct sha512_state *sctx = shash_desc_ctx(desc); + unsigned int i, index, padlen; + __be64 *dst = (__be64 *)out; + __be64 bits[2]; + static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, }; + + /* save number of bits */ + bits[1] = cpu_to_be64(sctx->count[0] << 3); + bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61); + + /* Pad out to 112 mod 128 and append length */ + index = sctx->count[0] & 0x7f; + padlen = (index < 112) ? (112 - index) : ((128+112) - index); + + if (!irq_fpu_usable()) { + crypto_sha512_update(desc, padding, padlen); + crypto_sha512_update(desc, (const u8 *)&bits, sizeof(bits)); + } else { + kernel_fpu_begin(); + /* We need to fill a whole block for __sha512_ssse3_update() */ + if (padlen <= 112) { + sctx->count[0] += padlen; + if (sctx->count[0] < padlen) + sctx->count[1]++; + memcpy(sctx->buf + index, padding, padlen); + } else { + __sha512_ssse3_update(desc, padding, padlen, index); + } + __sha512_ssse3_update(desc, (const u8 *)&bits, + sizeof(bits), 112); + kernel_fpu_end(); + } + + /* Store state in digest */ + for (i = 0; i < 8; i++) + dst[i] = cpu_to_be64(sctx->state[i]); + + /* Wipe context */ + memset(sctx, 0, sizeof(*sctx)); + + return 0; +} + +static int sha512_ssse3_export(struct shash_desc *desc, void *out) +{ + struct sha512_state *sctx = shash_desc_ctx(desc); + + memcpy(out, sctx, sizeof(*sctx)); + + return 0; +} + +static int sha512_ssse3_import(struct shash_desc *desc, const void *in) +{ + struct sha512_state *sctx = shash_desc_ctx(desc); + + memcpy(sctx, in, sizeof(*sctx)); + + return 0; +} + +static int sha384_ssse3_init(struct shash_desc *desc) +{ + struct sha512_state *sctx = shash_desc_ctx(desc); + + sctx->state[0] = SHA384_H0; + sctx->state[1] = SHA384_H1; + sctx->state[2] = SHA384_H2; + sctx->state[3] = SHA384_H3; + sctx->state[4] = SHA384_H4; + sctx->state[5] = SHA384_H5; + sctx->state[6] = SHA384_H6; + sctx->state[7] = SHA384_H7; + + sctx->count[0] = sctx->count[1] = 0; + + return 0; +} + +static int sha384_ssse3_final(struct shash_desc *desc, u8 *hash) +{ + u8 D[SHA512_DIGEST_SIZE]; + + sha512_ssse3_final(desc, D); + + memcpy(hash, D, SHA384_DIGEST_SIZE); + memset(D, 0, SHA512_DIGEST_SIZE); + + return 0; +} + +static struct shash_alg algs[] = { { + .digestsize = SHA512_DIGEST_SIZE, + .init = sha512_ssse3_init, + .update = sha512_ssse3_update, + .final = sha512_ssse3_final, + .export = sha512_ssse3_export, + .import = sha512_ssse3_import, + .descsize = sizeof(struct sha512_state), + .statesize = sizeof(struct sha512_state), + .base = { + .cra_name = "sha512", + .cra_driver_name = "sha512-ssse3", + .cra_priority = 150, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA512_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}, { + .digestsize = SHA384_DIGEST_SIZE, + .init = sha384_ssse3_init, + .update = sha512_ssse3_update, + .final = sha384_ssse3_final, + .export = sha512_ssse3_export, + .import = sha512_ssse3_import, + .descsize = sizeof(struct sha512_state), + .statesize = sizeof(struct sha512_state), + .base = { + .cra_name = "sha384", + .cra_driver_name = "sha384-ssse3", + .cra_priority = 150, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA384_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; + +#ifdef CONFIG_AS_AVX +static bool __init avx_usable(void) +{ + u64 xcr0; + + if (!cpu_has_avx || !cpu_has_osxsave) + return false; + + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { + pr_info("AVX detected but unusable.\n"); + + return false; + } + + return true; +} +#endif + +static int __init sha512_ssse3_mod_init(void) +{ + /* test for SSSE3 first */ + if (cpu_has_ssse3) + sha512_transform_asm = sha512_transform_ssse3; + +#ifdef CONFIG_AS_AVX + /* allow AVX to override SSSE3, it's a little faster */ + if (avx_usable()) { +#ifdef CONFIG_AS_AVX2 + if (boot_cpu_has(X86_FEATURE_AVX2)) + sha512_transform_asm = sha512_transform_rorx; + else +#endif + sha512_transform_asm = sha512_transform_avx; + } +#endif + + if (sha512_transform_asm) { +#ifdef CONFIG_AS_AVX + if (sha512_transform_asm == sha512_transform_avx) + pr_info("Using AVX optimized SHA-512 implementation\n"); +#ifdef CONFIG_AS_AVX2 + else if (sha512_transform_asm == sha512_transform_rorx) + pr_info("Using AVX2 optimized SHA-512 implementation\n"); +#endif + else +#endif + pr_info("Using SSSE3 optimized SHA-512 implementation\n"); + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); + } + pr_info("Neither AVX nor SSSE3 is available/usable.\n"); + + return -ENODEV; +} + +static void __exit sha512_ssse3_mod_fini(void) +{ + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); +} + +module_init(sha512_ssse3_mod_init); +module_exit(sha512_ssse3_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated"); + +MODULE_ALIAS("sha512"); +MODULE_ALIAS("sha384"); diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index ebac16bfa83..05058134c44 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -4,7 +4,7 @@ * Copyright (C) 2012 Johannes Goetzfried * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> * - * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -23,6 +23,7 @@ * */ +#include <linux/linkage.h> #include "glue_helper-asm-avx.S" .file "twofish-avx-x86_64-asm_64.S" @@ -32,6 +33,8 @@ .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +.Lxts_gf128mul_and_shl1_mask: + .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 .text @@ -243,8 +246,6 @@ vpxor x3, wkey, x3; .align 8 -.type __twofish_enc_blk8,@function; - __twofish_enc_blk8: /* input: * %rdi: ctx, CTX @@ -284,10 +285,9 @@ __twofish_enc_blk8: outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); ret; +ENDPROC(__twofish_enc_blk8) .align 8 -.type __twofish_dec_blk8,@function; - __twofish_dec_blk8: /* input: * %rdi: ctx, CTX @@ -325,12 +325,9 @@ __twofish_dec_blk8: outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); ret; +ENDPROC(__twofish_dec_blk8) -.align 8 -.global twofish_ecb_enc_8way -.type twofish_ecb_enc_8way,@function; - -twofish_ecb_enc_8way: +ENTRY(twofish_ecb_enc_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -346,12 +343,9 @@ twofish_ecb_enc_8way: store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); ret; +ENDPROC(twofish_ecb_enc_8way) -.align 8 -.global twofish_ecb_dec_8way -.type twofish_ecb_dec_8way,@function; - -twofish_ecb_dec_8way: +ENTRY(twofish_ecb_dec_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -367,12 +361,9 @@ twofish_ecb_dec_8way: store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); ret; +ENDPROC(twofish_ecb_dec_8way) -.align 8 -.global twofish_cbc_dec_8way -.type twofish_cbc_dec_8way,@function; - -twofish_cbc_dec_8way: +ENTRY(twofish_cbc_dec_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -393,12 +384,9 @@ twofish_cbc_dec_8way: popq %r12; ret; +ENDPROC(twofish_cbc_dec_8way) -.align 8 -.global twofish_ctr_8way -.type twofish_ctr_8way,@function; - -twofish_ctr_8way: +ENTRY(twofish_ctr_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -421,3 +409,48 @@ twofish_ctr_8way: popq %r12; ret; +ENDPROC(twofish_ctr_8way) + +ENTRY(twofish_xts_enc_8way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) + */ + + movq %rsi, %r11; + + /* regs <= src, dst <= IVs, regs <= regs xor IVs */ + load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, + RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask); + + call __twofish_enc_blk8; + + /* dst <= regs xor IVs(in dst) */ + store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); + + ret; +ENDPROC(twofish_xts_enc_8way) + +ENTRY(twofish_xts_dec_8way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) + */ + + movq %rsi, %r11; + + /* regs <= src, dst <= IVs, regs <= regs xor IVs */ + load_xts_8way(%rcx, %rdx, %rsi, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2, + RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask); + + call __twofish_dec_blk8; + + /* dst <= regs xor IVs(in dst) */ + store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + ret; +ENDPROC(twofish_xts_dec_8way) diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S index 658af4bb35c..694ea4587ba 100644 --- a/arch/x86/crypto/twofish-i586-asm_32.S +++ b/arch/x86/crypto/twofish-i586-asm_32.S @@ -20,6 +20,7 @@ .file "twofish-i586-asm.S" .text +#include <linux/linkage.h> #include <asm/asm-offsets.h> /* return address at 0 */ @@ -219,11 +220,7 @@ xor %esi, d ## D;\ ror $1, d ## D; -.align 4 -.global twofish_enc_blk -.global twofish_dec_blk - -twofish_enc_blk: +ENTRY(twofish_enc_blk) push %ebp /* save registers according to calling convention*/ push %ebx push %esi @@ -277,8 +274,9 @@ twofish_enc_blk: pop %ebp mov $1, %eax ret +ENDPROC(twofish_enc_blk) -twofish_dec_blk: +ENTRY(twofish_dec_blk) push %ebp /* save registers according to calling convention*/ push %ebx push %esi @@ -333,3 +331,4 @@ twofish_dec_blk: pop %ebp mov $1, %eax ret +ENDPROC(twofish_dec_blk) diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S index 5b012a2c511..1c3b7ceb36d 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S @@ -20,6 +20,8 @@ * */ +#include <linux/linkage.h> + .file "twofish-x86_64-asm-3way.S" .text @@ -214,11 +216,7 @@ rorq $32, RAB2; \ outunpack3(mov, RIO, 2, RAB, 2); -.align 8 -.global __twofish_enc_blk_3way -.type __twofish_enc_blk_3way,@function; - -__twofish_enc_blk_3way: +ENTRY(__twofish_enc_blk_3way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -250,7 +248,7 @@ __twofish_enc_blk_3way: popq %rbp; /* bool xor */ testb %bpl, %bpl; - jnz __enc_xor3; + jnz .L__enc_xor3; outunpack_enc3(mov); @@ -262,7 +260,7 @@ __twofish_enc_blk_3way: popq %r15; ret; -__enc_xor3: +.L__enc_xor3: outunpack_enc3(xor); popq %rbx; @@ -272,11 +270,9 @@ __enc_xor3: popq %r14; popq %r15; ret; +ENDPROC(__twofish_enc_blk_3way) -.global twofish_dec_blk_3way -.type twofish_dec_blk_3way,@function; - -twofish_dec_blk_3way: +ENTRY(twofish_dec_blk_3way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -313,4 +309,4 @@ twofish_dec_blk_3way: popq %r14; popq %r15; ret; - +ENDPROC(twofish_dec_blk_3way) diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S index 7bcf3fcc366..a039d21986a 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64.S @@ -20,6 +20,7 @@ .file "twofish-x86_64-asm.S" .text +#include <linux/linkage.h> #include <asm/asm-offsets.h> #define a_offset 0 @@ -214,11 +215,7 @@ xor %r8d, d ## D;\ ror $1, d ## D; -.align 8 -.global twofish_enc_blk -.global twofish_dec_blk - -twofish_enc_blk: +ENTRY(twofish_enc_blk) pushq R1 /* %rdi contains the ctx address */ @@ -269,8 +266,9 @@ twofish_enc_blk: popq R1 movq $1,%rax ret +ENDPROC(twofish_enc_blk) -twofish_dec_blk: +ENTRY(twofish_dec_blk) pushq R1 /* %rdi contains the ctx address */ @@ -320,3 +318,4 @@ twofish_dec_blk: popq R1 movq $1,%rax ret +ENDPROC(twofish_dec_blk) diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 94ac91d26e4..4e3c665be12 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -4,6 +4,8 @@ * Copyright (C) 2012 Johannes Goetzfried * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> * + * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -26,6 +28,7 @@ #include <linux/types.h> #include <linux/crypto.h> #include <linux/err.h> +#include <crypto/ablk_helper.h> #include <crypto/algapi.h> #include <crypto/twofish.h> #include <crypto/cryptd.h> @@ -37,7 +40,6 @@ #include <asm/xcr.h> #include <asm/xsave.h> #include <asm/crypto/twofish.h> -#include <asm/crypto/ablk_helper.h> #include <asm/crypto/glue_helper.h> #include <crypto/scatterwalk.h> #include <linux/workqueue.h> @@ -56,12 +58,29 @@ asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst, asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst, const u8 *src, le128 *iv); +asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); +asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); + static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, const u8 *src) { __twofish_enc_blk_3way(ctx, dst, src, false); } +static void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) +{ + glue_xts_crypt_128bit_one(ctx, dst, src, iv, + GLUE_FUNC_CAST(twofish_enc_blk)); +} + +static void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) +{ + glue_xts_crypt_128bit_one(ctx, dst, src, iv, + GLUE_FUNC_CAST(twofish_dec_blk)); +} + static const struct common_glue_ctx twofish_enc = { .num_funcs = 3, @@ -95,6 +114,19 @@ static const struct common_glue_ctx twofish_ctr = { } } }; +static const struct common_glue_ctx twofish_enc_xts = { + .num_funcs = 2, + .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = TWOFISH_PARALLEL_BLOCKS, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) } + }, { + .num_blocks = 1, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) } + } } +}; + static const struct common_glue_ctx twofish_dec = { .num_funcs = 3, .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, @@ -127,6 +159,19 @@ static const struct common_glue_ctx twofish_dec_cbc = { } } }; +static const struct common_glue_ctx twofish_dec_xts = { + .num_funcs = 2, + .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = TWOFISH_PARALLEL_BLOCKS, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) } + }, { + .num_blocks = 1, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) } + } } +}; + static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { @@ -275,54 +320,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[TWOFISH_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->crypt_ctx, - .fpu_enabled = false, - }; - struct xts_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .tweak_ctx = &ctx->tweak_ctx, - .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk), - .crypt_ctx = &crypt_ctx, - .crypt_fn = encrypt_callback, - }; - int ret; - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = xts_crypt(desc, dst, src, nbytes, &req); - twofish_fpu_end(crypt_ctx.fpu_enabled); - - return ret; + return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes, + XTS_TWEAK_CAST(twofish_enc_blk), + &ctx->tweak_ctx, &ctx->crypt_ctx); } static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[TWOFISH_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->crypt_ctx, - .fpu_enabled = false, - }; - struct xts_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .tweak_ctx = &ctx->tweak_ctx, - .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk), - .crypt_ctx = &crypt_ctx, - .crypt_fn = decrypt_callback, - }; - int ret; - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = xts_crypt(desc, dst, src, nbytes, &req); - twofish_fpu_end(crypt_ctx.fpu_enabled); - - return ret; + return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes, + XTS_TWEAK_CAST(twofish_enc_blk), + &ctx->tweak_ctx, &ctx->crypt_ctx); } static struct crypto_alg twofish_algs[10] = { { |
